1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <math.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/inv_txfm.h"
17 
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20      0.5 shifts per pixel. */
21   int i;
22   tran_low_t output[16];
23   tran_high_t a1, b1, c1, d1, e1;
24   const tran_low_t *ip = input;
25   tran_low_t *op = output;
26 
27   for (i = 0; i < 4; i++) {
28     a1 = ip[0] >> UNIT_QUANT_SHIFT;
29     c1 = ip[1] >> UNIT_QUANT_SHIFT;
30     d1 = ip[2] >> UNIT_QUANT_SHIFT;
31     b1 = ip[3] >> UNIT_QUANT_SHIFT;
32     a1 += c1;
33     d1 -= b1;
34     e1 = (a1 - d1) >> 1;
35     b1 = e1 - b1;
36     c1 = e1 - c1;
37     a1 -= b1;
38     d1 += c1;
39     op[0] = WRAPLOW(a1);
40     op[1] = WRAPLOW(b1);
41     op[2] = WRAPLOW(c1);
42     op[3] = WRAPLOW(d1);
43     ip += 4;
44     op += 4;
45   }
46 
47   ip = output;
48   for (i = 0; i < 4; i++) {
49     a1 = ip[4 * 0];
50     c1 = ip[4 * 1];
51     d1 = ip[4 * 2];
52     b1 = ip[4 * 3];
53     a1 += c1;
54     d1 -= b1;
55     e1 = (a1 - d1) >> 1;
56     b1 = e1 - b1;
57     c1 = e1 - c1;
58     a1 -= b1;
59     d1 += c1;
60     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64 
65     ip++;
66     dest++;
67   }
68 }
69 
vpx_iwht4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)70 void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
71   int i;
72   tran_high_t a1, e1;
73   tran_low_t tmp[4];
74   const tran_low_t *ip = input;
75   tran_low_t *op = tmp;
76 
77   a1 = ip[0] >> UNIT_QUANT_SHIFT;
78   e1 = a1 >> 1;
79   a1 -= e1;
80   op[0] = WRAPLOW(a1);
81   op[1] = op[2] = op[3] = WRAPLOW(e1);
82 
83   ip = tmp;
84   for (i = 0; i < 4; i++) {
85     e1 = ip[0] >> 1;
86     a1 = ip[0] - e1;
87     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88     dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89     dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90     dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
91     ip++;
92     dest++;
93   }
94 }
95 
iadst4_c(const tran_low_t * input,tran_low_t * output)96 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
97   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
98   tran_low_t x0 = input[0];
99   tran_low_t x1 = input[1];
100   tran_low_t x2 = input[2];
101   tran_low_t x3 = input[3];
102 
103   if (!(x0 | x1 | x2 | x3)) {
104     memset(output, 0, 4 * sizeof(*output));
105     return;
106   }
107 
108   // 32-bit result is enough for the following multiplications.
109   s0 = sinpi_1_9 * x0;
110   s1 = sinpi_2_9 * x0;
111   s2 = sinpi_3_9 * x1;
112   s3 = sinpi_4_9 * x2;
113   s4 = sinpi_1_9 * x2;
114   s5 = sinpi_2_9 * x3;
115   s6 = sinpi_4_9 * x3;
116   s7 = WRAPLOW(x0 - x2 + x3);
117 
118   s0 = s0 + s3 + s5;
119   s1 = s1 - s4 - s6;
120   s3 = s2;
121   s2 = sinpi_3_9 * s7;
122 
123   // 1-D transform scaling factor is sqrt(2).
124   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
125   // + 1b (addition) = 29b.
126   // Hence the output bit depth is 15b.
127   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
128   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
129   output[2] = WRAPLOW(dct_const_round_shift(s2));
130   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
131 }
132 
idct4_c(const tran_low_t * input,tran_low_t * output)133 void idct4_c(const tran_low_t *input, tran_low_t *output) {
134   int16_t step[4];
135   tran_high_t temp1, temp2;
136 
137   // stage 1
138   temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
139   temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
140   step[0] = WRAPLOW(dct_const_round_shift(temp1));
141   step[1] = WRAPLOW(dct_const_round_shift(temp2));
142   temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
143   temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
144   step[2] = WRAPLOW(dct_const_round_shift(temp1));
145   step[3] = WRAPLOW(dct_const_round_shift(temp2));
146 
147   // stage 2
148   output[0] = WRAPLOW(step[0] + step[3]);
149   output[1] = WRAPLOW(step[1] + step[2]);
150   output[2] = WRAPLOW(step[1] - step[2]);
151   output[3] = WRAPLOW(step[0] - step[3]);
152 }
153 
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)154 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
155   int i, j;
156   tran_low_t out[4 * 4];
157   tran_low_t *outptr = out;
158   tran_low_t temp_in[4], temp_out[4];
159 
160   // Rows
161   for (i = 0; i < 4; ++i) {
162     idct4_c(input, outptr);
163     input += 4;
164     outptr += 4;
165   }
166 
167   // Columns
168   for (i = 0; i < 4; ++i) {
169     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
170     idct4_c(temp_in, temp_out);
171     for (j = 0; j < 4; ++j) {
172       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
173                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
174     }
175   }
176 }
177 
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)178 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
179   int i;
180   tran_high_t a1;
181   tran_low_t out =
182       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
183 
184   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
185   a1 = ROUND_POWER_OF_TWO(out, 4);
186 
187   for (i = 0; i < 4; i++) {
188     dest[0] = clip_pixel_add(dest[0], a1);
189     dest[1] = clip_pixel_add(dest[1], a1);
190     dest[2] = clip_pixel_add(dest[2], a1);
191     dest[3] = clip_pixel_add(dest[3], a1);
192     dest += stride;
193   }
194 }
195 
iadst8_c(const tran_low_t * input,tran_low_t * output)196 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
197   int s0, s1, s2, s3, s4, s5, s6, s7;
198   tran_high_t x0 = input[7];
199   tran_high_t x1 = input[0];
200   tran_high_t x2 = input[5];
201   tran_high_t x3 = input[2];
202   tran_high_t x4 = input[3];
203   tran_high_t x5 = input[4];
204   tran_high_t x6 = input[1];
205   tran_high_t x7 = input[6];
206 
207   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
208     memset(output, 0, 8 * sizeof(*output));
209     return;
210   }
211 
212   // stage 1
213   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
214   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
215   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
216   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
217   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
218   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
219   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
220   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
221 
222   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
223   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
224   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
225   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
226   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
227   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
228   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
229   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
230 
231   // stage 2
232   s0 = (int)x0;
233   s1 = (int)x1;
234   s2 = (int)x2;
235   s3 = (int)x3;
236   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
237   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
238   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
239   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
240 
241   x0 = WRAPLOW(s0 + s2);
242   x1 = WRAPLOW(s1 + s3);
243   x2 = WRAPLOW(s0 - s2);
244   x3 = WRAPLOW(s1 - s3);
245   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
246   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
247   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
248   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
249 
250   // stage 3
251   s2 = (int)(cospi_16_64 * (x2 + x3));
252   s3 = (int)(cospi_16_64 * (x2 - x3));
253   s6 = (int)(cospi_16_64 * (x6 + x7));
254   s7 = (int)(cospi_16_64 * (x6 - x7));
255 
256   x2 = WRAPLOW(dct_const_round_shift(s2));
257   x3 = WRAPLOW(dct_const_round_shift(s3));
258   x6 = WRAPLOW(dct_const_round_shift(s6));
259   x7 = WRAPLOW(dct_const_round_shift(s7));
260 
261   output[0] = WRAPLOW(x0);
262   output[1] = WRAPLOW(-x4);
263   output[2] = WRAPLOW(x6);
264   output[3] = WRAPLOW(-x2);
265   output[4] = WRAPLOW(x3);
266   output[5] = WRAPLOW(-x7);
267   output[6] = WRAPLOW(x5);
268   output[7] = WRAPLOW(-x1);
269 }
270 
idct8_c(const tran_low_t * input,tran_low_t * output)271 void idct8_c(const tran_low_t *input, tran_low_t *output) {
272   int16_t step1[8], step2[8];
273   tran_high_t temp1, temp2;
274 
275   // stage 1
276   step1[0] = (int16_t)input[0];
277   step1[2] = (int16_t)input[4];
278   step1[1] = (int16_t)input[2];
279   step1[3] = (int16_t)input[6];
280   temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
281   temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
282   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
283   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
284   temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
285   temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
286   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
287   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
288 
289   // stage 2
290   temp1 = (step1[0] + step1[2]) * cospi_16_64;
291   temp2 = (step1[0] - step1[2]) * cospi_16_64;
292   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
293   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
294   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
295   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
296   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
297   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
298   step2[4] = WRAPLOW(step1[4] + step1[5]);
299   step2[5] = WRAPLOW(step1[4] - step1[5]);
300   step2[6] = WRAPLOW(-step1[6] + step1[7]);
301   step2[7] = WRAPLOW(step1[6] + step1[7]);
302 
303   // stage 3
304   step1[0] = WRAPLOW(step2[0] + step2[3]);
305   step1[1] = WRAPLOW(step2[1] + step2[2]);
306   step1[2] = WRAPLOW(step2[1] - step2[2]);
307   step1[3] = WRAPLOW(step2[0] - step2[3]);
308   step1[4] = step2[4];
309   temp1 = (step2[6] - step2[5]) * cospi_16_64;
310   temp2 = (step2[5] + step2[6]) * cospi_16_64;
311   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
312   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
313   step1[7] = step2[7];
314 
315   // stage 4
316   output[0] = WRAPLOW(step1[0] + step1[7]);
317   output[1] = WRAPLOW(step1[1] + step1[6]);
318   output[2] = WRAPLOW(step1[2] + step1[5]);
319   output[3] = WRAPLOW(step1[3] + step1[4]);
320   output[4] = WRAPLOW(step1[3] - step1[4]);
321   output[5] = WRAPLOW(step1[2] - step1[5]);
322   output[6] = WRAPLOW(step1[1] - step1[6]);
323   output[7] = WRAPLOW(step1[0] - step1[7]);
324 }
325 
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)326 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
327   int i, j;
328   tran_low_t out[8 * 8];
329   tran_low_t *outptr = out;
330   tran_low_t temp_in[8], temp_out[8];
331 
332   // First transform rows
333   for (i = 0; i < 8; ++i) {
334     idct8_c(input, outptr);
335     input += 8;
336     outptr += 8;
337   }
338 
339   // Then transform columns
340   for (i = 0; i < 8; ++i) {
341     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
342     idct8_c(temp_in, temp_out);
343     for (j = 0; j < 8; ++j) {
344       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
345                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
346     }
347   }
348 }
349 
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)350 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
351   int i, j;
352   tran_low_t out[8 * 8] = { 0 };
353   tran_low_t *outptr = out;
354   tran_low_t temp_in[8], temp_out[8];
355 
356   // First transform rows
357   // Only first 4 row has non-zero coefs
358   for (i = 0; i < 4; ++i) {
359     idct8_c(input, outptr);
360     input += 8;
361     outptr += 8;
362   }
363 
364   // Then transform columns
365   for (i = 0; i < 8; ++i) {
366     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
367     idct8_c(temp_in, temp_out);
368     for (j = 0; j < 8; ++j) {
369       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
370                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
371     }
372   }
373 }
374 
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)375 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
376   int i, j;
377   tran_high_t a1;
378   tran_low_t out =
379       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
380 
381   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
382   a1 = ROUND_POWER_OF_TWO(out, 5);
383   for (j = 0; j < 8; ++j) {
384     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
385     dest += stride;
386   }
387 }
388 
iadst16_c(const tran_low_t * input,tran_low_t * output)389 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
390   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
391   tran_high_t s9, s10, s11, s12, s13, s14, s15;
392   tran_high_t x0 = input[15];
393   tran_high_t x1 = input[0];
394   tran_high_t x2 = input[13];
395   tran_high_t x3 = input[2];
396   tran_high_t x4 = input[11];
397   tran_high_t x5 = input[4];
398   tran_high_t x6 = input[9];
399   tran_high_t x7 = input[6];
400   tran_high_t x8 = input[7];
401   tran_high_t x9 = input[8];
402   tran_high_t x10 = input[5];
403   tran_high_t x11 = input[10];
404   tran_high_t x12 = input[3];
405   tran_high_t x13 = input[12];
406   tran_high_t x14 = input[1];
407   tran_high_t x15 = input[14];
408 
409   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
410         x13 | x14 | x15)) {
411     memset(output, 0, 16 * sizeof(*output));
412     return;
413   }
414 
415   // stage 1
416   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
417   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
418   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
419   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
420   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
421   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
422   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
423   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
424   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
425   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
426   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
427   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
428   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
429   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
430   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
431   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
432 
433   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
434   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
435   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
436   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
437   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
438   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
439   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
440   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
441   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
442   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
443   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
444   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
445   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
446   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
447   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
448   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
449 
450   // stage 2
451   s0 = x0;
452   s1 = x1;
453   s2 = x2;
454   s3 = x3;
455   s4 = x4;
456   s5 = x5;
457   s6 = x6;
458   s7 = x7;
459   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
460   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
461   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
462   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
463   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
464   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
465   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
466   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
467 
468   x0 = WRAPLOW(s0 + s4);
469   x1 = WRAPLOW(s1 + s5);
470   x2 = WRAPLOW(s2 + s6);
471   x3 = WRAPLOW(s3 + s7);
472   x4 = WRAPLOW(s0 - s4);
473   x5 = WRAPLOW(s1 - s5);
474   x6 = WRAPLOW(s2 - s6);
475   x7 = WRAPLOW(s3 - s7);
476   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
477   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
478   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
479   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
480   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
481   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
482   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
483   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
484 
485   // stage 3
486   s0 = x0;
487   s1 = x1;
488   s2 = x2;
489   s3 = x3;
490   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
491   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
492   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
493   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
494   s8 = x8;
495   s9 = x9;
496   s10 = x10;
497   s11 = x11;
498   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
499   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
500   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
501   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
502 
503   x0 = WRAPLOW(s0 + s2);
504   x1 = WRAPLOW(s1 + s3);
505   x2 = WRAPLOW(s0 - s2);
506   x3 = WRAPLOW(s1 - s3);
507   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
508   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
509   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
510   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
511   x8 = WRAPLOW(s8 + s10);
512   x9 = WRAPLOW(s9 + s11);
513   x10 = WRAPLOW(s8 - s10);
514   x11 = WRAPLOW(s9 - s11);
515   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
516   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
517   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
518   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
519 
520   // stage 4
521   s2 = (-cospi_16_64) * (x2 + x3);
522   s3 = cospi_16_64 * (x2 - x3);
523   s6 = cospi_16_64 * (x6 + x7);
524   s7 = cospi_16_64 * (-x6 + x7);
525   s10 = cospi_16_64 * (x10 + x11);
526   s11 = cospi_16_64 * (-x10 + x11);
527   s14 = (-cospi_16_64) * (x14 + x15);
528   s15 = cospi_16_64 * (x14 - x15);
529 
530   x2 = WRAPLOW(dct_const_round_shift(s2));
531   x3 = WRAPLOW(dct_const_round_shift(s3));
532   x6 = WRAPLOW(dct_const_round_shift(s6));
533   x7 = WRAPLOW(dct_const_round_shift(s7));
534   x10 = WRAPLOW(dct_const_round_shift(s10));
535   x11 = WRAPLOW(dct_const_round_shift(s11));
536   x14 = WRAPLOW(dct_const_round_shift(s14));
537   x15 = WRAPLOW(dct_const_round_shift(s15));
538 
539   output[0] = WRAPLOW(x0);
540   output[1] = WRAPLOW(-x8);
541   output[2] = WRAPLOW(x12);
542   output[3] = WRAPLOW(-x4);
543   output[4] = WRAPLOW(x6);
544   output[5] = WRAPLOW(x14);
545   output[6] = WRAPLOW(x10);
546   output[7] = WRAPLOW(x2);
547   output[8] = WRAPLOW(x3);
548   output[9] = WRAPLOW(x11);
549   output[10] = WRAPLOW(x15);
550   output[11] = WRAPLOW(x7);
551   output[12] = WRAPLOW(x5);
552   output[13] = WRAPLOW(-x13);
553   output[14] = WRAPLOW(x9);
554   output[15] = WRAPLOW(-x1);
555 }
556 
idct16_c(const tran_low_t * input,tran_low_t * output)557 void idct16_c(const tran_low_t *input, tran_low_t *output) {
558   int16_t step1[16], step2[16];
559   tran_high_t temp1, temp2;
560 
561   // stage 1
562   step1[0] = (int16_t)input[0 / 2];
563   step1[1] = (int16_t)input[16 / 2];
564   step1[2] = (int16_t)input[8 / 2];
565   step1[3] = (int16_t)input[24 / 2];
566   step1[4] = (int16_t)input[4 / 2];
567   step1[5] = (int16_t)input[20 / 2];
568   step1[6] = (int16_t)input[12 / 2];
569   step1[7] = (int16_t)input[28 / 2];
570   step1[8] = (int16_t)input[2 / 2];
571   step1[9] = (int16_t)input[18 / 2];
572   step1[10] = (int16_t)input[10 / 2];
573   step1[11] = (int16_t)input[26 / 2];
574   step1[12] = (int16_t)input[6 / 2];
575   step1[13] = (int16_t)input[22 / 2];
576   step1[14] = (int16_t)input[14 / 2];
577   step1[15] = (int16_t)input[30 / 2];
578 
579   // stage 2
580   step2[0] = step1[0];
581   step2[1] = step1[1];
582   step2[2] = step1[2];
583   step2[3] = step1[3];
584   step2[4] = step1[4];
585   step2[5] = step1[5];
586   step2[6] = step1[6];
587   step2[7] = step1[7];
588 
589   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
590   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
591   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
592   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
593 
594   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
595   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
596   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
597   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
598 
599   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
600   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
601   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
602   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
603 
604   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
605   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
606   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
607   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
608 
609   // stage 3
610   step1[0] = step2[0];
611   step1[1] = step2[1];
612   step1[2] = step2[2];
613   step1[3] = step2[3];
614 
615   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
616   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
617   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
618   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
619   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
620   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
621   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
622   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
623 
624   step1[8] = WRAPLOW(step2[8] + step2[9]);
625   step1[9] = WRAPLOW(step2[8] - step2[9]);
626   step1[10] = WRAPLOW(-step2[10] + step2[11]);
627   step1[11] = WRAPLOW(step2[10] + step2[11]);
628   step1[12] = WRAPLOW(step2[12] + step2[13]);
629   step1[13] = WRAPLOW(step2[12] - step2[13]);
630   step1[14] = WRAPLOW(-step2[14] + step2[15]);
631   step1[15] = WRAPLOW(step2[14] + step2[15]);
632 
633   // stage 4
634   temp1 = (step1[0] + step1[1]) * cospi_16_64;
635   temp2 = (step1[0] - step1[1]) * cospi_16_64;
636   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
637   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
638   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
639   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
640   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
641   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
642   step2[4] = WRAPLOW(step1[4] + step1[5]);
643   step2[5] = WRAPLOW(step1[4] - step1[5]);
644   step2[6] = WRAPLOW(-step1[6] + step1[7]);
645   step2[7] = WRAPLOW(step1[6] + step1[7]);
646 
647   step2[8] = step1[8];
648   step2[15] = step1[15];
649   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
650   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
651   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
652   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
653   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
654   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
655   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
656   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
657   step2[11] = step1[11];
658   step2[12] = step1[12];
659 
660   // stage 5
661   step1[0] = WRAPLOW(step2[0] + step2[3]);
662   step1[1] = WRAPLOW(step2[1] + step2[2]);
663   step1[2] = WRAPLOW(step2[1] - step2[2]);
664   step1[3] = WRAPLOW(step2[0] - step2[3]);
665   step1[4] = step2[4];
666   temp1 = (step2[6] - step2[5]) * cospi_16_64;
667   temp2 = (step2[5] + step2[6]) * cospi_16_64;
668   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
669   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
670   step1[7] = step2[7];
671 
672   step1[8] = WRAPLOW(step2[8] + step2[11]);
673   step1[9] = WRAPLOW(step2[9] + step2[10]);
674   step1[10] = WRAPLOW(step2[9] - step2[10]);
675   step1[11] = WRAPLOW(step2[8] - step2[11]);
676   step1[12] = WRAPLOW(-step2[12] + step2[15]);
677   step1[13] = WRAPLOW(-step2[13] + step2[14]);
678   step1[14] = WRAPLOW(step2[13] + step2[14]);
679   step1[15] = WRAPLOW(step2[12] + step2[15]);
680 
681   // stage 6
682   step2[0] = WRAPLOW(step1[0] + step1[7]);
683   step2[1] = WRAPLOW(step1[1] + step1[6]);
684   step2[2] = WRAPLOW(step1[2] + step1[5]);
685   step2[3] = WRAPLOW(step1[3] + step1[4]);
686   step2[4] = WRAPLOW(step1[3] - step1[4]);
687   step2[5] = WRAPLOW(step1[2] - step1[5]);
688   step2[6] = WRAPLOW(step1[1] - step1[6]);
689   step2[7] = WRAPLOW(step1[0] - step1[7]);
690   step2[8] = step1[8];
691   step2[9] = step1[9];
692   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
693   temp2 = (step1[10] + step1[13]) * cospi_16_64;
694   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
695   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
696   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
697   temp2 = (step1[11] + step1[12]) * cospi_16_64;
698   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
699   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
700   step2[14] = step1[14];
701   step2[15] = step1[15];
702 
703   // stage 7
704   output[0] = (tran_low_t)WRAPLOW(step2[0] + step2[15]);
705   output[1] = (tran_low_t)WRAPLOW(step2[1] + step2[14]);
706   output[2] = (tran_low_t)WRAPLOW(step2[2] + step2[13]);
707   output[3] = (tran_low_t)WRAPLOW(step2[3] + step2[12]);
708   output[4] = (tran_low_t)WRAPLOW(step2[4] + step2[11]);
709   output[5] = (tran_low_t)WRAPLOW(step2[5] + step2[10]);
710   output[6] = (tran_low_t)WRAPLOW(step2[6] + step2[9]);
711   output[7] = (tran_low_t)WRAPLOW(step2[7] + step2[8]);
712   output[8] = (tran_low_t)WRAPLOW(step2[7] - step2[8]);
713   output[9] = (tran_low_t)WRAPLOW(step2[6] - step2[9]);
714   output[10] = (tran_low_t)WRAPLOW(step2[5] - step2[10]);
715   output[11] = (tran_low_t)WRAPLOW(step2[4] - step2[11]);
716   output[12] = (tran_low_t)WRAPLOW(step2[3] - step2[12]);
717   output[13] = (tran_low_t)WRAPLOW(step2[2] - step2[13]);
718   output[14] = (tran_low_t)WRAPLOW(step2[1] - step2[14]);
719   output[15] = (tran_low_t)WRAPLOW(step2[0] - step2[15]);
720 }
721 
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)722 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
723                              int stride) {
724   int i, j;
725   tran_low_t out[16 * 16];
726   tran_low_t *outptr = out;
727   tran_low_t temp_in[16], temp_out[16];
728 
729   // First transform rows
730   for (i = 0; i < 16; ++i) {
731     idct16_c(input, outptr);
732     input += 16;
733     outptr += 16;
734   }
735 
736   // Then transform columns
737   for (i = 0; i < 16; ++i) {
738     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
739     idct16_c(temp_in, temp_out);
740     for (j = 0; j < 16; ++j) {
741       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
742                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
743     }
744   }
745 }
746 
vpx_idct16x16_38_add_c(const tran_low_t * input,uint8_t * dest,int stride)747 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
748                             int stride) {
749   int i, j;
750   tran_low_t out[16 * 16] = { 0 };
751   tran_low_t *outptr = out;
752   tran_low_t temp_in[16], temp_out[16];
753 
754   // First transform rows. Since all non-zero dct coefficients are in
755   // upper-left 8x8 area, we only need to calculate first 8 rows here.
756   for (i = 0; i < 8; ++i) {
757     idct16_c(input, outptr);
758     input += 16;
759     outptr += 16;
760   }
761 
762   // Then transform columns
763   for (i = 0; i < 16; ++i) {
764     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
765     idct16_c(temp_in, temp_out);
766     for (j = 0; j < 16; ++j) {
767       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
768                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
769     }
770   }
771 }
772 
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)773 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
774                             int stride) {
775   int i, j;
776   tran_low_t out[16 * 16] = { 0 };
777   tran_low_t *outptr = out;
778   tran_low_t temp_in[16], temp_out[16];
779 
780   // First transform rows. Since all non-zero dct coefficients are in
781   // upper-left 4x4 area, we only need to calculate first 4 rows here.
782   for (i = 0; i < 4; ++i) {
783     idct16_c(input, outptr);
784     input += 16;
785     outptr += 16;
786   }
787 
788   // Then transform columns
789   for (i = 0; i < 16; ++i) {
790     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
791     idct16_c(temp_in, temp_out);
792     for (j = 0; j < 16; ++j) {
793       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
794                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
795     }
796   }
797 }
798 
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)799 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
800   int i, j;
801   tran_high_t a1;
802   tran_low_t out =
803       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
804 
805   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
806   a1 = ROUND_POWER_OF_TWO(out, 6);
807   for (j = 0; j < 16; ++j) {
808     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
809     dest += stride;
810   }
811 }
812 
idct32_c(const tran_low_t * input,tran_low_t * output)813 void idct32_c(const tran_low_t *input, tran_low_t *output) {
814   int16_t step1[32], step2[32];
815   tran_high_t temp1, temp2;
816 
817   // stage 1
818   step1[0] = (int16_t)input[0];
819   step1[1] = (int16_t)input[16];
820   step1[2] = (int16_t)input[8];
821   step1[3] = (int16_t)input[24];
822   step1[4] = (int16_t)input[4];
823   step1[5] = (int16_t)input[20];
824   step1[6] = (int16_t)input[12];
825   step1[7] = (int16_t)input[28];
826   step1[8] = (int16_t)input[2];
827   step1[9] = (int16_t)input[18];
828   step1[10] = (int16_t)input[10];
829   step1[11] = (int16_t)input[26];
830   step1[12] = (int16_t)input[6];
831   step1[13] = (int16_t)input[22];
832   step1[14] = (int16_t)input[14];
833   step1[15] = (int16_t)input[30];
834 
835   temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
836   temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
837   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
838   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
839 
840   temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
841   temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
842   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
843   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
844 
845   temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
846   temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
847   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
848   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
849 
850   temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
851   temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
852   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
853   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
854 
855   temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
856   temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
857   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
858   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
859 
860   temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
861   temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
862   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
863   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
864 
865   temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
866   temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
867   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
868   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
869 
870   temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
871   temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
872   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
873   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
874 
875   // stage 2
876   step2[0] = step1[0];
877   step2[1] = step1[1];
878   step2[2] = step1[2];
879   step2[3] = step1[3];
880   step2[4] = step1[4];
881   step2[5] = step1[5];
882   step2[6] = step1[6];
883   step2[7] = step1[7];
884 
885   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
886   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
887   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
888   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
889 
890   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
891   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
892   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
893   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
894 
895   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
896   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
897   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
898   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
899 
900   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
901   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
902   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
903   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
904 
905   step2[16] = WRAPLOW(step1[16] + step1[17]);
906   step2[17] = WRAPLOW(step1[16] - step1[17]);
907   step2[18] = WRAPLOW(-step1[18] + step1[19]);
908   step2[19] = WRAPLOW(step1[18] + step1[19]);
909   step2[20] = WRAPLOW(step1[20] + step1[21]);
910   step2[21] = WRAPLOW(step1[20] - step1[21]);
911   step2[22] = WRAPLOW(-step1[22] + step1[23]);
912   step2[23] = WRAPLOW(step1[22] + step1[23]);
913   step2[24] = WRAPLOW(step1[24] + step1[25]);
914   step2[25] = WRAPLOW(step1[24] - step1[25]);
915   step2[26] = WRAPLOW(-step1[26] + step1[27]);
916   step2[27] = WRAPLOW(step1[26] + step1[27]);
917   step2[28] = WRAPLOW(step1[28] + step1[29]);
918   step2[29] = WRAPLOW(step1[28] - step1[29]);
919   step2[30] = WRAPLOW(-step1[30] + step1[31]);
920   step2[31] = WRAPLOW(step1[30] + step1[31]);
921 
922   // stage 3
923   step1[0] = step2[0];
924   step1[1] = step2[1];
925   step1[2] = step2[2];
926   step1[3] = step2[3];
927 
928   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
929   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
930   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
931   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
932   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
933   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
934   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
935   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
936 
937   step1[8] = WRAPLOW(step2[8] + step2[9]);
938   step1[9] = WRAPLOW(step2[8] - step2[9]);
939   step1[10] = WRAPLOW(-step2[10] + step2[11]);
940   step1[11] = WRAPLOW(step2[10] + step2[11]);
941   step1[12] = WRAPLOW(step2[12] + step2[13]);
942   step1[13] = WRAPLOW(step2[12] - step2[13]);
943   step1[14] = WRAPLOW(-step2[14] + step2[15]);
944   step1[15] = WRAPLOW(step2[14] + step2[15]);
945 
946   step1[16] = step2[16];
947   step1[31] = step2[31];
948   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
949   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
950   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
951   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
952   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
953   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
954   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
955   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
956   step1[19] = step2[19];
957   step1[20] = step2[20];
958   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
959   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
960   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
961   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
962   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
963   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
964   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
965   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
966   step1[23] = step2[23];
967   step1[24] = step2[24];
968   step1[27] = step2[27];
969   step1[28] = step2[28];
970 
971   // stage 4
972   temp1 = (step1[0] + step1[1]) * cospi_16_64;
973   temp2 = (step1[0] - step1[1]) * cospi_16_64;
974   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
975   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
976   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
977   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
978   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
979   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
980   step2[4] = WRAPLOW(step1[4] + step1[5]);
981   step2[5] = WRAPLOW(step1[4] - step1[5]);
982   step2[6] = WRAPLOW(-step1[6] + step1[7]);
983   step2[7] = WRAPLOW(step1[6] + step1[7]);
984 
985   step2[8] = step1[8];
986   step2[15] = step1[15];
987   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
988   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
989   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
990   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
991   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
992   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
993   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
994   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
995   step2[11] = step1[11];
996   step2[12] = step1[12];
997 
998   step2[16] = WRAPLOW(step1[16] + step1[19]);
999   step2[17] = WRAPLOW(step1[17] + step1[18]);
1000   step2[18] = WRAPLOW(step1[17] - step1[18]);
1001   step2[19] = WRAPLOW(step1[16] - step1[19]);
1002   step2[20] = WRAPLOW(-step1[20] + step1[23]);
1003   step2[21] = WRAPLOW(-step1[21] + step1[22]);
1004   step2[22] = WRAPLOW(step1[21] + step1[22]);
1005   step2[23] = WRAPLOW(step1[20] + step1[23]);
1006 
1007   step2[24] = WRAPLOW(step1[24] + step1[27]);
1008   step2[25] = WRAPLOW(step1[25] + step1[26]);
1009   step2[26] = WRAPLOW(step1[25] - step1[26]);
1010   step2[27] = WRAPLOW(step1[24] - step1[27]);
1011   step2[28] = WRAPLOW(-step1[28] + step1[31]);
1012   step2[29] = WRAPLOW(-step1[29] + step1[30]);
1013   step2[30] = WRAPLOW(step1[29] + step1[30]);
1014   step2[31] = WRAPLOW(step1[28] + step1[31]);
1015 
1016   // stage 5
1017   step1[0] = WRAPLOW(step2[0] + step2[3]);
1018   step1[1] = WRAPLOW(step2[1] + step2[2]);
1019   step1[2] = WRAPLOW(step2[1] - step2[2]);
1020   step1[3] = WRAPLOW(step2[0] - step2[3]);
1021   step1[4] = step2[4];
1022   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1023   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1024   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1025   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1026   step1[7] = step2[7];
1027 
1028   step1[8] = WRAPLOW(step2[8] + step2[11]);
1029   step1[9] = WRAPLOW(step2[9] + step2[10]);
1030   step1[10] = WRAPLOW(step2[9] - step2[10]);
1031   step1[11] = WRAPLOW(step2[8] - step2[11]);
1032   step1[12] = WRAPLOW(-step2[12] + step2[15]);
1033   step1[13] = WRAPLOW(-step2[13] + step2[14]);
1034   step1[14] = WRAPLOW(step2[13] + step2[14]);
1035   step1[15] = WRAPLOW(step2[12] + step2[15]);
1036 
1037   step1[16] = step2[16];
1038   step1[17] = step2[17];
1039   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1040   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1041   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1042   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1043   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1044   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1045   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1046   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1047   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1048   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1049   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1050   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1051   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1052   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1053   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1054   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1055   step1[22] = step2[22];
1056   step1[23] = step2[23];
1057   step1[24] = step2[24];
1058   step1[25] = step2[25];
1059   step1[30] = step2[30];
1060   step1[31] = step2[31];
1061 
1062   // stage 6
1063   step2[0] = WRAPLOW(step1[0] + step1[7]);
1064   step2[1] = WRAPLOW(step1[1] + step1[6]);
1065   step2[2] = WRAPLOW(step1[2] + step1[5]);
1066   step2[3] = WRAPLOW(step1[3] + step1[4]);
1067   step2[4] = WRAPLOW(step1[3] - step1[4]);
1068   step2[5] = WRAPLOW(step1[2] - step1[5]);
1069   step2[6] = WRAPLOW(step1[1] - step1[6]);
1070   step2[7] = WRAPLOW(step1[0] - step1[7]);
1071   step2[8] = step1[8];
1072   step2[9] = step1[9];
1073   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1074   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1075   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1076   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1077   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1078   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1079   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1080   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1081   step2[14] = step1[14];
1082   step2[15] = step1[15];
1083 
1084   step2[16] = WRAPLOW(step1[16] + step1[23]);
1085   step2[17] = WRAPLOW(step1[17] + step1[22]);
1086   step2[18] = WRAPLOW(step1[18] + step1[21]);
1087   step2[19] = WRAPLOW(step1[19] + step1[20]);
1088   step2[20] = WRAPLOW(step1[19] - step1[20]);
1089   step2[21] = WRAPLOW(step1[18] - step1[21]);
1090   step2[22] = WRAPLOW(step1[17] - step1[22]);
1091   step2[23] = WRAPLOW(step1[16] - step1[23]);
1092 
1093   step2[24] = WRAPLOW(-step1[24] + step1[31]);
1094   step2[25] = WRAPLOW(-step1[25] + step1[30]);
1095   step2[26] = WRAPLOW(-step1[26] + step1[29]);
1096   step2[27] = WRAPLOW(-step1[27] + step1[28]);
1097   step2[28] = WRAPLOW(step1[27] + step1[28]);
1098   step2[29] = WRAPLOW(step1[26] + step1[29]);
1099   step2[30] = WRAPLOW(step1[25] + step1[30]);
1100   step2[31] = WRAPLOW(step1[24] + step1[31]);
1101 
1102   // stage 7
1103   step1[0] = WRAPLOW(step2[0] + step2[15]);
1104   step1[1] = WRAPLOW(step2[1] + step2[14]);
1105   step1[2] = WRAPLOW(step2[2] + step2[13]);
1106   step1[3] = WRAPLOW(step2[3] + step2[12]);
1107   step1[4] = WRAPLOW(step2[4] + step2[11]);
1108   step1[5] = WRAPLOW(step2[5] + step2[10]);
1109   step1[6] = WRAPLOW(step2[6] + step2[9]);
1110   step1[7] = WRAPLOW(step2[7] + step2[8]);
1111   step1[8] = WRAPLOW(step2[7] - step2[8]);
1112   step1[9] = WRAPLOW(step2[6] - step2[9]);
1113   step1[10] = WRAPLOW(step2[5] - step2[10]);
1114   step1[11] = WRAPLOW(step2[4] - step2[11]);
1115   step1[12] = WRAPLOW(step2[3] - step2[12]);
1116   step1[13] = WRAPLOW(step2[2] - step2[13]);
1117   step1[14] = WRAPLOW(step2[1] - step2[14]);
1118   step1[15] = WRAPLOW(step2[0] - step2[15]);
1119 
1120   step1[16] = step2[16];
1121   step1[17] = step2[17];
1122   step1[18] = step2[18];
1123   step1[19] = step2[19];
1124   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1125   temp2 = (step2[20] + step2[27]) * cospi_16_64;
1126   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1127   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1128   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1129   temp2 = (step2[21] + step2[26]) * cospi_16_64;
1130   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1131   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1132   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1133   temp2 = (step2[22] + step2[25]) * cospi_16_64;
1134   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1135   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1136   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1137   temp2 = (step2[23] + step2[24]) * cospi_16_64;
1138   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1139   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1140   step1[28] = step2[28];
1141   step1[29] = step2[29];
1142   step1[30] = step2[30];
1143   step1[31] = step2[31];
1144 
1145   // final stage
1146   output[0] = WRAPLOW(step1[0] + step1[31]);
1147   output[1] = WRAPLOW(step1[1] + step1[30]);
1148   output[2] = WRAPLOW(step1[2] + step1[29]);
1149   output[3] = WRAPLOW(step1[3] + step1[28]);
1150   output[4] = WRAPLOW(step1[4] + step1[27]);
1151   output[5] = WRAPLOW(step1[5] + step1[26]);
1152   output[6] = WRAPLOW(step1[6] + step1[25]);
1153   output[7] = WRAPLOW(step1[7] + step1[24]);
1154   output[8] = WRAPLOW(step1[8] + step1[23]);
1155   output[9] = WRAPLOW(step1[9] + step1[22]);
1156   output[10] = WRAPLOW(step1[10] + step1[21]);
1157   output[11] = WRAPLOW(step1[11] + step1[20]);
1158   output[12] = WRAPLOW(step1[12] + step1[19]);
1159   output[13] = WRAPLOW(step1[13] + step1[18]);
1160   output[14] = WRAPLOW(step1[14] + step1[17]);
1161   output[15] = WRAPLOW(step1[15] + step1[16]);
1162   output[16] = WRAPLOW(step1[15] - step1[16]);
1163   output[17] = WRAPLOW(step1[14] - step1[17]);
1164   output[18] = WRAPLOW(step1[13] - step1[18]);
1165   output[19] = WRAPLOW(step1[12] - step1[19]);
1166   output[20] = WRAPLOW(step1[11] - step1[20]);
1167   output[21] = WRAPLOW(step1[10] - step1[21]);
1168   output[22] = WRAPLOW(step1[9] - step1[22]);
1169   output[23] = WRAPLOW(step1[8] - step1[23]);
1170   output[24] = WRAPLOW(step1[7] - step1[24]);
1171   output[25] = WRAPLOW(step1[6] - step1[25]);
1172   output[26] = WRAPLOW(step1[5] - step1[26]);
1173   output[27] = WRAPLOW(step1[4] - step1[27]);
1174   output[28] = WRAPLOW(step1[3] - step1[28]);
1175   output[29] = WRAPLOW(step1[2] - step1[29]);
1176   output[30] = WRAPLOW(step1[1] - step1[30]);
1177   output[31] = WRAPLOW(step1[0] - step1[31]);
1178 }
1179 
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1180 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1181                               int stride) {
1182   int i, j;
1183   tran_low_t out[32 * 32];
1184   tran_low_t *outptr = out;
1185   tran_low_t temp_in[32], temp_out[32];
1186 
1187   // Rows
1188   for (i = 0; i < 32; ++i) {
1189     int16_t zero_coeff = 0;
1190     for (j = 0; j < 32; ++j) zero_coeff |= input[j];
1191 
1192     if (zero_coeff)
1193       idct32_c(input, outptr);
1194     else
1195       memset(outptr, 0, sizeof(tran_low_t) * 32);
1196     input += 32;
1197     outptr += 32;
1198   }
1199 
1200   // Columns
1201   for (i = 0; i < 32; ++i) {
1202     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1203     idct32_c(temp_in, temp_out);
1204     for (j = 0; j < 32; ++j) {
1205       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1206                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1207     }
1208   }
1209 }
1210 
vpx_idct32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride)1211 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1212                              int stride) {
1213   int i, j;
1214   tran_low_t out[32 * 32] = { 0 };
1215   tran_low_t *outptr = out;
1216   tran_low_t temp_in[32], temp_out[32];
1217 
1218   // Rows
1219   // Only upper-left 16x16 has non-zero coeff
1220   for (i = 0; i < 16; ++i) {
1221     idct32_c(input, outptr);
1222     input += 32;
1223     outptr += 32;
1224   }
1225 
1226   // Columns
1227   for (i = 0; i < 32; ++i) {
1228     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1229     idct32_c(temp_in, temp_out);
1230     for (j = 0; j < 32; ++j) {
1231       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1232                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1233     }
1234   }
1235 }
1236 
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1237 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1238                             int stride) {
1239   int i, j;
1240   tran_low_t out[32 * 32] = { 0 };
1241   tran_low_t *outptr = out;
1242   tran_low_t temp_in[32], temp_out[32];
1243 
1244   // Rows
1245   // Only upper-left 8x8 has non-zero coeff
1246   for (i = 0; i < 8; ++i) {
1247     idct32_c(input, outptr);
1248     input += 32;
1249     outptr += 32;
1250   }
1251 
1252   // Columns
1253   for (i = 0; i < 32; ++i) {
1254     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1255     idct32_c(temp_in, temp_out);
1256     for (j = 0; j < 32; ++j) {
1257       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1258                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1259     }
1260   }
1261 }
1262 
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1263 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1264   int i, j;
1265   tran_high_t a1;
1266   tran_low_t out =
1267       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
1268 
1269   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1270   a1 = ROUND_POWER_OF_TWO(out, 6);
1271 
1272   for (j = 0; j < 32; ++j) {
1273     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1274     dest += stride;
1275   }
1276 }
1277 
1278 #if CONFIG_VP9_HIGHBITDEPTH
1279 
1280 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1281 // transform amplify bits + 1 bit for contingency in rounding and quantizing
1282 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1283 
detect_invalid_highbd_input(const tran_low_t * input,int size)1284 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1285                                               int size) {
1286   int i;
1287   for (i = 0; i < size; ++i)
1288     if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1289   return 0;
1290 }
1291 
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1292 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1293                                  int stride, int bd) {
1294   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1295      0.5 shifts per pixel. */
1296   int i;
1297   tran_low_t output[16];
1298   tran_high_t a1, b1, c1, d1, e1;
1299   const tran_low_t *ip = input;
1300   tran_low_t *op = output;
1301 
1302   for (i = 0; i < 4; i++) {
1303     a1 = ip[0] >> UNIT_QUANT_SHIFT;
1304     c1 = ip[1] >> UNIT_QUANT_SHIFT;
1305     d1 = ip[2] >> UNIT_QUANT_SHIFT;
1306     b1 = ip[3] >> UNIT_QUANT_SHIFT;
1307     a1 += c1;
1308     d1 -= b1;
1309     e1 = (a1 - d1) >> 1;
1310     b1 = e1 - b1;
1311     c1 = e1 - c1;
1312     a1 -= b1;
1313     d1 += c1;
1314     op[0] = HIGHBD_WRAPLOW(a1, bd);
1315     op[1] = HIGHBD_WRAPLOW(b1, bd);
1316     op[2] = HIGHBD_WRAPLOW(c1, bd);
1317     op[3] = HIGHBD_WRAPLOW(d1, bd);
1318     ip += 4;
1319     op += 4;
1320   }
1321 
1322   ip = output;
1323   for (i = 0; i < 4; i++) {
1324     a1 = ip[4 * 0];
1325     c1 = ip[4 * 1];
1326     d1 = ip[4 * 2];
1327     b1 = ip[4 * 3];
1328     a1 += c1;
1329     d1 -= b1;
1330     e1 = (a1 - d1) >> 1;
1331     b1 = e1 - b1;
1332     c1 = e1 - c1;
1333     a1 -= b1;
1334     d1 += c1;
1335     dest[stride * 0] =
1336         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1337     dest[stride * 1] =
1338         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1339     dest[stride * 2] =
1340         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1341     dest[stride * 3] =
1342         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1343 
1344     ip++;
1345     dest++;
1346   }
1347 }
1348 
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1349 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
1350                                 int stride, int bd) {
1351   int i;
1352   tran_high_t a1, e1;
1353   tran_low_t tmp[4];
1354   const tran_low_t *ip = input;
1355   tran_low_t *op = tmp;
1356   (void)bd;
1357 
1358   a1 = ip[0] >> UNIT_QUANT_SHIFT;
1359   e1 = a1 >> 1;
1360   a1 -= e1;
1361   op[0] = HIGHBD_WRAPLOW(a1, bd);
1362   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1363 
1364   ip = tmp;
1365   for (i = 0; i < 4; i++) {
1366     e1 = ip[0] >> 1;
1367     a1 = ip[0] - e1;
1368     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1369     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1370     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1371     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1372     ip++;
1373     dest++;
1374   }
1375 }
1376 
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1377 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1378   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1379   tran_low_t x0 = input[0];
1380   tran_low_t x1 = input[1];
1381   tran_low_t x2 = input[2];
1382   tran_low_t x3 = input[3];
1383   (void)bd;
1384 
1385   if (detect_invalid_highbd_input(input, 4)) {
1386 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1387     assert(0 && "invalid highbd txfm input");
1388 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1389     memset(output, 0, sizeof(*output) * 4);
1390     return;
1391   }
1392 
1393   if (!(x0 | x1 | x2 | x3)) {
1394     memset(output, 0, 4 * sizeof(*output));
1395     return;
1396   }
1397 
1398   s0 = (tran_high_t)sinpi_1_9 * x0;
1399   s1 = (tran_high_t)sinpi_2_9 * x0;
1400   s2 = (tran_high_t)sinpi_3_9 * x1;
1401   s3 = (tran_high_t)sinpi_4_9 * x2;
1402   s4 = (tran_high_t)sinpi_1_9 * x2;
1403   s5 = (tran_high_t)sinpi_2_9 * x3;
1404   s6 = (tran_high_t)sinpi_4_9 * x3;
1405   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1406 
1407   s0 = s0 + s3 + s5;
1408   s1 = s1 - s4 - s6;
1409   s3 = s2;
1410   s2 = sinpi_3_9 * s7;
1411 
1412   // 1-D transform scaling factor is sqrt(2).
1413   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1414   // + 1b (addition) = 29b.
1415   // Hence the output bit depth is 15b.
1416   output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1417   output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1418   output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1419   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1420 }
1421 
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1422 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1423   tran_low_t step[4];
1424   tran_high_t temp1, temp2;
1425   (void)bd;
1426 
1427   if (detect_invalid_highbd_input(input, 4)) {
1428 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1429     assert(0 && "invalid highbd txfm input");
1430 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1431     memset(output, 0, sizeof(*output) * 4);
1432     return;
1433   }
1434 
1435   // stage 1
1436   temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
1437   temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
1438   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1439   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1440   temp1 =
1441       input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
1442   temp2 =
1443       input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
1444   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1445   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1446 
1447   // stage 2
1448   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1449   output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1450   output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1451   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1452 }
1453 
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1454 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1455                                  int stride, int bd) {
1456   int i, j;
1457   tran_low_t out[4 * 4];
1458   tran_low_t *outptr = out;
1459   tran_low_t temp_in[4], temp_out[4];
1460 
1461   // Rows
1462   for (i = 0; i < 4; ++i) {
1463     vpx_highbd_idct4_c(input, outptr, bd);
1464     input += 4;
1465     outptr += 4;
1466   }
1467 
1468   // Columns
1469   for (i = 0; i < 4; ++i) {
1470     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1471     vpx_highbd_idct4_c(temp_in, temp_out, bd);
1472     for (j = 0; j < 4; ++j) {
1473       dest[j * stride + i] = highbd_clip_pixel_add(
1474           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1475     }
1476   }
1477 }
1478 
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1479 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
1480                                 int stride, int bd) {
1481   int i;
1482   tran_high_t a1;
1483   tran_low_t out = HIGHBD_WRAPLOW(
1484       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
1485 
1486   out =
1487       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
1488   a1 = ROUND_POWER_OF_TWO(out, 4);
1489 
1490   for (i = 0; i < 4; i++) {
1491     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1492     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1493     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1494     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1495     dest += stride;
1496   }
1497 }
1498 
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1499 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1500   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1501   tran_low_t x0 = input[7];
1502   tran_low_t x1 = input[0];
1503   tran_low_t x2 = input[5];
1504   tran_low_t x3 = input[2];
1505   tran_low_t x4 = input[3];
1506   tran_low_t x5 = input[4];
1507   tran_low_t x6 = input[1];
1508   tran_low_t x7 = input[6];
1509   (void)bd;
1510 
1511   if (detect_invalid_highbd_input(input, 8)) {
1512 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1513     assert(0 && "invalid highbd txfm input");
1514 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1515     memset(output, 0, sizeof(*output) * 8);
1516     return;
1517   }
1518 
1519   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1520     memset(output, 0, 8 * sizeof(*output));
1521     return;
1522   }
1523 
1524   // stage 1
1525   s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
1526   s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
1527   s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
1528   s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
1529   s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
1530   s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
1531   s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
1532   s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
1533 
1534   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1535   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1536   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1537   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1538   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1539   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1540   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1541   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1542 
1543   // stage 2
1544   s0 = x0;
1545   s1 = x1;
1546   s2 = x2;
1547   s3 = x3;
1548   s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
1549   s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
1550   s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
1551   s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
1552 
1553   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1554   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1555   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1556   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1557   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1558   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1559   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1560   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1561 
1562   // stage 3
1563   s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
1564   s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
1565   s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
1566   s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
1567 
1568   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1569   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1570   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1571   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1572 
1573   output[0] = HIGHBD_WRAPLOW(x0, bd);
1574   output[1] = HIGHBD_WRAPLOW(-x4, bd);
1575   output[2] = HIGHBD_WRAPLOW(x6, bd);
1576   output[3] = HIGHBD_WRAPLOW(-x2, bd);
1577   output[4] = HIGHBD_WRAPLOW(x3, bd);
1578   output[5] = HIGHBD_WRAPLOW(-x7, bd);
1579   output[6] = HIGHBD_WRAPLOW(x5, bd);
1580   output[7] = HIGHBD_WRAPLOW(-x1, bd);
1581 }
1582 
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1583 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1584   tran_low_t step1[8], step2[8];
1585   tran_high_t temp1, temp2;
1586 
1587   if (detect_invalid_highbd_input(input, 8)) {
1588 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1589     assert(0 && "invalid highbd txfm input");
1590 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1591     memset(output, 0, sizeof(*output) * 8);
1592     return;
1593   }
1594 
1595   // stage 1
1596   step1[0] = input[0];
1597   step1[2] = input[4];
1598   step1[1] = input[2];
1599   step1[3] = input[6];
1600   temp1 =
1601       input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
1602   temp2 =
1603       input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
1604   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1605   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1606   temp1 =
1607       input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
1608   temp2 =
1609       input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
1610   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1611   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1612 
1613   // stage 2 & stage 3 - even half
1614   vpx_highbd_idct4_c(step1, step1, bd);
1615 
1616   // stage 2 - odd half
1617   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1618   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1619   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1620   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1621 
1622   // stage 3 - odd half
1623   step1[4] = step2[4];
1624   temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
1625   temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
1626   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1627   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1628   step1[7] = step2[7];
1629 
1630   // stage 4
1631   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1632   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1633   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1634   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1635   output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1636   output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1637   output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1638   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1639 }
1640 
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1641 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
1642                                  int stride, int bd) {
1643   int i, j;
1644   tran_low_t out[8 * 8];
1645   tran_low_t *outptr = out;
1646   tran_low_t temp_in[8], temp_out[8];
1647 
1648   // First transform rows
1649   for (i = 0; i < 8; ++i) {
1650     vpx_highbd_idct8_c(input, outptr, bd);
1651     input += 8;
1652     outptr += 8;
1653   }
1654 
1655   // Then transform columns
1656   for (i = 0; i < 8; ++i) {
1657     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1658     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1659     for (j = 0; j < 8; ++j) {
1660       dest[j * stride + i] = highbd_clip_pixel_add(
1661           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1662     }
1663   }
1664 }
1665 
vpx_highbd_idct8x8_12_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1666 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
1667                                  int stride, int bd) {
1668   int i, j;
1669   tran_low_t out[8 * 8] = { 0 };
1670   tran_low_t *outptr = out;
1671   tran_low_t temp_in[8], temp_out[8];
1672 
1673   // First transform rows
1674   // Only first 4 row has non-zero coefs
1675   for (i = 0; i < 4; ++i) {
1676     vpx_highbd_idct8_c(input, outptr, bd);
1677     input += 8;
1678     outptr += 8;
1679   }
1680 
1681   // Then transform columns
1682   for (i = 0; i < 8; ++i) {
1683     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1684     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1685     for (j = 0; j < 8; ++j) {
1686       dest[j * stride + i] = highbd_clip_pixel_add(
1687           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1688     }
1689   }
1690 }
1691 
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1692 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
1693                                 int stride, int bd) {
1694   int i, j;
1695   tran_high_t a1;
1696   tran_low_t out = HIGHBD_WRAPLOW(
1697       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
1698 
1699   out =
1700       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
1701   a1 = ROUND_POWER_OF_TWO(out, 5);
1702   for (j = 0; j < 8; ++j) {
1703     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1704     dest += stride;
1705   }
1706 }
1707 
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1708 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1709   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1710   tran_high_t s9, s10, s11, s12, s13, s14, s15;
1711   tran_low_t x0 = input[15];
1712   tran_low_t x1 = input[0];
1713   tran_low_t x2 = input[13];
1714   tran_low_t x3 = input[2];
1715   tran_low_t x4 = input[11];
1716   tran_low_t x5 = input[4];
1717   tran_low_t x6 = input[9];
1718   tran_low_t x7 = input[6];
1719   tran_low_t x8 = input[7];
1720   tran_low_t x9 = input[8];
1721   tran_low_t x10 = input[5];
1722   tran_low_t x11 = input[10];
1723   tran_low_t x12 = input[3];
1724   tran_low_t x13 = input[12];
1725   tran_low_t x14 = input[1];
1726   tran_low_t x15 = input[14];
1727   (void)bd;
1728 
1729   if (detect_invalid_highbd_input(input, 16)) {
1730 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1731     assert(0 && "invalid highbd txfm input");
1732 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1733     memset(output, 0, sizeof(*output) * 16);
1734     return;
1735   }
1736 
1737   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1738         x13 | x14 | x15)) {
1739     memset(output, 0, 16 * sizeof(*output));
1740     return;
1741   }
1742 
1743   // stage 1
1744   s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
1745   s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
1746   s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
1747   s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
1748   s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
1749   s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
1750   s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
1751   s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
1752   s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
1753   s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
1754   s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
1755   s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
1756   s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
1757   s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
1758   s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
1759   s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
1760 
1761   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1762   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1763   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1764   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1765   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1766   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1767   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1768   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1769   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1770   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1771   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1772   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1773   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1774   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1775   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1776   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1777 
1778   // stage 2
1779   s0 = x0;
1780   s1 = x1;
1781   s2 = x2;
1782   s3 = x3;
1783   s4 = x4;
1784   s5 = x5;
1785   s6 = x6;
1786   s7 = x7;
1787   s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
1788   s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
1789   s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
1790   s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
1791   s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
1792   s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
1793   s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
1794   s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
1795 
1796   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1797   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1798   x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1799   x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1800   x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1801   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1802   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1803   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1804   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1805   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1806   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1807   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1808   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1809   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1810   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1811   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1812 
1813   // stage 3
1814   s0 = x0;
1815   s1 = x1;
1816   s2 = x2;
1817   s3 = x3;
1818   s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
1819   s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
1820   s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
1821   s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
1822   s8 = x8;
1823   s9 = x9;
1824   s10 = x10;
1825   s11 = x11;
1826   s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
1827   s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
1828   s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
1829   s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
1830 
1831   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1832   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1833   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1834   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1835   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1836   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1837   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1838   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1839   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1840   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1841   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1842   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
1843   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1844   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1845   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1846   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
1847 
1848   // stage 4
1849   s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
1850   s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
1851   s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
1852   s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
1853   s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
1854   s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
1855   s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
1856   s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
1857 
1858   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1859   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1860   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1861   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1862   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1863   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1864   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1865   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
1866 
1867   output[0] = HIGHBD_WRAPLOW(x0, bd);
1868   output[1] = HIGHBD_WRAPLOW(-x8, bd);
1869   output[2] = HIGHBD_WRAPLOW(x12, bd);
1870   output[3] = HIGHBD_WRAPLOW(-x4, bd);
1871   output[4] = HIGHBD_WRAPLOW(x6, bd);
1872   output[5] = HIGHBD_WRAPLOW(x14, bd);
1873   output[6] = HIGHBD_WRAPLOW(x10, bd);
1874   output[7] = HIGHBD_WRAPLOW(x2, bd);
1875   output[8] = HIGHBD_WRAPLOW(x3, bd);
1876   output[9] = HIGHBD_WRAPLOW(x11, bd);
1877   output[10] = HIGHBD_WRAPLOW(x15, bd);
1878   output[11] = HIGHBD_WRAPLOW(x7, bd);
1879   output[12] = HIGHBD_WRAPLOW(x5, bd);
1880   output[13] = HIGHBD_WRAPLOW(-x13, bd);
1881   output[14] = HIGHBD_WRAPLOW(x9, bd);
1882   output[15] = HIGHBD_WRAPLOW(-x1, bd);
1883 }
1884 
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1885 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1886   tran_low_t step1[16], step2[16];
1887   tran_high_t temp1, temp2;
1888   (void)bd;
1889 
1890   if (detect_invalid_highbd_input(input, 16)) {
1891 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1892     assert(0 && "invalid highbd txfm input");
1893 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1894     memset(output, 0, sizeof(*output) * 16);
1895     return;
1896   }
1897 
1898   // stage 1
1899   step1[0] = input[0 / 2];
1900   step1[1] = input[16 / 2];
1901   step1[2] = input[8 / 2];
1902   step1[3] = input[24 / 2];
1903   step1[4] = input[4 / 2];
1904   step1[5] = input[20 / 2];
1905   step1[6] = input[12 / 2];
1906   step1[7] = input[28 / 2];
1907   step1[8] = input[2 / 2];
1908   step1[9] = input[18 / 2];
1909   step1[10] = input[10 / 2];
1910   step1[11] = input[26 / 2];
1911   step1[12] = input[6 / 2];
1912   step1[13] = input[22 / 2];
1913   step1[14] = input[14 / 2];
1914   step1[15] = input[30 / 2];
1915 
1916   // stage 2
1917   step2[0] = step1[0];
1918   step2[1] = step1[1];
1919   step2[2] = step1[2];
1920   step2[3] = step1[3];
1921   step2[4] = step1[4];
1922   step2[5] = step1[5];
1923   step2[6] = step1[6];
1924   step2[7] = step1[7];
1925 
1926   temp1 =
1927       step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
1928   temp2 =
1929       step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
1930   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1931   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1932 
1933   temp1 = step1[9] * (tran_high_t)cospi_14_64 -
1934           step1[14] * (tran_high_t)cospi_18_64;
1935   temp2 = step1[9] * (tran_high_t)cospi_18_64 +
1936           step1[14] * (tran_high_t)cospi_14_64;
1937   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1938   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1939 
1940   temp1 = step1[10] * (tran_high_t)cospi_22_64 -
1941           step1[13] * (tran_high_t)cospi_10_64;
1942   temp2 = step1[10] * (tran_high_t)cospi_10_64 +
1943           step1[13] * (tran_high_t)cospi_22_64;
1944   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1945   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1946 
1947   temp1 = step1[11] * (tran_high_t)cospi_6_64 -
1948           step1[12] * (tran_high_t)cospi_26_64;
1949   temp2 = step1[11] * (tran_high_t)cospi_26_64 +
1950           step1[12] * (tran_high_t)cospi_6_64;
1951   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1952   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1953 
1954   // stage 3
1955   step1[0] = step2[0];
1956   step1[1] = step2[1];
1957   step1[2] = step2[2];
1958   step1[3] = step2[3];
1959 
1960   temp1 =
1961       step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
1962   temp2 =
1963       step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
1964   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1965   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1966   temp1 =
1967       step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
1968   temp2 =
1969       step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
1970   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1971   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1972 
1973   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1974   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1975   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1976   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1977   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1978   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1979   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1980   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1981 
1982   // stage 4
1983   temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
1984   temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
1985   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1986   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1987   temp1 =
1988       step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
1989   temp2 =
1990       step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
1991   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1992   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1993   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1994   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1995   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1996   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1997 
1998   step2[8] = step1[8];
1999   step2[15] = step1[15];
2000   temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
2001           step1[14] * (tran_high_t)cospi_24_64;
2002   temp2 =
2003       step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
2004   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2005   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2006   temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
2007           step1[13] * (tran_high_t)cospi_8_64;
2008   temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
2009           step1[13] * (tran_high_t)cospi_24_64;
2010   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2011   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2012   step2[11] = step1[11];
2013   step2[12] = step1[12];
2014 
2015   // stage 5
2016   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2017   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2018   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2019   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2020   step1[4] = step2[4];
2021   temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
2022   temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
2023   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2024   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2025   step1[7] = step2[7];
2026 
2027   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2028   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2029   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2030   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2031   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2032   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2033   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2034   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2035 
2036   // stage 6
2037   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2038   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2039   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2040   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2041   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2042   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2043   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2044   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2045   step2[8] = step1[8];
2046   step2[9] = step1[9];
2047   temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2048   temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2049   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2050   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2051   temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2052   temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2053   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2054   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2055   step2[14] = step1[14];
2056   step2[15] = step1[15];
2057 
2058   // stage 7
2059   output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2060   output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2061   output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2062   output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2063   output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2064   output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2065   output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2066   output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2067   output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2068   output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2069   output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2070   output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2071   output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2072   output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2073   output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2074   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2075 }
2076 
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2077 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
2078                                     int stride, int bd) {
2079   int i, j;
2080   tran_low_t out[16 * 16];
2081   tran_low_t *outptr = out;
2082   tran_low_t temp_in[16], temp_out[16];
2083 
2084   // First transform rows
2085   for (i = 0; i < 16; ++i) {
2086     vpx_highbd_idct16_c(input, outptr, bd);
2087     input += 16;
2088     outptr += 16;
2089   }
2090 
2091   // Then transform columns
2092   for (i = 0; i < 16; ++i) {
2093     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2094     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2095     for (j = 0; j < 16; ++j) {
2096       dest[j * stride + i] = highbd_clip_pixel_add(
2097           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2098     }
2099   }
2100 }
2101 
vpx_highbd_idct16x16_38_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2102 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
2103                                    int stride, int bd) {
2104   int i, j;
2105   tran_low_t out[16 * 16] = { 0 };
2106   tran_low_t *outptr = out;
2107   tran_low_t temp_in[16], temp_out[16];
2108 
2109   // First transform rows. Since all non-zero dct coefficients are in
2110   // upper-left 8x8 area, we only need to calculate first 8 rows here.
2111   for (i = 0; i < 8; ++i) {
2112     vpx_highbd_idct16_c(input, outptr, bd);
2113     input += 16;
2114     outptr += 16;
2115   }
2116 
2117   // Then transform columns
2118   for (i = 0; i < 16; ++i) {
2119     uint16_t *destT = dest;
2120     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2121     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2122     for (j = 0; j < 16; ++j) {
2123       destT[i] = highbd_clip_pixel_add(destT[i],
2124                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2125       destT += stride;
2126     }
2127   }
2128 }
2129 
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2130 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
2131                                    int stride, int bd) {
2132   int i, j;
2133   tran_low_t out[16 * 16] = { 0 };
2134   tran_low_t *outptr = out;
2135   tran_low_t temp_in[16], temp_out[16];
2136 
2137   // First transform rows. Since all non-zero dct coefficients are in
2138   // upper-left 4x4 area, we only need to calculate first 4 rows here.
2139   for (i = 0; i < 4; ++i) {
2140     vpx_highbd_idct16_c(input, outptr, bd);
2141     input += 16;
2142     outptr += 16;
2143   }
2144 
2145   // Then transform columns
2146   for (i = 0; i < 16; ++i) {
2147     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2148     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2149     for (j = 0; j < 16; ++j) {
2150       dest[j * stride + i] = highbd_clip_pixel_add(
2151           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2152     }
2153   }
2154 }
2155 
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2156 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
2157                                   int stride, int bd) {
2158   int i, j;
2159   tran_high_t a1;
2160   tran_low_t out = HIGHBD_WRAPLOW(
2161       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
2162 
2163   out =
2164       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
2165   a1 = ROUND_POWER_OF_TWO(out, 6);
2166   for (j = 0; j < 16; ++j) {
2167     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2168     dest += stride;
2169   }
2170 }
2171 
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2172 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2173                             int bd) {
2174   tran_low_t step1[32], step2[32];
2175   tran_high_t temp1, temp2;
2176   (void)bd;
2177 
2178   if (detect_invalid_highbd_input(input, 32)) {
2179 #if CONFIG_COEFFICIENT_RANGE_CHECKING
2180     assert(0 && "invalid highbd txfm input");
2181 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
2182     memset(output, 0, sizeof(*output) * 32);
2183     return;
2184   }
2185 
2186   // stage 1
2187   step1[0] = input[0];
2188   step1[1] = input[16];
2189   step1[2] = input[8];
2190   step1[3] = input[24];
2191   step1[4] = input[4];
2192   step1[5] = input[20];
2193   step1[6] = input[12];
2194   step1[7] = input[28];
2195   step1[8] = input[2];
2196   step1[9] = input[18];
2197   step1[10] = input[10];
2198   step1[11] = input[26];
2199   step1[12] = input[6];
2200   step1[13] = input[22];
2201   step1[14] = input[14];
2202   step1[15] = input[30];
2203 
2204   temp1 =
2205       input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
2206   temp2 =
2207       input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
2208   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2209   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2210 
2211   temp1 = input[17] * (tran_high_t)cospi_15_64 -
2212           input[15] * (tran_high_t)cospi_17_64;
2213   temp2 = input[17] * (tran_high_t)cospi_17_64 +
2214           input[15] * (tran_high_t)cospi_15_64;
2215   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2216   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2217 
2218   temp1 =
2219       input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
2220   temp2 =
2221       input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
2222   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2223   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2224 
2225   temp1 =
2226       input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
2227   temp2 =
2228       input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
2229   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2230   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2231 
2232   temp1 =
2233       input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
2234   temp2 =
2235       input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
2236   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2237   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2238 
2239   temp1 = input[21] * (tran_high_t)cospi_11_64 -
2240           input[11] * (tran_high_t)cospi_21_64;
2241   temp2 = input[21] * (tran_high_t)cospi_21_64 +
2242           input[11] * (tran_high_t)cospi_11_64;
2243   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2244   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2245 
2246   temp1 = input[13] * (tran_high_t)cospi_19_64 -
2247           input[19] * (tran_high_t)cospi_13_64;
2248   temp2 = input[13] * (tran_high_t)cospi_13_64 +
2249           input[19] * (tran_high_t)cospi_19_64;
2250   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2251   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2252 
2253   temp1 =
2254       input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
2255   temp2 =
2256       input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
2257   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2258   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2259 
2260   // stage 2
2261   step2[0] = step1[0];
2262   step2[1] = step1[1];
2263   step2[2] = step1[2];
2264   step2[3] = step1[3];
2265   step2[4] = step1[4];
2266   step2[5] = step1[5];
2267   step2[6] = step1[6];
2268   step2[7] = step1[7];
2269 
2270   temp1 =
2271       step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
2272   temp2 =
2273       step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
2274   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2275   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2276 
2277   temp1 = step1[9] * (tran_high_t)cospi_14_64 -
2278           step1[14] * (tran_high_t)cospi_18_64;
2279   temp2 = step1[9] * (tran_high_t)cospi_18_64 +
2280           step1[14] * (tran_high_t)cospi_14_64;
2281   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2282   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2283 
2284   temp1 = step1[10] * (tran_high_t)cospi_22_64 -
2285           step1[13] * (tran_high_t)cospi_10_64;
2286   temp2 = step1[10] * (tran_high_t)cospi_10_64 +
2287           step1[13] * (tran_high_t)cospi_22_64;
2288   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2289   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2290 
2291   temp1 = step1[11] * (tran_high_t)cospi_6_64 -
2292           step1[12] * (tran_high_t)cospi_26_64;
2293   temp2 = step1[11] * (tran_high_t)cospi_26_64 +
2294           step1[12] * (tran_high_t)cospi_6_64;
2295   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2296   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2297 
2298   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2299   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2300   step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2301   step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2302   step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2303   step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2304   step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2305   step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2306   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2307   step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2308   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2309   step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2310   step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2311   step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2312   step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2313   step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2314 
2315   // stage 3
2316   step1[0] = step2[0];
2317   step1[1] = step2[1];
2318   step1[2] = step2[2];
2319   step1[3] = step2[3];
2320 
2321   temp1 =
2322       step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
2323   temp2 =
2324       step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
2325   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2326   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2327   temp1 =
2328       step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
2329   temp2 =
2330       step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
2331   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2332   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2333 
2334   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2335   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2336   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2337   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2338   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2339   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2340   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2341   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2342 
2343   step1[16] = step2[16];
2344   step1[31] = step2[31];
2345   temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
2346           step2[30] * (tran_high_t)cospi_28_64;
2347   temp2 = step2[17] * (tran_high_t)cospi_28_64 +
2348           step2[30] * (tran_high_t)cospi_4_64;
2349   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2350   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2351   temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
2352           step2[29] * (tran_high_t)cospi_4_64;
2353   temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
2354           step2[29] * (tran_high_t)cospi_28_64;
2355   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2356   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2357   step1[19] = step2[19];
2358   step1[20] = step2[20];
2359   temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
2360           step2[26] * (tran_high_t)cospi_12_64;
2361   temp2 = step2[21] * (tran_high_t)cospi_12_64 +
2362           step2[26] * (tran_high_t)cospi_20_64;
2363   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2364   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2365   temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
2366           step2[25] * (tran_high_t)cospi_20_64;
2367   temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
2368           step2[25] * (tran_high_t)cospi_12_64;
2369   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2370   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2371   step1[23] = step2[23];
2372   step1[24] = step2[24];
2373   step1[27] = step2[27];
2374   step1[28] = step2[28];
2375 
2376   // stage 4
2377   temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
2378   temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
2379   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2380   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2381   temp1 =
2382       step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
2383   temp2 =
2384       step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
2385   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2386   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2387   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2388   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2389   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2390   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2391 
2392   step2[8] = step1[8];
2393   step2[15] = step1[15];
2394   temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
2395           step1[14] * (tran_high_t)cospi_24_64;
2396   temp2 =
2397       step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
2398   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2399   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2400   temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
2401           step1[13] * (tran_high_t)cospi_8_64;
2402   temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
2403           step1[13] * (tran_high_t)cospi_24_64;
2404   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2405   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2406   step2[11] = step1[11];
2407   step2[12] = step1[12];
2408 
2409   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2410   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2411   step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2412   step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2413   step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2414   step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2415   step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2416   step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2417 
2418   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2419   step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2420   step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2421   step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2422   step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2423   step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2424   step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2425   step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2426 
2427   // stage 5
2428   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2429   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2430   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2431   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2432   step1[4] = step2[4];
2433   temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
2434   temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
2435   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2436   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2437   step1[7] = step2[7];
2438 
2439   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2440   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2441   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2442   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2443   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2444   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2445   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2446   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2447 
2448   step1[16] = step2[16];
2449   step1[17] = step2[17];
2450   temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
2451           step2[29] * (tran_high_t)cospi_24_64;
2452   temp2 = step2[18] * (tran_high_t)cospi_24_64 +
2453           step2[29] * (tran_high_t)cospi_8_64;
2454   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2455   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2456   temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
2457           step2[28] * (tran_high_t)cospi_24_64;
2458   temp2 = step2[19] * (tran_high_t)cospi_24_64 +
2459           step2[28] * (tran_high_t)cospi_8_64;
2460   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2461   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2462   temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
2463           step2[27] * (tran_high_t)cospi_8_64;
2464   temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
2465           step2[27] * (tran_high_t)cospi_24_64;
2466   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2467   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2468   temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
2469           step2[26] * (tran_high_t)cospi_8_64;
2470   temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
2471           step2[26] * (tran_high_t)cospi_24_64;
2472   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2473   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2474   step1[22] = step2[22];
2475   step1[23] = step2[23];
2476   step1[24] = step2[24];
2477   step1[25] = step2[25];
2478   step1[30] = step2[30];
2479   step1[31] = step2[31];
2480 
2481   // stage 6
2482   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2483   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2484   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2485   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2486   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2487   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2488   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2489   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2490   step2[8] = step1[8];
2491   step2[9] = step1[9];
2492   temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2493   temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2494   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2495   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2496   temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2497   temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2498   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2499   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2500   step2[14] = step1[14];
2501   step2[15] = step1[15];
2502 
2503   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2504   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2505   step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2506   step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2507   step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2508   step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2509   step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2510   step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2511 
2512   step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2513   step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2514   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2515   step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2516   step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2517   step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2518   step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2519   step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2520 
2521   // stage 7
2522   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2523   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2524   step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2525   step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2526   step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2527   step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2528   step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2529   step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2530   step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2531   step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2532   step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2533   step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2534   step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2535   step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2536   step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2537   step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2538 
2539   step1[16] = step2[16];
2540   step1[17] = step2[17];
2541   step1[18] = step2[18];
2542   step1[19] = step2[19];
2543   temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
2544   temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
2545   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2546   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2547   temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
2548   temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
2549   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2550   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2551   temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
2552   temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
2553   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2554   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2555   temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
2556   temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
2557   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2558   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2559   step1[28] = step2[28];
2560   step1[29] = step2[29];
2561   step1[30] = step2[30];
2562   step1[31] = step2[31];
2563 
2564   // final stage
2565   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2566   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2567   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2568   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2569   output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2570   output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2571   output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2572   output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2573   output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2574   output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2575   output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2576   output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2577   output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2578   output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2579   output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2580   output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2581   output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2582   output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2583   output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2584   output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2585   output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2586   output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2587   output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2588   output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2589   output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2590   output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2591   output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2592   output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2593   output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2594   output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2595   output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2596   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2597 }
2598 
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2599 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
2600                                      int stride, int bd) {
2601   int i, j;
2602   tran_low_t out[32 * 32];
2603   tran_low_t *outptr = out;
2604   tran_low_t temp_in[32], temp_out[32];
2605 
2606   // Rows
2607   for (i = 0; i < 32; ++i) {
2608     tran_low_t zero_coeff = 0;
2609     for (j = 0; j < 32; ++j) zero_coeff |= input[j];
2610 
2611     if (zero_coeff)
2612       highbd_idct32_c(input, outptr, bd);
2613     else
2614       memset(outptr, 0, sizeof(tran_low_t) * 32);
2615     input += 32;
2616     outptr += 32;
2617   }
2618 
2619   // Columns
2620   for (i = 0; i < 32; ++i) {
2621     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2622     highbd_idct32_c(temp_in, temp_out, bd);
2623     for (j = 0; j < 32; ++j) {
2624       dest[j * stride + i] = highbd_clip_pixel_add(
2625           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2626     }
2627   }
2628 }
2629 
vpx_highbd_idct32x32_135_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2630 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
2631                                     int stride, int bd) {
2632   int i, j;
2633   tran_low_t out[32 * 32] = { 0 };
2634   tran_low_t *outptr = out;
2635   tran_low_t temp_in[32], temp_out[32];
2636 
2637   // Rows
2638   // Only upper-left 16x16 has non-zero coeff
2639   for (i = 0; i < 16; ++i) {
2640     highbd_idct32_c(input, outptr, bd);
2641     input += 32;
2642     outptr += 32;
2643   }
2644 
2645   // Columns
2646   for (i = 0; i < 32; ++i) {
2647     uint16_t *destT = dest;
2648     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2649     highbd_idct32_c(temp_in, temp_out, bd);
2650     for (j = 0; j < 32; ++j) {
2651       destT[i] = highbd_clip_pixel_add(destT[i],
2652                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2653       destT += stride;
2654     }
2655   }
2656 }
2657 
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2658 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
2659                                    int stride, int bd) {
2660   int i, j;
2661   tran_low_t out[32 * 32] = { 0 };
2662   tran_low_t *outptr = out;
2663   tran_low_t temp_in[32], temp_out[32];
2664 
2665   // Rows
2666   // Only upper-left 8x8 has non-zero coeff
2667   for (i = 0; i < 8; ++i) {
2668     highbd_idct32_c(input, outptr, bd);
2669     input += 32;
2670     outptr += 32;
2671   }
2672 
2673   // Columns
2674   for (i = 0; i < 32; ++i) {
2675     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2676     highbd_idct32_c(temp_in, temp_out, bd);
2677     for (j = 0; j < 32; ++j) {
2678       dest[j * stride + i] = highbd_clip_pixel_add(
2679           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2680     }
2681   }
2682 }
2683 
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2684 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
2685                                   int stride, int bd) {
2686   int i, j;
2687   int a1;
2688   tran_low_t out = HIGHBD_WRAPLOW(
2689       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
2690 
2691   out =
2692       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
2693   a1 = ROUND_POWER_OF_TWO(out, 6);
2694 
2695   for (j = 0; j < 32; ++j) {
2696     for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2697     dest += stride;
2698   }
2699 }
2700 
2701 #endif  // CONFIG_VP9_HIGHBITDEPTH
2702