1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <math.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/inv_txfm.h"
17 
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20      0.5 shifts per pixel. */
21   int i;
22   tran_low_t output[16];
23   tran_high_t a1, b1, c1, d1, e1;
24   const tran_low_t *ip = input;
25   tran_low_t *op = output;
26 
27   for (i = 0; i < 4; i++) {
28     a1 = ip[0] >> UNIT_QUANT_SHIFT;
29     c1 = ip[1] >> UNIT_QUANT_SHIFT;
30     d1 = ip[2] >> UNIT_QUANT_SHIFT;
31     b1 = ip[3] >> UNIT_QUANT_SHIFT;
32     a1 += c1;
33     d1 -= b1;
34     e1 = (a1 - d1) >> 1;
35     b1 = e1 - b1;
36     c1 = e1 - c1;
37     a1 -= b1;
38     d1 += c1;
39     op[0] = WRAPLOW(a1);
40     op[1] = WRAPLOW(b1);
41     op[2] = WRAPLOW(c1);
42     op[3] = WRAPLOW(d1);
43     ip += 4;
44     op += 4;
45   }
46 
47   ip = output;
48   for (i = 0; i < 4; i++) {
49     a1 = ip[4 * 0];
50     c1 = ip[4 * 1];
51     d1 = ip[4 * 2];
52     b1 = ip[4 * 3];
53     a1 += c1;
54     d1 -= b1;
55     e1 = (a1 - d1) >> 1;
56     b1 = e1 - b1;
57     c1 = e1 - c1;
58     a1 -= b1;
59     d1 += c1;
60     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64 
65     ip++;
66     dest++;
67   }
68 }
69 
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int stride)70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
71   int i;
72   tran_high_t a1, e1;
73   tran_low_t tmp[4];
74   const tran_low_t *ip = in;
75   tran_low_t *op = tmp;
76 
77   a1 = ip[0] >> UNIT_QUANT_SHIFT;
78   e1 = a1 >> 1;
79   a1 -= e1;
80   op[0] = WRAPLOW(a1);
81   op[1] = op[2] = op[3] = WRAPLOW(e1);
82 
83   ip = tmp;
84   for (i = 0; i < 4; i++) {
85     e1 = ip[0] >> 1;
86     a1 = ip[0] - e1;
87     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88     dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89     dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90     dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
91     ip++;
92     dest++;
93   }
94 }
95 
iadst4_c(const tran_low_t * input,tran_low_t * output)96 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
97   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
98   tran_low_t x0 = input[0];
99   tran_low_t x1 = input[1];
100   tran_low_t x2 = input[2];
101   tran_low_t x3 = input[3];
102 
103   if (!(x0 | x1 | x2 | x3)) {
104     memset(output, 0, 4 * sizeof(*output));
105     return;
106   }
107 
108   s0 = sinpi_1_9 * x0;
109   s1 = sinpi_2_9 * x0;
110   s2 = sinpi_3_9 * x1;
111   s3 = sinpi_4_9 * x2;
112   s4 = sinpi_1_9 * x2;
113   s5 = sinpi_2_9 * x3;
114   s6 = sinpi_4_9 * x3;
115   s7 = WRAPLOW(x0 - x2 + x3);
116 
117   s0 = s0 + s3 + s5;
118   s1 = s1 - s4 - s6;
119   s3 = s2;
120   s2 = sinpi_3_9 * s7;
121 
122   // 1-D transform scaling factor is sqrt(2).
123   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
124   // + 1b (addition) = 29b.
125   // Hence the output bit depth is 15b.
126   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
127   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
128   output[2] = WRAPLOW(dct_const_round_shift(s2));
129   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
130 }
131 
idct4_c(const tran_low_t * input,tran_low_t * output)132 void idct4_c(const tran_low_t *input, tran_low_t *output) {
133   tran_low_t step[4];
134   tran_high_t temp1, temp2;
135 
136   // stage 1
137   temp1 = (input[0] + input[2]) * cospi_16_64;
138   temp2 = (input[0] - input[2]) * cospi_16_64;
139   step[0] = WRAPLOW(dct_const_round_shift(temp1));
140   step[1] = WRAPLOW(dct_const_round_shift(temp2));
141   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
142   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
143   step[2] = WRAPLOW(dct_const_round_shift(temp1));
144   step[3] = WRAPLOW(dct_const_round_shift(temp2));
145 
146   // stage 2
147   output[0] = WRAPLOW(step[0] + step[3]);
148   output[1] = WRAPLOW(step[1] + step[2]);
149   output[2] = WRAPLOW(step[1] - step[2]);
150   output[3] = WRAPLOW(step[0] - step[3]);
151 }
152 
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)153 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
154   int i, j;
155   tran_low_t out[4 * 4];
156   tran_low_t *outptr = out;
157   tran_low_t temp_in[4], temp_out[4];
158 
159   // Rows
160   for (i = 0; i < 4; ++i) {
161     idct4_c(input, outptr);
162     input += 4;
163     outptr += 4;
164   }
165 
166   // Columns
167   for (i = 0; i < 4; ++i) {
168     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
169     idct4_c(temp_in, temp_out);
170     for (j = 0; j < 4; ++j) {
171       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
172                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
173     }
174   }
175 }
176 
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)177 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
178   int i;
179   tran_high_t a1;
180   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
181 
182   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
183   a1 = ROUND_POWER_OF_TWO(out, 4);
184 
185   for (i = 0; i < 4; i++) {
186     dest[0] = clip_pixel_add(dest[0], a1);
187     dest[1] = clip_pixel_add(dest[1], a1);
188     dest[2] = clip_pixel_add(dest[2], a1);
189     dest[3] = clip_pixel_add(dest[3], a1);
190     dest += stride;
191   }
192 }
193 
iadst8_c(const tran_low_t * input,tran_low_t * output)194 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
195   int s0, s1, s2, s3, s4, s5, s6, s7;
196   tran_high_t x0 = input[7];
197   tran_high_t x1 = input[0];
198   tran_high_t x2 = input[5];
199   tran_high_t x3 = input[2];
200   tran_high_t x4 = input[3];
201   tran_high_t x5 = input[4];
202   tran_high_t x6 = input[1];
203   tran_high_t x7 = input[6];
204 
205   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
206     memset(output, 0, 8 * sizeof(*output));
207     return;
208   }
209 
210   // stage 1
211   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
212   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
213   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
214   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
215   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
216   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
217   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
218   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
219 
220   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
221   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
222   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
223   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
224   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
225   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
226   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
227   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
228 
229   // stage 2
230   s0 = (int)x0;
231   s1 = (int)x1;
232   s2 = (int)x2;
233   s3 = (int)x3;
234   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
235   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
236   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
237   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
238 
239   x0 = WRAPLOW(s0 + s2);
240   x1 = WRAPLOW(s1 + s3);
241   x2 = WRAPLOW(s0 - s2);
242   x3 = WRAPLOW(s1 - s3);
243   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
244   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
245   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
246   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
247 
248   // stage 3
249   s2 = (int)(cospi_16_64 * (x2 + x3));
250   s3 = (int)(cospi_16_64 * (x2 - x3));
251   s6 = (int)(cospi_16_64 * (x6 + x7));
252   s7 = (int)(cospi_16_64 * (x6 - x7));
253 
254   x2 = WRAPLOW(dct_const_round_shift(s2));
255   x3 = WRAPLOW(dct_const_round_shift(s3));
256   x6 = WRAPLOW(dct_const_round_shift(s6));
257   x7 = WRAPLOW(dct_const_round_shift(s7));
258 
259   output[0] = WRAPLOW(x0);
260   output[1] = WRAPLOW(-x4);
261   output[2] = WRAPLOW(x6);
262   output[3] = WRAPLOW(-x2);
263   output[4] = WRAPLOW(x3);
264   output[5] = WRAPLOW(-x7);
265   output[6] = WRAPLOW(x5);
266   output[7] = WRAPLOW(-x1);
267 }
268 
idct8_c(const tran_low_t * input,tran_low_t * output)269 void idct8_c(const tran_low_t *input, tran_low_t *output) {
270   tran_low_t step1[8], step2[8];
271   tran_high_t temp1, temp2;
272 
273   // stage 1
274   step1[0] = input[0];
275   step1[2] = input[4];
276   step1[1] = input[2];
277   step1[3] = input[6];
278   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
279   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
280   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
281   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
282   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
283   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
284   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
285   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
286 
287   // stage 2
288   temp1 = (step1[0] + step1[2]) * cospi_16_64;
289   temp2 = (step1[0] - step1[2]) * cospi_16_64;
290   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
291   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
292   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
293   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
294   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
295   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
296   step2[4] = WRAPLOW(step1[4] + step1[5]);
297   step2[5] = WRAPLOW(step1[4] - step1[5]);
298   step2[6] = WRAPLOW(-step1[6] + step1[7]);
299   step2[7] = WRAPLOW(step1[6] + step1[7]);
300 
301   // stage 3
302   step1[0] = WRAPLOW(step2[0] + step2[3]);
303   step1[1] = WRAPLOW(step2[1] + step2[2]);
304   step1[2] = WRAPLOW(step2[1] - step2[2]);
305   step1[3] = WRAPLOW(step2[0] - step2[3]);
306   step1[4] = step2[4];
307   temp1 = (step2[6] - step2[5]) * cospi_16_64;
308   temp2 = (step2[5] + step2[6]) * cospi_16_64;
309   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
310   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
311   step1[7] = step2[7];
312 
313   // stage 4
314   output[0] = WRAPLOW(step1[0] + step1[7]);
315   output[1] = WRAPLOW(step1[1] + step1[6]);
316   output[2] = WRAPLOW(step1[2] + step1[5]);
317   output[3] = WRAPLOW(step1[3] + step1[4]);
318   output[4] = WRAPLOW(step1[3] - step1[4]);
319   output[5] = WRAPLOW(step1[2] - step1[5]);
320   output[6] = WRAPLOW(step1[1] - step1[6]);
321   output[7] = WRAPLOW(step1[0] - step1[7]);
322 }
323 
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)324 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
325   int i, j;
326   tran_low_t out[8 * 8];
327   tran_low_t *outptr = out;
328   tran_low_t temp_in[8], temp_out[8];
329 
330   // First transform rows
331   for (i = 0; i < 8; ++i) {
332     idct8_c(input, outptr);
333     input += 8;
334     outptr += 8;
335   }
336 
337   // Then transform columns
338   for (i = 0; i < 8; ++i) {
339     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
340     idct8_c(temp_in, temp_out);
341     for (j = 0; j < 8; ++j) {
342       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
343                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
344     }
345   }
346 }
347 
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)348 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
349   int i, j;
350   tran_low_t out[8 * 8] = { 0 };
351   tran_low_t *outptr = out;
352   tran_low_t temp_in[8], temp_out[8];
353 
354   // First transform rows
355   // Only first 4 row has non-zero coefs
356   for (i = 0; i < 4; ++i) {
357     idct8_c(input, outptr);
358     input += 8;
359     outptr += 8;
360   }
361 
362   // Then transform columns
363   for (i = 0; i < 8; ++i) {
364     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
365     idct8_c(temp_in, temp_out);
366     for (j = 0; j < 8; ++j) {
367       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
368                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
369     }
370   }
371 }
372 
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)373 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
374   int i, j;
375   tran_high_t a1;
376   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
377 
378   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
379   a1 = ROUND_POWER_OF_TWO(out, 5);
380   for (j = 0; j < 8; ++j) {
381     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
382     dest += stride;
383   }
384 }
385 
iadst16_c(const tran_low_t * input,tran_low_t * output)386 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
387   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
388   tran_high_t s9, s10, s11, s12, s13, s14, s15;
389   tran_high_t x0 = input[15];
390   tran_high_t x1 = input[0];
391   tran_high_t x2 = input[13];
392   tran_high_t x3 = input[2];
393   tran_high_t x4 = input[11];
394   tran_high_t x5 = input[4];
395   tran_high_t x6 = input[9];
396   tran_high_t x7 = input[6];
397   tran_high_t x8 = input[7];
398   tran_high_t x9 = input[8];
399   tran_high_t x10 = input[5];
400   tran_high_t x11 = input[10];
401   tran_high_t x12 = input[3];
402   tran_high_t x13 = input[12];
403   tran_high_t x14 = input[1];
404   tran_high_t x15 = input[14];
405 
406   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
407         x13 | x14 | x15)) {
408     memset(output, 0, 16 * sizeof(*output));
409     return;
410   }
411 
412   // stage 1
413   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
414   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
415   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
416   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
417   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
418   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
419   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
420   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
421   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
422   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
423   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
424   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
425   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
426   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
427   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
428   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
429 
430   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
431   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
432   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
433   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
434   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
435   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
436   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
437   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
438   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
439   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
440   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
441   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
442   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
443   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
444   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
445   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
446 
447   // stage 2
448   s0 = x0;
449   s1 = x1;
450   s2 = x2;
451   s3 = x3;
452   s4 = x4;
453   s5 = x5;
454   s6 = x6;
455   s7 = x7;
456   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
457   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
458   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
459   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
460   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
461   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
462   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
463   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
464 
465   x0 = WRAPLOW(s0 + s4);
466   x1 = WRAPLOW(s1 + s5);
467   x2 = WRAPLOW(s2 + s6);
468   x3 = WRAPLOW(s3 + s7);
469   x4 = WRAPLOW(s0 - s4);
470   x5 = WRAPLOW(s1 - s5);
471   x6 = WRAPLOW(s2 - s6);
472   x7 = WRAPLOW(s3 - s7);
473   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
474   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
475   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
476   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
477   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
478   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
479   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
480   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
481 
482   // stage 3
483   s0 = x0;
484   s1 = x1;
485   s2 = x2;
486   s3 = x3;
487   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
488   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
489   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
490   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
491   s8 = x8;
492   s9 = x9;
493   s10 = x10;
494   s11 = x11;
495   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
496   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
497   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
498   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
499 
500   x0 = WRAPLOW(s0 + s2);
501   x1 = WRAPLOW(s1 + s3);
502   x2 = WRAPLOW(s0 - s2);
503   x3 = WRAPLOW(s1 - s3);
504   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
505   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
506   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
507   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
508   x8 = WRAPLOW(s8 + s10);
509   x9 = WRAPLOW(s9 + s11);
510   x10 = WRAPLOW(s8 - s10);
511   x11 = WRAPLOW(s9 - s11);
512   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
513   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
514   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
515   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
516 
517   // stage 4
518   s2 = (-cospi_16_64) * (x2 + x3);
519   s3 = cospi_16_64 * (x2 - x3);
520   s6 = cospi_16_64 * (x6 + x7);
521   s7 = cospi_16_64 * (-x6 + x7);
522   s10 = cospi_16_64 * (x10 + x11);
523   s11 = cospi_16_64 * (-x10 + x11);
524   s14 = (-cospi_16_64) * (x14 + x15);
525   s15 = cospi_16_64 * (x14 - x15);
526 
527   x2 = WRAPLOW(dct_const_round_shift(s2));
528   x3 = WRAPLOW(dct_const_round_shift(s3));
529   x6 = WRAPLOW(dct_const_round_shift(s6));
530   x7 = WRAPLOW(dct_const_round_shift(s7));
531   x10 = WRAPLOW(dct_const_round_shift(s10));
532   x11 = WRAPLOW(dct_const_round_shift(s11));
533   x14 = WRAPLOW(dct_const_round_shift(s14));
534   x15 = WRAPLOW(dct_const_round_shift(s15));
535 
536   output[0] = WRAPLOW(x0);
537   output[1] = WRAPLOW(-x8);
538   output[2] = WRAPLOW(x12);
539   output[3] = WRAPLOW(-x4);
540   output[4] = WRAPLOW(x6);
541   output[5] = WRAPLOW(x14);
542   output[6] = WRAPLOW(x10);
543   output[7] = WRAPLOW(x2);
544   output[8] = WRAPLOW(x3);
545   output[9] = WRAPLOW(x11);
546   output[10] = WRAPLOW(x15);
547   output[11] = WRAPLOW(x7);
548   output[12] = WRAPLOW(x5);
549   output[13] = WRAPLOW(-x13);
550   output[14] = WRAPLOW(x9);
551   output[15] = WRAPLOW(-x1);
552 }
553 
idct16_c(const tran_low_t * input,tran_low_t * output)554 void idct16_c(const tran_low_t *input, tran_low_t *output) {
555   tran_low_t step1[16], step2[16];
556   tran_high_t temp1, temp2;
557 
558   // stage 1
559   step1[0] = input[0 / 2];
560   step1[1] = input[16 / 2];
561   step1[2] = input[8 / 2];
562   step1[3] = input[24 / 2];
563   step1[4] = input[4 / 2];
564   step1[5] = input[20 / 2];
565   step1[6] = input[12 / 2];
566   step1[7] = input[28 / 2];
567   step1[8] = input[2 / 2];
568   step1[9] = input[18 / 2];
569   step1[10] = input[10 / 2];
570   step1[11] = input[26 / 2];
571   step1[12] = input[6 / 2];
572   step1[13] = input[22 / 2];
573   step1[14] = input[14 / 2];
574   step1[15] = input[30 / 2];
575 
576   // stage 2
577   step2[0] = step1[0];
578   step2[1] = step1[1];
579   step2[2] = step1[2];
580   step2[3] = step1[3];
581   step2[4] = step1[4];
582   step2[5] = step1[5];
583   step2[6] = step1[6];
584   step2[7] = step1[7];
585 
586   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
587   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
588   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
589   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
590 
591   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
592   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
593   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
594   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
595 
596   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
597   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
598   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
599   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
600 
601   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
602   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
603   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
604   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
605 
606   // stage 3
607   step1[0] = step2[0];
608   step1[1] = step2[1];
609   step1[2] = step2[2];
610   step1[3] = step2[3];
611 
612   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
613   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
614   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
615   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
616   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
617   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
618   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
619   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
620 
621   step1[8] = WRAPLOW(step2[8] + step2[9]);
622   step1[9] = WRAPLOW(step2[8] - step2[9]);
623   step1[10] = WRAPLOW(-step2[10] + step2[11]);
624   step1[11] = WRAPLOW(step2[10] + step2[11]);
625   step1[12] = WRAPLOW(step2[12] + step2[13]);
626   step1[13] = WRAPLOW(step2[12] - step2[13]);
627   step1[14] = WRAPLOW(-step2[14] + step2[15]);
628   step1[15] = WRAPLOW(step2[14] + step2[15]);
629 
630   // stage 4
631   temp1 = (step1[0] + step1[1]) * cospi_16_64;
632   temp2 = (step1[0] - step1[1]) * cospi_16_64;
633   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
634   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
635   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
636   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
637   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
638   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
639   step2[4] = WRAPLOW(step1[4] + step1[5]);
640   step2[5] = WRAPLOW(step1[4] - step1[5]);
641   step2[6] = WRAPLOW(-step1[6] + step1[7]);
642   step2[7] = WRAPLOW(step1[6] + step1[7]);
643 
644   step2[8] = step1[8];
645   step2[15] = step1[15];
646   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
647   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
648   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
649   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
650   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
651   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
652   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
653   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
654   step2[11] = step1[11];
655   step2[12] = step1[12];
656 
657   // stage 5
658   step1[0] = WRAPLOW(step2[0] + step2[3]);
659   step1[1] = WRAPLOW(step2[1] + step2[2]);
660   step1[2] = WRAPLOW(step2[1] - step2[2]);
661   step1[3] = WRAPLOW(step2[0] - step2[3]);
662   step1[4] = step2[4];
663   temp1 = (step2[6] - step2[5]) * cospi_16_64;
664   temp2 = (step2[5] + step2[6]) * cospi_16_64;
665   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
666   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
667   step1[7] = step2[7];
668 
669   step1[8] = WRAPLOW(step2[8] + step2[11]);
670   step1[9] = WRAPLOW(step2[9] + step2[10]);
671   step1[10] = WRAPLOW(step2[9] - step2[10]);
672   step1[11] = WRAPLOW(step2[8] - step2[11]);
673   step1[12] = WRAPLOW(-step2[12] + step2[15]);
674   step1[13] = WRAPLOW(-step2[13] + step2[14]);
675   step1[14] = WRAPLOW(step2[13] + step2[14]);
676   step1[15] = WRAPLOW(step2[12] + step2[15]);
677 
678   // stage 6
679   step2[0] = WRAPLOW(step1[0] + step1[7]);
680   step2[1] = WRAPLOW(step1[1] + step1[6]);
681   step2[2] = WRAPLOW(step1[2] + step1[5]);
682   step2[3] = WRAPLOW(step1[3] + step1[4]);
683   step2[4] = WRAPLOW(step1[3] - step1[4]);
684   step2[5] = WRAPLOW(step1[2] - step1[5]);
685   step2[6] = WRAPLOW(step1[1] - step1[6]);
686   step2[7] = WRAPLOW(step1[0] - step1[7]);
687   step2[8] = step1[8];
688   step2[9] = step1[9];
689   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
690   temp2 = (step1[10] + step1[13]) * cospi_16_64;
691   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
692   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
693   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
694   temp2 = (step1[11] + step1[12]) * cospi_16_64;
695   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
696   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
697   step2[14] = step1[14];
698   step2[15] = step1[15];
699 
700   // stage 7
701   output[0] = WRAPLOW(step2[0] + step2[15]);
702   output[1] = WRAPLOW(step2[1] + step2[14]);
703   output[2] = WRAPLOW(step2[2] + step2[13]);
704   output[3] = WRAPLOW(step2[3] + step2[12]);
705   output[4] = WRAPLOW(step2[4] + step2[11]);
706   output[5] = WRAPLOW(step2[5] + step2[10]);
707   output[6] = WRAPLOW(step2[6] + step2[9]);
708   output[7] = WRAPLOW(step2[7] + step2[8]);
709   output[8] = WRAPLOW(step2[7] - step2[8]);
710   output[9] = WRAPLOW(step2[6] - step2[9]);
711   output[10] = WRAPLOW(step2[5] - step2[10]);
712   output[11] = WRAPLOW(step2[4] - step2[11]);
713   output[12] = WRAPLOW(step2[3] - step2[12]);
714   output[13] = WRAPLOW(step2[2] - step2[13]);
715   output[14] = WRAPLOW(step2[1] - step2[14]);
716   output[15] = WRAPLOW(step2[0] - step2[15]);
717 }
718 
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)719 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
720                              int stride) {
721   int i, j;
722   tran_low_t out[16 * 16];
723   tran_low_t *outptr = out;
724   tran_low_t temp_in[16], temp_out[16];
725 
726   // First transform rows
727   for (i = 0; i < 16; ++i) {
728     idct16_c(input, outptr);
729     input += 16;
730     outptr += 16;
731   }
732 
733   // Then transform columns
734   for (i = 0; i < 16; ++i) {
735     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
736     idct16_c(temp_in, temp_out);
737     for (j = 0; j < 16; ++j) {
738       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
739                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
740     }
741   }
742 }
743 
vpx_idct16x16_38_add_c(const tran_low_t * input,uint8_t * dest,int stride)744 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
745                             int stride) {
746   int i, j;
747   tran_low_t out[16 * 16] = { 0 };
748   tran_low_t *outptr = out;
749   tran_low_t temp_in[16], temp_out[16];
750 
751   // First transform rows. Since all non-zero dct coefficients are in
752   // upper-left 8x8 area, we only need to calculate first 8 rows here.
753   for (i = 0; i < 8; ++i) {
754     idct16_c(input, outptr);
755     input += 16;
756     outptr += 16;
757   }
758 
759   // Then transform columns
760   for (i = 0; i < 16; ++i) {
761     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
762     idct16_c(temp_in, temp_out);
763     for (j = 0; j < 16; ++j) {
764       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
765                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
766     }
767   }
768 }
769 
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)770 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
771                             int stride) {
772   int i, j;
773   tran_low_t out[16 * 16] = { 0 };
774   tran_low_t *outptr = out;
775   tran_low_t temp_in[16], temp_out[16];
776 
777   // First transform rows. Since all non-zero dct coefficients are in
778   // upper-left 4x4 area, we only need to calculate first 4 rows here.
779   for (i = 0; i < 4; ++i) {
780     idct16_c(input, outptr);
781     input += 16;
782     outptr += 16;
783   }
784 
785   // Then transform columns
786   for (i = 0; i < 16; ++i) {
787     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
788     idct16_c(temp_in, temp_out);
789     for (j = 0; j < 16; ++j) {
790       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
791                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
792     }
793   }
794 }
795 
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)796 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
797   int i, j;
798   tran_high_t a1;
799   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
800 
801   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
802   a1 = ROUND_POWER_OF_TWO(out, 6);
803   for (j = 0; j < 16; ++j) {
804     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
805     dest += stride;
806   }
807 }
808 
idct32_c(const tran_low_t * input,tran_low_t * output)809 void idct32_c(const tran_low_t *input, tran_low_t *output) {
810   tran_low_t step1[32], step2[32];
811   tran_high_t temp1, temp2;
812 
813   // stage 1
814   step1[0] = input[0];
815   step1[1] = input[16];
816   step1[2] = input[8];
817   step1[3] = input[24];
818   step1[4] = input[4];
819   step1[5] = input[20];
820   step1[6] = input[12];
821   step1[7] = input[28];
822   step1[8] = input[2];
823   step1[9] = input[18];
824   step1[10] = input[10];
825   step1[11] = input[26];
826   step1[12] = input[6];
827   step1[13] = input[22];
828   step1[14] = input[14];
829   step1[15] = input[30];
830 
831   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
832   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
833   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
834   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
835 
836   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
837   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
838   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
839   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
840 
841   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
842   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
843   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
844   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
845 
846   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
847   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
848   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
849   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
850 
851   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
852   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
853   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
854   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
855 
856   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
857   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
858   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
859   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
860 
861   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
862   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
863   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
864   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
865 
866   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
867   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
868   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
869   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
870 
871   // stage 2
872   step2[0] = step1[0];
873   step2[1] = step1[1];
874   step2[2] = step1[2];
875   step2[3] = step1[3];
876   step2[4] = step1[4];
877   step2[5] = step1[5];
878   step2[6] = step1[6];
879   step2[7] = step1[7];
880 
881   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
882   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
883   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
884   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
885 
886   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
887   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
888   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
889   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
890 
891   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
892   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
893   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
894   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
895 
896   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
897   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
898   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
899   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
900 
901   step2[16] = WRAPLOW(step1[16] + step1[17]);
902   step2[17] = WRAPLOW(step1[16] - step1[17]);
903   step2[18] = WRAPLOW(-step1[18] + step1[19]);
904   step2[19] = WRAPLOW(step1[18] + step1[19]);
905   step2[20] = WRAPLOW(step1[20] + step1[21]);
906   step2[21] = WRAPLOW(step1[20] - step1[21]);
907   step2[22] = WRAPLOW(-step1[22] + step1[23]);
908   step2[23] = WRAPLOW(step1[22] + step1[23]);
909   step2[24] = WRAPLOW(step1[24] + step1[25]);
910   step2[25] = WRAPLOW(step1[24] - step1[25]);
911   step2[26] = WRAPLOW(-step1[26] + step1[27]);
912   step2[27] = WRAPLOW(step1[26] + step1[27]);
913   step2[28] = WRAPLOW(step1[28] + step1[29]);
914   step2[29] = WRAPLOW(step1[28] - step1[29]);
915   step2[30] = WRAPLOW(-step1[30] + step1[31]);
916   step2[31] = WRAPLOW(step1[30] + step1[31]);
917 
918   // stage 3
919   step1[0] = step2[0];
920   step1[1] = step2[1];
921   step1[2] = step2[2];
922   step1[3] = step2[3];
923 
924   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
925   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
926   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
927   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
928   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
929   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
930   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
931   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
932 
933   step1[8] = WRAPLOW(step2[8] + step2[9]);
934   step1[9] = WRAPLOW(step2[8] - step2[9]);
935   step1[10] = WRAPLOW(-step2[10] + step2[11]);
936   step1[11] = WRAPLOW(step2[10] + step2[11]);
937   step1[12] = WRAPLOW(step2[12] + step2[13]);
938   step1[13] = WRAPLOW(step2[12] - step2[13]);
939   step1[14] = WRAPLOW(-step2[14] + step2[15]);
940   step1[15] = WRAPLOW(step2[14] + step2[15]);
941 
942   step1[16] = step2[16];
943   step1[31] = step2[31];
944   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
945   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
946   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
947   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
948   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
949   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
950   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
951   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
952   step1[19] = step2[19];
953   step1[20] = step2[20];
954   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
955   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
956   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
957   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
958   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
959   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
960   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
961   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
962   step1[23] = step2[23];
963   step1[24] = step2[24];
964   step1[27] = step2[27];
965   step1[28] = step2[28];
966 
967   // stage 4
968   temp1 = (step1[0] + step1[1]) * cospi_16_64;
969   temp2 = (step1[0] - step1[1]) * cospi_16_64;
970   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
971   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
972   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
973   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
974   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
975   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
976   step2[4] = WRAPLOW(step1[4] + step1[5]);
977   step2[5] = WRAPLOW(step1[4] - step1[5]);
978   step2[6] = WRAPLOW(-step1[6] + step1[7]);
979   step2[7] = WRAPLOW(step1[6] + step1[7]);
980 
981   step2[8] = step1[8];
982   step2[15] = step1[15];
983   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
984   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
985   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
986   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
987   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
988   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
989   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
990   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
991   step2[11] = step1[11];
992   step2[12] = step1[12];
993 
994   step2[16] = WRAPLOW(step1[16] + step1[19]);
995   step2[17] = WRAPLOW(step1[17] + step1[18]);
996   step2[18] = WRAPLOW(step1[17] - step1[18]);
997   step2[19] = WRAPLOW(step1[16] - step1[19]);
998   step2[20] = WRAPLOW(-step1[20] + step1[23]);
999   step2[21] = WRAPLOW(-step1[21] + step1[22]);
1000   step2[22] = WRAPLOW(step1[21] + step1[22]);
1001   step2[23] = WRAPLOW(step1[20] + step1[23]);
1002 
1003   step2[24] = WRAPLOW(step1[24] + step1[27]);
1004   step2[25] = WRAPLOW(step1[25] + step1[26]);
1005   step2[26] = WRAPLOW(step1[25] - step1[26]);
1006   step2[27] = WRAPLOW(step1[24] - step1[27]);
1007   step2[28] = WRAPLOW(-step1[28] + step1[31]);
1008   step2[29] = WRAPLOW(-step1[29] + step1[30]);
1009   step2[30] = WRAPLOW(step1[29] + step1[30]);
1010   step2[31] = WRAPLOW(step1[28] + step1[31]);
1011 
1012   // stage 5
1013   step1[0] = WRAPLOW(step2[0] + step2[3]);
1014   step1[1] = WRAPLOW(step2[1] + step2[2]);
1015   step1[2] = WRAPLOW(step2[1] - step2[2]);
1016   step1[3] = WRAPLOW(step2[0] - step2[3]);
1017   step1[4] = step2[4];
1018   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1019   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1020   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1021   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1022   step1[7] = step2[7];
1023 
1024   step1[8] = WRAPLOW(step2[8] + step2[11]);
1025   step1[9] = WRAPLOW(step2[9] + step2[10]);
1026   step1[10] = WRAPLOW(step2[9] - step2[10]);
1027   step1[11] = WRAPLOW(step2[8] - step2[11]);
1028   step1[12] = WRAPLOW(-step2[12] + step2[15]);
1029   step1[13] = WRAPLOW(-step2[13] + step2[14]);
1030   step1[14] = WRAPLOW(step2[13] + step2[14]);
1031   step1[15] = WRAPLOW(step2[12] + step2[15]);
1032 
1033   step1[16] = step2[16];
1034   step1[17] = step2[17];
1035   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1036   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1037   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1038   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1039   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1040   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1041   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1042   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1043   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1044   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1045   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1046   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1047   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1048   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1049   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1050   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1051   step1[22] = step2[22];
1052   step1[23] = step2[23];
1053   step1[24] = step2[24];
1054   step1[25] = step2[25];
1055   step1[30] = step2[30];
1056   step1[31] = step2[31];
1057 
1058   // stage 6
1059   step2[0] = WRAPLOW(step1[0] + step1[7]);
1060   step2[1] = WRAPLOW(step1[1] + step1[6]);
1061   step2[2] = WRAPLOW(step1[2] + step1[5]);
1062   step2[3] = WRAPLOW(step1[3] + step1[4]);
1063   step2[4] = WRAPLOW(step1[3] - step1[4]);
1064   step2[5] = WRAPLOW(step1[2] - step1[5]);
1065   step2[6] = WRAPLOW(step1[1] - step1[6]);
1066   step2[7] = WRAPLOW(step1[0] - step1[7]);
1067   step2[8] = step1[8];
1068   step2[9] = step1[9];
1069   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1070   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1071   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1072   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1073   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1074   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1075   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1076   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1077   step2[14] = step1[14];
1078   step2[15] = step1[15];
1079 
1080   step2[16] = WRAPLOW(step1[16] + step1[23]);
1081   step2[17] = WRAPLOW(step1[17] + step1[22]);
1082   step2[18] = WRAPLOW(step1[18] + step1[21]);
1083   step2[19] = WRAPLOW(step1[19] + step1[20]);
1084   step2[20] = WRAPLOW(step1[19] - step1[20]);
1085   step2[21] = WRAPLOW(step1[18] - step1[21]);
1086   step2[22] = WRAPLOW(step1[17] - step1[22]);
1087   step2[23] = WRAPLOW(step1[16] - step1[23]);
1088 
1089   step2[24] = WRAPLOW(-step1[24] + step1[31]);
1090   step2[25] = WRAPLOW(-step1[25] + step1[30]);
1091   step2[26] = WRAPLOW(-step1[26] + step1[29]);
1092   step2[27] = WRAPLOW(-step1[27] + step1[28]);
1093   step2[28] = WRAPLOW(step1[27] + step1[28]);
1094   step2[29] = WRAPLOW(step1[26] + step1[29]);
1095   step2[30] = WRAPLOW(step1[25] + step1[30]);
1096   step2[31] = WRAPLOW(step1[24] + step1[31]);
1097 
1098   // stage 7
1099   step1[0] = WRAPLOW(step2[0] + step2[15]);
1100   step1[1] = WRAPLOW(step2[1] + step2[14]);
1101   step1[2] = WRAPLOW(step2[2] + step2[13]);
1102   step1[3] = WRAPLOW(step2[3] + step2[12]);
1103   step1[4] = WRAPLOW(step2[4] + step2[11]);
1104   step1[5] = WRAPLOW(step2[5] + step2[10]);
1105   step1[6] = WRAPLOW(step2[6] + step2[9]);
1106   step1[7] = WRAPLOW(step2[7] + step2[8]);
1107   step1[8] = WRAPLOW(step2[7] - step2[8]);
1108   step1[9] = WRAPLOW(step2[6] - step2[9]);
1109   step1[10] = WRAPLOW(step2[5] - step2[10]);
1110   step1[11] = WRAPLOW(step2[4] - step2[11]);
1111   step1[12] = WRAPLOW(step2[3] - step2[12]);
1112   step1[13] = WRAPLOW(step2[2] - step2[13]);
1113   step1[14] = WRAPLOW(step2[1] - step2[14]);
1114   step1[15] = WRAPLOW(step2[0] - step2[15]);
1115 
1116   step1[16] = step2[16];
1117   step1[17] = step2[17];
1118   step1[18] = step2[18];
1119   step1[19] = step2[19];
1120   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1121   temp2 = (step2[20] + step2[27]) * cospi_16_64;
1122   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1123   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1124   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1125   temp2 = (step2[21] + step2[26]) * cospi_16_64;
1126   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1127   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1128   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1129   temp2 = (step2[22] + step2[25]) * cospi_16_64;
1130   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1131   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1132   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1133   temp2 = (step2[23] + step2[24]) * cospi_16_64;
1134   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1135   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1136   step1[28] = step2[28];
1137   step1[29] = step2[29];
1138   step1[30] = step2[30];
1139   step1[31] = step2[31];
1140 
1141   // final stage
1142   output[0] = WRAPLOW(step1[0] + step1[31]);
1143   output[1] = WRAPLOW(step1[1] + step1[30]);
1144   output[2] = WRAPLOW(step1[2] + step1[29]);
1145   output[3] = WRAPLOW(step1[3] + step1[28]);
1146   output[4] = WRAPLOW(step1[4] + step1[27]);
1147   output[5] = WRAPLOW(step1[5] + step1[26]);
1148   output[6] = WRAPLOW(step1[6] + step1[25]);
1149   output[7] = WRAPLOW(step1[7] + step1[24]);
1150   output[8] = WRAPLOW(step1[8] + step1[23]);
1151   output[9] = WRAPLOW(step1[9] + step1[22]);
1152   output[10] = WRAPLOW(step1[10] + step1[21]);
1153   output[11] = WRAPLOW(step1[11] + step1[20]);
1154   output[12] = WRAPLOW(step1[12] + step1[19]);
1155   output[13] = WRAPLOW(step1[13] + step1[18]);
1156   output[14] = WRAPLOW(step1[14] + step1[17]);
1157   output[15] = WRAPLOW(step1[15] + step1[16]);
1158   output[16] = WRAPLOW(step1[15] - step1[16]);
1159   output[17] = WRAPLOW(step1[14] - step1[17]);
1160   output[18] = WRAPLOW(step1[13] - step1[18]);
1161   output[19] = WRAPLOW(step1[12] - step1[19]);
1162   output[20] = WRAPLOW(step1[11] - step1[20]);
1163   output[21] = WRAPLOW(step1[10] - step1[21]);
1164   output[22] = WRAPLOW(step1[9] - step1[22]);
1165   output[23] = WRAPLOW(step1[8] - step1[23]);
1166   output[24] = WRAPLOW(step1[7] - step1[24]);
1167   output[25] = WRAPLOW(step1[6] - step1[25]);
1168   output[26] = WRAPLOW(step1[5] - step1[26]);
1169   output[27] = WRAPLOW(step1[4] - step1[27]);
1170   output[28] = WRAPLOW(step1[3] - step1[28]);
1171   output[29] = WRAPLOW(step1[2] - step1[29]);
1172   output[30] = WRAPLOW(step1[1] - step1[30]);
1173   output[31] = WRAPLOW(step1[0] - step1[31]);
1174 }
1175 
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1176 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1177                               int stride) {
1178   int i, j;
1179   tran_low_t out[32 * 32];
1180   tran_low_t *outptr = out;
1181   tran_low_t temp_in[32], temp_out[32];
1182 
1183   // Rows
1184   for (i = 0; i < 32; ++i) {
1185     int16_t zero_coeff = 0;
1186     for (j = 0; j < 32; ++j) zero_coeff |= input[j];
1187 
1188     if (zero_coeff)
1189       idct32_c(input, outptr);
1190     else
1191       memset(outptr, 0, sizeof(tran_low_t) * 32);
1192     input += 32;
1193     outptr += 32;
1194   }
1195 
1196   // Columns
1197   for (i = 0; i < 32; ++i) {
1198     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1199     idct32_c(temp_in, temp_out);
1200     for (j = 0; j < 32; ++j) {
1201       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1202                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1203     }
1204   }
1205 }
1206 
vpx_idct32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride)1207 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1208                              int stride) {
1209   int i, j;
1210   tran_low_t out[32 * 32] = { 0 };
1211   tran_low_t *outptr = out;
1212   tran_low_t temp_in[32], temp_out[32];
1213 
1214   // Rows
1215   // Only upper-left 16x16 has non-zero coeff
1216   for (i = 0; i < 16; ++i) {
1217     idct32_c(input, outptr);
1218     input += 32;
1219     outptr += 32;
1220   }
1221 
1222   // Columns
1223   for (i = 0; i < 32; ++i) {
1224     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1225     idct32_c(temp_in, temp_out);
1226     for (j = 0; j < 32; ++j) {
1227       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1228                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1229     }
1230   }
1231 }
1232 
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1233 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1234                             int stride) {
1235   int i, j;
1236   tran_low_t out[32 * 32] = { 0 };
1237   tran_low_t *outptr = out;
1238   tran_low_t temp_in[32], temp_out[32];
1239 
1240   // Rows
1241   // Only upper-left 8x8 has non-zero coeff
1242   for (i = 0; i < 8; ++i) {
1243     idct32_c(input, outptr);
1244     input += 32;
1245     outptr += 32;
1246   }
1247 
1248   // Columns
1249   for (i = 0; i < 32; ++i) {
1250     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1251     idct32_c(temp_in, temp_out);
1252     for (j = 0; j < 32; ++j) {
1253       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1254                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1255     }
1256   }
1257 }
1258 
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1259 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1260   int i, j;
1261   tran_high_t a1;
1262   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1263 
1264   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1265   a1 = ROUND_POWER_OF_TWO(out, 6);
1266 
1267   for (j = 0; j < 32; ++j) {
1268     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1269     dest += stride;
1270   }
1271 }
1272 
1273 #if CONFIG_VP9_HIGHBITDEPTH
1274 
1275 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1276 // transform amplify bits + 1 bit for contingency in rounding and quantizing
1277 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1278 
detect_invalid_highbd_input(const tran_low_t * input,int size)1279 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1280                                               int size) {
1281   int i;
1282   for (i = 0; i < size; ++i)
1283     if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1284   return 0;
1285 }
1286 
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1287 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1288                                  int stride, int bd) {
1289   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1290      0.5 shifts per pixel. */
1291   int i;
1292   tran_low_t output[16];
1293   tran_high_t a1, b1, c1, d1, e1;
1294   const tran_low_t *ip = input;
1295   tran_low_t *op = output;
1296 
1297   for (i = 0; i < 4; i++) {
1298     a1 = ip[0] >> UNIT_QUANT_SHIFT;
1299     c1 = ip[1] >> UNIT_QUANT_SHIFT;
1300     d1 = ip[2] >> UNIT_QUANT_SHIFT;
1301     b1 = ip[3] >> UNIT_QUANT_SHIFT;
1302     a1 += c1;
1303     d1 -= b1;
1304     e1 = (a1 - d1) >> 1;
1305     b1 = e1 - b1;
1306     c1 = e1 - c1;
1307     a1 -= b1;
1308     d1 += c1;
1309     op[0] = HIGHBD_WRAPLOW(a1, bd);
1310     op[1] = HIGHBD_WRAPLOW(b1, bd);
1311     op[2] = HIGHBD_WRAPLOW(c1, bd);
1312     op[3] = HIGHBD_WRAPLOW(d1, bd);
1313     ip += 4;
1314     op += 4;
1315   }
1316 
1317   ip = output;
1318   for (i = 0; i < 4; i++) {
1319     a1 = ip[4 * 0];
1320     c1 = ip[4 * 1];
1321     d1 = ip[4 * 2];
1322     b1 = ip[4 * 3];
1323     a1 += c1;
1324     d1 -= b1;
1325     e1 = (a1 - d1) >> 1;
1326     b1 = e1 - b1;
1327     c1 = e1 - c1;
1328     a1 -= b1;
1329     d1 += c1;
1330     dest[stride * 0] =
1331         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1332     dest[stride * 1] =
1333         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1334     dest[stride * 2] =
1335         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1336     dest[stride * 3] =
1337         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1338 
1339     ip++;
1340     dest++;
1341   }
1342 }
1343 
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint16_t * dest,int stride,int bd)1344 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
1345                                 int stride, int bd) {
1346   int i;
1347   tran_high_t a1, e1;
1348   tran_low_t tmp[4];
1349   const tran_low_t *ip = in;
1350   tran_low_t *op = tmp;
1351   (void)bd;
1352 
1353   a1 = ip[0] >> UNIT_QUANT_SHIFT;
1354   e1 = a1 >> 1;
1355   a1 -= e1;
1356   op[0] = HIGHBD_WRAPLOW(a1, bd);
1357   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1358 
1359   ip = tmp;
1360   for (i = 0; i < 4; i++) {
1361     e1 = ip[0] >> 1;
1362     a1 = ip[0] - e1;
1363     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1364     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1365     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1366     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1367     ip++;
1368     dest++;
1369   }
1370 }
1371 
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1372 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1373   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1374   tran_low_t x0 = input[0];
1375   tran_low_t x1 = input[1];
1376   tran_low_t x2 = input[2];
1377   tran_low_t x3 = input[3];
1378   (void)bd;
1379 
1380   if (detect_invalid_highbd_input(input, 4)) {
1381 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1382     assert(0 && "invalid highbd txfm input");
1383 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1384     memset(output, 0, sizeof(*output) * 4);
1385     return;
1386   }
1387 
1388   if (!(x0 | x1 | x2 | x3)) {
1389     memset(output, 0, 4 * sizeof(*output));
1390     return;
1391   }
1392 
1393   s0 = sinpi_1_9 * x0;
1394   s1 = sinpi_2_9 * x0;
1395   s2 = sinpi_3_9 * x1;
1396   s3 = sinpi_4_9 * x2;
1397   s4 = sinpi_1_9 * x2;
1398   s5 = sinpi_2_9 * x3;
1399   s6 = sinpi_4_9 * x3;
1400   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1401 
1402   s0 = s0 + s3 + s5;
1403   s1 = s1 - s4 - s6;
1404   s3 = s2;
1405   s2 = sinpi_3_9 * s7;
1406 
1407   // 1-D transform scaling factor is sqrt(2).
1408   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1409   // + 1b (addition) = 29b.
1410   // Hence the output bit depth is 15b.
1411   output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1412   output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1413   output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1414   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1415 }
1416 
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1417 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1418   tran_low_t step[4];
1419   tran_high_t temp1, temp2;
1420   (void)bd;
1421 
1422   if (detect_invalid_highbd_input(input, 4)) {
1423 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1424     assert(0 && "invalid highbd txfm input");
1425 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1426     memset(output, 0, sizeof(*output) * 4);
1427     return;
1428   }
1429 
1430   // stage 1
1431   temp1 = (input[0] + input[2]) * cospi_16_64;
1432   temp2 = (input[0] - input[2]) * cospi_16_64;
1433   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1434   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1435   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1436   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1437   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1438   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1439 
1440   // stage 2
1441   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1442   output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1443   output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1444   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1445 }
1446 
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1447 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1448                                  int stride, int bd) {
1449   int i, j;
1450   tran_low_t out[4 * 4];
1451   tran_low_t *outptr = out;
1452   tran_low_t temp_in[4], temp_out[4];
1453 
1454   // Rows
1455   for (i = 0; i < 4; ++i) {
1456     vpx_highbd_idct4_c(input, outptr, bd);
1457     input += 4;
1458     outptr += 4;
1459   }
1460 
1461   // Columns
1462   for (i = 0; i < 4; ++i) {
1463     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1464     vpx_highbd_idct4_c(temp_in, temp_out, bd);
1465     for (j = 0; j < 4; ++j) {
1466       dest[j * stride + i] = highbd_clip_pixel_add(
1467           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1468     }
1469   }
1470 }
1471 
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1472 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
1473                                 int stride, int bd) {
1474   int i;
1475   tran_high_t a1;
1476   tran_low_t out =
1477       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1478 
1479   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1480   a1 = ROUND_POWER_OF_TWO(out, 4);
1481 
1482   for (i = 0; i < 4; i++) {
1483     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1484     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1485     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1486     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1487     dest += stride;
1488   }
1489 }
1490 
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1491 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1492   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1493   tran_low_t x0 = input[7];
1494   tran_low_t x1 = input[0];
1495   tran_low_t x2 = input[5];
1496   tran_low_t x3 = input[2];
1497   tran_low_t x4 = input[3];
1498   tran_low_t x5 = input[4];
1499   tran_low_t x6 = input[1];
1500   tran_low_t x7 = input[6];
1501   (void)bd;
1502 
1503   if (detect_invalid_highbd_input(input, 8)) {
1504 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1505     assert(0 && "invalid highbd txfm input");
1506 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1507     memset(output, 0, sizeof(*output) * 8);
1508     return;
1509   }
1510 
1511   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1512     memset(output, 0, 8 * sizeof(*output));
1513     return;
1514   }
1515 
1516   // stage 1
1517   s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1518   s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1519   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1520   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1521   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1522   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1523   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1524   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1525 
1526   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1527   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1528   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1529   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1530   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1531   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1532   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1533   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1534 
1535   // stage 2
1536   s0 = x0;
1537   s1 = x1;
1538   s2 = x2;
1539   s3 = x3;
1540   s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1541   s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1542   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1543   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1544 
1545   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1546   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1547   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1548   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1549   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1550   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1551   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1552   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1553 
1554   // stage 3
1555   s2 = cospi_16_64 * (x2 + x3);
1556   s3 = cospi_16_64 * (x2 - x3);
1557   s6 = cospi_16_64 * (x6 + x7);
1558   s7 = cospi_16_64 * (x6 - x7);
1559 
1560   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1561   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1562   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1563   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1564 
1565   output[0] = HIGHBD_WRAPLOW(x0, bd);
1566   output[1] = HIGHBD_WRAPLOW(-x4, bd);
1567   output[2] = HIGHBD_WRAPLOW(x6, bd);
1568   output[3] = HIGHBD_WRAPLOW(-x2, bd);
1569   output[4] = HIGHBD_WRAPLOW(x3, bd);
1570   output[5] = HIGHBD_WRAPLOW(-x7, bd);
1571   output[6] = HIGHBD_WRAPLOW(x5, bd);
1572   output[7] = HIGHBD_WRAPLOW(-x1, bd);
1573 }
1574 
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1575 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1576   tran_low_t step1[8], step2[8];
1577   tran_high_t temp1, temp2;
1578 
1579   if (detect_invalid_highbd_input(input, 8)) {
1580 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1581     assert(0 && "invalid highbd txfm input");
1582 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1583     memset(output, 0, sizeof(*output) * 8);
1584     return;
1585   }
1586 
1587   // stage 1
1588   step1[0] = input[0];
1589   step1[2] = input[4];
1590   step1[1] = input[2];
1591   step1[3] = input[6];
1592   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1593   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1594   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1595   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1596   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1597   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1598   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1599   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1600 
1601   // stage 2 & stage 3 - even half
1602   vpx_highbd_idct4_c(step1, step1, bd);
1603 
1604   // stage 2 - odd half
1605   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1606   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1607   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1608   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1609 
1610   // stage 3 - odd half
1611   step1[4] = step2[4];
1612   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1613   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1614   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1615   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1616   step1[7] = step2[7];
1617 
1618   // stage 4
1619   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1620   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1621   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1622   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1623   output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1624   output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1625   output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1626   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1627 }
1628 
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1629 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
1630                                  int stride, int bd) {
1631   int i, j;
1632   tran_low_t out[8 * 8];
1633   tran_low_t *outptr = out;
1634   tran_low_t temp_in[8], temp_out[8];
1635 
1636   // First transform rows
1637   for (i = 0; i < 8; ++i) {
1638     vpx_highbd_idct8_c(input, outptr, bd);
1639     input += 8;
1640     outptr += 8;
1641   }
1642 
1643   // Then transform columns
1644   for (i = 0; i < 8; ++i) {
1645     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1646     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1647     for (j = 0; j < 8; ++j) {
1648       dest[j * stride + i] = highbd_clip_pixel_add(
1649           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1650     }
1651   }
1652 }
1653 
vpx_highbd_idct8x8_12_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1654 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
1655                                  int stride, int bd) {
1656   int i, j;
1657   tran_low_t out[8 * 8] = { 0 };
1658   tran_low_t *outptr = out;
1659   tran_low_t temp_in[8], temp_out[8];
1660 
1661   // First transform rows
1662   // Only first 4 row has non-zero coefs
1663   for (i = 0; i < 4; ++i) {
1664     vpx_highbd_idct8_c(input, outptr, bd);
1665     input += 8;
1666     outptr += 8;
1667   }
1668 
1669   // Then transform columns
1670   for (i = 0; i < 8; ++i) {
1671     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1672     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1673     for (j = 0; j < 8; ++j) {
1674       dest[j * stride + i] = highbd_clip_pixel_add(
1675           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1676     }
1677   }
1678 }
1679 
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1680 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
1681                                 int stride, int bd) {
1682   int i, j;
1683   tran_high_t a1;
1684   tran_low_t out =
1685       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1686 
1687   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1688   a1 = ROUND_POWER_OF_TWO(out, 5);
1689   for (j = 0; j < 8; ++j) {
1690     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1691     dest += stride;
1692   }
1693 }
1694 
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1695 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1696   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1697   tran_high_t s9, s10, s11, s12, s13, s14, s15;
1698   tran_low_t x0 = input[15];
1699   tran_low_t x1 = input[0];
1700   tran_low_t x2 = input[13];
1701   tran_low_t x3 = input[2];
1702   tran_low_t x4 = input[11];
1703   tran_low_t x5 = input[4];
1704   tran_low_t x6 = input[9];
1705   tran_low_t x7 = input[6];
1706   tran_low_t x8 = input[7];
1707   tran_low_t x9 = input[8];
1708   tran_low_t x10 = input[5];
1709   tran_low_t x11 = input[10];
1710   tran_low_t x12 = input[3];
1711   tran_low_t x13 = input[12];
1712   tran_low_t x14 = input[1];
1713   tran_low_t x15 = input[14];
1714   (void)bd;
1715 
1716   if (detect_invalid_highbd_input(input, 16)) {
1717 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1718     assert(0 && "invalid highbd txfm input");
1719 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1720     memset(output, 0, sizeof(*output) * 16);
1721     return;
1722   }
1723 
1724   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1725         x13 | x14 | x15)) {
1726     memset(output, 0, 16 * sizeof(*output));
1727     return;
1728   }
1729 
1730   // stage 1
1731   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1732   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1733   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1734   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1735   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1736   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1737   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1738   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1739   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1740   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1741   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1742   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1743   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1744   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1745   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1746   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1747 
1748   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1749   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1750   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1751   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1752   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1753   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1754   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1755   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1756   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1757   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1758   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1759   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1760   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1761   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1762   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1763   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1764 
1765   // stage 2
1766   s0 = x0;
1767   s1 = x1;
1768   s2 = x2;
1769   s3 = x3;
1770   s4 = x4;
1771   s5 = x5;
1772   s6 = x6;
1773   s7 = x7;
1774   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1775   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1776   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1777   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1778   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1779   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1780   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1781   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1782 
1783   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1784   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1785   x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1786   x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1787   x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1788   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1789   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1790   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1791   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1792   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1793   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1794   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1795   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1796   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1797   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1798   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1799 
1800   // stage 3
1801   s0 = x0;
1802   s1 = x1;
1803   s2 = x2;
1804   s3 = x3;
1805   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1806   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1807   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1808   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1809   s8 = x8;
1810   s9 = x9;
1811   s10 = x10;
1812   s11 = x11;
1813   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1814   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1815   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1816   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1817 
1818   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1819   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1820   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1821   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1822   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1823   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1824   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1825   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1826   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1827   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1828   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1829   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
1830   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1831   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1832   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1833   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
1834 
1835   // stage 4
1836   s2 = (-cospi_16_64) * (x2 + x3);
1837   s3 = cospi_16_64 * (x2 - x3);
1838   s6 = cospi_16_64 * (x6 + x7);
1839   s7 = cospi_16_64 * (-x6 + x7);
1840   s10 = cospi_16_64 * (x10 + x11);
1841   s11 = cospi_16_64 * (-x10 + x11);
1842   s14 = (-cospi_16_64) * (x14 + x15);
1843   s15 = cospi_16_64 * (x14 - x15);
1844 
1845   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1846   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1847   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1848   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1849   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1850   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1851   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1852   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
1853 
1854   output[0] = HIGHBD_WRAPLOW(x0, bd);
1855   output[1] = HIGHBD_WRAPLOW(-x8, bd);
1856   output[2] = HIGHBD_WRAPLOW(x12, bd);
1857   output[3] = HIGHBD_WRAPLOW(-x4, bd);
1858   output[4] = HIGHBD_WRAPLOW(x6, bd);
1859   output[5] = HIGHBD_WRAPLOW(x14, bd);
1860   output[6] = HIGHBD_WRAPLOW(x10, bd);
1861   output[7] = HIGHBD_WRAPLOW(x2, bd);
1862   output[8] = HIGHBD_WRAPLOW(x3, bd);
1863   output[9] = HIGHBD_WRAPLOW(x11, bd);
1864   output[10] = HIGHBD_WRAPLOW(x15, bd);
1865   output[11] = HIGHBD_WRAPLOW(x7, bd);
1866   output[12] = HIGHBD_WRAPLOW(x5, bd);
1867   output[13] = HIGHBD_WRAPLOW(-x13, bd);
1868   output[14] = HIGHBD_WRAPLOW(x9, bd);
1869   output[15] = HIGHBD_WRAPLOW(-x1, bd);
1870 }
1871 
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1872 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1873   tran_low_t step1[16], step2[16];
1874   tran_high_t temp1, temp2;
1875   (void)bd;
1876 
1877   if (detect_invalid_highbd_input(input, 16)) {
1878 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1879     assert(0 && "invalid highbd txfm input");
1880 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1881     memset(output, 0, sizeof(*output) * 16);
1882     return;
1883   }
1884 
1885   // stage 1
1886   step1[0] = input[0 / 2];
1887   step1[1] = input[16 / 2];
1888   step1[2] = input[8 / 2];
1889   step1[3] = input[24 / 2];
1890   step1[4] = input[4 / 2];
1891   step1[5] = input[20 / 2];
1892   step1[6] = input[12 / 2];
1893   step1[7] = input[28 / 2];
1894   step1[8] = input[2 / 2];
1895   step1[9] = input[18 / 2];
1896   step1[10] = input[10 / 2];
1897   step1[11] = input[26 / 2];
1898   step1[12] = input[6 / 2];
1899   step1[13] = input[22 / 2];
1900   step1[14] = input[14 / 2];
1901   step1[15] = input[30 / 2];
1902 
1903   // stage 2
1904   step2[0] = step1[0];
1905   step2[1] = step1[1];
1906   step2[2] = step1[2];
1907   step2[3] = step1[3];
1908   step2[4] = step1[4];
1909   step2[5] = step1[5];
1910   step2[6] = step1[6];
1911   step2[7] = step1[7];
1912 
1913   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1914   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1915   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1916   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1917 
1918   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1919   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1920   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1921   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1922 
1923   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1924   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1925   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1926   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1927 
1928   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1929   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1930   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1931   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1932 
1933   // stage 3
1934   step1[0] = step2[0];
1935   step1[1] = step2[1];
1936   step1[2] = step2[2];
1937   step1[3] = step2[3];
1938 
1939   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1940   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1941   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1942   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1943   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1944   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1945   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1946   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1947 
1948   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1949   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1950   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1951   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1952   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1953   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1954   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1955   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1956 
1957   // stage 4
1958   temp1 = (step1[0] + step1[1]) * cospi_16_64;
1959   temp2 = (step1[0] - step1[1]) * cospi_16_64;
1960   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1961   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1962   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1963   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1964   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1965   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1966   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1967   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1968   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1969   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1970 
1971   step2[8] = step1[8];
1972   step2[15] = step1[15];
1973   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1974   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1975   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1976   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1977   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1978   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1979   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1980   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1981   step2[11] = step1[11];
1982   step2[12] = step1[12];
1983 
1984   // stage 5
1985   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1986   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
1987   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
1988   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
1989   step1[4] = step2[4];
1990   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1991   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1992   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1993   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1994   step1[7] = step2[7];
1995 
1996   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
1997   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
1998   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
1999   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2000   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2001   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2002   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2003   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2004 
2005   // stage 6
2006   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2007   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2008   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2009   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2010   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2011   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2012   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2013   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2014   step2[8] = step1[8];
2015   step2[9] = step1[9];
2016   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2017   temp2 = (step1[10] + step1[13]) * cospi_16_64;
2018   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2019   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2020   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2021   temp2 = (step1[11] + step1[12]) * cospi_16_64;
2022   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2023   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2024   step2[14] = step1[14];
2025   step2[15] = step1[15];
2026 
2027   // stage 7
2028   output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2029   output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2030   output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2031   output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2032   output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2033   output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2034   output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2035   output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2036   output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2037   output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2038   output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2039   output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2040   output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2041   output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2042   output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2043   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2044 }
2045 
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2046 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
2047                                     int stride, int bd) {
2048   int i, j;
2049   tran_low_t out[16 * 16];
2050   tran_low_t *outptr = out;
2051   tran_low_t temp_in[16], temp_out[16];
2052 
2053   // First transform rows
2054   for (i = 0; i < 16; ++i) {
2055     vpx_highbd_idct16_c(input, outptr, bd);
2056     input += 16;
2057     outptr += 16;
2058   }
2059 
2060   // Then transform columns
2061   for (i = 0; i < 16; ++i) {
2062     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2063     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2064     for (j = 0; j < 16; ++j) {
2065       dest[j * stride + i] = highbd_clip_pixel_add(
2066           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2067     }
2068   }
2069 }
2070 
vpx_highbd_idct16x16_38_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2071 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
2072                                    int stride, int bd) {
2073   int i, j;
2074   tran_low_t out[16 * 16] = { 0 };
2075   tran_low_t *outptr = out;
2076   tran_low_t temp_in[16], temp_out[16];
2077 
2078   // First transform rows. Since all non-zero dct coefficients are in
2079   // upper-left 8x8 area, we only need to calculate first 8 rows here.
2080   for (i = 0; i < 8; ++i) {
2081     vpx_highbd_idct16_c(input, outptr, bd);
2082     input += 16;
2083     outptr += 16;
2084   }
2085 
2086   // Then transform columns
2087   for (i = 0; i < 16; ++i) {
2088     uint16_t *destT = dest;
2089     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2090     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2091     for (j = 0; j < 16; ++j) {
2092       destT[i] = highbd_clip_pixel_add(destT[i],
2093                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2094       destT += stride;
2095     }
2096   }
2097 }
2098 
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2099 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
2100                                    int stride, int bd) {
2101   int i, j;
2102   tran_low_t out[16 * 16] = { 0 };
2103   tran_low_t *outptr = out;
2104   tran_low_t temp_in[16], temp_out[16];
2105 
2106   // First transform rows. Since all non-zero dct coefficients are in
2107   // upper-left 4x4 area, we only need to calculate first 4 rows here.
2108   for (i = 0; i < 4; ++i) {
2109     vpx_highbd_idct16_c(input, outptr, bd);
2110     input += 16;
2111     outptr += 16;
2112   }
2113 
2114   // Then transform columns
2115   for (i = 0; i < 16; ++i) {
2116     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2117     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2118     for (j = 0; j < 16; ++j) {
2119       dest[j * stride + i] = highbd_clip_pixel_add(
2120           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2121     }
2122   }
2123 }
2124 
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2125 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
2126                                   int stride, int bd) {
2127   int i, j;
2128   tran_high_t a1;
2129   tran_low_t out =
2130       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2131 
2132   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2133   a1 = ROUND_POWER_OF_TWO(out, 6);
2134   for (j = 0; j < 16; ++j) {
2135     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2136     dest += stride;
2137   }
2138 }
2139 
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2140 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2141                             int bd) {
2142   tran_low_t step1[32], step2[32];
2143   tran_high_t temp1, temp2;
2144   (void)bd;
2145 
2146   if (detect_invalid_highbd_input(input, 32)) {
2147 #if CONFIG_COEFFICIENT_RANGE_CHECKING
2148     assert(0 && "invalid highbd txfm input");
2149 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
2150     memset(output, 0, sizeof(*output) * 32);
2151     return;
2152   }
2153 
2154   // stage 1
2155   step1[0] = input[0];
2156   step1[1] = input[16];
2157   step1[2] = input[8];
2158   step1[3] = input[24];
2159   step1[4] = input[4];
2160   step1[5] = input[20];
2161   step1[6] = input[12];
2162   step1[7] = input[28];
2163   step1[8] = input[2];
2164   step1[9] = input[18];
2165   step1[10] = input[10];
2166   step1[11] = input[26];
2167   step1[12] = input[6];
2168   step1[13] = input[22];
2169   step1[14] = input[14];
2170   step1[15] = input[30];
2171 
2172   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2173   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2174   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2175   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2176 
2177   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2178   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2179   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2180   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2181 
2182   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2183   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2184   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2185   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2186 
2187   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2188   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2189   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2190   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2191 
2192   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2193   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2194   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2195   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2196 
2197   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2198   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2199   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2200   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2201 
2202   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2203   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2204   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2205   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2206 
2207   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2208   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2209   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2210   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2211 
2212   // stage 2
2213   step2[0] = step1[0];
2214   step2[1] = step1[1];
2215   step2[2] = step1[2];
2216   step2[3] = step1[3];
2217   step2[4] = step1[4];
2218   step2[5] = step1[5];
2219   step2[6] = step1[6];
2220   step2[7] = step1[7];
2221 
2222   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2223   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2224   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2225   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2226 
2227   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2228   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2229   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2230   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2231 
2232   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2233   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2234   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2235   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2236 
2237   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2238   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2239   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2240   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2241 
2242   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2243   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2244   step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2245   step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2246   step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2247   step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2248   step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2249   step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2250   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2251   step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2252   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2253   step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2254   step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2255   step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2256   step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2257   step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2258 
2259   // stage 3
2260   step1[0] = step2[0];
2261   step1[1] = step2[1];
2262   step1[2] = step2[2];
2263   step1[3] = step2[3];
2264 
2265   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2266   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2267   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2268   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2269   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2270   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2271   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2272   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2273 
2274   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2275   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2276   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2277   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2278   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2279   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2280   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2281   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2282 
2283   step1[16] = step2[16];
2284   step1[31] = step2[31];
2285   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2286   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2287   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2288   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2289   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2290   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2291   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2292   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2293   step1[19] = step2[19];
2294   step1[20] = step2[20];
2295   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2296   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2297   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2298   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2299   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2300   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2301   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2302   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2303   step1[23] = step2[23];
2304   step1[24] = step2[24];
2305   step1[27] = step2[27];
2306   step1[28] = step2[28];
2307 
2308   // stage 4
2309   temp1 = (step1[0] + step1[1]) * cospi_16_64;
2310   temp2 = (step1[0] - step1[1]) * cospi_16_64;
2311   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2312   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2313   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2314   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2315   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2316   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2317   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2318   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2319   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2320   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2321 
2322   step2[8] = step1[8];
2323   step2[15] = step1[15];
2324   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2325   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2326   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2327   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2328   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2329   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2330   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2331   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2332   step2[11] = step1[11];
2333   step2[12] = step1[12];
2334 
2335   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2336   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2337   step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2338   step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2339   step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2340   step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2341   step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2342   step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2343 
2344   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2345   step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2346   step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2347   step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2348   step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2349   step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2350   step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2351   step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2352 
2353   // stage 5
2354   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2355   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2356   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2357   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2358   step1[4] = step2[4];
2359   temp1 = (step2[6] - step2[5]) * cospi_16_64;
2360   temp2 = (step2[5] + step2[6]) * cospi_16_64;
2361   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2362   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2363   step1[7] = step2[7];
2364 
2365   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2366   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2367   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2368   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2369   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2370   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2371   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2372   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2373 
2374   step1[16] = step2[16];
2375   step1[17] = step2[17];
2376   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2377   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2378   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2379   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2380   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2381   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2382   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2383   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2384   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2385   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2386   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2387   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2388   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2389   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2390   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2391   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2392   step1[22] = step2[22];
2393   step1[23] = step2[23];
2394   step1[24] = step2[24];
2395   step1[25] = step2[25];
2396   step1[30] = step2[30];
2397   step1[31] = step2[31];
2398 
2399   // stage 6
2400   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2401   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2402   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2403   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2404   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2405   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2406   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2407   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2408   step2[8] = step1[8];
2409   step2[9] = step1[9];
2410   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2411   temp2 = (step1[10] + step1[13]) * cospi_16_64;
2412   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2413   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2414   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2415   temp2 = (step1[11] + step1[12]) * cospi_16_64;
2416   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2417   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2418   step2[14] = step1[14];
2419   step2[15] = step1[15];
2420 
2421   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2422   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2423   step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2424   step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2425   step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2426   step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2427   step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2428   step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2429 
2430   step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2431   step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2432   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2433   step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2434   step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2435   step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2436   step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2437   step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2438 
2439   // stage 7
2440   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2441   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2442   step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2443   step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2444   step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2445   step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2446   step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2447   step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2448   step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2449   step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2450   step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2451   step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2452   step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2453   step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2454   step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2455   step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2456 
2457   step1[16] = step2[16];
2458   step1[17] = step2[17];
2459   step1[18] = step2[18];
2460   step1[19] = step2[19];
2461   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2462   temp2 = (step2[20] + step2[27]) * cospi_16_64;
2463   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2464   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2465   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2466   temp2 = (step2[21] + step2[26]) * cospi_16_64;
2467   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2468   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2469   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2470   temp2 = (step2[22] + step2[25]) * cospi_16_64;
2471   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2472   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2473   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2474   temp2 = (step2[23] + step2[24]) * cospi_16_64;
2475   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2476   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2477   step1[28] = step2[28];
2478   step1[29] = step2[29];
2479   step1[30] = step2[30];
2480   step1[31] = step2[31];
2481 
2482   // final stage
2483   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2484   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2485   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2486   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2487   output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2488   output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2489   output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2490   output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2491   output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2492   output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2493   output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2494   output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2495   output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2496   output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2497   output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2498   output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2499   output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2500   output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2501   output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2502   output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2503   output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2504   output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2505   output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2506   output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2507   output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2508   output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2509   output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2510   output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2511   output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2512   output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2513   output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2514   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2515 }
2516 
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2517 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
2518                                      int stride, int bd) {
2519   int i, j;
2520   tran_low_t out[32 * 32];
2521   tran_low_t *outptr = out;
2522   tran_low_t temp_in[32], temp_out[32];
2523 
2524   // Rows
2525   for (i = 0; i < 32; ++i) {
2526     tran_low_t zero_coeff = 0;
2527     for (j = 0; j < 32; ++j) zero_coeff |= input[j];
2528 
2529     if (zero_coeff)
2530       highbd_idct32_c(input, outptr, bd);
2531     else
2532       memset(outptr, 0, sizeof(tran_low_t) * 32);
2533     input += 32;
2534     outptr += 32;
2535   }
2536 
2537   // Columns
2538   for (i = 0; i < 32; ++i) {
2539     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2540     highbd_idct32_c(temp_in, temp_out, bd);
2541     for (j = 0; j < 32; ++j) {
2542       dest[j * stride + i] = highbd_clip_pixel_add(
2543           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2544     }
2545   }
2546 }
2547 
vpx_highbd_idct32x32_135_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2548 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
2549                                     int stride, int bd) {
2550   int i, j;
2551   tran_low_t out[32 * 32] = { 0 };
2552   tran_low_t *outptr = out;
2553   tran_low_t temp_in[32], temp_out[32];
2554 
2555   // Rows
2556   // Only upper-left 16x16 has non-zero coeff
2557   for (i = 0; i < 16; ++i) {
2558     highbd_idct32_c(input, outptr, bd);
2559     input += 32;
2560     outptr += 32;
2561   }
2562 
2563   // Columns
2564   for (i = 0; i < 32; ++i) {
2565     uint16_t *destT = dest;
2566     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2567     highbd_idct32_c(temp_in, temp_out, bd);
2568     for (j = 0; j < 32; ++j) {
2569       destT[i] = highbd_clip_pixel_add(destT[i],
2570                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2571       destT += stride;
2572     }
2573   }
2574 }
2575 
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2576 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
2577                                    int stride, int bd) {
2578   int i, j;
2579   tran_low_t out[32 * 32] = { 0 };
2580   tran_low_t *outptr = out;
2581   tran_low_t temp_in[32], temp_out[32];
2582 
2583   // Rows
2584   // Only upper-left 8x8 has non-zero coeff
2585   for (i = 0; i < 8; ++i) {
2586     highbd_idct32_c(input, outptr, bd);
2587     input += 32;
2588     outptr += 32;
2589   }
2590 
2591   // Columns
2592   for (i = 0; i < 32; ++i) {
2593     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2594     highbd_idct32_c(temp_in, temp_out, bd);
2595     for (j = 0; j < 32; ++j) {
2596       dest[j * stride + i] = highbd_clip_pixel_add(
2597           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2598     }
2599   }
2600 }
2601 
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2602 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
2603                                   int stride, int bd) {
2604   int i, j;
2605   int a1;
2606   tran_low_t out =
2607       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2608 
2609   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2610   a1 = ROUND_POWER_OF_TWO(out, 6);
2611 
2612   for (j = 0; j < 32; ++j) {
2613     for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2614     dest += stride;
2615   }
2616 }
2617 
2618 #endif  // CONFIG_VP9_HIGHBITDEPTH
2619