1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <math.h>
12 #include <string.h>
13 
14 #include "vpx_dsp/inv_txfm.h"
15 
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)16 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
17 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
18    0.5 shifts per pixel. */
19   int i;
20   tran_low_t output[16];
21   tran_high_t a1, b1, c1, d1, e1;
22   const tran_low_t *ip = input;
23   tran_low_t *op = output;
24 
25   for (i = 0; i < 4; i++) {
26     a1 = ip[0] >> UNIT_QUANT_SHIFT;
27     c1 = ip[1] >> UNIT_QUANT_SHIFT;
28     d1 = ip[2] >> UNIT_QUANT_SHIFT;
29     b1 = ip[3] >> UNIT_QUANT_SHIFT;
30     a1 += c1;
31     d1 -= b1;
32     e1 = (a1 - d1) >> 1;
33     b1 = e1 - b1;
34     c1 = e1 - c1;
35     a1 -= b1;
36     d1 += c1;
37     op[0] = WRAPLOW(a1, 8);
38     op[1] = WRAPLOW(b1, 8);
39     op[2] = WRAPLOW(c1, 8);
40     op[3] = WRAPLOW(d1, 8);
41     ip += 4;
42     op += 4;
43   }
44 
45   ip = output;
46   for (i = 0; i < 4; i++) {
47     a1 = ip[4 * 0];
48     c1 = ip[4 * 1];
49     d1 = ip[4 * 2];
50     b1 = ip[4 * 3];
51     a1 += c1;
52     d1 -= b1;
53     e1 = (a1 - d1) >> 1;
54     b1 = e1 - b1;
55     c1 = e1 - c1;
56     a1 -= b1;
57     d1 += c1;
58     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
59     dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
60     dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
61     dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
62 
63     ip++;
64     dest++;
65   }
66 }
67 
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int dest_stride)68 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
69   int i;
70   tran_high_t a1, e1;
71   tran_low_t tmp[4];
72   const tran_low_t *ip = in;
73   tran_low_t *op = tmp;
74 
75   a1 = ip[0] >> UNIT_QUANT_SHIFT;
76   e1 = a1 >> 1;
77   a1 -= e1;
78   op[0] = WRAPLOW(a1, 8);
79   op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
80 
81   ip = tmp;
82   for (i = 0; i < 4; i++) {
83     e1 = ip[0] >> 1;
84     a1 = ip[0] - e1;
85     dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
86     dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
87     dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
88     dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
89     ip++;
90     dest++;
91   }
92 }
93 
idct4_c(const tran_low_t * input,tran_low_t * output)94 void idct4_c(const tran_low_t *input, tran_low_t *output) {
95   tran_low_t step[4];
96   tran_high_t temp1, temp2;
97   // stage 1
98   temp1 = (input[0] + input[2]) * cospi_16_64;
99   temp2 = (input[0] - input[2]) * cospi_16_64;
100   step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
101   step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
102   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
103   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
104   step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
105   step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
106 
107   // stage 2
108   output[0] = WRAPLOW(step[0] + step[3], 8);
109   output[1] = WRAPLOW(step[1] + step[2], 8);
110   output[2] = WRAPLOW(step[1] - step[2], 8);
111   output[3] = WRAPLOW(step[0] - step[3], 8);
112 }
113 
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)114 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
115   tran_low_t out[4 * 4];
116   tran_low_t *outptr = out;
117   int i, j;
118   tran_low_t temp_in[4], temp_out[4];
119 
120   // Rows
121   for (i = 0; i < 4; ++i) {
122     idct4_c(input, outptr);
123     input += 4;
124     outptr += 4;
125   }
126 
127   // Columns
128   for (i = 0; i < 4; ++i) {
129     for (j = 0; j < 4; ++j)
130       temp_in[j] = out[j * 4 + i];
131     idct4_c(temp_in, temp_out);
132     for (j = 0; j < 4; ++j) {
133       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
134                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
135     }
136   }
137 }
138 
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int dest_stride)139 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
140                          int dest_stride) {
141   int i;
142   tran_high_t a1;
143   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
144   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
145   a1 = ROUND_POWER_OF_TWO(out, 4);
146 
147   for (i = 0; i < 4; i++) {
148     dest[0] = clip_pixel_add(dest[0], a1);
149     dest[1] = clip_pixel_add(dest[1], a1);
150     dest[2] = clip_pixel_add(dest[2], a1);
151     dest[3] = clip_pixel_add(dest[3], a1);
152     dest += dest_stride;
153   }
154 }
155 
idct8_c(const tran_low_t * input,tran_low_t * output)156 void idct8_c(const tran_low_t *input, tran_low_t *output) {
157   tran_low_t step1[8], step2[8];
158   tran_high_t temp1, temp2;
159   // stage 1
160   step1[0] = input[0];
161   step1[2] = input[4];
162   step1[1] = input[2];
163   step1[3] = input[6];
164   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
165   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
166   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
167   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
168   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
169   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
170   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
171   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
172 
173   // stage 2
174   temp1 = (step1[0] + step1[2]) * cospi_16_64;
175   temp2 = (step1[0] - step1[2]) * cospi_16_64;
176   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
177   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
178   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
179   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
180   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
181   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
182   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
183   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
184   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
185   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
186 
187   // stage 3
188   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
189   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
190   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
191   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
192   step1[4] = step2[4];
193   temp1 = (step2[6] - step2[5]) * cospi_16_64;
194   temp2 = (step2[5] + step2[6]) * cospi_16_64;
195   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
196   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
197   step1[7] = step2[7];
198 
199   // stage 4
200   output[0] = WRAPLOW(step1[0] + step1[7], 8);
201   output[1] = WRAPLOW(step1[1] + step1[6], 8);
202   output[2] = WRAPLOW(step1[2] + step1[5], 8);
203   output[3] = WRAPLOW(step1[3] + step1[4], 8);
204   output[4] = WRAPLOW(step1[3] - step1[4], 8);
205   output[5] = WRAPLOW(step1[2] - step1[5], 8);
206   output[6] = WRAPLOW(step1[1] - step1[6], 8);
207   output[7] = WRAPLOW(step1[0] - step1[7], 8);
208 }
209 
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)210 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
211   tran_low_t out[8 * 8];
212   tran_low_t *outptr = out;
213   int i, j;
214   tran_low_t temp_in[8], temp_out[8];
215 
216   // First transform rows
217   for (i = 0; i < 8; ++i) {
218     idct8_c(input, outptr);
219     input += 8;
220     outptr += 8;
221   }
222 
223   // Then transform columns
224   for (i = 0; i < 8; ++i) {
225     for (j = 0; j < 8; ++j)
226       temp_in[j] = out[j * 8 + i];
227     idct8_c(temp_in, temp_out);
228     for (j = 0; j < 8; ++j) {
229       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
230                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
231     }
232   }
233 }
234 
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)235 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
236   int i, j;
237   tran_high_t a1;
238   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
239   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
240   a1 = ROUND_POWER_OF_TWO(out, 5);
241   for (j = 0; j < 8; ++j) {
242     for (i = 0; i < 8; ++i)
243       dest[i] = clip_pixel_add(dest[i], a1);
244     dest += stride;
245   }
246 }
247 
iadst4_c(const tran_low_t * input,tran_low_t * output)248 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
249   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
250 
251   tran_low_t x0 = input[0];
252   tran_low_t x1 = input[1];
253   tran_low_t x2 = input[2];
254   tran_low_t x3 = input[3];
255 
256   if (!(x0 | x1 | x2 | x3)) {
257     output[0] = output[1] = output[2] = output[3] = 0;
258     return;
259   }
260 
261   s0 = sinpi_1_9 * x0;
262   s1 = sinpi_2_9 * x0;
263   s2 = sinpi_3_9 * x1;
264   s3 = sinpi_4_9 * x2;
265   s4 = sinpi_1_9 * x2;
266   s5 = sinpi_2_9 * x3;
267   s6 = sinpi_4_9 * x3;
268   s7 = x0 - x2 + x3;
269 
270   s0 = s0 + s3 + s5;
271   s1 = s1 - s4 - s6;
272   s3 = s2;
273   s2 = sinpi_3_9 * s7;
274 
275   // 1-D transform scaling factor is sqrt(2).
276   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
277   // + 1b (addition) = 29b.
278   // Hence the output bit depth is 15b.
279   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
280   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
281   output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
282   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
283 }
284 
iadst8_c(const tran_low_t * input,tran_low_t * output)285 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
286   int s0, s1, s2, s3, s4, s5, s6, s7;
287 
288   tran_high_t x0 = input[7];
289   tran_high_t x1 = input[0];
290   tran_high_t x2 = input[5];
291   tran_high_t x3 = input[2];
292   tran_high_t x4 = input[3];
293   tran_high_t x5 = input[4];
294   tran_high_t x6 = input[1];
295   tran_high_t x7 = input[6];
296 
297   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
298     output[0] = output[1] = output[2] = output[3] = output[4]
299               = output[5] = output[6] = output[7] = 0;
300     return;
301   }
302 
303   // stage 1
304   s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
305   s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
306   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
307   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
308   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
309   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
310   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
311   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
312 
313   x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
314   x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
315   x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
316   x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
317   x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
318   x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
319   x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
320   x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
321 
322   // stage 2
323   s0 = (int)x0;
324   s1 = (int)x1;
325   s2 = (int)x2;
326   s3 = (int)x3;
327   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
328   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
329   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
330   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
331 
332   x0 = WRAPLOW(s0 + s2, 8);
333   x1 = WRAPLOW(s1 + s3, 8);
334   x2 = WRAPLOW(s0 - s2, 8);
335   x3 = WRAPLOW(s1 - s3, 8);
336   x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
337   x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
338   x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
339   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
340 
341   // stage 3
342   s2 = (int)(cospi_16_64 * (x2 + x3));
343   s3 = (int)(cospi_16_64 * (x2 - x3));
344   s6 = (int)(cospi_16_64 * (x6 + x7));
345   s7 = (int)(cospi_16_64 * (x6 - x7));
346 
347   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
348   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
349   x6 = WRAPLOW(dct_const_round_shift(s6), 8);
350   x7 = WRAPLOW(dct_const_round_shift(s7), 8);
351 
352   output[0] = WRAPLOW(x0, 8);
353   output[1] = WRAPLOW(-x4, 8);
354   output[2] = WRAPLOW(x6, 8);
355   output[3] = WRAPLOW(-x2, 8);
356   output[4] = WRAPLOW(x3, 8);
357   output[5] = WRAPLOW(-x7, 8);
358   output[6] = WRAPLOW(x5, 8);
359   output[7] = WRAPLOW(-x1, 8);
360 }
361 
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)362 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
363   tran_low_t out[8 * 8] = { 0 };
364   tran_low_t *outptr = out;
365   int i, j;
366   tran_low_t temp_in[8], temp_out[8];
367 
368   // First transform rows
369   // only first 4 row has non-zero coefs
370   for (i = 0; i < 4; ++i) {
371     idct8_c(input, outptr);
372     input += 8;
373     outptr += 8;
374   }
375 
376   // Then transform columns
377   for (i = 0; i < 8; ++i) {
378     for (j = 0; j < 8; ++j)
379       temp_in[j] = out[j * 8 + i];
380     idct8_c(temp_in, temp_out);
381     for (j = 0; j < 8; ++j) {
382       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
383                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
384     }
385   }
386 }
387 
idct16_c(const tran_low_t * input,tran_low_t * output)388 void idct16_c(const tran_low_t *input, tran_low_t *output) {
389   tran_low_t step1[16], step2[16];
390   tran_high_t temp1, temp2;
391 
392   // stage 1
393   step1[0] = input[0/2];
394   step1[1] = input[16/2];
395   step1[2] = input[8/2];
396   step1[3] = input[24/2];
397   step1[4] = input[4/2];
398   step1[5] = input[20/2];
399   step1[6] = input[12/2];
400   step1[7] = input[28/2];
401   step1[8] = input[2/2];
402   step1[9] = input[18/2];
403   step1[10] = input[10/2];
404   step1[11] = input[26/2];
405   step1[12] = input[6/2];
406   step1[13] = input[22/2];
407   step1[14] = input[14/2];
408   step1[15] = input[30/2];
409 
410   // stage 2
411   step2[0] = step1[0];
412   step2[1] = step1[1];
413   step2[2] = step1[2];
414   step2[3] = step1[3];
415   step2[4] = step1[4];
416   step2[5] = step1[5];
417   step2[6] = step1[6];
418   step2[7] = step1[7];
419 
420   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
421   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
422   step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
423   step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
424 
425   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
426   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
427   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
428   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
429 
430   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
431   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
432   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
433   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
434 
435   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
436   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
437   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
438   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
439 
440   // stage 3
441   step1[0] = step2[0];
442   step1[1] = step2[1];
443   step1[2] = step2[2];
444   step1[3] = step2[3];
445 
446   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
447   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
448   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
449   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
450   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
451   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
452   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
453   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
454 
455   step1[8] = WRAPLOW(step2[8] + step2[9], 8);
456   step1[9] = WRAPLOW(step2[8] - step2[9], 8);
457   step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
458   step1[11] = WRAPLOW(step2[10] + step2[11], 8);
459   step1[12] = WRAPLOW(step2[12] + step2[13], 8);
460   step1[13] = WRAPLOW(step2[12] - step2[13], 8);
461   step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
462   step1[15] = WRAPLOW(step2[14] + step2[15], 8);
463 
464   // stage 4
465   temp1 = (step1[0] + step1[1]) * cospi_16_64;
466   temp2 = (step1[0] - step1[1]) * cospi_16_64;
467   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
468   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
469   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
470   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
471   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
472   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
473   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
474   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
475   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
476   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
477 
478   step2[8] = step1[8];
479   step2[15] = step1[15];
480   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
481   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
482   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
483   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
484   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
485   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
486   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
487   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
488   step2[11] = step1[11];
489   step2[12] = step1[12];
490 
491   // stage 5
492   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
493   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
494   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
495   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
496   step1[4] = step2[4];
497   temp1 = (step2[6] - step2[5]) * cospi_16_64;
498   temp2 = (step2[5] + step2[6]) * cospi_16_64;
499   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
500   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
501   step1[7] = step2[7];
502 
503   step1[8] = WRAPLOW(step2[8] + step2[11], 8);
504   step1[9] = WRAPLOW(step2[9] + step2[10], 8);
505   step1[10] = WRAPLOW(step2[9] - step2[10], 8);
506   step1[11] = WRAPLOW(step2[8] - step2[11], 8);
507   step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
508   step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
509   step1[14] = WRAPLOW(step2[13] + step2[14], 8);
510   step1[15] = WRAPLOW(step2[12] + step2[15], 8);
511 
512   // stage 6
513   step2[0] = WRAPLOW(step1[0] + step1[7], 8);
514   step2[1] = WRAPLOW(step1[1] + step1[6], 8);
515   step2[2] = WRAPLOW(step1[2] + step1[5], 8);
516   step2[3] = WRAPLOW(step1[3] + step1[4], 8);
517   step2[4] = WRAPLOW(step1[3] - step1[4], 8);
518   step2[5] = WRAPLOW(step1[2] - step1[5], 8);
519   step2[6] = WRAPLOW(step1[1] - step1[6], 8);
520   step2[7] = WRAPLOW(step1[0] - step1[7], 8);
521   step2[8] = step1[8];
522   step2[9] = step1[9];
523   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
524   temp2 = (step1[10] + step1[13]) * cospi_16_64;
525   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
526   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
527   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
528   temp2 = (step1[11] + step1[12]) * cospi_16_64;
529   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
530   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
531   step2[14] = step1[14];
532   step2[15] = step1[15];
533 
534   // stage 7
535   output[0] = WRAPLOW(step2[0] + step2[15], 8);
536   output[1] = WRAPLOW(step2[1] + step2[14], 8);
537   output[2] = WRAPLOW(step2[2] + step2[13], 8);
538   output[3] = WRAPLOW(step2[3] + step2[12], 8);
539   output[4] = WRAPLOW(step2[4] + step2[11], 8);
540   output[5] = WRAPLOW(step2[5] + step2[10], 8);
541   output[6] = WRAPLOW(step2[6] + step2[9], 8);
542   output[7] = WRAPLOW(step2[7] + step2[8], 8);
543   output[8] = WRAPLOW(step2[7] - step2[8], 8);
544   output[9] = WRAPLOW(step2[6] - step2[9], 8);
545   output[10] = WRAPLOW(step2[5] - step2[10], 8);
546   output[11] = WRAPLOW(step2[4] - step2[11], 8);
547   output[12] = WRAPLOW(step2[3] - step2[12], 8);
548   output[13] = WRAPLOW(step2[2] - step2[13], 8);
549   output[14] = WRAPLOW(step2[1] - step2[14], 8);
550   output[15] = WRAPLOW(step2[0] - step2[15], 8);
551 }
552 
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)553 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
554                              int stride) {
555   tran_low_t out[16 * 16];
556   tran_low_t *outptr = out;
557   int i, j;
558   tran_low_t temp_in[16], temp_out[16];
559 
560   // First transform rows
561   for (i = 0; i < 16; ++i) {
562     idct16_c(input, outptr);
563     input += 16;
564     outptr += 16;
565   }
566 
567   // Then transform columns
568   for (i = 0; i < 16; ++i) {
569     for (j = 0; j < 16; ++j)
570       temp_in[j] = out[j * 16 + i];
571     idct16_c(temp_in, temp_out);
572     for (j = 0; j < 16; ++j) {
573       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
574                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
575     }
576   }
577 }
578 
iadst16_c(const tran_low_t * input,tran_low_t * output)579 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
580   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
581   tran_high_t s9, s10, s11, s12, s13, s14, s15;
582 
583   tran_high_t x0 = input[15];
584   tran_high_t x1 = input[0];
585   tran_high_t x2 = input[13];
586   tran_high_t x3 = input[2];
587   tran_high_t x4 = input[11];
588   tran_high_t x5 = input[4];
589   tran_high_t x6 = input[9];
590   tran_high_t x7 = input[6];
591   tran_high_t x8 = input[7];
592   tran_high_t x9 = input[8];
593   tran_high_t x10 = input[5];
594   tran_high_t x11 = input[10];
595   tran_high_t x12 = input[3];
596   tran_high_t x13 = input[12];
597   tran_high_t x14 = input[1];
598   tran_high_t x15 = input[14];
599 
600   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
601            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
602     output[0] = output[1] = output[2] = output[3] = output[4]
603               = output[5] = output[6] = output[7] = output[8]
604               = output[9] = output[10] = output[11] = output[12]
605               = output[13] = output[14] = output[15] = 0;
606     return;
607   }
608 
609   // stage 1
610   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
611   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
612   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
613   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
614   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
615   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
616   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
617   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
618   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
619   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
620   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
621   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
622   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
623   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
624   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
625   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
626 
627   x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
628   x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
629   x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
630   x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
631   x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
632   x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
633   x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
634   x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
635   x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
636   x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
637   x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
638   x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
639   x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
640   x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
641   x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
642   x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
643 
644   // stage 2
645   s0 = x0;
646   s1 = x1;
647   s2 = x2;
648   s3 = x3;
649   s4 = x4;
650   s5 = x5;
651   s6 = x6;
652   s7 = x7;
653   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
654   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
655   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
656   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
657   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
658   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
659   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
660   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
661 
662   x0 = WRAPLOW(s0 + s4, 8);
663   x1 = WRAPLOW(s1 + s5, 8);
664   x2 = WRAPLOW(s2 + s6, 8);
665   x3 = WRAPLOW(s3 + s7, 8);
666   x4 = WRAPLOW(s0 - s4, 8);
667   x5 = WRAPLOW(s1 - s5, 8);
668   x6 = WRAPLOW(s2 - s6, 8);
669   x7 = WRAPLOW(s3 - s7, 8);
670   x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
671   x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
672   x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
673   x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
674   x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
675   x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
676   x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
677   x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
678 
679   // stage 3
680   s0 = x0;
681   s1 = x1;
682   s2 = x2;
683   s3 = x3;
684   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
685   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
686   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
687   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
688   s8 = x8;
689   s9 = x9;
690   s10 = x10;
691   s11 = x11;
692   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
693   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
694   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
695   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
696 
697   x0 = WRAPLOW(check_range(s0 + s2), 8);
698   x1 = WRAPLOW(check_range(s1 + s3), 8);
699   x2 = WRAPLOW(check_range(s0 - s2), 8);
700   x3 = WRAPLOW(check_range(s1 - s3), 8);
701   x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
702   x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
703   x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
704   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
705   x8 = WRAPLOW(check_range(s8 + s10), 8);
706   x9 = WRAPLOW(check_range(s9 + s11), 8);
707   x10 = WRAPLOW(check_range(s8 - s10), 8);
708   x11 = WRAPLOW(check_range(s9 - s11), 8);
709   x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
710   x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
711   x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
712   x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
713 
714   // stage 4
715   s2 = (- cospi_16_64) * (x2 + x3);
716   s3 = cospi_16_64 * (x2 - x3);
717   s6 = cospi_16_64 * (x6 + x7);
718   s7 = cospi_16_64 * (- x6 + x7);
719   s10 = cospi_16_64 * (x10 + x11);
720   s11 = cospi_16_64 * (- x10 + x11);
721   s14 = (- cospi_16_64) * (x14 + x15);
722   s15 = cospi_16_64 * (x14 - x15);
723 
724   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
725   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
726   x6 = WRAPLOW(dct_const_round_shift(s6), 8);
727   x7 = WRAPLOW(dct_const_round_shift(s7), 8);
728   x10 = WRAPLOW(dct_const_round_shift(s10), 8);
729   x11 = WRAPLOW(dct_const_round_shift(s11), 8);
730   x14 = WRAPLOW(dct_const_round_shift(s14), 8);
731   x15 = WRAPLOW(dct_const_round_shift(s15), 8);
732 
733   output[0] = WRAPLOW(x0, 8);
734   output[1] = WRAPLOW(-x8, 8);
735   output[2] = WRAPLOW(x12, 8);
736   output[3] = WRAPLOW(-x4, 8);
737   output[4] = WRAPLOW(x6, 8);
738   output[5] = WRAPLOW(x14, 8);
739   output[6] = WRAPLOW(x10, 8);
740   output[7] = WRAPLOW(x2, 8);
741   output[8] = WRAPLOW(x3, 8);
742   output[9] = WRAPLOW(x11, 8);
743   output[10] = WRAPLOW(x15, 8);
744   output[11] = WRAPLOW(x7, 8);
745   output[12] = WRAPLOW(x5, 8);
746   output[13] = WRAPLOW(-x13, 8);
747   output[14] = WRAPLOW(x9, 8);
748   output[15] = WRAPLOW(-x1, 8);
749 }
750 
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)751 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
752                             int stride) {
753   tran_low_t out[16 * 16] = { 0 };
754   tran_low_t *outptr = out;
755   int i, j;
756   tran_low_t temp_in[16], temp_out[16];
757 
758   // First transform rows. Since all non-zero dct coefficients are in
759   // upper-left 4x4 area, we only need to calculate first 4 rows here.
760   for (i = 0; i < 4; ++i) {
761     idct16_c(input, outptr);
762     input += 16;
763     outptr += 16;
764   }
765 
766   // Then transform columns
767   for (i = 0; i < 16; ++i) {
768     for (j = 0; j < 16; ++j)
769       temp_in[j] = out[j*16 + i];
770     idct16_c(temp_in, temp_out);
771     for (j = 0; j < 16; ++j) {
772       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
773                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
774     }
775   }
776 }
777 
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)778 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
779   int i, j;
780   tran_high_t a1;
781   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
782   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
783   a1 = ROUND_POWER_OF_TWO(out, 6);
784   for (j = 0; j < 16; ++j) {
785     for (i = 0; i < 16; ++i)
786       dest[i] = clip_pixel_add(dest[i], a1);
787     dest += stride;
788   }
789 }
790 
idct32_c(const tran_low_t * input,tran_low_t * output)791 void idct32_c(const tran_low_t *input, tran_low_t *output) {
792   tran_low_t step1[32], step2[32];
793   tran_high_t temp1, temp2;
794 
795   // stage 1
796   step1[0] = input[0];
797   step1[1] = input[16];
798   step1[2] = input[8];
799   step1[3] = input[24];
800   step1[4] = input[4];
801   step1[5] = input[20];
802   step1[6] = input[12];
803   step1[7] = input[28];
804   step1[8] = input[2];
805   step1[9] = input[18];
806   step1[10] = input[10];
807   step1[11] = input[26];
808   step1[12] = input[6];
809   step1[13] = input[22];
810   step1[14] = input[14];
811   step1[15] = input[30];
812 
813   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
814   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
815   step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
816   step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
817 
818   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
819   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
820   step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
821   step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
822 
823   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
824   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
825   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
826   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
827 
828   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
829   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
830   step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
831   step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
832 
833   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
834   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
835   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
836   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
837 
838   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
839   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
840   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
841   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
842 
843   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
844   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
845   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
846   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
847 
848   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
849   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
850   step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
851   step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
852 
853   // stage 2
854   step2[0] = step1[0];
855   step2[1] = step1[1];
856   step2[2] = step1[2];
857   step2[3] = step1[3];
858   step2[4] = step1[4];
859   step2[5] = step1[5];
860   step2[6] = step1[6];
861   step2[7] = step1[7];
862 
863   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
864   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
865   step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
866   step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
867 
868   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
869   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
870   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
871   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
872 
873   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
874   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
875   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
876   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
877 
878   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
879   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
880   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
881   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
882 
883   step2[16] = WRAPLOW(step1[16] + step1[17], 8);
884   step2[17] = WRAPLOW(step1[16] - step1[17], 8);
885   step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
886   step2[19] = WRAPLOW(step1[18] + step1[19], 8);
887   step2[20] = WRAPLOW(step1[20] + step1[21], 8);
888   step2[21] = WRAPLOW(step1[20] - step1[21], 8);
889   step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
890   step2[23] = WRAPLOW(step1[22] + step1[23], 8);
891   step2[24] = WRAPLOW(step1[24] + step1[25], 8);
892   step2[25] = WRAPLOW(step1[24] - step1[25], 8);
893   step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
894   step2[27] = WRAPLOW(step1[26] + step1[27], 8);
895   step2[28] = WRAPLOW(step1[28] + step1[29], 8);
896   step2[29] = WRAPLOW(step1[28] - step1[29], 8);
897   step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
898   step2[31] = WRAPLOW(step1[30] + step1[31], 8);
899 
900   // stage 3
901   step1[0] = step2[0];
902   step1[1] = step2[1];
903   step1[2] = step2[2];
904   step1[3] = step2[3];
905 
906   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
907   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
908   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
909   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
910   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
911   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
912   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
913   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
914 
915   step1[8] = WRAPLOW(step2[8] + step2[9], 8);
916   step1[9] = WRAPLOW(step2[8] - step2[9], 8);
917   step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
918   step1[11] = WRAPLOW(step2[10] + step2[11], 8);
919   step1[12] = WRAPLOW(step2[12] + step2[13], 8);
920   step1[13] = WRAPLOW(step2[12] - step2[13], 8);
921   step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
922   step1[15] = WRAPLOW(step2[14] + step2[15], 8);
923 
924   step1[16] = step2[16];
925   step1[31] = step2[31];
926   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
927   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
928   step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
929   step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
930   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
931   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
932   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
933   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
934   step1[19] = step2[19];
935   step1[20] = step2[20];
936   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
937   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
938   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
939   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
940   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
941   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
942   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
943   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
944   step1[23] = step2[23];
945   step1[24] = step2[24];
946   step1[27] = step2[27];
947   step1[28] = step2[28];
948 
949   // stage 4
950   temp1 = (step1[0] + step1[1]) * cospi_16_64;
951   temp2 = (step1[0] - step1[1]) * cospi_16_64;
952   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
953   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
954   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
955   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
956   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
957   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
958   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
959   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
960   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
961   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
962 
963   step2[8] = step1[8];
964   step2[15] = step1[15];
965   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
966   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
967   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
968   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
969   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
970   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
971   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
972   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
973   step2[11] = step1[11];
974   step2[12] = step1[12];
975 
976   step2[16] = WRAPLOW(step1[16] + step1[19], 8);
977   step2[17] = WRAPLOW(step1[17] + step1[18], 8);
978   step2[18] = WRAPLOW(step1[17] - step1[18], 8);
979   step2[19] = WRAPLOW(step1[16] - step1[19], 8);
980   step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
981   step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
982   step2[22] = WRAPLOW(step1[21] + step1[22], 8);
983   step2[23] = WRAPLOW(step1[20] + step1[23], 8);
984 
985   step2[24] = WRAPLOW(step1[24] + step1[27], 8);
986   step2[25] = WRAPLOW(step1[25] + step1[26], 8);
987   step2[26] = WRAPLOW(step1[25] - step1[26], 8);
988   step2[27] = WRAPLOW(step1[24] - step1[27], 8);
989   step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
990   step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
991   step2[30] = WRAPLOW(step1[29] + step1[30], 8);
992   step2[31] = WRAPLOW(step1[28] + step1[31], 8);
993 
994   // stage 5
995   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
996   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
997   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
998   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
999   step1[4] = step2[4];
1000   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1001   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1002   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
1003   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
1004   step1[7] = step2[7];
1005 
1006   step1[8] = WRAPLOW(step2[8] + step2[11], 8);
1007   step1[9] = WRAPLOW(step2[9] + step2[10], 8);
1008   step1[10] = WRAPLOW(step2[9] - step2[10], 8);
1009   step1[11] = WRAPLOW(step2[8] - step2[11], 8);
1010   step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
1011   step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
1012   step1[14] = WRAPLOW(step2[13] + step2[14], 8);
1013   step1[15] = WRAPLOW(step2[12] + step2[15], 8);
1014 
1015   step1[16] = step2[16];
1016   step1[17] = step2[17];
1017   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1018   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1019   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1020   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1021   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1022   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1023   step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
1024   step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
1025   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1026   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1027   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1028   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1029   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1030   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1031   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1032   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1033   step1[22] = step2[22];
1034   step1[23] = step2[23];
1035   step1[24] = step2[24];
1036   step1[25] = step2[25];
1037   step1[30] = step2[30];
1038   step1[31] = step2[31];
1039 
1040   // stage 6
1041   step2[0] = WRAPLOW(step1[0] + step1[7], 8);
1042   step2[1] = WRAPLOW(step1[1] + step1[6], 8);
1043   step2[2] = WRAPLOW(step1[2] + step1[5], 8);
1044   step2[3] = WRAPLOW(step1[3] + step1[4], 8);
1045   step2[4] = WRAPLOW(step1[3] - step1[4], 8);
1046   step2[5] = WRAPLOW(step1[2] - step1[5], 8);
1047   step2[6] = WRAPLOW(step1[1] - step1[6], 8);
1048   step2[7] = WRAPLOW(step1[0] - step1[7], 8);
1049   step2[8] = step1[8];
1050   step2[9] = step1[9];
1051   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1052   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1053   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1054   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1055   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1056   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1057   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1058   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1059   step2[14] = step1[14];
1060   step2[15] = step1[15];
1061 
1062   step2[16] = WRAPLOW(step1[16] + step1[23], 8);
1063   step2[17] = WRAPLOW(step1[17] + step1[22], 8);
1064   step2[18] = WRAPLOW(step1[18] + step1[21], 8);
1065   step2[19] = WRAPLOW(step1[19] + step1[20], 8);
1066   step2[20] = WRAPLOW(step1[19] - step1[20], 8);
1067   step2[21] = WRAPLOW(step1[18] - step1[21], 8);
1068   step2[22] = WRAPLOW(step1[17] - step1[22], 8);
1069   step2[23] = WRAPLOW(step1[16] - step1[23], 8);
1070 
1071   step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
1072   step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
1073   step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
1074   step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
1075   step2[28] = WRAPLOW(step1[27] + step1[28], 8);
1076   step2[29] = WRAPLOW(step1[26] + step1[29], 8);
1077   step2[30] = WRAPLOW(step1[25] + step1[30], 8);
1078   step2[31] = WRAPLOW(step1[24] + step1[31], 8);
1079 
1080   // stage 7
1081   step1[0] = WRAPLOW(step2[0] + step2[15], 8);
1082   step1[1] = WRAPLOW(step2[1] + step2[14], 8);
1083   step1[2] = WRAPLOW(step2[2] + step2[13], 8);
1084   step1[3] = WRAPLOW(step2[3] + step2[12], 8);
1085   step1[4] = WRAPLOW(step2[4] + step2[11], 8);
1086   step1[5] = WRAPLOW(step2[5] + step2[10], 8);
1087   step1[6] = WRAPLOW(step2[6] + step2[9], 8);
1088   step1[7] = WRAPLOW(step2[7] + step2[8], 8);
1089   step1[8] = WRAPLOW(step2[7] - step2[8], 8);
1090   step1[9] = WRAPLOW(step2[6] - step2[9], 8);
1091   step1[10] = WRAPLOW(step2[5] - step2[10], 8);
1092   step1[11] = WRAPLOW(step2[4] - step2[11], 8);
1093   step1[12] = WRAPLOW(step2[3] - step2[12], 8);
1094   step1[13] = WRAPLOW(step2[2] - step2[13], 8);
1095   step1[14] = WRAPLOW(step2[1] - step2[14], 8);
1096   step1[15] = WRAPLOW(step2[0] - step2[15], 8);
1097 
1098   step1[16] = step2[16];
1099   step1[17] = step2[17];
1100   step1[18] = step2[18];
1101   step1[19] = step2[19];
1102   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1103   temp2 = (step2[20] + step2[27]) * cospi_16_64;
1104   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1105   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1106   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1107   temp2 = (step2[21] + step2[26]) * cospi_16_64;
1108   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1109   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1110   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1111   temp2 = (step2[22] + step2[25]) * cospi_16_64;
1112   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1113   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1114   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1115   temp2 = (step2[23] + step2[24]) * cospi_16_64;
1116   step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
1117   step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
1118   step1[28] = step2[28];
1119   step1[29] = step2[29];
1120   step1[30] = step2[30];
1121   step1[31] = step2[31];
1122 
1123   // final stage
1124   output[0] = WRAPLOW(step1[0] + step1[31], 8);
1125   output[1] = WRAPLOW(step1[1] + step1[30], 8);
1126   output[2] = WRAPLOW(step1[2] + step1[29], 8);
1127   output[3] = WRAPLOW(step1[3] + step1[28], 8);
1128   output[4] = WRAPLOW(step1[4] + step1[27], 8);
1129   output[5] = WRAPLOW(step1[5] + step1[26], 8);
1130   output[6] = WRAPLOW(step1[6] + step1[25], 8);
1131   output[7] = WRAPLOW(step1[7] + step1[24], 8);
1132   output[8] = WRAPLOW(step1[8] + step1[23], 8);
1133   output[9] = WRAPLOW(step1[9] + step1[22], 8);
1134   output[10] = WRAPLOW(step1[10] + step1[21], 8);
1135   output[11] = WRAPLOW(step1[11] + step1[20], 8);
1136   output[12] = WRAPLOW(step1[12] + step1[19], 8);
1137   output[13] = WRAPLOW(step1[13] + step1[18], 8);
1138   output[14] = WRAPLOW(step1[14] + step1[17], 8);
1139   output[15] = WRAPLOW(step1[15] + step1[16], 8);
1140   output[16] = WRAPLOW(step1[15] - step1[16], 8);
1141   output[17] = WRAPLOW(step1[14] - step1[17], 8);
1142   output[18] = WRAPLOW(step1[13] - step1[18], 8);
1143   output[19] = WRAPLOW(step1[12] - step1[19], 8);
1144   output[20] = WRAPLOW(step1[11] - step1[20], 8);
1145   output[21] = WRAPLOW(step1[10] - step1[21], 8);
1146   output[22] = WRAPLOW(step1[9] - step1[22], 8);
1147   output[23] = WRAPLOW(step1[8] - step1[23], 8);
1148   output[24] = WRAPLOW(step1[7] - step1[24], 8);
1149   output[25] = WRAPLOW(step1[6] - step1[25], 8);
1150   output[26] = WRAPLOW(step1[5] - step1[26], 8);
1151   output[27] = WRAPLOW(step1[4] - step1[27], 8);
1152   output[28] = WRAPLOW(step1[3] - step1[28], 8);
1153   output[29] = WRAPLOW(step1[2] - step1[29], 8);
1154   output[30] = WRAPLOW(step1[1] - step1[30], 8);
1155   output[31] = WRAPLOW(step1[0] - step1[31], 8);
1156 }
1157 
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1158 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1159                               int stride) {
1160   tran_low_t out[32 * 32];
1161   tran_low_t *outptr = out;
1162   int i, j;
1163   tran_low_t temp_in[32], temp_out[32];
1164 
1165   // Rows
1166   for (i = 0; i < 32; ++i) {
1167     int16_t zero_coeff[16];
1168     for (j = 0; j < 16; ++j)
1169       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1170     for (j = 0; j < 8; ++j)
1171       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1172     for (j = 0; j < 4; ++j)
1173       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1174     for (j = 0; j < 2; ++j)
1175       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1176 
1177     if (zero_coeff[0] | zero_coeff[1])
1178       idct32_c(input, outptr);
1179     else
1180       memset(outptr, 0, sizeof(tran_low_t) * 32);
1181     input += 32;
1182     outptr += 32;
1183   }
1184 
1185   // Columns
1186   for (i = 0; i < 32; ++i) {
1187     for (j = 0; j < 32; ++j)
1188       temp_in[j] = out[j * 32 + i];
1189     idct32_c(temp_in, temp_out);
1190     for (j = 0; j < 32; ++j) {
1191       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1192                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1193     }
1194   }
1195 }
1196 
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1197 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1198                             int stride) {
1199   tran_low_t out[32 * 32] = {0};
1200   tran_low_t *outptr = out;
1201   int i, j;
1202   tran_low_t temp_in[32], temp_out[32];
1203 
1204   // Rows
1205   // only upper-left 8x8 has non-zero coeff
1206   for (i = 0; i < 8; ++i) {
1207     idct32_c(input, outptr);
1208     input += 32;
1209     outptr += 32;
1210   }
1211 
1212   // Columns
1213   for (i = 0; i < 32; ++i) {
1214     for (j = 0; j < 32; ++j)
1215       temp_in[j] = out[j * 32 + i];
1216     idct32_c(temp_in, temp_out);
1217     for (j = 0; j < 32; ++j) {
1218       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1219                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1220     }
1221   }
1222 }
1223 
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1224 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1225   int i, j;
1226   tran_high_t a1;
1227 
1228   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
1229   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
1230   a1 = ROUND_POWER_OF_TWO(out, 6);
1231 
1232   for (j = 0; j < 32; ++j) {
1233     for (i = 0; i < 32; ++i)
1234       dest[i] = clip_pixel_add(dest[i], a1);
1235     dest += stride;
1236   }
1237 }
1238 
1239 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1240 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1241                                  int stride, int bd) {
1242   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1243      0.5 shifts per pixel. */
1244   int i;
1245   tran_low_t output[16];
1246   tran_high_t a1, b1, c1, d1, e1;
1247   const tran_low_t *ip = input;
1248   tran_low_t *op = output;
1249   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1250 
1251   for (i = 0; i < 4; i++) {
1252     a1 = ip[0] >> UNIT_QUANT_SHIFT;
1253     c1 = ip[1] >> UNIT_QUANT_SHIFT;
1254     d1 = ip[2] >> UNIT_QUANT_SHIFT;
1255     b1 = ip[3] >> UNIT_QUANT_SHIFT;
1256     a1 += c1;
1257     d1 -= b1;
1258     e1 = (a1 - d1) >> 1;
1259     b1 = e1 - b1;
1260     c1 = e1 - c1;
1261     a1 -= b1;
1262     d1 += c1;
1263     op[0] = WRAPLOW(a1, bd);
1264     op[1] = WRAPLOW(b1, bd);
1265     op[2] = WRAPLOW(c1, bd);
1266     op[3] = WRAPLOW(d1, bd);
1267     ip += 4;
1268     op += 4;
1269   }
1270 
1271   ip = output;
1272   for (i = 0; i < 4; i++) {
1273     a1 = ip[4 * 0];
1274     c1 = ip[4 * 1];
1275     d1 = ip[4 * 2];
1276     b1 = ip[4 * 3];
1277     a1 += c1;
1278     d1 -= b1;
1279     e1 = (a1 - d1) >> 1;
1280     b1 = e1 - b1;
1281     c1 = e1 - c1;
1282     a1 -= b1;
1283     d1 += c1;
1284     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1285     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
1286     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
1287     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
1288 
1289     ip++;
1290     dest++;
1291   }
1292 }
1293 
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest8,int dest_stride,int bd)1294 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1295                                 int dest_stride, int bd) {
1296   int i;
1297   tran_high_t a1, e1;
1298   tran_low_t tmp[4];
1299   const tran_low_t *ip = in;
1300   tran_low_t *op = tmp;
1301   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1302   (void) bd;
1303 
1304   a1 = ip[0] >> UNIT_QUANT_SHIFT;
1305   e1 = a1 >> 1;
1306   a1 -= e1;
1307   op[0] = WRAPLOW(a1, bd);
1308   op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
1309 
1310   ip = tmp;
1311   for (i = 0; i < 4; i++) {
1312     e1 = ip[0] >> 1;
1313     a1 = ip[0] - e1;
1314     dest[dest_stride * 0] = highbd_clip_pixel_add(
1315         dest[dest_stride * 0], a1, bd);
1316     dest[dest_stride * 1] = highbd_clip_pixel_add(
1317         dest[dest_stride * 1], e1, bd);
1318     dest[dest_stride * 2] = highbd_clip_pixel_add(
1319         dest[dest_stride * 2], e1, bd);
1320     dest[dest_stride * 3] = highbd_clip_pixel_add(
1321         dest[dest_stride * 3], e1, bd);
1322     ip++;
1323     dest++;
1324   }
1325 }
1326 
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1327 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1328   tran_low_t step[4];
1329   tran_high_t temp1, temp2;
1330   (void) bd;
1331   // stage 1
1332   temp1 = (input[0] + input[2]) * cospi_16_64;
1333   temp2 = (input[0] - input[2]) * cospi_16_64;
1334   step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1335   step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1336   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1337   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1338   step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1339   step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1340 
1341   // stage 2
1342   output[0] = WRAPLOW(step[0] + step[3], bd);
1343   output[1] = WRAPLOW(step[1] + step[2], bd);
1344   output[2] = WRAPLOW(step[1] - step[2], bd);
1345   output[3] = WRAPLOW(step[0] - step[3], bd);
1346 }
1347 
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1348 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1349                                  int stride, int bd) {
1350   tran_low_t out[4 * 4];
1351   tran_low_t *outptr = out;
1352   int i, j;
1353   tran_low_t temp_in[4], temp_out[4];
1354   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1355 
1356   // Rows
1357   for (i = 0; i < 4; ++i) {
1358     vpx_highbd_idct4_c(input, outptr, bd);
1359     input += 4;
1360     outptr += 4;
1361   }
1362 
1363   // Columns
1364   for (i = 0; i < 4; ++i) {
1365     for (j = 0; j < 4; ++j)
1366       temp_in[j] = out[j * 4 + i];
1367     vpx_highbd_idct4_c(temp_in, temp_out, bd);
1368     for (j = 0; j < 4; ++j) {
1369       dest[j * stride + i] = highbd_clip_pixel_add(
1370           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1371     }
1372   }
1373 }
1374 
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest8,int dest_stride,int bd)1375 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1376                                 int dest_stride, int bd) {
1377   int i;
1378   tran_high_t a1;
1379   tran_low_t out = WRAPLOW(
1380       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1381   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1382 
1383   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1384   a1 = ROUND_POWER_OF_TWO(out, 4);
1385 
1386   for (i = 0; i < 4; i++) {
1387     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1388     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1389     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1390     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1391     dest += dest_stride;
1392   }
1393 }
1394 
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1395 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1396   tran_low_t step1[8], step2[8];
1397   tran_high_t temp1, temp2;
1398   // stage 1
1399   step1[0] = input[0];
1400   step1[2] = input[4];
1401   step1[1] = input[2];
1402   step1[3] = input[6];
1403   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1404   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1405   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1406   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1407   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1408   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1409   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1410   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1411 
1412   // stage 2 & stage 3 - even half
1413   vpx_highbd_idct4_c(step1, step1, bd);
1414 
1415   // stage 2 - odd half
1416   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1417   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1418   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1419   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1420 
1421   // stage 3 - odd half
1422   step1[4] = step2[4];
1423   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1424   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1425   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1426   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1427   step1[7] = step2[7];
1428 
1429   // stage 4
1430   output[0] = WRAPLOW(step1[0] + step1[7], bd);
1431   output[1] = WRAPLOW(step1[1] + step1[6], bd);
1432   output[2] = WRAPLOW(step1[2] + step1[5], bd);
1433   output[3] = WRAPLOW(step1[3] + step1[4], bd);
1434   output[4] = WRAPLOW(step1[3] - step1[4], bd);
1435   output[5] = WRAPLOW(step1[2] - step1[5], bd);
1436   output[6] = WRAPLOW(step1[1] - step1[6], bd);
1437   output[7] = WRAPLOW(step1[0] - step1[7], bd);
1438 }
1439 
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1440 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1441                                  int stride, int bd) {
1442   tran_low_t out[8 * 8];
1443   tran_low_t *outptr = out;
1444   int i, j;
1445   tran_low_t temp_in[8], temp_out[8];
1446   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1447 
1448   // First transform rows.
1449   for (i = 0; i < 8; ++i) {
1450     vpx_highbd_idct8_c(input, outptr, bd);
1451     input += 8;
1452     outptr += 8;
1453   }
1454 
1455   // Then transform columns.
1456   for (i = 0; i < 8; ++i) {
1457     for (j = 0; j < 8; ++j)
1458       temp_in[j] = out[j * 8 + i];
1459     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1460     for (j = 0; j < 8; ++j) {
1461       dest[j * stride + i] = highbd_clip_pixel_add(
1462           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1463     }
1464   }
1465 }
1466 
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1467 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1468                                 int stride, int bd) {
1469   int i, j;
1470   tran_high_t a1;
1471   tran_low_t out = WRAPLOW(
1472       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1473   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1474   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1475   a1 = ROUND_POWER_OF_TWO(out, 5);
1476   for (j = 0; j < 8; ++j) {
1477     for (i = 0; i < 8; ++i)
1478       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1479     dest += stride;
1480   }
1481 }
1482 
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1483 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1484   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1485 
1486   tran_low_t x0 = input[0];
1487   tran_low_t x1 = input[1];
1488   tran_low_t x2 = input[2];
1489   tran_low_t x3 = input[3];
1490   (void) bd;
1491 
1492   if (!(x0 | x1 | x2 | x3)) {
1493     memset(output, 0, 4 * sizeof(*output));
1494     return;
1495   }
1496 
1497   s0 = sinpi_1_9 * x0;
1498   s1 = sinpi_2_9 * x0;
1499   s2 = sinpi_3_9 * x1;
1500   s3 = sinpi_4_9 * x2;
1501   s4 = sinpi_1_9 * x2;
1502   s5 = sinpi_2_9 * x3;
1503   s6 = sinpi_4_9 * x3;
1504   s7 = (tran_high_t)(x0 - x2 + x3);
1505 
1506   s0 = s0 + s3 + s5;
1507   s1 = s1 - s4 - s6;
1508   s3 = s2;
1509   s2 = sinpi_3_9 * s7;
1510 
1511   // 1-D transform scaling factor is sqrt(2).
1512   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1513   // + 1b (addition) = 29b.
1514   // Hence the output bit depth is 15b.
1515   output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
1516   output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
1517   output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1518   output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
1519 }
1520 
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1521 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1522   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1523 
1524   tran_low_t x0 = input[7];
1525   tran_low_t x1 = input[0];
1526   tran_low_t x2 = input[5];
1527   tran_low_t x3 = input[2];
1528   tran_low_t x4 = input[3];
1529   tran_low_t x5 = input[4];
1530   tran_low_t x6 = input[1];
1531   tran_low_t x7 = input[6];
1532   (void) bd;
1533 
1534   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1535     memset(output, 0, 8 * sizeof(*output));
1536     return;
1537   }
1538 
1539   // stage 1
1540   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
1541   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
1542   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1543   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1544   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1545   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1546   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
1547   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
1548 
1549   x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
1550   x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
1551   x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
1552   x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
1553   x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
1554   x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
1555   x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
1556   x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
1557 
1558   // stage 2
1559   s0 = x0;
1560   s1 = x1;
1561   s2 = x2;
1562   s3 = x3;
1563   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
1564   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
1565   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
1566   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
1567 
1568   x0 = WRAPLOW(s0 + s2, bd);
1569   x1 = WRAPLOW(s1 + s3, bd);
1570   x2 = WRAPLOW(s0 - s2, bd);
1571   x3 = WRAPLOW(s1 - s3, bd);
1572   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1573   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1574   x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1575   x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1576 
1577   // stage 3
1578   s2 = cospi_16_64 * (x2 + x3);
1579   s3 = cospi_16_64 * (x2 - x3);
1580   s6 = cospi_16_64 * (x6 + x7);
1581   s7 = cospi_16_64 * (x6 - x7);
1582 
1583   x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1584   x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1585   x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1586   x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1587 
1588   output[0] = WRAPLOW(x0, bd);
1589   output[1] = WRAPLOW(-x4, bd);
1590   output[2] = WRAPLOW(x6, bd);
1591   output[3] = WRAPLOW(-x2, bd);
1592   output[4] = WRAPLOW(x3, bd);
1593   output[5] = WRAPLOW(-x7, bd);
1594   output[6] = WRAPLOW(x5, bd);
1595   output[7] = WRAPLOW(-x1, bd);
1596 }
1597 
vpx_highbd_idct8x8_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1598 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1599                                  int stride, int bd) {
1600   tran_low_t out[8 * 8] = { 0 };
1601   tran_low_t *outptr = out;
1602   int i, j;
1603   tran_low_t temp_in[8], temp_out[8];
1604   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1605 
1606   // First transform rows.
1607   // Only first 4 row has non-zero coefs.
1608   for (i = 0; i < 4; ++i) {
1609     vpx_highbd_idct8_c(input, outptr, bd);
1610     input += 8;
1611     outptr += 8;
1612   }
1613   // Then transform columns.
1614   for (i = 0; i < 8; ++i) {
1615     for (j = 0; j < 8; ++j)
1616       temp_in[j] = out[j * 8 + i];
1617     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1618     for (j = 0; j < 8; ++j) {
1619       dest[j * stride + i] = highbd_clip_pixel_add(
1620           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1621     }
1622   }
1623 }
1624 
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1625 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1626   tran_low_t step1[16], step2[16];
1627   tran_high_t temp1, temp2;
1628   (void) bd;
1629 
1630   // stage 1
1631   step1[0] = input[0/2];
1632   step1[1] = input[16/2];
1633   step1[2] = input[8/2];
1634   step1[3] = input[24/2];
1635   step1[4] = input[4/2];
1636   step1[5] = input[20/2];
1637   step1[6] = input[12/2];
1638   step1[7] = input[28/2];
1639   step1[8] = input[2/2];
1640   step1[9] = input[18/2];
1641   step1[10] = input[10/2];
1642   step1[11] = input[26/2];
1643   step1[12] = input[6/2];
1644   step1[13] = input[22/2];
1645   step1[14] = input[14/2];
1646   step1[15] = input[30/2];
1647 
1648   // stage 2
1649   step2[0] = step1[0];
1650   step2[1] = step1[1];
1651   step2[2] = step1[2];
1652   step2[3] = step1[3];
1653   step2[4] = step1[4];
1654   step2[5] = step1[5];
1655   step2[6] = step1[6];
1656   step2[7] = step1[7];
1657 
1658   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1659   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1660   step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1661   step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1662 
1663   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1664   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1665   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1666   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1667 
1668   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1669   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1670   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1671   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1672 
1673   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1674   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1675   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1676   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1677 
1678   // stage 3
1679   step1[0] = step2[0];
1680   step1[1] = step2[1];
1681   step1[2] = step2[2];
1682   step1[3] = step2[3];
1683 
1684   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1685   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1686   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1687   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1688   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1689   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1690   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1691   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1692 
1693   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
1694   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
1695   step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
1696   step1[11] = WRAPLOW(step2[10] + step2[11], bd);
1697   step1[12] = WRAPLOW(step2[12] + step2[13], bd);
1698   step1[13] = WRAPLOW(step2[12] - step2[13], bd);
1699   step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
1700   step1[15] = WRAPLOW(step2[14] + step2[15], bd);
1701 
1702   // stage 4
1703   temp1 = (step1[0] + step1[1]) * cospi_16_64;
1704   temp2 = (step1[0] - step1[1]) * cospi_16_64;
1705   step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1706   step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1707   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1708   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1709   step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1710   step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1711   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1712   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1713   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1714   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1715 
1716   step2[8] = step1[8];
1717   step2[15] = step1[15];
1718   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1719   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1720   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1721   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1722   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1723   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1724   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1725   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1726   step2[11] = step1[11];
1727   step2[12] = step1[12];
1728 
1729   // stage 5
1730   step1[0] = WRAPLOW(step2[0] + step2[3], bd);
1731   step1[1] = WRAPLOW(step2[1] + step2[2], bd);
1732   step1[2] = WRAPLOW(step2[1] - step2[2], bd);
1733   step1[3] = WRAPLOW(step2[0] - step2[3], bd);
1734   step1[4] = step2[4];
1735   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1736   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1737   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1738   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1739   step1[7] = step2[7];
1740 
1741   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
1742   step1[9] = WRAPLOW(step2[9] + step2[10], bd);
1743   step1[10] = WRAPLOW(step2[9] - step2[10], bd);
1744   step1[11] = WRAPLOW(step2[8] - step2[11], bd);
1745   step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
1746   step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
1747   step1[14] = WRAPLOW(step2[13] + step2[14], bd);
1748   step1[15] = WRAPLOW(step2[12] + step2[15], bd);
1749 
1750   // stage 6
1751   step2[0] = WRAPLOW(step1[0] + step1[7], bd);
1752   step2[1] = WRAPLOW(step1[1] + step1[6], bd);
1753   step2[2] = WRAPLOW(step1[2] + step1[5], bd);
1754   step2[3] = WRAPLOW(step1[3] + step1[4], bd);
1755   step2[4] = WRAPLOW(step1[3] - step1[4], bd);
1756   step2[5] = WRAPLOW(step1[2] - step1[5], bd);
1757   step2[6] = WRAPLOW(step1[1] - step1[6], bd);
1758   step2[7] = WRAPLOW(step1[0] - step1[7], bd);
1759   step2[8] = step1[8];
1760   step2[9] = step1[9];
1761   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1762   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1763   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1764   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1765   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1766   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1767   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1768   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1769   step2[14] = step1[14];
1770   step2[15] = step1[15];
1771 
1772   // stage 7
1773   output[0] = WRAPLOW(step2[0] + step2[15], bd);
1774   output[1] = WRAPLOW(step2[1] + step2[14], bd);
1775   output[2] = WRAPLOW(step2[2] + step2[13], bd);
1776   output[3] = WRAPLOW(step2[3] + step2[12], bd);
1777   output[4] = WRAPLOW(step2[4] + step2[11], bd);
1778   output[5] = WRAPLOW(step2[5] + step2[10], bd);
1779   output[6] = WRAPLOW(step2[6] + step2[9], bd);
1780   output[7] = WRAPLOW(step2[7] + step2[8], bd);
1781   output[8] = WRAPLOW(step2[7] - step2[8], bd);
1782   output[9] = WRAPLOW(step2[6] - step2[9], bd);
1783   output[10] = WRAPLOW(step2[5] - step2[10], bd);
1784   output[11] = WRAPLOW(step2[4] - step2[11], bd);
1785   output[12] = WRAPLOW(step2[3] - step2[12], bd);
1786   output[13] = WRAPLOW(step2[2] - step2[13], bd);
1787   output[14] = WRAPLOW(step2[1] - step2[14], bd);
1788   output[15] = WRAPLOW(step2[0] - step2[15], bd);
1789 }
1790 
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1791 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1792                                     int stride, int bd) {
1793   tran_low_t out[16 * 16];
1794   tran_low_t *outptr = out;
1795   int i, j;
1796   tran_low_t temp_in[16], temp_out[16];
1797   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1798 
1799   // First transform rows.
1800   for (i = 0; i < 16; ++i) {
1801     vpx_highbd_idct16_c(input, outptr, bd);
1802     input += 16;
1803     outptr += 16;
1804   }
1805 
1806   // Then transform columns.
1807   for (i = 0; i < 16; ++i) {
1808     for (j = 0; j < 16; ++j)
1809       temp_in[j] = out[j * 16 + i];
1810     vpx_highbd_idct16_c(temp_in, temp_out, bd);
1811     for (j = 0; j < 16; ++j) {
1812       dest[j * stride + i] = highbd_clip_pixel_add(
1813           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1814     }
1815   }
1816 }
1817 
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1818 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1819   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1820   tran_high_t s9, s10, s11, s12, s13, s14, s15;
1821 
1822   tran_low_t x0 = input[15];
1823   tran_low_t x1 = input[0];
1824   tran_low_t x2 = input[13];
1825   tran_low_t x3 = input[2];
1826   tran_low_t x4 = input[11];
1827   tran_low_t x5 = input[4];
1828   tran_low_t x6 = input[9];
1829   tran_low_t x7 = input[6];
1830   tran_low_t x8 = input[7];
1831   tran_low_t x9 = input[8];
1832   tran_low_t x10 = input[5];
1833   tran_low_t x11 = input[10];
1834   tran_low_t x12 = input[3];
1835   tran_low_t x13 = input[12];
1836   tran_low_t x14 = input[1];
1837   tran_low_t x15 = input[14];
1838   (void) bd;
1839 
1840   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
1841            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
1842     memset(output, 0, 16 * sizeof(*output));
1843     return;
1844   }
1845 
1846   // stage 1
1847   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
1848   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1849   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
1850   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1851   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
1852   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1853   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1854   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1855   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1856   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1857   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1858   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1859   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1860   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
1861   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1862   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
1863 
1864   x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
1865   x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
1866   x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
1867   x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
1868   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
1869   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
1870   x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
1871   x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
1872   x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
1873   x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
1874   x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
1875   x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
1876   x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
1877   x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
1878   x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
1879   x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
1880 
1881   // stage 2
1882   s0 = x0;
1883   s1 = x1;
1884   s2 = x2;
1885   s3 = x3;
1886   s4 = x4;
1887   s5 = x5;
1888   s6 = x6;
1889   s7 = x7;
1890   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1891   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1892   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1893   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1894   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1895   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1896   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1897   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1898 
1899   x0 = WRAPLOW(s0 + s4, bd);
1900   x1 = WRAPLOW(s1 + s5, bd);
1901   x2 = WRAPLOW(s2 + s6, bd);
1902   x3 = WRAPLOW(s3 + s7, bd);
1903   x4 = WRAPLOW(s0 - s4, bd);
1904   x5 = WRAPLOW(s1 - s5, bd);
1905   x6 = WRAPLOW(s2 - s6, bd);
1906   x7 = WRAPLOW(s3 - s7, bd);
1907   x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
1908   x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
1909   x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
1910   x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
1911   x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
1912   x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
1913   x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
1914   x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
1915 
1916   // stage 3
1917   s0 = x0;
1918   s1 = x1;
1919   s2 = x2;
1920   s3 = x3;
1921   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1922   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1923   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1924   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1925   s8 = x8;
1926   s9 = x9;
1927   s10 = x10;
1928   s11 = x11;
1929   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1930   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1931   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1932   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1933 
1934   x0 = WRAPLOW(s0 + s2, bd);
1935   x1 = WRAPLOW(s1 + s3, bd);
1936   x2 = WRAPLOW(s0 - s2, bd);
1937   x3 = WRAPLOW(s1 - s3, bd);
1938   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1939   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1940   x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1941   x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1942   x8 = WRAPLOW(s8 + s10, bd);
1943   x9 = WRAPLOW(s9 + s11, bd);
1944   x10 = WRAPLOW(s8 - s10, bd);
1945   x11 = WRAPLOW(s9 - s11, bd);
1946   x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
1947   x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
1948   x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
1949   x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
1950 
1951   // stage 4
1952   s2 = (- cospi_16_64) * (x2 + x3);
1953   s3 = cospi_16_64 * (x2 - x3);
1954   s6 = cospi_16_64 * (x6 + x7);
1955   s7 = cospi_16_64 * (-x6 + x7);
1956   s10 = cospi_16_64 * (x10 + x11);
1957   s11 = cospi_16_64 * (-x10 + x11);
1958   s14 = (- cospi_16_64) * (x14 + x15);
1959   s15 = cospi_16_64 * (x14 - x15);
1960 
1961   x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1962   x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1963   x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1964   x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1965   x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
1966   x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
1967   x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
1968   x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
1969 
1970   output[0] = WRAPLOW(x0, bd);
1971   output[1] = WRAPLOW(-x8, bd);
1972   output[2] = WRAPLOW(x12, bd);
1973   output[3] = WRAPLOW(-x4, bd);
1974   output[4] = WRAPLOW(x6, bd);
1975   output[5] = WRAPLOW(x14, bd);
1976   output[6] = WRAPLOW(x10, bd);
1977   output[7] = WRAPLOW(x2, bd);
1978   output[8] = WRAPLOW(x3, bd);
1979   output[9] = WRAPLOW(x11, bd);
1980   output[10] = WRAPLOW(x15, bd);
1981   output[11] = WRAPLOW(x7, bd);
1982   output[12] = WRAPLOW(x5, bd);
1983   output[13] = WRAPLOW(-x13, bd);
1984   output[14] = WRAPLOW(x9, bd);
1985   output[15] = WRAPLOW(-x1, bd);
1986 }
1987 
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1988 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
1989                                    int stride, int bd) {
1990   tran_low_t out[16 * 16] = { 0 };
1991   tran_low_t *outptr = out;
1992   int i, j;
1993   tran_low_t temp_in[16], temp_out[16];
1994   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1995 
1996   // First transform rows. Since all non-zero dct coefficients are in
1997   // upper-left 4x4 area, we only need to calculate first 4 rows here.
1998   for (i = 0; i < 4; ++i) {
1999     vpx_highbd_idct16_c(input, outptr, bd);
2000     input += 16;
2001     outptr += 16;
2002   }
2003 
2004   // Then transform columns.
2005   for (i = 0; i < 16; ++i) {
2006     for (j = 0; j < 16; ++j)
2007       temp_in[j] = out[j*16 + i];
2008     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2009     for (j = 0; j < 16; ++j) {
2010       dest[j * stride + i] = highbd_clip_pixel_add(
2011           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2012     }
2013   }
2014 }
2015 
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2016 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2017                                   int stride, int bd) {
2018   int i, j;
2019   tran_high_t a1;
2020   tran_low_t out = WRAPLOW(
2021       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2022   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2023 
2024   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2025   a1 = ROUND_POWER_OF_TWO(out, 6);
2026   for (j = 0; j < 16; ++j) {
2027     for (i = 0; i < 16; ++i)
2028       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2029     dest += stride;
2030   }
2031 }
2032 
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2033 static void highbd_idct32_c(const tran_low_t *input,
2034                             tran_low_t *output, int bd) {
2035   tran_low_t step1[32], step2[32];
2036   tran_high_t temp1, temp2;
2037   (void) bd;
2038 
2039   // stage 1
2040   step1[0] = input[0];
2041   step1[1] = input[16];
2042   step1[2] = input[8];
2043   step1[3] = input[24];
2044   step1[4] = input[4];
2045   step1[5] = input[20];
2046   step1[6] = input[12];
2047   step1[7] = input[28];
2048   step1[8] = input[2];
2049   step1[9] = input[18];
2050   step1[10] = input[10];
2051   step1[11] = input[26];
2052   step1[12] = input[6];
2053   step1[13] = input[22];
2054   step1[14] = input[14];
2055   step1[15] = input[30];
2056 
2057   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2058   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2059   step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2060   step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2061 
2062   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2063   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2064   step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2065   step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2066 
2067   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2068   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2069   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2070   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2071 
2072   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2073   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2074   step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2075   step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2076 
2077   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2078   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2079   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2080   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2081 
2082   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2083   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2084   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2085   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2086 
2087   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2088   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2089   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2090   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2091 
2092   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2093   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2094   step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2095   step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2096 
2097   // stage 2
2098   step2[0] = step1[0];
2099   step2[1] = step1[1];
2100   step2[2] = step1[2];
2101   step2[3] = step1[3];
2102   step2[4] = step1[4];
2103   step2[5] = step1[5];
2104   step2[6] = step1[6];
2105   step2[7] = step1[7];
2106 
2107   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2108   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2109   step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2110   step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2111 
2112   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2113   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2114   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2115   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2116 
2117   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2118   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2119   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2120   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2121 
2122   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2123   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2124   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2125   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2126 
2127   step2[16] = WRAPLOW(step1[16] + step1[17], bd);
2128   step2[17] = WRAPLOW(step1[16] - step1[17], bd);
2129   step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
2130   step2[19] = WRAPLOW(step1[18] + step1[19], bd);
2131   step2[20] = WRAPLOW(step1[20] + step1[21], bd);
2132   step2[21] = WRAPLOW(step1[20] - step1[21], bd);
2133   step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
2134   step2[23] = WRAPLOW(step1[22] + step1[23], bd);
2135   step2[24] = WRAPLOW(step1[24] + step1[25], bd);
2136   step2[25] = WRAPLOW(step1[24] - step1[25], bd);
2137   step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
2138   step2[27] = WRAPLOW(step1[26] + step1[27], bd);
2139   step2[28] = WRAPLOW(step1[28] + step1[29], bd);
2140   step2[29] = WRAPLOW(step1[28] - step1[29], bd);
2141   step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
2142   step2[31] = WRAPLOW(step1[30] + step1[31], bd);
2143 
2144   // stage 3
2145   step1[0] = step2[0];
2146   step1[1] = step2[1];
2147   step1[2] = step2[2];
2148   step1[3] = step2[3];
2149 
2150   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2151   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2152   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2153   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2154   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2155   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2156   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2157   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2158 
2159   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
2160   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
2161   step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
2162   step1[11] = WRAPLOW(step2[10] + step2[11], bd);
2163   step1[12] = WRAPLOW(step2[12] + step2[13], bd);
2164   step1[13] = WRAPLOW(step2[12] - step2[13], bd);
2165   step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
2166   step1[15] = WRAPLOW(step2[14] + step2[15], bd);
2167 
2168   step1[16] = step2[16];
2169   step1[31] = step2[31];
2170   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2171   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2172   step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2173   step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2174   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2175   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2176   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2177   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2178   step1[19] = step2[19];
2179   step1[20] = step2[20];
2180   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2181   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2182   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2183   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2184   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2185   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2186   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2187   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2188   step1[23] = step2[23];
2189   step1[24] = step2[24];
2190   step1[27] = step2[27];
2191   step1[28] = step2[28];
2192 
2193   // stage 4
2194   temp1 = (step1[0] + step1[1]) * cospi_16_64;
2195   temp2 = (step1[0] - step1[1]) * cospi_16_64;
2196   step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2197   step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2198   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2199   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2200   step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2201   step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2202   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2203   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2204   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2205   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2206 
2207   step2[8] = step1[8];
2208   step2[15] = step1[15];
2209   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2210   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2211   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2212   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2213   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2214   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2215   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2216   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2217   step2[11] = step1[11];
2218   step2[12] = step1[12];
2219 
2220   step2[16] = WRAPLOW(step1[16] + step1[19], bd);
2221   step2[17] = WRAPLOW(step1[17] + step1[18], bd);
2222   step2[18] = WRAPLOW(step1[17] - step1[18], bd);
2223   step2[19] = WRAPLOW(step1[16] - step1[19], bd);
2224   step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
2225   step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
2226   step2[22] = WRAPLOW(step1[21] + step1[22], bd);
2227   step2[23] = WRAPLOW(step1[20] + step1[23], bd);
2228 
2229   step2[24] = WRAPLOW(step1[24] + step1[27], bd);
2230   step2[25] = WRAPLOW(step1[25] + step1[26], bd);
2231   step2[26] = WRAPLOW(step1[25] - step1[26], bd);
2232   step2[27] = WRAPLOW(step1[24] - step1[27], bd);
2233   step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
2234   step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
2235   step2[30] = WRAPLOW(step1[29] + step1[30], bd);
2236   step2[31] = WRAPLOW(step1[28] + step1[31], bd);
2237 
2238   // stage 5
2239   step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2240   step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2241   step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2242   step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2243   step1[4] = step2[4];
2244   temp1 = (step2[6] - step2[5]) * cospi_16_64;
2245   temp2 = (step2[5] + step2[6]) * cospi_16_64;
2246   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2247   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2248   step1[7] = step2[7];
2249 
2250   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2251   step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2252   step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2253   step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2254   step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2255   step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2256   step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2257   step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2258 
2259   step1[16] = step2[16];
2260   step1[17] = step2[17];
2261   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2262   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2263   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2264   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2265   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2266   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2267   step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2268   step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2269   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2270   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2271   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2272   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2273   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2274   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2275   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2276   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2277   step1[22] = step2[22];
2278   step1[23] = step2[23];
2279   step1[24] = step2[24];
2280   step1[25] = step2[25];
2281   step1[30] = step2[30];
2282   step1[31] = step2[31];
2283 
2284   // stage 6
2285   step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2286   step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2287   step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2288   step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2289   step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2290   step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2291   step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2292   step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2293   step2[8] = step1[8];
2294   step2[9] = step1[9];
2295   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2296   temp2 = (step1[10] + step1[13]) * cospi_16_64;
2297   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2298   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2299   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2300   temp2 = (step1[11] + step1[12]) * cospi_16_64;
2301   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2302   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2303   step2[14] = step1[14];
2304   step2[15] = step1[15];
2305 
2306   step2[16] = WRAPLOW(step1[16] + step1[23], bd);
2307   step2[17] = WRAPLOW(step1[17] + step1[22], bd);
2308   step2[18] = WRAPLOW(step1[18] + step1[21], bd);
2309   step2[19] = WRAPLOW(step1[19] + step1[20], bd);
2310   step2[20] = WRAPLOW(step1[19] - step1[20], bd);
2311   step2[21] = WRAPLOW(step1[18] - step1[21], bd);
2312   step2[22] = WRAPLOW(step1[17] - step1[22], bd);
2313   step2[23] = WRAPLOW(step1[16] - step1[23], bd);
2314 
2315   step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
2316   step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
2317   step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
2318   step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
2319   step2[28] = WRAPLOW(step1[27] + step1[28], bd);
2320   step2[29] = WRAPLOW(step1[26] + step1[29], bd);
2321   step2[30] = WRAPLOW(step1[25] + step1[30], bd);
2322   step2[31] = WRAPLOW(step1[24] + step1[31], bd);
2323 
2324   // stage 7
2325   step1[0] = WRAPLOW(step2[0] + step2[15], bd);
2326   step1[1] = WRAPLOW(step2[1] + step2[14], bd);
2327   step1[2] = WRAPLOW(step2[2] + step2[13], bd);
2328   step1[3] = WRAPLOW(step2[3] + step2[12], bd);
2329   step1[4] = WRAPLOW(step2[4] + step2[11], bd);
2330   step1[5] = WRAPLOW(step2[5] + step2[10], bd);
2331   step1[6] = WRAPLOW(step2[6] + step2[9], bd);
2332   step1[7] = WRAPLOW(step2[7] + step2[8], bd);
2333   step1[8] = WRAPLOW(step2[7] - step2[8], bd);
2334   step1[9] = WRAPLOW(step2[6] - step2[9], bd);
2335   step1[10] = WRAPLOW(step2[5] - step2[10], bd);
2336   step1[11] = WRAPLOW(step2[4] - step2[11], bd);
2337   step1[12] = WRAPLOW(step2[3] - step2[12], bd);
2338   step1[13] = WRAPLOW(step2[2] - step2[13], bd);
2339   step1[14] = WRAPLOW(step2[1] - step2[14], bd);
2340   step1[15] = WRAPLOW(step2[0] - step2[15], bd);
2341 
2342   step1[16] = step2[16];
2343   step1[17] = step2[17];
2344   step1[18] = step2[18];
2345   step1[19] = step2[19];
2346   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2347   temp2 = (step2[20] + step2[27]) * cospi_16_64;
2348   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2349   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2350   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2351   temp2 = (step2[21] + step2[26]) * cospi_16_64;
2352   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2353   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2354   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2355   temp2 = (step2[22] + step2[25]) * cospi_16_64;
2356   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2357   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2358   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2359   temp2 = (step2[23] + step2[24]) * cospi_16_64;
2360   step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2361   step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2362   step1[28] = step2[28];
2363   step1[29] = step2[29];
2364   step1[30] = step2[30];
2365   step1[31] = step2[31];
2366 
2367   // final stage
2368   output[0] = WRAPLOW(step1[0] + step1[31], bd);
2369   output[1] = WRAPLOW(step1[1] + step1[30], bd);
2370   output[2] = WRAPLOW(step1[2] + step1[29], bd);
2371   output[3] = WRAPLOW(step1[3] + step1[28], bd);
2372   output[4] = WRAPLOW(step1[4] + step1[27], bd);
2373   output[5] = WRAPLOW(step1[5] + step1[26], bd);
2374   output[6] = WRAPLOW(step1[6] + step1[25], bd);
2375   output[7] = WRAPLOW(step1[7] + step1[24], bd);
2376   output[8] = WRAPLOW(step1[8] + step1[23], bd);
2377   output[9] = WRAPLOW(step1[9] + step1[22], bd);
2378   output[10] = WRAPLOW(step1[10] + step1[21], bd);
2379   output[11] = WRAPLOW(step1[11] + step1[20], bd);
2380   output[12] = WRAPLOW(step1[12] + step1[19], bd);
2381   output[13] = WRAPLOW(step1[13] + step1[18], bd);
2382   output[14] = WRAPLOW(step1[14] + step1[17], bd);
2383   output[15] = WRAPLOW(step1[15] + step1[16], bd);
2384   output[16] = WRAPLOW(step1[15] - step1[16], bd);
2385   output[17] = WRAPLOW(step1[14] - step1[17], bd);
2386   output[18] = WRAPLOW(step1[13] - step1[18], bd);
2387   output[19] = WRAPLOW(step1[12] - step1[19], bd);
2388   output[20] = WRAPLOW(step1[11] - step1[20], bd);
2389   output[21] = WRAPLOW(step1[10] - step1[21], bd);
2390   output[22] = WRAPLOW(step1[9] - step1[22], bd);
2391   output[23] = WRAPLOW(step1[8] - step1[23], bd);
2392   output[24] = WRAPLOW(step1[7] - step1[24], bd);
2393   output[25] = WRAPLOW(step1[6] - step1[25], bd);
2394   output[26] = WRAPLOW(step1[5] - step1[26], bd);
2395   output[27] = WRAPLOW(step1[4] - step1[27], bd);
2396   output[28] = WRAPLOW(step1[3] - step1[28], bd);
2397   output[29] = WRAPLOW(step1[2] - step1[29], bd);
2398   output[30] = WRAPLOW(step1[1] - step1[30], bd);
2399   output[31] = WRAPLOW(step1[0] - step1[31], bd);
2400 }
2401 
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2402 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2403                                      int stride, int bd) {
2404   tran_low_t out[32 * 32];
2405   tran_low_t *outptr = out;
2406   int i, j;
2407   tran_low_t temp_in[32], temp_out[32];
2408   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2409 
2410   // Rows
2411   for (i = 0; i < 32; ++i) {
2412     tran_low_t zero_coeff[16];
2413     for (j = 0; j < 16; ++j)
2414       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2415     for (j = 0; j < 8; ++j)
2416       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2417     for (j = 0; j < 4; ++j)
2418       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2419     for (j = 0; j < 2; ++j)
2420       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2421 
2422     if (zero_coeff[0] | zero_coeff[1])
2423       highbd_idct32_c(input, outptr, bd);
2424     else
2425       memset(outptr, 0, sizeof(tran_low_t) * 32);
2426     input += 32;
2427     outptr += 32;
2428   }
2429 
2430   // Columns
2431   for (i = 0; i < 32; ++i) {
2432     for (j = 0; j < 32; ++j)
2433       temp_in[j] = out[j * 32 + i];
2434     highbd_idct32_c(temp_in, temp_out, bd);
2435     for (j = 0; j < 32; ++j) {
2436       dest[j * stride + i] = highbd_clip_pixel_add(
2437           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2438     }
2439   }
2440 }
2441 
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2442 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2443                                    int stride, int bd) {
2444   tran_low_t out[32 * 32] = {0};
2445   tran_low_t *outptr = out;
2446   int i, j;
2447   tran_low_t temp_in[32], temp_out[32];
2448   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2449 
2450   // Rows
2451   // Only upper-left 8x8 has non-zero coeff.
2452   for (i = 0; i < 8; ++i) {
2453     highbd_idct32_c(input, outptr, bd);
2454     input += 32;
2455     outptr += 32;
2456   }
2457   // Columns
2458   for (i = 0; i < 32; ++i) {
2459     for (j = 0; j < 32; ++j)
2460       temp_in[j] = out[j * 32 + i];
2461     highbd_idct32_c(temp_in, temp_out, bd);
2462     for (j = 0; j < 32; ++j) {
2463       dest[j * stride + i] = highbd_clip_pixel_add(
2464           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2465     }
2466   }
2467 }
2468 
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2469 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2470                                   int stride, int bd) {
2471   int i, j;
2472   int a1;
2473   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2474 
2475   tran_low_t out = WRAPLOW(
2476       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2477   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2478   a1 = ROUND_POWER_OF_TWO(out, 6);
2479 
2480   for (j = 0; j < 32; ++j) {
2481     for (i = 0; i < 32; ++i)
2482       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2483     dest += stride;
2484   }
2485 }
2486 #endif  // CONFIG_VP9_HIGHBITDEPTH
2487