1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <math.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/vp9_systemdependent.h"
20 
fdct_round_shift(int input)21 static INLINE int fdct_round_shift(int input) {
22   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
23   assert(INT16_MIN <= rv && rv <= INT16_MAX);
24   return rv;
25 }
26 
fdct4(const int16_t * input,int16_t * output)27 static void fdct4(const int16_t *input, int16_t *output) {
28   int16_t step[4];
29   int temp1, temp2;
30 
31   step[0] = input[0] + input[3];
32   step[1] = input[1] + input[2];
33   step[2] = input[1] - input[2];
34   step[3] = input[0] - input[3];
35 
36   temp1 = (step[0] + step[1]) * cospi_16_64;
37   temp2 = (step[0] - step[1]) * cospi_16_64;
38   output[0] = fdct_round_shift(temp1);
39   output[2] = fdct_round_shift(temp2);
40   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
41   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
42   output[1] = fdct_round_shift(temp1);
43   output[3] = fdct_round_shift(temp2);
44 }
45 
vp9_fdct4x4_c(const int16_t * input,int16_t * output,int stride)46 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
47   // The 2D transform is done with two passes which are actually pretty
48   // similar. In the first one, we transform the columns and transpose
49   // the results. In the second one, we transform the rows. To achieve that,
50   // as the first pass results are transposed, we transpose the columns (that
51   // is the transposed rows) and transpose the results (so that it goes back
52   // in normal/row positions).
53   int pass;
54   // We need an intermediate buffer between passes.
55   int16_t intermediate[4 * 4];
56   const int16_t *in = input;
57   int16_t *out = intermediate;
58   // Do the two transform/transpose passes
59   for (pass = 0; pass < 2; ++pass) {
60     /*canbe16*/ int input[4];
61     /*canbe16*/ int step[4];
62     /*needs32*/ int temp1, temp2;
63     int i;
64     for (i = 0; i < 4; ++i) {
65       // Load inputs.
66       if (0 == pass) {
67         input[0] = in[0 * stride] * 16;
68         input[1] = in[1 * stride] * 16;
69         input[2] = in[2 * stride] * 16;
70         input[3] = in[3 * stride] * 16;
71         if (i == 0 && input[0]) {
72           input[0] += 1;
73         }
74       } else {
75         input[0] = in[0 * 4];
76         input[1] = in[1 * 4];
77         input[2] = in[2 * 4];
78         input[3] = in[3 * 4];
79       }
80       // Transform.
81       step[0] = input[0] + input[3];
82       step[1] = input[1] + input[2];
83       step[2] = input[1] - input[2];
84       step[3] = input[0] - input[3];
85       temp1 = (step[0] + step[1]) * cospi_16_64;
86       temp2 = (step[0] - step[1]) * cospi_16_64;
87       out[0] = fdct_round_shift(temp1);
88       out[2] = fdct_round_shift(temp2);
89       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
90       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
91       out[1] = fdct_round_shift(temp1);
92       out[3] = fdct_round_shift(temp2);
93       // Do next column (which is a transposed row in second/horizontal pass)
94       in++;
95       out += 4;
96     }
97     // Setup in/out for next pass.
98     in = intermediate;
99     out = output;
100   }
101 
102   {
103     int i, j;
104     for (i = 0; i < 4; ++i) {
105       for (j = 0; j < 4; ++j)
106         output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
107     }
108   }
109 }
110 
fadst4(const int16_t * input,int16_t * output)111 static void fadst4(const int16_t *input, int16_t *output) {
112   int x0, x1, x2, x3;
113   int s0, s1, s2, s3, s4, s5, s6, s7;
114 
115   x0 = input[0];
116   x1 = input[1];
117   x2 = input[2];
118   x3 = input[3];
119 
120   if (!(x0 | x1 | x2 | x3)) {
121     output[0] = output[1] = output[2] = output[3] = 0;
122     return;
123   }
124 
125   s0 = sinpi_1_9 * x0;
126   s1 = sinpi_4_9 * x0;
127   s2 = sinpi_2_9 * x1;
128   s3 = sinpi_1_9 * x1;
129   s4 = sinpi_3_9 * x2;
130   s5 = sinpi_4_9 * x3;
131   s6 = sinpi_2_9 * x3;
132   s7 = x0 + x1 - x3;
133 
134   x0 = s0 + s2 + s5;
135   x1 = sinpi_3_9 * s7;
136   x2 = s1 - s3 + s6;
137   x3 = s4;
138 
139   s0 = x0 + x3;
140   s1 = x1;
141   s2 = x2 - x3;
142   s3 = x2 - x0 + x3;
143 
144   // 1-D transform scaling factor is sqrt(2).
145   output[0] = fdct_round_shift(s0);
146   output[1] = fdct_round_shift(s1);
147   output[2] = fdct_round_shift(s2);
148   output[3] = fdct_round_shift(s3);
149 }
150 
151 static const transform_2d FHT_4[] = {
152   { fdct4,  fdct4  },  // DCT_DCT  = 0
153   { fadst4, fdct4  },  // ADST_DCT = 1
154   { fdct4,  fadst4 },  // DCT_ADST = 2
155   { fadst4, fadst4 }   // ADST_ADST = 3
156 };
157 
vp9_fht4x4_c(const int16_t * input,int16_t * output,int stride,int tx_type)158 void vp9_fht4x4_c(const int16_t *input, int16_t *output,
159                   int stride, int tx_type) {
160   if (tx_type == DCT_DCT) {
161     vp9_fdct4x4_c(input, output, stride);
162   } else {
163     int16_t out[4 * 4];
164     int16_t *outptr = &out[0];
165     int i, j;
166     int16_t temp_in[4], temp_out[4];
167     const transform_2d ht = FHT_4[tx_type];
168 
169     // Columns
170     for (i = 0; i < 4; ++i) {
171       for (j = 0; j < 4; ++j)
172         temp_in[j] = input[j * stride + i] * 16;
173       if (i == 0 && temp_in[0])
174         temp_in[0] += 1;
175       ht.cols(temp_in, temp_out);
176       for (j = 0; j < 4; ++j)
177         outptr[j * 4 + i] = temp_out[j];
178     }
179 
180     // Rows
181     for (i = 0; i < 4; ++i) {
182       for (j = 0; j < 4; ++j)
183         temp_in[j] = out[j + i * 4];
184       ht.rows(temp_in, temp_out);
185       for (j = 0; j < 4; ++j)
186         output[j + i * 4] = (temp_out[j] + 1) >> 2;
187     }
188   }
189 }
190 
fdct8(const int16_t * input,int16_t * output)191 static void fdct8(const int16_t *input, int16_t *output) {
192   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
193   /*needs32*/ int t0, t1, t2, t3;
194   /*canbe16*/ int x0, x1, x2, x3;
195 
196   // stage 1
197   s0 = input[0] + input[7];
198   s1 = input[1] + input[6];
199   s2 = input[2] + input[5];
200   s3 = input[3] + input[4];
201   s4 = input[3] - input[4];
202   s5 = input[2] - input[5];
203   s6 = input[1] - input[6];
204   s7 = input[0] - input[7];
205 
206   // fdct4(step, step);
207   x0 = s0 + s3;
208   x1 = s1 + s2;
209   x2 = s1 - s2;
210   x3 = s0 - s3;
211   t0 = (x0 + x1) * cospi_16_64;
212   t1 = (x0 - x1) * cospi_16_64;
213   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
214   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
215   output[0] = fdct_round_shift(t0);
216   output[2] = fdct_round_shift(t2);
217   output[4] = fdct_round_shift(t1);
218   output[6] = fdct_round_shift(t3);
219 
220   // Stage 2
221   t0 = (s6 - s5) * cospi_16_64;
222   t1 = (s6 + s5) * cospi_16_64;
223   t2 = fdct_round_shift(t0);
224   t3 = fdct_round_shift(t1);
225 
226   // Stage 3
227   x0 = s4 + t2;
228   x1 = s4 - t2;
229   x2 = s7 - t3;
230   x3 = s7 + t3;
231 
232   // Stage 4
233   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
234   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
235   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
236   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
237   output[1] = fdct_round_shift(t0);
238   output[3] = fdct_round_shift(t2);
239   output[5] = fdct_round_shift(t1);
240   output[7] = fdct_round_shift(t3);
241 }
242 
vp9_fdct8x8_c(const int16_t * input,int16_t * final_output,int stride)243 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
244   int i, j;
245   int16_t intermediate[64];
246 
247   // Transform columns
248   {
249     int16_t *output = intermediate;
250     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
251     /*needs32*/ int t0, t1, t2, t3;
252     /*canbe16*/ int x0, x1, x2, x3;
253 
254     int i;
255     for (i = 0; i < 8; i++) {
256       // stage 1
257       s0 = (input[0 * stride] + input[7 * stride]) * 4;
258       s1 = (input[1 * stride] + input[6 * stride]) * 4;
259       s2 = (input[2 * stride] + input[5 * stride]) * 4;
260       s3 = (input[3 * stride] + input[4 * stride]) * 4;
261       s4 = (input[3 * stride] - input[4 * stride]) * 4;
262       s5 = (input[2 * stride] - input[5 * stride]) * 4;
263       s6 = (input[1 * stride] - input[6 * stride]) * 4;
264       s7 = (input[0 * stride] - input[7 * stride]) * 4;
265 
266       // fdct4(step, step);
267       x0 = s0 + s3;
268       x1 = s1 + s2;
269       x2 = s1 - s2;
270       x3 = s0 - s3;
271       t0 = (x0 + x1) * cospi_16_64;
272       t1 = (x0 - x1) * cospi_16_64;
273       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
274       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
275       output[0 * 8] = fdct_round_shift(t0);
276       output[2 * 8] = fdct_round_shift(t2);
277       output[4 * 8] = fdct_round_shift(t1);
278       output[6 * 8] = fdct_round_shift(t3);
279 
280       // Stage 2
281       t0 = (s6 - s5) * cospi_16_64;
282       t1 = (s6 + s5) * cospi_16_64;
283       t2 = fdct_round_shift(t0);
284       t3 = fdct_round_shift(t1);
285 
286       // Stage 3
287       x0 = s4 + t2;
288       x1 = s4 - t2;
289       x2 = s7 - t3;
290       x3 = s7 + t3;
291 
292       // Stage 4
293       t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
294       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
295       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
296       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
297       output[1 * 8] = fdct_round_shift(t0);
298       output[3 * 8] = fdct_round_shift(t2);
299       output[5 * 8] = fdct_round_shift(t1);
300       output[7 * 8] = fdct_round_shift(t3);
301       input++;
302       output++;
303     }
304   }
305 
306   // Rows
307   for (i = 0; i < 8; ++i) {
308     fdct8(&intermediate[i * 8], &final_output[i * 8]);
309     for (j = 0; j < 8; ++j)
310       final_output[j + i * 8] /= 2;
311   }
312 }
313 
vp9_fdct16x16_c(const int16_t * input,int16_t * output,int stride)314 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
315   // The 2D transform is done with two passes which are actually pretty
316   // similar. In the first one, we transform the columns and transpose
317   // the results. In the second one, we transform the rows. To achieve that,
318   // as the first pass results are transposed, we transpose the columns (that
319   // is the transposed rows) and transpose the results (so that it goes back
320   // in normal/row positions).
321   int pass;
322   // We need an intermediate buffer between passes.
323   int16_t intermediate[256];
324   const int16_t *in = input;
325   int16_t *out = intermediate;
326   // Do the two transform/transpose passes
327   for (pass = 0; pass < 2; ++pass) {
328     /*canbe16*/ int step1[8];
329     /*canbe16*/ int step2[8];
330     /*canbe16*/ int step3[8];
331     /*canbe16*/ int input[8];
332     /*needs32*/ int temp1, temp2;
333     int i;
334     for (i = 0; i < 16; i++) {
335       if (0 == pass) {
336         // Calculate input for the first 8 results.
337         input[0] = (in[0 * stride] + in[15 * stride]) * 4;
338         input[1] = (in[1 * stride] + in[14 * stride]) * 4;
339         input[2] = (in[2 * stride] + in[13 * stride]) * 4;
340         input[3] = (in[3 * stride] + in[12 * stride]) * 4;
341         input[4] = (in[4 * stride] + in[11 * stride]) * 4;
342         input[5] = (in[5 * stride] + in[10 * stride]) * 4;
343         input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
344         input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
345         // Calculate input for the next 8 results.
346         step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
347         step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
348         step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
349         step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
350         step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
351         step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
352         step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
353         step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
354       } else {
355         // Calculate input for the first 8 results.
356         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
357         input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
358         input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
359         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
360         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
361         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
362         input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
363         input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
364         // Calculate input for the next 8 results.
365         step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
366         step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
367         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
368         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
369         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
370         step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
371         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
372         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
373       }
374       // Work on the first eight values; fdct8(input, even_results);
375       {
376         /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
377         /*needs32*/ int t0, t1, t2, t3;
378         /*canbe16*/ int x0, x1, x2, x3;
379 
380         // stage 1
381         s0 = input[0] + input[7];
382         s1 = input[1] + input[6];
383         s2 = input[2] + input[5];
384         s3 = input[3] + input[4];
385         s4 = input[3] - input[4];
386         s5 = input[2] - input[5];
387         s6 = input[1] - input[6];
388         s7 = input[0] - input[7];
389 
390         // fdct4(step, step);
391         x0 = s0 + s3;
392         x1 = s1 + s2;
393         x2 = s1 - s2;
394         x3 = s0 - s3;
395         t0 = (x0 + x1) * cospi_16_64;
396         t1 = (x0 - x1) * cospi_16_64;
397         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
398         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
399         out[0] = fdct_round_shift(t0);
400         out[4] = fdct_round_shift(t2);
401         out[8] = fdct_round_shift(t1);
402         out[12] = fdct_round_shift(t3);
403 
404         // Stage 2
405         t0 = (s6 - s5) * cospi_16_64;
406         t1 = (s6 + s5) * cospi_16_64;
407         t2 = fdct_round_shift(t0);
408         t3 = fdct_round_shift(t1);
409 
410         // Stage 3
411         x0 = s4 + t2;
412         x1 = s4 - t2;
413         x2 = s7 - t3;
414         x3 = s7 + t3;
415 
416         // Stage 4
417         t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
418         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
419         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
420         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
421         out[2] = fdct_round_shift(t0);
422         out[6] = fdct_round_shift(t2);
423         out[10] = fdct_round_shift(t1);
424         out[14] = fdct_round_shift(t3);
425       }
426       // Work on the next eight values; step1 -> odd_results
427       {
428         // step 2
429         temp1 = (step1[5] - step1[2]) * cospi_16_64;
430         temp2 = (step1[4] - step1[3]) * cospi_16_64;
431         step2[2] = fdct_round_shift(temp1);
432         step2[3] = fdct_round_shift(temp2);
433         temp1 = (step1[4] + step1[3]) * cospi_16_64;
434         temp2 = (step1[5] + step1[2]) * cospi_16_64;
435         step2[4] = fdct_round_shift(temp1);
436         step2[5] = fdct_round_shift(temp2);
437         // step 3
438         step3[0] = step1[0] + step2[3];
439         step3[1] = step1[1] + step2[2];
440         step3[2] = step1[1] - step2[2];
441         step3[3] = step1[0] - step2[3];
442         step3[4] = step1[7] - step2[4];
443         step3[5] = step1[6] - step2[5];
444         step3[6] = step1[6] + step2[5];
445         step3[7] = step1[7] + step2[4];
446         // step 4
447         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
448         temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
449         step2[1] = fdct_round_shift(temp1);
450         step2[2] = fdct_round_shift(temp2);
451         temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
452         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
453         step2[5] = fdct_round_shift(temp1);
454         step2[6] = fdct_round_shift(temp2);
455         // step 5
456         step1[0] = step3[0] + step2[1];
457         step1[1] = step3[0] - step2[1];
458         step1[2] = step3[3] - step2[2];
459         step1[3] = step3[3] + step2[2];
460         step1[4] = step3[4] + step2[5];
461         step1[5] = step3[4] - step2[5];
462         step1[6] = step3[7] - step2[6];
463         step1[7] = step3[7] + step2[6];
464         // step 6
465         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
466         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
467         out[1] = fdct_round_shift(temp1);
468         out[9] = fdct_round_shift(temp2);
469         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
470         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
471         out[5] = fdct_round_shift(temp1);
472         out[13] = fdct_round_shift(temp2);
473         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
474         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
475         out[3] = fdct_round_shift(temp1);
476         out[11] = fdct_round_shift(temp2);
477         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
478         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
479         out[7] = fdct_round_shift(temp1);
480         out[15] = fdct_round_shift(temp2);
481       }
482       // Do next column (which is a transposed row in second/horizontal pass)
483       in++;
484       out += 16;
485     }
486     // Setup in/out for next pass.
487     in = intermediate;
488     out = output;
489   }
490 }
491 
fadst8(const int16_t * input,int16_t * output)492 static void fadst8(const int16_t *input, int16_t *output) {
493   int s0, s1, s2, s3, s4, s5, s6, s7;
494 
495   int x0 = input[7];
496   int x1 = input[0];
497   int x2 = input[5];
498   int x3 = input[2];
499   int x4 = input[3];
500   int x5 = input[4];
501   int x6 = input[1];
502   int x7 = input[6];
503 
504   // stage 1
505   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
506   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
507   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
508   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
509   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
510   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
511   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
512   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
513 
514   x0 = fdct_round_shift(s0 + s4);
515   x1 = fdct_round_shift(s1 + s5);
516   x2 = fdct_round_shift(s2 + s6);
517   x3 = fdct_round_shift(s3 + s7);
518   x4 = fdct_round_shift(s0 - s4);
519   x5 = fdct_round_shift(s1 - s5);
520   x6 = fdct_round_shift(s2 - s6);
521   x7 = fdct_round_shift(s3 - s7);
522 
523   // stage 2
524   s0 = x0;
525   s1 = x1;
526   s2 = x2;
527   s3 = x3;
528   s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
529   s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
530   s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
531   s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
532 
533   x0 = s0 + s2;
534   x1 = s1 + s3;
535   x2 = s0 - s2;
536   x3 = s1 - s3;
537   x4 = fdct_round_shift(s4 + s6);
538   x5 = fdct_round_shift(s5 + s7);
539   x6 = fdct_round_shift(s4 - s6);
540   x7 = fdct_round_shift(s5 - s7);
541 
542   // stage 3
543   s2 = cospi_16_64 * (x2 + x3);
544   s3 = cospi_16_64 * (x2 - x3);
545   s6 = cospi_16_64 * (x6 + x7);
546   s7 = cospi_16_64 * (x6 - x7);
547 
548   x2 = fdct_round_shift(s2);
549   x3 = fdct_round_shift(s3);
550   x6 = fdct_round_shift(s6);
551   x7 = fdct_round_shift(s7);
552 
553   output[0] =   x0;
554   output[1] = - x4;
555   output[2] =   x6;
556   output[3] = - x2;
557   output[4] =   x3;
558   output[5] = - x7;
559   output[6] =   x5;
560   output[7] = - x1;
561 }
562 
563 static const transform_2d FHT_8[] = {
564   { fdct8,  fdct8  },  // DCT_DCT  = 0
565   { fadst8, fdct8  },  // ADST_DCT = 1
566   { fdct8,  fadst8 },  // DCT_ADST = 2
567   { fadst8, fadst8 }   // ADST_ADST = 3
568 };
569 
vp9_fht8x8_c(const int16_t * input,int16_t * output,int stride,int tx_type)570 void vp9_fht8x8_c(const int16_t *input, int16_t *output,
571                   int stride, int tx_type) {
572   if (tx_type == DCT_DCT) {
573     vp9_fdct8x8_c(input, output, stride);
574   } else {
575     int16_t out[64];
576     int16_t *outptr = &out[0];
577     int i, j;
578     int16_t temp_in[8], temp_out[8];
579     const transform_2d ht = FHT_8[tx_type];
580 
581     // Columns
582     for (i = 0; i < 8; ++i) {
583       for (j = 0; j < 8; ++j)
584         temp_in[j] = input[j * stride + i] * 4;
585       ht.cols(temp_in, temp_out);
586       for (j = 0; j < 8; ++j)
587         outptr[j * 8 + i] = temp_out[j];
588     }
589 
590     // Rows
591     for (i = 0; i < 8; ++i) {
592       for (j = 0; j < 8; ++j)
593         temp_in[j] = out[j + i * 8];
594       ht.rows(temp_in, temp_out);
595       for (j = 0; j < 8; ++j)
596         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
597     }
598   }
599 }
600 
601 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
602    pixel. */
vp9_fwht4x4_c(const int16_t * input,int16_t * output,int stride)603 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
604   int i;
605   int a1, b1, c1, d1, e1;
606   const int16_t *ip = input;
607   int16_t *op = output;
608 
609   for (i = 0; i < 4; i++) {
610     a1 = ip[0 * stride];
611     b1 = ip[1 * stride];
612     c1 = ip[2 * stride];
613     d1 = ip[3 * stride];
614 
615     a1 += b1;
616     d1 = d1 - c1;
617     e1 = (a1 - d1) >> 1;
618     b1 = e1 - b1;
619     c1 = e1 - c1;
620     a1 -= c1;
621     d1 += b1;
622     op[0] = a1;
623     op[4] = c1;
624     op[8] = d1;
625     op[12] = b1;
626 
627     ip++;
628     op++;
629   }
630   ip = output;
631   op = output;
632 
633   for (i = 0; i < 4; i++) {
634     a1 = ip[0];
635     b1 = ip[1];
636     c1 = ip[2];
637     d1 = ip[3];
638 
639     a1 += b1;
640     d1 -= c1;
641     e1 = (a1 - d1) >> 1;
642     b1 = e1 - b1;
643     c1 = e1 - c1;
644     a1 -= c1;
645     d1 += b1;
646     op[0] = a1 * UNIT_QUANT_FACTOR;
647     op[1] = c1 * UNIT_QUANT_FACTOR;
648     op[2] = d1 * UNIT_QUANT_FACTOR;
649     op[3] = b1 * UNIT_QUANT_FACTOR;
650 
651     ip += 4;
652     op += 4;
653   }
654 }
655 
656 // Rewrote to use same algorithm as others.
fdct16(const int16_t in[16],int16_t out[16])657 static void fdct16(const int16_t in[16], int16_t out[16]) {
658   /*canbe16*/ int step1[8];
659   /*canbe16*/ int step2[8];
660   /*canbe16*/ int step3[8];
661   /*canbe16*/ int input[8];
662   /*needs32*/ int temp1, temp2;
663 
664   // step 1
665   input[0] = in[0] + in[15];
666   input[1] = in[1] + in[14];
667   input[2] = in[2] + in[13];
668   input[3] = in[3] + in[12];
669   input[4] = in[4] + in[11];
670   input[5] = in[5] + in[10];
671   input[6] = in[6] + in[ 9];
672   input[7] = in[7] + in[ 8];
673 
674   step1[0] = in[7] - in[ 8];
675   step1[1] = in[6] - in[ 9];
676   step1[2] = in[5] - in[10];
677   step1[3] = in[4] - in[11];
678   step1[4] = in[3] - in[12];
679   step1[5] = in[2] - in[13];
680   step1[6] = in[1] - in[14];
681   step1[7] = in[0] - in[15];
682 
683   // fdct8(step, step);
684   {
685     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
686     /*needs32*/ int t0, t1, t2, t3;
687     /*canbe16*/ int x0, x1, x2, x3;
688 
689     // stage 1
690     s0 = input[0] + input[7];
691     s1 = input[1] + input[6];
692     s2 = input[2] + input[5];
693     s3 = input[3] + input[4];
694     s4 = input[3] - input[4];
695     s5 = input[2] - input[5];
696     s6 = input[1] - input[6];
697     s7 = input[0] - input[7];
698 
699     // fdct4(step, step);
700     x0 = s0 + s3;
701     x1 = s1 + s2;
702     x2 = s1 - s2;
703     x3 = s0 - s3;
704     t0 = (x0 + x1) * cospi_16_64;
705     t1 = (x0 - x1) * cospi_16_64;
706     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
707     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
708     out[0] = fdct_round_shift(t0);
709     out[4] = fdct_round_shift(t2);
710     out[8] = fdct_round_shift(t1);
711     out[12] = fdct_round_shift(t3);
712 
713     // Stage 2
714     t0 = (s6 - s5) * cospi_16_64;
715     t1 = (s6 + s5) * cospi_16_64;
716     t2 = fdct_round_shift(t0);
717     t3 = fdct_round_shift(t1);
718 
719     // Stage 3
720     x0 = s4 + t2;
721     x1 = s4 - t2;
722     x2 = s7 - t3;
723     x3 = s7 + t3;
724 
725     // Stage 4
726     t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
727     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
728     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
729     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
730     out[2] = fdct_round_shift(t0);
731     out[6] = fdct_round_shift(t2);
732     out[10] = fdct_round_shift(t1);
733     out[14] = fdct_round_shift(t3);
734   }
735 
736   // step 2
737   temp1 = (step1[5] - step1[2]) * cospi_16_64;
738   temp2 = (step1[4] - step1[3]) * cospi_16_64;
739   step2[2] = fdct_round_shift(temp1);
740   step2[3] = fdct_round_shift(temp2);
741   temp1 = (step1[4] + step1[3]) * cospi_16_64;
742   temp2 = (step1[5] + step1[2]) * cospi_16_64;
743   step2[4] = fdct_round_shift(temp1);
744   step2[5] = fdct_round_shift(temp2);
745 
746   // step 3
747   step3[0] = step1[0] + step2[3];
748   step3[1] = step1[1] + step2[2];
749   step3[2] = step1[1] - step2[2];
750   step3[3] = step1[0] - step2[3];
751   step3[4] = step1[7] - step2[4];
752   step3[5] = step1[6] - step2[5];
753   step3[6] = step1[6] + step2[5];
754   step3[7] = step1[7] + step2[4];
755 
756   // step 4
757   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
758   temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
759   step2[1] = fdct_round_shift(temp1);
760   step2[2] = fdct_round_shift(temp2);
761   temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
762   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
763   step2[5] = fdct_round_shift(temp1);
764   step2[6] = fdct_round_shift(temp2);
765 
766   // step 5
767   step1[0] = step3[0] + step2[1];
768   step1[1] = step3[0] - step2[1];
769   step1[2] = step3[3] - step2[2];
770   step1[3] = step3[3] + step2[2];
771   step1[4] = step3[4] + step2[5];
772   step1[5] = step3[4] - step2[5];
773   step1[6] = step3[7] - step2[6];
774   step1[7] = step3[7] + step2[6];
775 
776   // step 6
777   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
778   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
779   out[1] = fdct_round_shift(temp1);
780   out[9] = fdct_round_shift(temp2);
781 
782   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
783   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
784   out[5] = fdct_round_shift(temp1);
785   out[13] = fdct_round_shift(temp2);
786 
787   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
788   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
789   out[3] = fdct_round_shift(temp1);
790   out[11] = fdct_round_shift(temp2);
791 
792   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
793   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
794   out[7] = fdct_round_shift(temp1);
795   out[15] = fdct_round_shift(temp2);
796 }
797 
fadst16(const int16_t * input,int16_t * output)798 static void fadst16(const int16_t *input, int16_t *output) {
799   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
800 
801   int x0 = input[15];
802   int x1 = input[0];
803   int x2 = input[13];
804   int x3 = input[2];
805   int x4 = input[11];
806   int x5 = input[4];
807   int x6 = input[9];
808   int x7 = input[6];
809   int x8 = input[7];
810   int x9 = input[8];
811   int x10 = input[5];
812   int x11 = input[10];
813   int x12 = input[3];
814   int x13 = input[12];
815   int x14 = input[1];
816   int x15 = input[14];
817 
818   // stage 1
819   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
820   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
821   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
822   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
823   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
824   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
825   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
826   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
827   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
828   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
829   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
830   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
831   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
832   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
833   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
834   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
835 
836   x0 = fdct_round_shift(s0 + s8);
837   x1 = fdct_round_shift(s1 + s9);
838   x2 = fdct_round_shift(s2 + s10);
839   x3 = fdct_round_shift(s3 + s11);
840   x4 = fdct_round_shift(s4 + s12);
841   x5 = fdct_round_shift(s5 + s13);
842   x6 = fdct_round_shift(s6 + s14);
843   x7 = fdct_round_shift(s7 + s15);
844   x8  = fdct_round_shift(s0 - s8);
845   x9  = fdct_round_shift(s1 - s9);
846   x10 = fdct_round_shift(s2 - s10);
847   x11 = fdct_round_shift(s3 - s11);
848   x12 = fdct_round_shift(s4 - s12);
849   x13 = fdct_round_shift(s5 - s13);
850   x14 = fdct_round_shift(s6 - s14);
851   x15 = fdct_round_shift(s7 - s15);
852 
853   // stage 2
854   s0 = x0;
855   s1 = x1;
856   s2 = x2;
857   s3 = x3;
858   s4 = x4;
859   s5 = x5;
860   s6 = x6;
861   s7 = x7;
862   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
863   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
864   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
865   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
866   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
867   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
868   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
869   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
870 
871   x0 = s0 + s4;
872   x1 = s1 + s5;
873   x2 = s2 + s6;
874   x3 = s3 + s7;
875   x4 = s0 - s4;
876   x5 = s1 - s5;
877   x6 = s2 - s6;
878   x7 = s3 - s7;
879   x8 = fdct_round_shift(s8 + s12);
880   x9 = fdct_round_shift(s9 + s13);
881   x10 = fdct_round_shift(s10 + s14);
882   x11 = fdct_round_shift(s11 + s15);
883   x12 = fdct_round_shift(s8 - s12);
884   x13 = fdct_round_shift(s9 - s13);
885   x14 = fdct_round_shift(s10 - s14);
886   x15 = fdct_round_shift(s11 - s15);
887 
888   // stage 3
889   s0 = x0;
890   s1 = x1;
891   s2 = x2;
892   s3 = x3;
893   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
894   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
895   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
896   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
897   s8 = x8;
898   s9 = x9;
899   s10 = x10;
900   s11 = x11;
901   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
902   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
903   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
904   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
905 
906   x0 = s0 + s2;
907   x1 = s1 + s3;
908   x2 = s0 - s2;
909   x3 = s1 - s3;
910   x4 = fdct_round_shift(s4 + s6);
911   x5 = fdct_round_shift(s5 + s7);
912   x6 = fdct_round_shift(s4 - s6);
913   x7 = fdct_round_shift(s5 - s7);
914   x8 = s8 + s10;
915   x9 = s9 + s11;
916   x10 = s8 - s10;
917   x11 = s9 - s11;
918   x12 = fdct_round_shift(s12 + s14);
919   x13 = fdct_round_shift(s13 + s15);
920   x14 = fdct_round_shift(s12 - s14);
921   x15 = fdct_round_shift(s13 - s15);
922 
923   // stage 4
924   s2 = (- cospi_16_64) * (x2 + x3);
925   s3 = cospi_16_64 * (x2 - x3);
926   s6 = cospi_16_64 * (x6 + x7);
927   s7 = cospi_16_64 * (- x6 + x7);
928   s10 = cospi_16_64 * (x10 + x11);
929   s11 = cospi_16_64 * (- x10 + x11);
930   s14 = (- cospi_16_64) * (x14 + x15);
931   s15 = cospi_16_64 * (x14 - x15);
932 
933   x2 = fdct_round_shift(s2);
934   x3 = fdct_round_shift(s3);
935   x6 = fdct_round_shift(s6);
936   x7 = fdct_round_shift(s7);
937   x10 = fdct_round_shift(s10);
938   x11 = fdct_round_shift(s11);
939   x14 = fdct_round_shift(s14);
940   x15 = fdct_round_shift(s15);
941 
942   output[0] = x0;
943   output[1] = - x8;
944   output[2] = x12;
945   output[3] = - x4;
946   output[4] = x6;
947   output[5] = x14;
948   output[6] = x10;
949   output[7] = x2;
950   output[8] = x3;
951   output[9] =  x11;
952   output[10] = x15;
953   output[11] = x7;
954   output[12] = x5;
955   output[13] = - x13;
956   output[14] = x9;
957   output[15] = - x1;
958 }
959 
960 static const transform_2d FHT_16[] = {
961   { fdct16,  fdct16  },  // DCT_DCT  = 0
962   { fadst16, fdct16  },  // ADST_DCT = 1
963   { fdct16,  fadst16 },  // DCT_ADST = 2
964   { fadst16, fadst16 }   // ADST_ADST = 3
965 };
966 
vp9_fht16x16_c(const int16_t * input,int16_t * output,int stride,int tx_type)967 void vp9_fht16x16_c(const int16_t *input, int16_t *output,
968                     int stride, int tx_type) {
969   if (tx_type == DCT_DCT) {
970     vp9_fdct16x16_c(input, output, stride);
971   } else {
972     int16_t out[256];
973     int16_t *outptr = &out[0];
974     int i, j;
975     int16_t temp_in[16], temp_out[16];
976     const transform_2d ht = FHT_16[tx_type];
977 
978     // Columns
979     for (i = 0; i < 16; ++i) {
980       for (j = 0; j < 16; ++j)
981         temp_in[j] = input[j * stride + i] * 4;
982       ht.cols(temp_in, temp_out);
983       for (j = 0; j < 16; ++j)
984         outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
985     }
986 
987     // Rows
988     for (i = 0; i < 16; ++i) {
989       for (j = 0; j < 16; ++j)
990         temp_in[j] = out[j + i * 16];
991       ht.rows(temp_in, temp_out);
992       for (j = 0; j < 16; ++j)
993         output[j + i * 16] = temp_out[j];
994     }
995   }
996 }
997 
dct_32_round(int input)998 static INLINE int dct_32_round(int input) {
999   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
1000   assert(-131072 <= rv && rv <= 131071);
1001   return rv;
1002 }
1003 
half_round_shift(int input)1004 static INLINE int half_round_shift(int input) {
1005   int rv = (input + 1 + (input < 0)) >> 2;
1006   return rv;
1007 }
1008 
fdct32(const int * input,int * output,int round)1009 static void fdct32(const int *input, int *output, int round) {
1010   int step[32];
1011   // Stage 1
1012   step[0] = input[0] + input[(32 - 1)];
1013   step[1] = input[1] + input[(32 - 2)];
1014   step[2] = input[2] + input[(32 - 3)];
1015   step[3] = input[3] + input[(32 - 4)];
1016   step[4] = input[4] + input[(32 - 5)];
1017   step[5] = input[5] + input[(32 - 6)];
1018   step[6] = input[6] + input[(32 - 7)];
1019   step[7] = input[7] + input[(32 - 8)];
1020   step[8] = input[8] + input[(32 - 9)];
1021   step[9] = input[9] + input[(32 - 10)];
1022   step[10] = input[10] + input[(32 - 11)];
1023   step[11] = input[11] + input[(32 - 12)];
1024   step[12] = input[12] + input[(32 - 13)];
1025   step[13] = input[13] + input[(32 - 14)];
1026   step[14] = input[14] + input[(32 - 15)];
1027   step[15] = input[15] + input[(32 - 16)];
1028   step[16] = -input[16] + input[(32 - 17)];
1029   step[17] = -input[17] + input[(32 - 18)];
1030   step[18] = -input[18] + input[(32 - 19)];
1031   step[19] = -input[19] + input[(32 - 20)];
1032   step[20] = -input[20] + input[(32 - 21)];
1033   step[21] = -input[21] + input[(32 - 22)];
1034   step[22] = -input[22] + input[(32 - 23)];
1035   step[23] = -input[23] + input[(32 - 24)];
1036   step[24] = -input[24] + input[(32 - 25)];
1037   step[25] = -input[25] + input[(32 - 26)];
1038   step[26] = -input[26] + input[(32 - 27)];
1039   step[27] = -input[27] + input[(32 - 28)];
1040   step[28] = -input[28] + input[(32 - 29)];
1041   step[29] = -input[29] + input[(32 - 30)];
1042   step[30] = -input[30] + input[(32 - 31)];
1043   step[31] = -input[31] + input[(32 - 32)];
1044 
1045   // Stage 2
1046   output[0] = step[0] + step[16 - 1];
1047   output[1] = step[1] + step[16 - 2];
1048   output[2] = step[2] + step[16 - 3];
1049   output[3] = step[3] + step[16 - 4];
1050   output[4] = step[4] + step[16 - 5];
1051   output[5] = step[5] + step[16 - 6];
1052   output[6] = step[6] + step[16 - 7];
1053   output[7] = step[7] + step[16 - 8];
1054   output[8] = -step[8] + step[16 - 9];
1055   output[9] = -step[9] + step[16 - 10];
1056   output[10] = -step[10] + step[16 - 11];
1057   output[11] = -step[11] + step[16 - 12];
1058   output[12] = -step[12] + step[16 - 13];
1059   output[13] = -step[13] + step[16 - 14];
1060   output[14] = -step[14] + step[16 - 15];
1061   output[15] = -step[15] + step[16 - 16];
1062 
1063   output[16] = step[16];
1064   output[17] = step[17];
1065   output[18] = step[18];
1066   output[19] = step[19];
1067 
1068   output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
1069   output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
1070   output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
1071   output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
1072 
1073   output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
1074   output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
1075   output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
1076   output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
1077 
1078   output[28] = step[28];
1079   output[29] = step[29];
1080   output[30] = step[30];
1081   output[31] = step[31];
1082 
1083   // dump the magnitude by 4, hence the intermediate values are within
1084   // the range of 16 bits.
1085   if (round) {
1086     output[0] = half_round_shift(output[0]);
1087     output[1] = half_round_shift(output[1]);
1088     output[2] = half_round_shift(output[2]);
1089     output[3] = half_round_shift(output[3]);
1090     output[4] = half_round_shift(output[4]);
1091     output[5] = half_round_shift(output[5]);
1092     output[6] = half_round_shift(output[6]);
1093     output[7] = half_round_shift(output[7]);
1094     output[8] = half_round_shift(output[8]);
1095     output[9] = half_round_shift(output[9]);
1096     output[10] = half_round_shift(output[10]);
1097     output[11] = half_round_shift(output[11]);
1098     output[12] = half_round_shift(output[12]);
1099     output[13] = half_round_shift(output[13]);
1100     output[14] = half_round_shift(output[14]);
1101     output[15] = half_round_shift(output[15]);
1102 
1103     output[16] = half_round_shift(output[16]);
1104     output[17] = half_round_shift(output[17]);
1105     output[18] = half_round_shift(output[18]);
1106     output[19] = half_round_shift(output[19]);
1107     output[20] = half_round_shift(output[20]);
1108     output[21] = half_round_shift(output[21]);
1109     output[22] = half_round_shift(output[22]);
1110     output[23] = half_round_shift(output[23]);
1111     output[24] = half_round_shift(output[24]);
1112     output[25] = half_round_shift(output[25]);
1113     output[26] = half_round_shift(output[26]);
1114     output[27] = half_round_shift(output[27]);
1115     output[28] = half_round_shift(output[28]);
1116     output[29] = half_round_shift(output[29]);
1117     output[30] = half_round_shift(output[30]);
1118     output[31] = half_round_shift(output[31]);
1119   }
1120 
1121   // Stage 3
1122   step[0] = output[0] + output[(8 - 1)];
1123   step[1] = output[1] + output[(8 - 2)];
1124   step[2] = output[2] + output[(8 - 3)];
1125   step[3] = output[3] + output[(8 - 4)];
1126   step[4] = -output[4] + output[(8 - 5)];
1127   step[5] = -output[5] + output[(8 - 6)];
1128   step[6] = -output[6] + output[(8 - 7)];
1129   step[7] = -output[7] + output[(8 - 8)];
1130   step[8] = output[8];
1131   step[9] = output[9];
1132   step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
1133   step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
1134   step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
1135   step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
1136   step[14] = output[14];
1137   step[15] = output[15];
1138 
1139   step[16] = output[16] + output[23];
1140   step[17] = output[17] + output[22];
1141   step[18] = output[18] + output[21];
1142   step[19] = output[19] + output[20];
1143   step[20] = -output[20] + output[19];
1144   step[21] = -output[21] + output[18];
1145   step[22] = -output[22] + output[17];
1146   step[23] = -output[23] + output[16];
1147   step[24] = -output[24] + output[31];
1148   step[25] = -output[25] + output[30];
1149   step[26] = -output[26] + output[29];
1150   step[27] = -output[27] + output[28];
1151   step[28] = output[28] + output[27];
1152   step[29] = output[29] + output[26];
1153   step[30] = output[30] + output[25];
1154   step[31] = output[31] + output[24];
1155 
1156   // Stage 4
1157   output[0] = step[0] + step[3];
1158   output[1] = step[1] + step[2];
1159   output[2] = -step[2] + step[1];
1160   output[3] = -step[3] + step[0];
1161   output[4] = step[4];
1162   output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
1163   output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
1164   output[7] = step[7];
1165   output[8] = step[8] + step[11];
1166   output[9] = step[9] + step[10];
1167   output[10] = -step[10] + step[9];
1168   output[11] = -step[11] + step[8];
1169   output[12] = -step[12] + step[15];
1170   output[13] = -step[13] + step[14];
1171   output[14] = step[14] + step[13];
1172   output[15] = step[15] + step[12];
1173 
1174   output[16] = step[16];
1175   output[17] = step[17];
1176   output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
1177   output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
1178   output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
1179   output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
1180   output[22] = step[22];
1181   output[23] = step[23];
1182   output[24] = step[24];
1183   output[25] = step[25];
1184   output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
1185   output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
1186   output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
1187   output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
1188   output[30] = step[30];
1189   output[31] = step[31];
1190 
1191   // Stage 5
1192   step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
1193   step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
1194   step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
1195   step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
1196   step[4] = output[4] + output[5];
1197   step[5] = -output[5] + output[4];
1198   step[6] = -output[6] + output[7];
1199   step[7] = output[7] + output[6];
1200   step[8] = output[8];
1201   step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
1202   step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
1203   step[11] = output[11];
1204   step[12] = output[12];
1205   step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
1206   step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
1207   step[15] = output[15];
1208 
1209   step[16] = output[16] + output[19];
1210   step[17] = output[17] + output[18];
1211   step[18] = -output[18] + output[17];
1212   step[19] = -output[19] + output[16];
1213   step[20] = -output[20] + output[23];
1214   step[21] = -output[21] + output[22];
1215   step[22] = output[22] + output[21];
1216   step[23] = output[23] + output[20];
1217   step[24] = output[24] + output[27];
1218   step[25] = output[25] + output[26];
1219   step[26] = -output[26] + output[25];
1220   step[27] = -output[27] + output[24];
1221   step[28] = -output[28] + output[31];
1222   step[29] = -output[29] + output[30];
1223   step[30] = output[30] + output[29];
1224   step[31] = output[31] + output[28];
1225 
1226   // Stage 6
1227   output[0] = step[0];
1228   output[1] = step[1];
1229   output[2] = step[2];
1230   output[3] = step[3];
1231   output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
1232   output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
1233   output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
1234   output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
1235   output[8] = step[8] + step[9];
1236   output[9] = -step[9] + step[8];
1237   output[10] = -step[10] + step[11];
1238   output[11] = step[11] + step[10];
1239   output[12] = step[12] + step[13];
1240   output[13] = -step[13] + step[12];
1241   output[14] = -step[14] + step[15];
1242   output[15] = step[15] + step[14];
1243 
1244   output[16] = step[16];
1245   output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
1246   output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
1247   output[19] = step[19];
1248   output[20] = step[20];
1249   output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
1250   output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
1251   output[23] = step[23];
1252   output[24] = step[24];
1253   output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
1254   output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
1255   output[27] = step[27];
1256   output[28] = step[28];
1257   output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
1258   output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
1259   output[31] = step[31];
1260 
1261   // Stage 7
1262   step[0] = output[0];
1263   step[1] = output[1];
1264   step[2] = output[2];
1265   step[3] = output[3];
1266   step[4] = output[4];
1267   step[5] = output[5];
1268   step[6] = output[6];
1269   step[7] = output[7];
1270   step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
1271   step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
1272   step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
1273   step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
1274   step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
1275   step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
1276   step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
1277   step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
1278 
1279   step[16] = output[16] + output[17];
1280   step[17] = -output[17] + output[16];
1281   step[18] = -output[18] + output[19];
1282   step[19] = output[19] + output[18];
1283   step[20] = output[20] + output[21];
1284   step[21] = -output[21] + output[20];
1285   step[22] = -output[22] + output[23];
1286   step[23] = output[23] + output[22];
1287   step[24] = output[24] + output[25];
1288   step[25] = -output[25] + output[24];
1289   step[26] = -output[26] + output[27];
1290   step[27] = output[27] + output[26];
1291   step[28] = output[28] + output[29];
1292   step[29] = -output[29] + output[28];
1293   step[30] = -output[30] + output[31];
1294   step[31] = output[31] + output[30];
1295 
1296   // Final stage --- outputs indices are bit-reversed.
1297   output[0]  = step[0];
1298   output[16] = step[1];
1299   output[8]  = step[2];
1300   output[24] = step[3];
1301   output[4]  = step[4];
1302   output[20] = step[5];
1303   output[12] = step[6];
1304   output[28] = step[7];
1305   output[2]  = step[8];
1306   output[18] = step[9];
1307   output[10] = step[10];
1308   output[26] = step[11];
1309   output[6]  = step[12];
1310   output[22] = step[13];
1311   output[14] = step[14];
1312   output[30] = step[15];
1313 
1314   output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
1315   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
1316   output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
1317   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
1318   output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
1319   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
1320   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
1321   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
1322   output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1323   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1324   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1325   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1326   output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1327   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1328   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1329   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1330 }
1331 
vp9_fdct32x32_c(const int16_t * input,int16_t * out,int stride)1332 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
1333   int i, j;
1334   int output[32 * 32];
1335 
1336   // Columns
1337   for (i = 0; i < 32; ++i) {
1338     int temp_in[32], temp_out[32];
1339     for (j = 0; j < 32; ++j)
1340       temp_in[j] = input[j * stride + i] * 4;
1341     fdct32(temp_in, temp_out, 0);
1342     for (j = 0; j < 32; ++j)
1343       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1344   }
1345 
1346   // Rows
1347   for (i = 0; i < 32; ++i) {
1348     int temp_in[32], temp_out[32];
1349     for (j = 0; j < 32; ++j)
1350       temp_in[j] = output[j + i * 32];
1351     fdct32(temp_in, temp_out, 0);
1352     for (j = 0; j < 32; ++j)
1353       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1354   }
1355 }
1356 
1357 // Note that although we use dct_32_round in dct32 computation flow,
1358 // this 2d fdct32x32 for rate-distortion optimization loop is operating
1359 // within 16 bits precision.
vp9_fdct32x32_rd_c(const int16_t * input,int16_t * out,int stride)1360 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
1361   int i, j;
1362   int output[32 * 32];
1363 
1364   // Columns
1365   for (i = 0; i < 32; ++i) {
1366     int temp_in[32], temp_out[32];
1367     for (j = 0; j < 32; ++j)
1368       temp_in[j] = input[j * stride + i] * 4;
1369     fdct32(temp_in, temp_out, 0);
1370     for (j = 0; j < 32; ++j)
1371       // TODO(cd): see quality impact of only doing
1372       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
1373       //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1374       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1375   }
1376 
1377   // Rows
1378   for (i = 0; i < 32; ++i) {
1379     int temp_in[32], temp_out[32];
1380     for (j = 0; j < 32; ++j)
1381       temp_in[j] = output[j + i * 32];
1382     fdct32(temp_in, temp_out, 1);
1383     for (j = 0; j < 32; ++j)
1384       out[j + i * 32] = temp_out[j];
1385   }
1386 }
1387