1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <math.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/vp9_systemdependent.h"
20
fdct_round_shift(int input)21 static INLINE int fdct_round_shift(int input) {
22 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
23 assert(INT16_MIN <= rv && rv <= INT16_MAX);
24 return rv;
25 }
26
fdct4(const int16_t * input,int16_t * output)27 static void fdct4(const int16_t *input, int16_t *output) {
28 int16_t step[4];
29 int temp1, temp2;
30
31 step[0] = input[0] + input[3];
32 step[1] = input[1] + input[2];
33 step[2] = input[1] - input[2];
34 step[3] = input[0] - input[3];
35
36 temp1 = (step[0] + step[1]) * cospi_16_64;
37 temp2 = (step[0] - step[1]) * cospi_16_64;
38 output[0] = fdct_round_shift(temp1);
39 output[2] = fdct_round_shift(temp2);
40 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
41 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
42 output[1] = fdct_round_shift(temp1);
43 output[3] = fdct_round_shift(temp2);
44 }
45
vp9_fdct4x4_c(const int16_t * input,int16_t * output,int stride)46 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
47 // The 2D transform is done with two passes which are actually pretty
48 // similar. In the first one, we transform the columns and transpose
49 // the results. In the second one, we transform the rows. To achieve that,
50 // as the first pass results are transposed, we transpose the columns (that
51 // is the transposed rows) and transpose the results (so that it goes back
52 // in normal/row positions).
53 int pass;
54 // We need an intermediate buffer between passes.
55 int16_t intermediate[4 * 4];
56 const int16_t *in = input;
57 int16_t *out = intermediate;
58 // Do the two transform/transpose passes
59 for (pass = 0; pass < 2; ++pass) {
60 /*canbe16*/ int input[4];
61 /*canbe16*/ int step[4];
62 /*needs32*/ int temp1, temp2;
63 int i;
64 for (i = 0; i < 4; ++i) {
65 // Load inputs.
66 if (0 == pass) {
67 input[0] = in[0 * stride] * 16;
68 input[1] = in[1 * stride] * 16;
69 input[2] = in[2 * stride] * 16;
70 input[3] = in[3 * stride] * 16;
71 if (i == 0 && input[0]) {
72 input[0] += 1;
73 }
74 } else {
75 input[0] = in[0 * 4];
76 input[1] = in[1 * 4];
77 input[2] = in[2 * 4];
78 input[3] = in[3 * 4];
79 }
80 // Transform.
81 step[0] = input[0] + input[3];
82 step[1] = input[1] + input[2];
83 step[2] = input[1] - input[2];
84 step[3] = input[0] - input[3];
85 temp1 = (step[0] + step[1]) * cospi_16_64;
86 temp2 = (step[0] - step[1]) * cospi_16_64;
87 out[0] = fdct_round_shift(temp1);
88 out[2] = fdct_round_shift(temp2);
89 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
90 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
91 out[1] = fdct_round_shift(temp1);
92 out[3] = fdct_round_shift(temp2);
93 // Do next column (which is a transposed row in second/horizontal pass)
94 in++;
95 out += 4;
96 }
97 // Setup in/out for next pass.
98 in = intermediate;
99 out = output;
100 }
101
102 {
103 int i, j;
104 for (i = 0; i < 4; ++i) {
105 for (j = 0; j < 4; ++j)
106 output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
107 }
108 }
109 }
110
fadst4(const int16_t * input,int16_t * output)111 static void fadst4(const int16_t *input, int16_t *output) {
112 int x0, x1, x2, x3;
113 int s0, s1, s2, s3, s4, s5, s6, s7;
114
115 x0 = input[0];
116 x1 = input[1];
117 x2 = input[2];
118 x3 = input[3];
119
120 if (!(x0 | x1 | x2 | x3)) {
121 output[0] = output[1] = output[2] = output[3] = 0;
122 return;
123 }
124
125 s0 = sinpi_1_9 * x0;
126 s1 = sinpi_4_9 * x0;
127 s2 = sinpi_2_9 * x1;
128 s3 = sinpi_1_9 * x1;
129 s4 = sinpi_3_9 * x2;
130 s5 = sinpi_4_9 * x3;
131 s6 = sinpi_2_9 * x3;
132 s7 = x0 + x1 - x3;
133
134 x0 = s0 + s2 + s5;
135 x1 = sinpi_3_9 * s7;
136 x2 = s1 - s3 + s6;
137 x3 = s4;
138
139 s0 = x0 + x3;
140 s1 = x1;
141 s2 = x2 - x3;
142 s3 = x2 - x0 + x3;
143
144 // 1-D transform scaling factor is sqrt(2).
145 output[0] = fdct_round_shift(s0);
146 output[1] = fdct_round_shift(s1);
147 output[2] = fdct_round_shift(s2);
148 output[3] = fdct_round_shift(s3);
149 }
150
151 static const transform_2d FHT_4[] = {
152 { fdct4, fdct4 }, // DCT_DCT = 0
153 { fadst4, fdct4 }, // ADST_DCT = 1
154 { fdct4, fadst4 }, // DCT_ADST = 2
155 { fadst4, fadst4 } // ADST_ADST = 3
156 };
157
vp9_fht4x4_c(const int16_t * input,int16_t * output,int stride,int tx_type)158 void vp9_fht4x4_c(const int16_t *input, int16_t *output,
159 int stride, int tx_type) {
160 if (tx_type == DCT_DCT) {
161 vp9_fdct4x4_c(input, output, stride);
162 } else {
163 int16_t out[4 * 4];
164 int16_t *outptr = &out[0];
165 int i, j;
166 int16_t temp_in[4], temp_out[4];
167 const transform_2d ht = FHT_4[tx_type];
168
169 // Columns
170 for (i = 0; i < 4; ++i) {
171 for (j = 0; j < 4; ++j)
172 temp_in[j] = input[j * stride + i] * 16;
173 if (i == 0 && temp_in[0])
174 temp_in[0] += 1;
175 ht.cols(temp_in, temp_out);
176 for (j = 0; j < 4; ++j)
177 outptr[j * 4 + i] = temp_out[j];
178 }
179
180 // Rows
181 for (i = 0; i < 4; ++i) {
182 for (j = 0; j < 4; ++j)
183 temp_in[j] = out[j + i * 4];
184 ht.rows(temp_in, temp_out);
185 for (j = 0; j < 4; ++j)
186 output[j + i * 4] = (temp_out[j] + 1) >> 2;
187 }
188 }
189 }
190
fdct8(const int16_t * input,int16_t * output)191 static void fdct8(const int16_t *input, int16_t *output) {
192 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
193 /*needs32*/ int t0, t1, t2, t3;
194 /*canbe16*/ int x0, x1, x2, x3;
195
196 // stage 1
197 s0 = input[0] + input[7];
198 s1 = input[1] + input[6];
199 s2 = input[2] + input[5];
200 s3 = input[3] + input[4];
201 s4 = input[3] - input[4];
202 s5 = input[2] - input[5];
203 s6 = input[1] - input[6];
204 s7 = input[0] - input[7];
205
206 // fdct4(step, step);
207 x0 = s0 + s3;
208 x1 = s1 + s2;
209 x2 = s1 - s2;
210 x3 = s0 - s3;
211 t0 = (x0 + x1) * cospi_16_64;
212 t1 = (x0 - x1) * cospi_16_64;
213 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
214 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
215 output[0] = fdct_round_shift(t0);
216 output[2] = fdct_round_shift(t2);
217 output[4] = fdct_round_shift(t1);
218 output[6] = fdct_round_shift(t3);
219
220 // Stage 2
221 t0 = (s6 - s5) * cospi_16_64;
222 t1 = (s6 + s5) * cospi_16_64;
223 t2 = fdct_round_shift(t0);
224 t3 = fdct_round_shift(t1);
225
226 // Stage 3
227 x0 = s4 + t2;
228 x1 = s4 - t2;
229 x2 = s7 - t3;
230 x3 = s7 + t3;
231
232 // Stage 4
233 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
234 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
235 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
236 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
237 output[1] = fdct_round_shift(t0);
238 output[3] = fdct_round_shift(t2);
239 output[5] = fdct_round_shift(t1);
240 output[7] = fdct_round_shift(t3);
241 }
242
vp9_fdct8x8_c(const int16_t * input,int16_t * final_output,int stride)243 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
244 int i, j;
245 int16_t intermediate[64];
246
247 // Transform columns
248 {
249 int16_t *output = intermediate;
250 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
251 /*needs32*/ int t0, t1, t2, t3;
252 /*canbe16*/ int x0, x1, x2, x3;
253
254 int i;
255 for (i = 0; i < 8; i++) {
256 // stage 1
257 s0 = (input[0 * stride] + input[7 * stride]) * 4;
258 s1 = (input[1 * stride] + input[6 * stride]) * 4;
259 s2 = (input[2 * stride] + input[5 * stride]) * 4;
260 s3 = (input[3 * stride] + input[4 * stride]) * 4;
261 s4 = (input[3 * stride] - input[4 * stride]) * 4;
262 s5 = (input[2 * stride] - input[5 * stride]) * 4;
263 s6 = (input[1 * stride] - input[6 * stride]) * 4;
264 s7 = (input[0 * stride] - input[7 * stride]) * 4;
265
266 // fdct4(step, step);
267 x0 = s0 + s3;
268 x1 = s1 + s2;
269 x2 = s1 - s2;
270 x3 = s0 - s3;
271 t0 = (x0 + x1) * cospi_16_64;
272 t1 = (x0 - x1) * cospi_16_64;
273 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
274 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
275 output[0 * 8] = fdct_round_shift(t0);
276 output[2 * 8] = fdct_round_shift(t2);
277 output[4 * 8] = fdct_round_shift(t1);
278 output[6 * 8] = fdct_round_shift(t3);
279
280 // Stage 2
281 t0 = (s6 - s5) * cospi_16_64;
282 t1 = (s6 + s5) * cospi_16_64;
283 t2 = fdct_round_shift(t0);
284 t3 = fdct_round_shift(t1);
285
286 // Stage 3
287 x0 = s4 + t2;
288 x1 = s4 - t2;
289 x2 = s7 - t3;
290 x3 = s7 + t3;
291
292 // Stage 4
293 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
294 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
295 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
296 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
297 output[1 * 8] = fdct_round_shift(t0);
298 output[3 * 8] = fdct_round_shift(t2);
299 output[5 * 8] = fdct_round_shift(t1);
300 output[7 * 8] = fdct_round_shift(t3);
301 input++;
302 output++;
303 }
304 }
305
306 // Rows
307 for (i = 0; i < 8; ++i) {
308 fdct8(&intermediate[i * 8], &final_output[i * 8]);
309 for (j = 0; j < 8; ++j)
310 final_output[j + i * 8] /= 2;
311 }
312 }
313
vp9_fdct16x16_c(const int16_t * input,int16_t * output,int stride)314 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
315 // The 2D transform is done with two passes which are actually pretty
316 // similar. In the first one, we transform the columns and transpose
317 // the results. In the second one, we transform the rows. To achieve that,
318 // as the first pass results are transposed, we transpose the columns (that
319 // is the transposed rows) and transpose the results (so that it goes back
320 // in normal/row positions).
321 int pass;
322 // We need an intermediate buffer between passes.
323 int16_t intermediate[256];
324 const int16_t *in = input;
325 int16_t *out = intermediate;
326 // Do the two transform/transpose passes
327 for (pass = 0; pass < 2; ++pass) {
328 /*canbe16*/ int step1[8];
329 /*canbe16*/ int step2[8];
330 /*canbe16*/ int step3[8];
331 /*canbe16*/ int input[8];
332 /*needs32*/ int temp1, temp2;
333 int i;
334 for (i = 0; i < 16; i++) {
335 if (0 == pass) {
336 // Calculate input for the first 8 results.
337 input[0] = (in[0 * stride] + in[15 * stride]) * 4;
338 input[1] = (in[1 * stride] + in[14 * stride]) * 4;
339 input[2] = (in[2 * stride] + in[13 * stride]) * 4;
340 input[3] = (in[3 * stride] + in[12 * stride]) * 4;
341 input[4] = (in[4 * stride] + in[11 * stride]) * 4;
342 input[5] = (in[5 * stride] + in[10 * stride]) * 4;
343 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
344 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
345 // Calculate input for the next 8 results.
346 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
347 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
348 step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
349 step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
350 step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
351 step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
352 step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
353 step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
354 } else {
355 // Calculate input for the first 8 results.
356 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
357 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
358 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
359 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
360 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
361 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
362 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
363 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
364 // Calculate input for the next 8 results.
365 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
366 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
367 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
368 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
369 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
370 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
371 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
372 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
373 }
374 // Work on the first eight values; fdct8(input, even_results);
375 {
376 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
377 /*needs32*/ int t0, t1, t2, t3;
378 /*canbe16*/ int x0, x1, x2, x3;
379
380 // stage 1
381 s0 = input[0] + input[7];
382 s1 = input[1] + input[6];
383 s2 = input[2] + input[5];
384 s3 = input[3] + input[4];
385 s4 = input[3] - input[4];
386 s5 = input[2] - input[5];
387 s6 = input[1] - input[6];
388 s7 = input[0] - input[7];
389
390 // fdct4(step, step);
391 x0 = s0 + s3;
392 x1 = s1 + s2;
393 x2 = s1 - s2;
394 x3 = s0 - s3;
395 t0 = (x0 + x1) * cospi_16_64;
396 t1 = (x0 - x1) * cospi_16_64;
397 t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
398 t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
399 out[0] = fdct_round_shift(t0);
400 out[4] = fdct_round_shift(t2);
401 out[8] = fdct_round_shift(t1);
402 out[12] = fdct_round_shift(t3);
403
404 // Stage 2
405 t0 = (s6 - s5) * cospi_16_64;
406 t1 = (s6 + s5) * cospi_16_64;
407 t2 = fdct_round_shift(t0);
408 t3 = fdct_round_shift(t1);
409
410 // Stage 3
411 x0 = s4 + t2;
412 x1 = s4 - t2;
413 x2 = s7 - t3;
414 x3 = s7 + t3;
415
416 // Stage 4
417 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
418 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
419 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
420 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
421 out[2] = fdct_round_shift(t0);
422 out[6] = fdct_round_shift(t2);
423 out[10] = fdct_round_shift(t1);
424 out[14] = fdct_round_shift(t3);
425 }
426 // Work on the next eight values; step1 -> odd_results
427 {
428 // step 2
429 temp1 = (step1[5] - step1[2]) * cospi_16_64;
430 temp2 = (step1[4] - step1[3]) * cospi_16_64;
431 step2[2] = fdct_round_shift(temp1);
432 step2[3] = fdct_round_shift(temp2);
433 temp1 = (step1[4] + step1[3]) * cospi_16_64;
434 temp2 = (step1[5] + step1[2]) * cospi_16_64;
435 step2[4] = fdct_round_shift(temp1);
436 step2[5] = fdct_round_shift(temp2);
437 // step 3
438 step3[0] = step1[0] + step2[3];
439 step3[1] = step1[1] + step2[2];
440 step3[2] = step1[1] - step2[2];
441 step3[3] = step1[0] - step2[3];
442 step3[4] = step1[7] - step2[4];
443 step3[5] = step1[6] - step2[5];
444 step3[6] = step1[6] + step2[5];
445 step3[7] = step1[7] + step2[4];
446 // step 4
447 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
448 temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
449 step2[1] = fdct_round_shift(temp1);
450 step2[2] = fdct_round_shift(temp2);
451 temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
452 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
453 step2[5] = fdct_round_shift(temp1);
454 step2[6] = fdct_round_shift(temp2);
455 // step 5
456 step1[0] = step3[0] + step2[1];
457 step1[1] = step3[0] - step2[1];
458 step1[2] = step3[3] - step2[2];
459 step1[3] = step3[3] + step2[2];
460 step1[4] = step3[4] + step2[5];
461 step1[5] = step3[4] - step2[5];
462 step1[6] = step3[7] - step2[6];
463 step1[7] = step3[7] + step2[6];
464 // step 6
465 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
466 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
467 out[1] = fdct_round_shift(temp1);
468 out[9] = fdct_round_shift(temp2);
469 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
470 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
471 out[5] = fdct_round_shift(temp1);
472 out[13] = fdct_round_shift(temp2);
473 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
474 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
475 out[3] = fdct_round_shift(temp1);
476 out[11] = fdct_round_shift(temp2);
477 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
478 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
479 out[7] = fdct_round_shift(temp1);
480 out[15] = fdct_round_shift(temp2);
481 }
482 // Do next column (which is a transposed row in second/horizontal pass)
483 in++;
484 out += 16;
485 }
486 // Setup in/out for next pass.
487 in = intermediate;
488 out = output;
489 }
490 }
491
fadst8(const int16_t * input,int16_t * output)492 static void fadst8(const int16_t *input, int16_t *output) {
493 int s0, s1, s2, s3, s4, s5, s6, s7;
494
495 int x0 = input[7];
496 int x1 = input[0];
497 int x2 = input[5];
498 int x3 = input[2];
499 int x4 = input[3];
500 int x5 = input[4];
501 int x6 = input[1];
502 int x7 = input[6];
503
504 // stage 1
505 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
506 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
507 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
508 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
509 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
510 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
511 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
512 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
513
514 x0 = fdct_round_shift(s0 + s4);
515 x1 = fdct_round_shift(s1 + s5);
516 x2 = fdct_round_shift(s2 + s6);
517 x3 = fdct_round_shift(s3 + s7);
518 x4 = fdct_round_shift(s0 - s4);
519 x5 = fdct_round_shift(s1 - s5);
520 x6 = fdct_round_shift(s2 - s6);
521 x7 = fdct_round_shift(s3 - s7);
522
523 // stage 2
524 s0 = x0;
525 s1 = x1;
526 s2 = x2;
527 s3 = x3;
528 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
529 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
530 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7;
531 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
532
533 x0 = s0 + s2;
534 x1 = s1 + s3;
535 x2 = s0 - s2;
536 x3 = s1 - s3;
537 x4 = fdct_round_shift(s4 + s6);
538 x5 = fdct_round_shift(s5 + s7);
539 x6 = fdct_round_shift(s4 - s6);
540 x7 = fdct_round_shift(s5 - s7);
541
542 // stage 3
543 s2 = cospi_16_64 * (x2 + x3);
544 s3 = cospi_16_64 * (x2 - x3);
545 s6 = cospi_16_64 * (x6 + x7);
546 s7 = cospi_16_64 * (x6 - x7);
547
548 x2 = fdct_round_shift(s2);
549 x3 = fdct_round_shift(s3);
550 x6 = fdct_round_shift(s6);
551 x7 = fdct_round_shift(s7);
552
553 output[0] = x0;
554 output[1] = - x4;
555 output[2] = x6;
556 output[3] = - x2;
557 output[4] = x3;
558 output[5] = - x7;
559 output[6] = x5;
560 output[7] = - x1;
561 }
562
563 static const transform_2d FHT_8[] = {
564 { fdct8, fdct8 }, // DCT_DCT = 0
565 { fadst8, fdct8 }, // ADST_DCT = 1
566 { fdct8, fadst8 }, // DCT_ADST = 2
567 { fadst8, fadst8 } // ADST_ADST = 3
568 };
569
vp9_fht8x8_c(const int16_t * input,int16_t * output,int stride,int tx_type)570 void vp9_fht8x8_c(const int16_t *input, int16_t *output,
571 int stride, int tx_type) {
572 if (tx_type == DCT_DCT) {
573 vp9_fdct8x8_c(input, output, stride);
574 } else {
575 int16_t out[64];
576 int16_t *outptr = &out[0];
577 int i, j;
578 int16_t temp_in[8], temp_out[8];
579 const transform_2d ht = FHT_8[tx_type];
580
581 // Columns
582 for (i = 0; i < 8; ++i) {
583 for (j = 0; j < 8; ++j)
584 temp_in[j] = input[j * stride + i] * 4;
585 ht.cols(temp_in, temp_out);
586 for (j = 0; j < 8; ++j)
587 outptr[j * 8 + i] = temp_out[j];
588 }
589
590 // Rows
591 for (i = 0; i < 8; ++i) {
592 for (j = 0; j < 8; ++j)
593 temp_in[j] = out[j + i * 8];
594 ht.rows(temp_in, temp_out);
595 for (j = 0; j < 8; ++j)
596 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
597 }
598 }
599 }
600
601 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
602 pixel. */
vp9_fwht4x4_c(const int16_t * input,int16_t * output,int stride)603 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
604 int i;
605 int a1, b1, c1, d1, e1;
606 const int16_t *ip = input;
607 int16_t *op = output;
608
609 for (i = 0; i < 4; i++) {
610 a1 = ip[0 * stride];
611 b1 = ip[1 * stride];
612 c1 = ip[2 * stride];
613 d1 = ip[3 * stride];
614
615 a1 += b1;
616 d1 = d1 - c1;
617 e1 = (a1 - d1) >> 1;
618 b1 = e1 - b1;
619 c1 = e1 - c1;
620 a1 -= c1;
621 d1 += b1;
622 op[0] = a1;
623 op[4] = c1;
624 op[8] = d1;
625 op[12] = b1;
626
627 ip++;
628 op++;
629 }
630 ip = output;
631 op = output;
632
633 for (i = 0; i < 4; i++) {
634 a1 = ip[0];
635 b1 = ip[1];
636 c1 = ip[2];
637 d1 = ip[3];
638
639 a1 += b1;
640 d1 -= c1;
641 e1 = (a1 - d1) >> 1;
642 b1 = e1 - b1;
643 c1 = e1 - c1;
644 a1 -= c1;
645 d1 += b1;
646 op[0] = a1 * UNIT_QUANT_FACTOR;
647 op[1] = c1 * UNIT_QUANT_FACTOR;
648 op[2] = d1 * UNIT_QUANT_FACTOR;
649 op[3] = b1 * UNIT_QUANT_FACTOR;
650
651 ip += 4;
652 op += 4;
653 }
654 }
655
656 // Rewrote to use same algorithm as others.
fdct16(const int16_t in[16],int16_t out[16])657 static void fdct16(const int16_t in[16], int16_t out[16]) {
658 /*canbe16*/ int step1[8];
659 /*canbe16*/ int step2[8];
660 /*canbe16*/ int step3[8];
661 /*canbe16*/ int input[8];
662 /*needs32*/ int temp1, temp2;
663
664 // step 1
665 input[0] = in[0] + in[15];
666 input[1] = in[1] + in[14];
667 input[2] = in[2] + in[13];
668 input[3] = in[3] + in[12];
669 input[4] = in[4] + in[11];
670 input[5] = in[5] + in[10];
671 input[6] = in[6] + in[ 9];
672 input[7] = in[7] + in[ 8];
673
674 step1[0] = in[7] - in[ 8];
675 step1[1] = in[6] - in[ 9];
676 step1[2] = in[5] - in[10];
677 step1[3] = in[4] - in[11];
678 step1[4] = in[3] - in[12];
679 step1[5] = in[2] - in[13];
680 step1[6] = in[1] - in[14];
681 step1[7] = in[0] - in[15];
682
683 // fdct8(step, step);
684 {
685 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
686 /*needs32*/ int t0, t1, t2, t3;
687 /*canbe16*/ int x0, x1, x2, x3;
688
689 // stage 1
690 s0 = input[0] + input[7];
691 s1 = input[1] + input[6];
692 s2 = input[2] + input[5];
693 s3 = input[3] + input[4];
694 s4 = input[3] - input[4];
695 s5 = input[2] - input[5];
696 s6 = input[1] - input[6];
697 s7 = input[0] - input[7];
698
699 // fdct4(step, step);
700 x0 = s0 + s3;
701 x1 = s1 + s2;
702 x2 = s1 - s2;
703 x3 = s0 - s3;
704 t0 = (x0 + x1) * cospi_16_64;
705 t1 = (x0 - x1) * cospi_16_64;
706 t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
707 t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
708 out[0] = fdct_round_shift(t0);
709 out[4] = fdct_round_shift(t2);
710 out[8] = fdct_round_shift(t1);
711 out[12] = fdct_round_shift(t3);
712
713 // Stage 2
714 t0 = (s6 - s5) * cospi_16_64;
715 t1 = (s6 + s5) * cospi_16_64;
716 t2 = fdct_round_shift(t0);
717 t3 = fdct_round_shift(t1);
718
719 // Stage 3
720 x0 = s4 + t2;
721 x1 = s4 - t2;
722 x2 = s7 - t3;
723 x3 = s7 + t3;
724
725 // Stage 4
726 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
727 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
728 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
729 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
730 out[2] = fdct_round_shift(t0);
731 out[6] = fdct_round_shift(t2);
732 out[10] = fdct_round_shift(t1);
733 out[14] = fdct_round_shift(t3);
734 }
735
736 // step 2
737 temp1 = (step1[5] - step1[2]) * cospi_16_64;
738 temp2 = (step1[4] - step1[3]) * cospi_16_64;
739 step2[2] = fdct_round_shift(temp1);
740 step2[3] = fdct_round_shift(temp2);
741 temp1 = (step1[4] + step1[3]) * cospi_16_64;
742 temp2 = (step1[5] + step1[2]) * cospi_16_64;
743 step2[4] = fdct_round_shift(temp1);
744 step2[5] = fdct_round_shift(temp2);
745
746 // step 3
747 step3[0] = step1[0] + step2[3];
748 step3[1] = step1[1] + step2[2];
749 step3[2] = step1[1] - step2[2];
750 step3[3] = step1[0] - step2[3];
751 step3[4] = step1[7] - step2[4];
752 step3[5] = step1[6] - step2[5];
753 step3[6] = step1[6] + step2[5];
754 step3[7] = step1[7] + step2[4];
755
756 // step 4
757 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
758 temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
759 step2[1] = fdct_round_shift(temp1);
760 step2[2] = fdct_round_shift(temp2);
761 temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
762 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
763 step2[5] = fdct_round_shift(temp1);
764 step2[6] = fdct_round_shift(temp2);
765
766 // step 5
767 step1[0] = step3[0] + step2[1];
768 step1[1] = step3[0] - step2[1];
769 step1[2] = step3[3] - step2[2];
770 step1[3] = step3[3] + step2[2];
771 step1[4] = step3[4] + step2[5];
772 step1[5] = step3[4] - step2[5];
773 step1[6] = step3[7] - step2[6];
774 step1[7] = step3[7] + step2[6];
775
776 // step 6
777 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
778 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
779 out[1] = fdct_round_shift(temp1);
780 out[9] = fdct_round_shift(temp2);
781
782 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
783 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
784 out[5] = fdct_round_shift(temp1);
785 out[13] = fdct_round_shift(temp2);
786
787 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
788 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
789 out[3] = fdct_round_shift(temp1);
790 out[11] = fdct_round_shift(temp2);
791
792 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
793 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
794 out[7] = fdct_round_shift(temp1);
795 out[15] = fdct_round_shift(temp2);
796 }
797
fadst16(const int16_t * input,int16_t * output)798 static void fadst16(const int16_t *input, int16_t *output) {
799 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
800
801 int x0 = input[15];
802 int x1 = input[0];
803 int x2 = input[13];
804 int x3 = input[2];
805 int x4 = input[11];
806 int x5 = input[4];
807 int x6 = input[9];
808 int x7 = input[6];
809 int x8 = input[7];
810 int x9 = input[8];
811 int x10 = input[5];
812 int x11 = input[10];
813 int x12 = input[3];
814 int x13 = input[12];
815 int x14 = input[1];
816 int x15 = input[14];
817
818 // stage 1
819 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
820 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
821 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
822 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
823 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
824 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
825 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
826 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
827 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
828 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
829 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
830 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
831 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
832 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
833 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
834 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
835
836 x0 = fdct_round_shift(s0 + s8);
837 x1 = fdct_round_shift(s1 + s9);
838 x2 = fdct_round_shift(s2 + s10);
839 x3 = fdct_round_shift(s3 + s11);
840 x4 = fdct_round_shift(s4 + s12);
841 x5 = fdct_round_shift(s5 + s13);
842 x6 = fdct_round_shift(s6 + s14);
843 x7 = fdct_round_shift(s7 + s15);
844 x8 = fdct_round_shift(s0 - s8);
845 x9 = fdct_round_shift(s1 - s9);
846 x10 = fdct_round_shift(s2 - s10);
847 x11 = fdct_round_shift(s3 - s11);
848 x12 = fdct_round_shift(s4 - s12);
849 x13 = fdct_round_shift(s5 - s13);
850 x14 = fdct_round_shift(s6 - s14);
851 x15 = fdct_round_shift(s7 - s15);
852
853 // stage 2
854 s0 = x0;
855 s1 = x1;
856 s2 = x2;
857 s3 = x3;
858 s4 = x4;
859 s5 = x5;
860 s6 = x6;
861 s7 = x7;
862 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
863 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
864 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
865 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
866 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
867 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
868 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
869 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
870
871 x0 = s0 + s4;
872 x1 = s1 + s5;
873 x2 = s2 + s6;
874 x3 = s3 + s7;
875 x4 = s0 - s4;
876 x5 = s1 - s5;
877 x6 = s2 - s6;
878 x7 = s3 - s7;
879 x8 = fdct_round_shift(s8 + s12);
880 x9 = fdct_round_shift(s9 + s13);
881 x10 = fdct_round_shift(s10 + s14);
882 x11 = fdct_round_shift(s11 + s15);
883 x12 = fdct_round_shift(s8 - s12);
884 x13 = fdct_round_shift(s9 - s13);
885 x14 = fdct_round_shift(s10 - s14);
886 x15 = fdct_round_shift(s11 - s15);
887
888 // stage 3
889 s0 = x0;
890 s1 = x1;
891 s2 = x2;
892 s3 = x3;
893 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
894 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
895 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
896 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
897 s8 = x8;
898 s9 = x9;
899 s10 = x10;
900 s11 = x11;
901 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
902 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
903 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
904 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
905
906 x0 = s0 + s2;
907 x1 = s1 + s3;
908 x2 = s0 - s2;
909 x3 = s1 - s3;
910 x4 = fdct_round_shift(s4 + s6);
911 x5 = fdct_round_shift(s5 + s7);
912 x6 = fdct_round_shift(s4 - s6);
913 x7 = fdct_round_shift(s5 - s7);
914 x8 = s8 + s10;
915 x9 = s9 + s11;
916 x10 = s8 - s10;
917 x11 = s9 - s11;
918 x12 = fdct_round_shift(s12 + s14);
919 x13 = fdct_round_shift(s13 + s15);
920 x14 = fdct_round_shift(s12 - s14);
921 x15 = fdct_round_shift(s13 - s15);
922
923 // stage 4
924 s2 = (- cospi_16_64) * (x2 + x3);
925 s3 = cospi_16_64 * (x2 - x3);
926 s6 = cospi_16_64 * (x6 + x7);
927 s7 = cospi_16_64 * (- x6 + x7);
928 s10 = cospi_16_64 * (x10 + x11);
929 s11 = cospi_16_64 * (- x10 + x11);
930 s14 = (- cospi_16_64) * (x14 + x15);
931 s15 = cospi_16_64 * (x14 - x15);
932
933 x2 = fdct_round_shift(s2);
934 x3 = fdct_round_shift(s3);
935 x6 = fdct_round_shift(s6);
936 x7 = fdct_round_shift(s7);
937 x10 = fdct_round_shift(s10);
938 x11 = fdct_round_shift(s11);
939 x14 = fdct_round_shift(s14);
940 x15 = fdct_round_shift(s15);
941
942 output[0] = x0;
943 output[1] = - x8;
944 output[2] = x12;
945 output[3] = - x4;
946 output[4] = x6;
947 output[5] = x14;
948 output[6] = x10;
949 output[7] = x2;
950 output[8] = x3;
951 output[9] = x11;
952 output[10] = x15;
953 output[11] = x7;
954 output[12] = x5;
955 output[13] = - x13;
956 output[14] = x9;
957 output[15] = - x1;
958 }
959
960 static const transform_2d FHT_16[] = {
961 { fdct16, fdct16 }, // DCT_DCT = 0
962 { fadst16, fdct16 }, // ADST_DCT = 1
963 { fdct16, fadst16 }, // DCT_ADST = 2
964 { fadst16, fadst16 } // ADST_ADST = 3
965 };
966
vp9_fht16x16_c(const int16_t * input,int16_t * output,int stride,int tx_type)967 void vp9_fht16x16_c(const int16_t *input, int16_t *output,
968 int stride, int tx_type) {
969 if (tx_type == DCT_DCT) {
970 vp9_fdct16x16_c(input, output, stride);
971 } else {
972 int16_t out[256];
973 int16_t *outptr = &out[0];
974 int i, j;
975 int16_t temp_in[16], temp_out[16];
976 const transform_2d ht = FHT_16[tx_type];
977
978 // Columns
979 for (i = 0; i < 16; ++i) {
980 for (j = 0; j < 16; ++j)
981 temp_in[j] = input[j * stride + i] * 4;
982 ht.cols(temp_in, temp_out);
983 for (j = 0; j < 16; ++j)
984 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
985 }
986
987 // Rows
988 for (i = 0; i < 16; ++i) {
989 for (j = 0; j < 16; ++j)
990 temp_in[j] = out[j + i * 16];
991 ht.rows(temp_in, temp_out);
992 for (j = 0; j < 16; ++j)
993 output[j + i * 16] = temp_out[j];
994 }
995 }
996 }
997
dct_32_round(int input)998 static INLINE int dct_32_round(int input) {
999 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
1000 assert(-131072 <= rv && rv <= 131071);
1001 return rv;
1002 }
1003
half_round_shift(int input)1004 static INLINE int half_round_shift(int input) {
1005 int rv = (input + 1 + (input < 0)) >> 2;
1006 return rv;
1007 }
1008
fdct32(const int * input,int * output,int round)1009 static void fdct32(const int *input, int *output, int round) {
1010 int step[32];
1011 // Stage 1
1012 step[0] = input[0] + input[(32 - 1)];
1013 step[1] = input[1] + input[(32 - 2)];
1014 step[2] = input[2] + input[(32 - 3)];
1015 step[3] = input[3] + input[(32 - 4)];
1016 step[4] = input[4] + input[(32 - 5)];
1017 step[5] = input[5] + input[(32 - 6)];
1018 step[6] = input[6] + input[(32 - 7)];
1019 step[7] = input[7] + input[(32 - 8)];
1020 step[8] = input[8] + input[(32 - 9)];
1021 step[9] = input[9] + input[(32 - 10)];
1022 step[10] = input[10] + input[(32 - 11)];
1023 step[11] = input[11] + input[(32 - 12)];
1024 step[12] = input[12] + input[(32 - 13)];
1025 step[13] = input[13] + input[(32 - 14)];
1026 step[14] = input[14] + input[(32 - 15)];
1027 step[15] = input[15] + input[(32 - 16)];
1028 step[16] = -input[16] + input[(32 - 17)];
1029 step[17] = -input[17] + input[(32 - 18)];
1030 step[18] = -input[18] + input[(32 - 19)];
1031 step[19] = -input[19] + input[(32 - 20)];
1032 step[20] = -input[20] + input[(32 - 21)];
1033 step[21] = -input[21] + input[(32 - 22)];
1034 step[22] = -input[22] + input[(32 - 23)];
1035 step[23] = -input[23] + input[(32 - 24)];
1036 step[24] = -input[24] + input[(32 - 25)];
1037 step[25] = -input[25] + input[(32 - 26)];
1038 step[26] = -input[26] + input[(32 - 27)];
1039 step[27] = -input[27] + input[(32 - 28)];
1040 step[28] = -input[28] + input[(32 - 29)];
1041 step[29] = -input[29] + input[(32 - 30)];
1042 step[30] = -input[30] + input[(32 - 31)];
1043 step[31] = -input[31] + input[(32 - 32)];
1044
1045 // Stage 2
1046 output[0] = step[0] + step[16 - 1];
1047 output[1] = step[1] + step[16 - 2];
1048 output[2] = step[2] + step[16 - 3];
1049 output[3] = step[3] + step[16 - 4];
1050 output[4] = step[4] + step[16 - 5];
1051 output[5] = step[5] + step[16 - 6];
1052 output[6] = step[6] + step[16 - 7];
1053 output[7] = step[7] + step[16 - 8];
1054 output[8] = -step[8] + step[16 - 9];
1055 output[9] = -step[9] + step[16 - 10];
1056 output[10] = -step[10] + step[16 - 11];
1057 output[11] = -step[11] + step[16 - 12];
1058 output[12] = -step[12] + step[16 - 13];
1059 output[13] = -step[13] + step[16 - 14];
1060 output[14] = -step[14] + step[16 - 15];
1061 output[15] = -step[15] + step[16 - 16];
1062
1063 output[16] = step[16];
1064 output[17] = step[17];
1065 output[18] = step[18];
1066 output[19] = step[19];
1067
1068 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
1069 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
1070 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
1071 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
1072
1073 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
1074 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
1075 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
1076 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
1077
1078 output[28] = step[28];
1079 output[29] = step[29];
1080 output[30] = step[30];
1081 output[31] = step[31];
1082
1083 // dump the magnitude by 4, hence the intermediate values are within
1084 // the range of 16 bits.
1085 if (round) {
1086 output[0] = half_round_shift(output[0]);
1087 output[1] = half_round_shift(output[1]);
1088 output[2] = half_round_shift(output[2]);
1089 output[3] = half_round_shift(output[3]);
1090 output[4] = half_round_shift(output[4]);
1091 output[5] = half_round_shift(output[5]);
1092 output[6] = half_round_shift(output[6]);
1093 output[7] = half_round_shift(output[7]);
1094 output[8] = half_round_shift(output[8]);
1095 output[9] = half_round_shift(output[9]);
1096 output[10] = half_round_shift(output[10]);
1097 output[11] = half_round_shift(output[11]);
1098 output[12] = half_round_shift(output[12]);
1099 output[13] = half_round_shift(output[13]);
1100 output[14] = half_round_shift(output[14]);
1101 output[15] = half_round_shift(output[15]);
1102
1103 output[16] = half_round_shift(output[16]);
1104 output[17] = half_round_shift(output[17]);
1105 output[18] = half_round_shift(output[18]);
1106 output[19] = half_round_shift(output[19]);
1107 output[20] = half_round_shift(output[20]);
1108 output[21] = half_round_shift(output[21]);
1109 output[22] = half_round_shift(output[22]);
1110 output[23] = half_round_shift(output[23]);
1111 output[24] = half_round_shift(output[24]);
1112 output[25] = half_round_shift(output[25]);
1113 output[26] = half_round_shift(output[26]);
1114 output[27] = half_round_shift(output[27]);
1115 output[28] = half_round_shift(output[28]);
1116 output[29] = half_round_shift(output[29]);
1117 output[30] = half_round_shift(output[30]);
1118 output[31] = half_round_shift(output[31]);
1119 }
1120
1121 // Stage 3
1122 step[0] = output[0] + output[(8 - 1)];
1123 step[1] = output[1] + output[(8 - 2)];
1124 step[2] = output[2] + output[(8 - 3)];
1125 step[3] = output[3] + output[(8 - 4)];
1126 step[4] = -output[4] + output[(8 - 5)];
1127 step[5] = -output[5] + output[(8 - 6)];
1128 step[6] = -output[6] + output[(8 - 7)];
1129 step[7] = -output[7] + output[(8 - 8)];
1130 step[8] = output[8];
1131 step[9] = output[9];
1132 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
1133 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
1134 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
1135 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
1136 step[14] = output[14];
1137 step[15] = output[15];
1138
1139 step[16] = output[16] + output[23];
1140 step[17] = output[17] + output[22];
1141 step[18] = output[18] + output[21];
1142 step[19] = output[19] + output[20];
1143 step[20] = -output[20] + output[19];
1144 step[21] = -output[21] + output[18];
1145 step[22] = -output[22] + output[17];
1146 step[23] = -output[23] + output[16];
1147 step[24] = -output[24] + output[31];
1148 step[25] = -output[25] + output[30];
1149 step[26] = -output[26] + output[29];
1150 step[27] = -output[27] + output[28];
1151 step[28] = output[28] + output[27];
1152 step[29] = output[29] + output[26];
1153 step[30] = output[30] + output[25];
1154 step[31] = output[31] + output[24];
1155
1156 // Stage 4
1157 output[0] = step[0] + step[3];
1158 output[1] = step[1] + step[2];
1159 output[2] = -step[2] + step[1];
1160 output[3] = -step[3] + step[0];
1161 output[4] = step[4];
1162 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
1163 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
1164 output[7] = step[7];
1165 output[8] = step[8] + step[11];
1166 output[9] = step[9] + step[10];
1167 output[10] = -step[10] + step[9];
1168 output[11] = -step[11] + step[8];
1169 output[12] = -step[12] + step[15];
1170 output[13] = -step[13] + step[14];
1171 output[14] = step[14] + step[13];
1172 output[15] = step[15] + step[12];
1173
1174 output[16] = step[16];
1175 output[17] = step[17];
1176 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
1177 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
1178 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
1179 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
1180 output[22] = step[22];
1181 output[23] = step[23];
1182 output[24] = step[24];
1183 output[25] = step[25];
1184 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
1185 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
1186 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
1187 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
1188 output[30] = step[30];
1189 output[31] = step[31];
1190
1191 // Stage 5
1192 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
1193 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
1194 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
1195 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
1196 step[4] = output[4] + output[5];
1197 step[5] = -output[5] + output[4];
1198 step[6] = -output[6] + output[7];
1199 step[7] = output[7] + output[6];
1200 step[8] = output[8];
1201 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
1202 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
1203 step[11] = output[11];
1204 step[12] = output[12];
1205 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
1206 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
1207 step[15] = output[15];
1208
1209 step[16] = output[16] + output[19];
1210 step[17] = output[17] + output[18];
1211 step[18] = -output[18] + output[17];
1212 step[19] = -output[19] + output[16];
1213 step[20] = -output[20] + output[23];
1214 step[21] = -output[21] + output[22];
1215 step[22] = output[22] + output[21];
1216 step[23] = output[23] + output[20];
1217 step[24] = output[24] + output[27];
1218 step[25] = output[25] + output[26];
1219 step[26] = -output[26] + output[25];
1220 step[27] = -output[27] + output[24];
1221 step[28] = -output[28] + output[31];
1222 step[29] = -output[29] + output[30];
1223 step[30] = output[30] + output[29];
1224 step[31] = output[31] + output[28];
1225
1226 // Stage 6
1227 output[0] = step[0];
1228 output[1] = step[1];
1229 output[2] = step[2];
1230 output[3] = step[3];
1231 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
1232 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
1233 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
1234 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
1235 output[8] = step[8] + step[9];
1236 output[9] = -step[9] + step[8];
1237 output[10] = -step[10] + step[11];
1238 output[11] = step[11] + step[10];
1239 output[12] = step[12] + step[13];
1240 output[13] = -step[13] + step[12];
1241 output[14] = -step[14] + step[15];
1242 output[15] = step[15] + step[14];
1243
1244 output[16] = step[16];
1245 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
1246 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
1247 output[19] = step[19];
1248 output[20] = step[20];
1249 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
1250 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
1251 output[23] = step[23];
1252 output[24] = step[24];
1253 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
1254 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
1255 output[27] = step[27];
1256 output[28] = step[28];
1257 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
1258 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
1259 output[31] = step[31];
1260
1261 // Stage 7
1262 step[0] = output[0];
1263 step[1] = output[1];
1264 step[2] = output[2];
1265 step[3] = output[3];
1266 step[4] = output[4];
1267 step[5] = output[5];
1268 step[6] = output[6];
1269 step[7] = output[7];
1270 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
1271 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
1272 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
1273 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
1274 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
1275 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
1276 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
1277 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
1278
1279 step[16] = output[16] + output[17];
1280 step[17] = -output[17] + output[16];
1281 step[18] = -output[18] + output[19];
1282 step[19] = output[19] + output[18];
1283 step[20] = output[20] + output[21];
1284 step[21] = -output[21] + output[20];
1285 step[22] = -output[22] + output[23];
1286 step[23] = output[23] + output[22];
1287 step[24] = output[24] + output[25];
1288 step[25] = -output[25] + output[24];
1289 step[26] = -output[26] + output[27];
1290 step[27] = output[27] + output[26];
1291 step[28] = output[28] + output[29];
1292 step[29] = -output[29] + output[28];
1293 step[30] = -output[30] + output[31];
1294 step[31] = output[31] + output[30];
1295
1296 // Final stage --- outputs indices are bit-reversed.
1297 output[0] = step[0];
1298 output[16] = step[1];
1299 output[8] = step[2];
1300 output[24] = step[3];
1301 output[4] = step[4];
1302 output[20] = step[5];
1303 output[12] = step[6];
1304 output[28] = step[7];
1305 output[2] = step[8];
1306 output[18] = step[9];
1307 output[10] = step[10];
1308 output[26] = step[11];
1309 output[6] = step[12];
1310 output[22] = step[13];
1311 output[14] = step[14];
1312 output[30] = step[15];
1313
1314 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
1315 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
1316 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
1317 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
1318 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
1319 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
1320 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
1321 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
1322 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1323 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1324 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1325 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1326 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1327 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1328 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1329 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1330 }
1331
vp9_fdct32x32_c(const int16_t * input,int16_t * out,int stride)1332 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
1333 int i, j;
1334 int output[32 * 32];
1335
1336 // Columns
1337 for (i = 0; i < 32; ++i) {
1338 int temp_in[32], temp_out[32];
1339 for (j = 0; j < 32; ++j)
1340 temp_in[j] = input[j * stride + i] * 4;
1341 fdct32(temp_in, temp_out, 0);
1342 for (j = 0; j < 32; ++j)
1343 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1344 }
1345
1346 // Rows
1347 for (i = 0; i < 32; ++i) {
1348 int temp_in[32], temp_out[32];
1349 for (j = 0; j < 32; ++j)
1350 temp_in[j] = output[j + i * 32];
1351 fdct32(temp_in, temp_out, 0);
1352 for (j = 0; j < 32; ++j)
1353 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1354 }
1355 }
1356
1357 // Note that although we use dct_32_round in dct32 computation flow,
1358 // this 2d fdct32x32 for rate-distortion optimization loop is operating
1359 // within 16 bits precision.
vp9_fdct32x32_rd_c(const int16_t * input,int16_t * out,int stride)1360 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
1361 int i, j;
1362 int output[32 * 32];
1363
1364 // Columns
1365 for (i = 0; i < 32; ++i) {
1366 int temp_in[32], temp_out[32];
1367 for (j = 0; j < 32; ++j)
1368 temp_in[j] = input[j * stride + i] * 4;
1369 fdct32(temp_in, temp_out, 0);
1370 for (j = 0; j < 32; ++j)
1371 // TODO(cd): see quality impact of only doing
1372 // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
1373 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1374 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1375 }
1376
1377 // Rows
1378 for (i = 0; i < 32; ++i) {
1379 int temp_in[32], temp_out[32];
1380 for (j = 0; j < 32; ++j)
1381 temp_in[j] = output[j + i * 32];
1382 fdct32(temp_in, temp_out, 1);
1383 for (j = 0; j < 32; ++j)
1384 out[j + i * 32] = temp_out[j];
1385 }
1386 }
1387