1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <math.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/vp9_systemdependent.h"
20
fdct_round_shift(int input)21 static INLINE int fdct_round_shift(int input) {
22 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
23 assert(INT16_MIN <= rv && rv <= INT16_MAX);
24 return rv;
25 }
26
fdct4(const int16_t * input,int16_t * output)27 static void fdct4(const int16_t *input, int16_t *output) {
28 int16_t step[4];
29 int temp1, temp2;
30
31 step[0] = input[0] + input[3];
32 step[1] = input[1] + input[2];
33 step[2] = input[1] - input[2];
34 step[3] = input[0] - input[3];
35
36 temp1 = (step[0] + step[1]) * cospi_16_64;
37 temp2 = (step[0] - step[1]) * cospi_16_64;
38 output[0] = fdct_round_shift(temp1);
39 output[2] = fdct_round_shift(temp2);
40 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
41 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
42 output[1] = fdct_round_shift(temp1);
43 output[3] = fdct_round_shift(temp2);
44 }
45
vp9_fdct4x4_1_c(const int16_t * input,int16_t * output,int stride)46 void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
47 int r, c;
48 int16_t sum = 0;
49 for (r = 0; r < 4; ++r)
50 for (c = 0; c < 4; ++c)
51 sum += input[r * stride + c];
52
53 output[0] = sum << 1;
54 output[1] = 0;
55 }
56
vp9_fdct4x4_c(const int16_t * input,int16_t * output,int stride)57 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
58 // The 2D transform is done with two passes which are actually pretty
59 // similar. In the first one, we transform the columns and transpose
60 // the results. In the second one, we transform the rows. To achieve that,
61 // as the first pass results are transposed, we transpose the columns (that
62 // is the transposed rows) and transpose the results (so that it goes back
63 // in normal/row positions).
64 int pass;
65 // We need an intermediate buffer between passes.
66 int16_t intermediate[4 * 4];
67 const int16_t *in = input;
68 int16_t *out = intermediate;
69 // Do the two transform/transpose passes
70 for (pass = 0; pass < 2; ++pass) {
71 /*canbe16*/ int input[4];
72 /*canbe16*/ int step[4];
73 /*needs32*/ int temp1, temp2;
74 int i;
75 for (i = 0; i < 4; ++i) {
76 // Load inputs.
77 if (0 == pass) {
78 input[0] = in[0 * stride] * 16;
79 input[1] = in[1 * stride] * 16;
80 input[2] = in[2 * stride] * 16;
81 input[3] = in[3 * stride] * 16;
82 if (i == 0 && input[0]) {
83 input[0] += 1;
84 }
85 } else {
86 input[0] = in[0 * 4];
87 input[1] = in[1 * 4];
88 input[2] = in[2 * 4];
89 input[3] = in[3 * 4];
90 }
91 // Transform.
92 step[0] = input[0] + input[3];
93 step[1] = input[1] + input[2];
94 step[2] = input[1] - input[2];
95 step[3] = input[0] - input[3];
96 temp1 = (step[0] + step[1]) * cospi_16_64;
97 temp2 = (step[0] - step[1]) * cospi_16_64;
98 out[0] = fdct_round_shift(temp1);
99 out[2] = fdct_round_shift(temp2);
100 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
101 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
102 out[1] = fdct_round_shift(temp1);
103 out[3] = fdct_round_shift(temp2);
104 // Do next column (which is a transposed row in second/horizontal pass)
105 in++;
106 out += 4;
107 }
108 // Setup in/out for next pass.
109 in = intermediate;
110 out = output;
111 }
112
113 {
114 int i, j;
115 for (i = 0; i < 4; ++i) {
116 for (j = 0; j < 4; ++j)
117 output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
118 }
119 }
120 }
121
fadst4(const int16_t * input,int16_t * output)122 static void fadst4(const int16_t *input, int16_t *output) {
123 int x0, x1, x2, x3;
124 int s0, s1, s2, s3, s4, s5, s6, s7;
125
126 x0 = input[0];
127 x1 = input[1];
128 x2 = input[2];
129 x3 = input[3];
130
131 if (!(x0 | x1 | x2 | x3)) {
132 output[0] = output[1] = output[2] = output[3] = 0;
133 return;
134 }
135
136 s0 = sinpi_1_9 * x0;
137 s1 = sinpi_4_9 * x0;
138 s2 = sinpi_2_9 * x1;
139 s3 = sinpi_1_9 * x1;
140 s4 = sinpi_3_9 * x2;
141 s5 = sinpi_4_9 * x3;
142 s6 = sinpi_2_9 * x3;
143 s7 = x0 + x1 - x3;
144
145 x0 = s0 + s2 + s5;
146 x1 = sinpi_3_9 * s7;
147 x2 = s1 - s3 + s6;
148 x3 = s4;
149
150 s0 = x0 + x3;
151 s1 = x1;
152 s2 = x2 - x3;
153 s3 = x2 - x0 + x3;
154
155 // 1-D transform scaling factor is sqrt(2).
156 output[0] = fdct_round_shift(s0);
157 output[1] = fdct_round_shift(s1);
158 output[2] = fdct_round_shift(s2);
159 output[3] = fdct_round_shift(s3);
160 }
161
162 static const transform_2d FHT_4[] = {
163 { fdct4, fdct4 }, // DCT_DCT = 0
164 { fadst4, fdct4 }, // ADST_DCT = 1
165 { fdct4, fadst4 }, // DCT_ADST = 2
166 { fadst4, fadst4 } // ADST_ADST = 3
167 };
168
vp9_fht4x4_c(const int16_t * input,int16_t * output,int stride,int tx_type)169 void vp9_fht4x4_c(const int16_t *input, int16_t *output,
170 int stride, int tx_type) {
171 if (tx_type == DCT_DCT) {
172 vp9_fdct4x4_c(input, output, stride);
173 } else {
174 int16_t out[4 * 4];
175 int16_t *outptr = &out[0];
176 int i, j;
177 int16_t temp_in[4], temp_out[4];
178 const transform_2d ht = FHT_4[tx_type];
179
180 // Columns
181 for (i = 0; i < 4; ++i) {
182 for (j = 0; j < 4; ++j)
183 temp_in[j] = input[j * stride + i] * 16;
184 if (i == 0 && temp_in[0])
185 temp_in[0] += 1;
186 ht.cols(temp_in, temp_out);
187 for (j = 0; j < 4; ++j)
188 outptr[j * 4 + i] = temp_out[j];
189 }
190
191 // Rows
192 for (i = 0; i < 4; ++i) {
193 for (j = 0; j < 4; ++j)
194 temp_in[j] = out[j + i * 4];
195 ht.rows(temp_in, temp_out);
196 for (j = 0; j < 4; ++j)
197 output[j + i * 4] = (temp_out[j] + 1) >> 2;
198 }
199 }
200 }
201
fdct8(const int16_t * input,int16_t * output)202 static void fdct8(const int16_t *input, int16_t *output) {
203 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
204 /*needs32*/ int t0, t1, t2, t3;
205 /*canbe16*/ int x0, x1, x2, x3;
206
207 // stage 1
208 s0 = input[0] + input[7];
209 s1 = input[1] + input[6];
210 s2 = input[2] + input[5];
211 s3 = input[3] + input[4];
212 s4 = input[3] - input[4];
213 s5 = input[2] - input[5];
214 s6 = input[1] - input[6];
215 s7 = input[0] - input[7];
216
217 // fdct4(step, step);
218 x0 = s0 + s3;
219 x1 = s1 + s2;
220 x2 = s1 - s2;
221 x3 = s0 - s3;
222 t0 = (x0 + x1) * cospi_16_64;
223 t1 = (x0 - x1) * cospi_16_64;
224 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
225 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
226 output[0] = fdct_round_shift(t0);
227 output[2] = fdct_round_shift(t2);
228 output[4] = fdct_round_shift(t1);
229 output[6] = fdct_round_shift(t3);
230
231 // Stage 2
232 t0 = (s6 - s5) * cospi_16_64;
233 t1 = (s6 + s5) * cospi_16_64;
234 t2 = fdct_round_shift(t0);
235 t3 = fdct_round_shift(t1);
236
237 // Stage 3
238 x0 = s4 + t2;
239 x1 = s4 - t2;
240 x2 = s7 - t3;
241 x3 = s7 + t3;
242
243 // Stage 4
244 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
245 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
246 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
247 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
248 output[1] = fdct_round_shift(t0);
249 output[3] = fdct_round_shift(t2);
250 output[5] = fdct_round_shift(t1);
251 output[7] = fdct_round_shift(t3);
252 }
253
vp9_fdct8x8_1_c(const int16_t * input,int16_t * output,int stride)254 void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
255 int r, c;
256 int16_t sum = 0;
257 for (r = 0; r < 8; ++r)
258 for (c = 0; c < 8; ++c)
259 sum += input[r * stride + c];
260
261 output[0] = sum;
262 output[1] = 0;
263 }
264
vp9_fdct8x8_c(const int16_t * input,int16_t * final_output,int stride)265 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
266 int i, j;
267 int16_t intermediate[64];
268
269 // Transform columns
270 {
271 int16_t *output = intermediate;
272 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
273 /*needs32*/ int t0, t1, t2, t3;
274 /*canbe16*/ int x0, x1, x2, x3;
275
276 int i;
277 for (i = 0; i < 8; i++) {
278 // stage 1
279 s0 = (input[0 * stride] + input[7 * stride]) * 4;
280 s1 = (input[1 * stride] + input[6 * stride]) * 4;
281 s2 = (input[2 * stride] + input[5 * stride]) * 4;
282 s3 = (input[3 * stride] + input[4 * stride]) * 4;
283 s4 = (input[3 * stride] - input[4 * stride]) * 4;
284 s5 = (input[2 * stride] - input[5 * stride]) * 4;
285 s6 = (input[1 * stride] - input[6 * stride]) * 4;
286 s7 = (input[0 * stride] - input[7 * stride]) * 4;
287
288 // fdct4(step, step);
289 x0 = s0 + s3;
290 x1 = s1 + s2;
291 x2 = s1 - s2;
292 x3 = s0 - s3;
293 t0 = (x0 + x1) * cospi_16_64;
294 t1 = (x0 - x1) * cospi_16_64;
295 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
296 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
297 output[0 * 8] = fdct_round_shift(t0);
298 output[2 * 8] = fdct_round_shift(t2);
299 output[4 * 8] = fdct_round_shift(t1);
300 output[6 * 8] = fdct_round_shift(t3);
301
302 // Stage 2
303 t0 = (s6 - s5) * cospi_16_64;
304 t1 = (s6 + s5) * cospi_16_64;
305 t2 = fdct_round_shift(t0);
306 t3 = fdct_round_shift(t1);
307
308 // Stage 3
309 x0 = s4 + t2;
310 x1 = s4 - t2;
311 x2 = s7 - t3;
312 x3 = s7 + t3;
313
314 // Stage 4
315 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
316 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
317 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
318 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
319 output[1 * 8] = fdct_round_shift(t0);
320 output[3 * 8] = fdct_round_shift(t2);
321 output[5 * 8] = fdct_round_shift(t1);
322 output[7 * 8] = fdct_round_shift(t3);
323 input++;
324 output++;
325 }
326 }
327
328 // Rows
329 for (i = 0; i < 8; ++i) {
330 fdct8(&intermediate[i * 8], &final_output[i * 8]);
331 for (j = 0; j < 8; ++j)
332 final_output[j + i * 8] /= 2;
333 }
334 }
335
vp9_fdct16x16_1_c(const int16_t * input,int16_t * output,int stride)336 void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
337 int r, c;
338 int16_t sum = 0;
339 for (r = 0; r < 16; ++r)
340 for (c = 0; c < 16; ++c)
341 sum += input[r * stride + c];
342
343 output[0] = sum >> 1;
344 output[1] = 0;
345 }
346
vp9_fdct16x16_c(const int16_t * input,int16_t * output,int stride)347 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
348 // The 2D transform is done with two passes which are actually pretty
349 // similar. In the first one, we transform the columns and transpose
350 // the results. In the second one, we transform the rows. To achieve that,
351 // as the first pass results are transposed, we transpose the columns (that
352 // is the transposed rows) and transpose the results (so that it goes back
353 // in normal/row positions).
354 int pass;
355 // We need an intermediate buffer between passes.
356 int16_t intermediate[256];
357 const int16_t *in = input;
358 int16_t *out = intermediate;
359 // Do the two transform/transpose passes
360 for (pass = 0; pass < 2; ++pass) {
361 /*canbe16*/ int step1[8];
362 /*canbe16*/ int step2[8];
363 /*canbe16*/ int step3[8];
364 /*canbe16*/ int input[8];
365 /*needs32*/ int temp1, temp2;
366 int i;
367 for (i = 0; i < 16; i++) {
368 if (0 == pass) {
369 // Calculate input for the first 8 results.
370 input[0] = (in[0 * stride] + in[15 * stride]) * 4;
371 input[1] = (in[1 * stride] + in[14 * stride]) * 4;
372 input[2] = (in[2 * stride] + in[13 * stride]) * 4;
373 input[3] = (in[3 * stride] + in[12 * stride]) * 4;
374 input[4] = (in[4 * stride] + in[11 * stride]) * 4;
375 input[5] = (in[5 * stride] + in[10 * stride]) * 4;
376 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
377 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
378 // Calculate input for the next 8 results.
379 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
380 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
381 step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
382 step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
383 step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
384 step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
385 step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
386 step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
387 } else {
388 // Calculate input for the first 8 results.
389 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
390 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
391 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
392 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
393 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
394 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
395 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
396 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
397 // Calculate input for the next 8 results.
398 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
399 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
400 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
401 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
402 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
403 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
404 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
405 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
406 }
407 // Work on the first eight values; fdct8(input, even_results);
408 {
409 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
410 /*needs32*/ int t0, t1, t2, t3;
411 /*canbe16*/ int x0, x1, x2, x3;
412
413 // stage 1
414 s0 = input[0] + input[7];
415 s1 = input[1] + input[6];
416 s2 = input[2] + input[5];
417 s3 = input[3] + input[4];
418 s4 = input[3] - input[4];
419 s5 = input[2] - input[5];
420 s6 = input[1] - input[6];
421 s7 = input[0] - input[7];
422
423 // fdct4(step, step);
424 x0 = s0 + s3;
425 x1 = s1 + s2;
426 x2 = s1 - s2;
427 x3 = s0 - s3;
428 t0 = (x0 + x1) * cospi_16_64;
429 t1 = (x0 - x1) * cospi_16_64;
430 t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
431 t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
432 out[0] = fdct_round_shift(t0);
433 out[4] = fdct_round_shift(t2);
434 out[8] = fdct_round_shift(t1);
435 out[12] = fdct_round_shift(t3);
436
437 // Stage 2
438 t0 = (s6 - s5) * cospi_16_64;
439 t1 = (s6 + s5) * cospi_16_64;
440 t2 = fdct_round_shift(t0);
441 t3 = fdct_round_shift(t1);
442
443 // Stage 3
444 x0 = s4 + t2;
445 x1 = s4 - t2;
446 x2 = s7 - t3;
447 x3 = s7 + t3;
448
449 // Stage 4
450 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
451 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
452 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
453 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
454 out[2] = fdct_round_shift(t0);
455 out[6] = fdct_round_shift(t2);
456 out[10] = fdct_round_shift(t1);
457 out[14] = fdct_round_shift(t3);
458 }
459 // Work on the next eight values; step1 -> odd_results
460 {
461 // step 2
462 temp1 = (step1[5] - step1[2]) * cospi_16_64;
463 temp2 = (step1[4] - step1[3]) * cospi_16_64;
464 step2[2] = fdct_round_shift(temp1);
465 step2[3] = fdct_round_shift(temp2);
466 temp1 = (step1[4] + step1[3]) * cospi_16_64;
467 temp2 = (step1[5] + step1[2]) * cospi_16_64;
468 step2[4] = fdct_round_shift(temp1);
469 step2[5] = fdct_round_shift(temp2);
470 // step 3
471 step3[0] = step1[0] + step2[3];
472 step3[1] = step1[1] + step2[2];
473 step3[2] = step1[1] - step2[2];
474 step3[3] = step1[0] - step2[3];
475 step3[4] = step1[7] - step2[4];
476 step3[5] = step1[6] - step2[5];
477 step3[6] = step1[6] + step2[5];
478 step3[7] = step1[7] + step2[4];
479 // step 4
480 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
481 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
482 step2[1] = fdct_round_shift(temp1);
483 step2[2] = fdct_round_shift(temp2);
484 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
485 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
486 step2[5] = fdct_round_shift(temp1);
487 step2[6] = fdct_round_shift(temp2);
488 // step 5
489 step1[0] = step3[0] + step2[1];
490 step1[1] = step3[0] - step2[1];
491 step1[2] = step3[3] + step2[2];
492 step1[3] = step3[3] - step2[2];
493 step1[4] = step3[4] - step2[5];
494 step1[5] = step3[4] + step2[5];
495 step1[6] = step3[7] - step2[6];
496 step1[7] = step3[7] + step2[6];
497 // step 6
498 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
499 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
500 out[1] = fdct_round_shift(temp1);
501 out[9] = fdct_round_shift(temp2);
502 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
503 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
504 out[5] = fdct_round_shift(temp1);
505 out[13] = fdct_round_shift(temp2);
506 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
507 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
508 out[3] = fdct_round_shift(temp1);
509 out[11] = fdct_round_shift(temp2);
510 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
511 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
512 out[7] = fdct_round_shift(temp1);
513 out[15] = fdct_round_shift(temp2);
514 }
515 // Do next column (which is a transposed row in second/horizontal pass)
516 in++;
517 out += 16;
518 }
519 // Setup in/out for next pass.
520 in = intermediate;
521 out = output;
522 }
523 }
524
fadst8(const int16_t * input,int16_t * output)525 static void fadst8(const int16_t *input, int16_t *output) {
526 int s0, s1, s2, s3, s4, s5, s6, s7;
527
528 int x0 = input[7];
529 int x1 = input[0];
530 int x2 = input[5];
531 int x3 = input[2];
532 int x4 = input[3];
533 int x5 = input[4];
534 int x6 = input[1];
535 int x7 = input[6];
536
537 // stage 1
538 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
539 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
540 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
541 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
542 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
543 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
544 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
545 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
546
547 x0 = fdct_round_shift(s0 + s4);
548 x1 = fdct_round_shift(s1 + s5);
549 x2 = fdct_round_shift(s2 + s6);
550 x3 = fdct_round_shift(s3 + s7);
551 x4 = fdct_round_shift(s0 - s4);
552 x5 = fdct_round_shift(s1 - s5);
553 x6 = fdct_round_shift(s2 - s6);
554 x7 = fdct_round_shift(s3 - s7);
555
556 // stage 2
557 s0 = x0;
558 s1 = x1;
559 s2 = x2;
560 s3 = x3;
561 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
562 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
563 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7;
564 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
565
566 x0 = s0 + s2;
567 x1 = s1 + s3;
568 x2 = s0 - s2;
569 x3 = s1 - s3;
570 x4 = fdct_round_shift(s4 + s6);
571 x5 = fdct_round_shift(s5 + s7);
572 x6 = fdct_round_shift(s4 - s6);
573 x7 = fdct_round_shift(s5 - s7);
574
575 // stage 3
576 s2 = cospi_16_64 * (x2 + x3);
577 s3 = cospi_16_64 * (x2 - x3);
578 s6 = cospi_16_64 * (x6 + x7);
579 s7 = cospi_16_64 * (x6 - x7);
580
581 x2 = fdct_round_shift(s2);
582 x3 = fdct_round_shift(s3);
583 x6 = fdct_round_shift(s6);
584 x7 = fdct_round_shift(s7);
585
586 output[0] = x0;
587 output[1] = - x4;
588 output[2] = x6;
589 output[3] = - x2;
590 output[4] = x3;
591 output[5] = - x7;
592 output[6] = x5;
593 output[7] = - x1;
594 }
595
596 static const transform_2d FHT_8[] = {
597 { fdct8, fdct8 }, // DCT_DCT = 0
598 { fadst8, fdct8 }, // ADST_DCT = 1
599 { fdct8, fadst8 }, // DCT_ADST = 2
600 { fadst8, fadst8 } // ADST_ADST = 3
601 };
602
vp9_fht8x8_c(const int16_t * input,int16_t * output,int stride,int tx_type)603 void vp9_fht8x8_c(const int16_t *input, int16_t *output,
604 int stride, int tx_type) {
605 if (tx_type == DCT_DCT) {
606 vp9_fdct8x8_c(input, output, stride);
607 } else {
608 int16_t out[64];
609 int16_t *outptr = &out[0];
610 int i, j;
611 int16_t temp_in[8], temp_out[8];
612 const transform_2d ht = FHT_8[tx_type];
613
614 // Columns
615 for (i = 0; i < 8; ++i) {
616 for (j = 0; j < 8; ++j)
617 temp_in[j] = input[j * stride + i] * 4;
618 ht.cols(temp_in, temp_out);
619 for (j = 0; j < 8; ++j)
620 outptr[j * 8 + i] = temp_out[j];
621 }
622
623 // Rows
624 for (i = 0; i < 8; ++i) {
625 for (j = 0; j < 8; ++j)
626 temp_in[j] = out[j + i * 8];
627 ht.rows(temp_in, temp_out);
628 for (j = 0; j < 8; ++j)
629 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
630 }
631 }
632 }
633
634 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
635 pixel. */
vp9_fwht4x4_c(const int16_t * input,int16_t * output,int stride)636 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
637 int i;
638 int a1, b1, c1, d1, e1;
639 const int16_t *ip = input;
640 int16_t *op = output;
641
642 for (i = 0; i < 4; i++) {
643 a1 = ip[0 * stride];
644 b1 = ip[1 * stride];
645 c1 = ip[2 * stride];
646 d1 = ip[3 * stride];
647
648 a1 += b1;
649 d1 = d1 - c1;
650 e1 = (a1 - d1) >> 1;
651 b1 = e1 - b1;
652 c1 = e1 - c1;
653 a1 -= c1;
654 d1 += b1;
655 op[0] = a1;
656 op[4] = c1;
657 op[8] = d1;
658 op[12] = b1;
659
660 ip++;
661 op++;
662 }
663 ip = output;
664 op = output;
665
666 for (i = 0; i < 4; i++) {
667 a1 = ip[0];
668 b1 = ip[1];
669 c1 = ip[2];
670 d1 = ip[3];
671
672 a1 += b1;
673 d1 -= c1;
674 e1 = (a1 - d1) >> 1;
675 b1 = e1 - b1;
676 c1 = e1 - c1;
677 a1 -= c1;
678 d1 += b1;
679 op[0] = a1 * UNIT_QUANT_FACTOR;
680 op[1] = c1 * UNIT_QUANT_FACTOR;
681 op[2] = d1 * UNIT_QUANT_FACTOR;
682 op[3] = b1 * UNIT_QUANT_FACTOR;
683
684 ip += 4;
685 op += 4;
686 }
687 }
688
689 // Rewrote to use same algorithm as others.
fdct16(const int16_t in[16],int16_t out[16])690 static void fdct16(const int16_t in[16], int16_t out[16]) {
691 /*canbe16*/ int step1[8];
692 /*canbe16*/ int step2[8];
693 /*canbe16*/ int step3[8];
694 /*canbe16*/ int input[8];
695 /*needs32*/ int temp1, temp2;
696
697 // step 1
698 input[0] = in[0] + in[15];
699 input[1] = in[1] + in[14];
700 input[2] = in[2] + in[13];
701 input[3] = in[3] + in[12];
702 input[4] = in[4] + in[11];
703 input[5] = in[5] + in[10];
704 input[6] = in[6] + in[ 9];
705 input[7] = in[7] + in[ 8];
706
707 step1[0] = in[7] - in[ 8];
708 step1[1] = in[6] - in[ 9];
709 step1[2] = in[5] - in[10];
710 step1[3] = in[4] - in[11];
711 step1[4] = in[3] - in[12];
712 step1[5] = in[2] - in[13];
713 step1[6] = in[1] - in[14];
714 step1[7] = in[0] - in[15];
715
716 // fdct8(step, step);
717 {
718 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
719 /*needs32*/ int t0, t1, t2, t3;
720 /*canbe16*/ int x0, x1, x2, x3;
721
722 // stage 1
723 s0 = input[0] + input[7];
724 s1 = input[1] + input[6];
725 s2 = input[2] + input[5];
726 s3 = input[3] + input[4];
727 s4 = input[3] - input[4];
728 s5 = input[2] - input[5];
729 s6 = input[1] - input[6];
730 s7 = input[0] - input[7];
731
732 // fdct4(step, step);
733 x0 = s0 + s3;
734 x1 = s1 + s2;
735 x2 = s1 - s2;
736 x3 = s0 - s3;
737 t0 = (x0 + x1) * cospi_16_64;
738 t1 = (x0 - x1) * cospi_16_64;
739 t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
740 t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
741 out[0] = fdct_round_shift(t0);
742 out[4] = fdct_round_shift(t2);
743 out[8] = fdct_round_shift(t1);
744 out[12] = fdct_round_shift(t3);
745
746 // Stage 2
747 t0 = (s6 - s5) * cospi_16_64;
748 t1 = (s6 + s5) * cospi_16_64;
749 t2 = fdct_round_shift(t0);
750 t3 = fdct_round_shift(t1);
751
752 // Stage 3
753 x0 = s4 + t2;
754 x1 = s4 - t2;
755 x2 = s7 - t3;
756 x3 = s7 + t3;
757
758 // Stage 4
759 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
760 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
761 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
762 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
763 out[2] = fdct_round_shift(t0);
764 out[6] = fdct_round_shift(t2);
765 out[10] = fdct_round_shift(t1);
766 out[14] = fdct_round_shift(t3);
767 }
768
769 // step 2
770 temp1 = (step1[5] - step1[2]) * cospi_16_64;
771 temp2 = (step1[4] - step1[3]) * cospi_16_64;
772 step2[2] = fdct_round_shift(temp1);
773 step2[3] = fdct_round_shift(temp2);
774 temp1 = (step1[4] + step1[3]) * cospi_16_64;
775 temp2 = (step1[5] + step1[2]) * cospi_16_64;
776 step2[4] = fdct_round_shift(temp1);
777 step2[5] = fdct_round_shift(temp2);
778
779 // step 3
780 step3[0] = step1[0] + step2[3];
781 step3[1] = step1[1] + step2[2];
782 step3[2] = step1[1] - step2[2];
783 step3[3] = step1[0] - step2[3];
784 step3[4] = step1[7] - step2[4];
785 step3[5] = step1[6] - step2[5];
786 step3[6] = step1[6] + step2[5];
787 step3[7] = step1[7] + step2[4];
788
789 // step 4
790 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
791 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
792 step2[1] = fdct_round_shift(temp1);
793 step2[2] = fdct_round_shift(temp2);
794 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
795 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
796 step2[5] = fdct_round_shift(temp1);
797 step2[6] = fdct_round_shift(temp2);
798
799 // step 5
800 step1[0] = step3[0] + step2[1];
801 step1[1] = step3[0] - step2[1];
802 step1[2] = step3[3] + step2[2];
803 step1[3] = step3[3] - step2[2];
804 step1[4] = step3[4] - step2[5];
805 step1[5] = step3[4] + step2[5];
806 step1[6] = step3[7] - step2[6];
807 step1[7] = step3[7] + step2[6];
808
809 // step 6
810 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
811 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
812 out[1] = fdct_round_shift(temp1);
813 out[9] = fdct_round_shift(temp2);
814
815 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
816 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
817 out[5] = fdct_round_shift(temp1);
818 out[13] = fdct_round_shift(temp2);
819
820 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
821 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
822 out[3] = fdct_round_shift(temp1);
823 out[11] = fdct_round_shift(temp2);
824
825 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
826 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
827 out[7] = fdct_round_shift(temp1);
828 out[15] = fdct_round_shift(temp2);
829 }
830
fadst16(const int16_t * input,int16_t * output)831 static void fadst16(const int16_t *input, int16_t *output) {
832 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
833
834 int x0 = input[15];
835 int x1 = input[0];
836 int x2 = input[13];
837 int x3 = input[2];
838 int x4 = input[11];
839 int x5 = input[4];
840 int x6 = input[9];
841 int x7 = input[6];
842 int x8 = input[7];
843 int x9 = input[8];
844 int x10 = input[5];
845 int x11 = input[10];
846 int x12 = input[3];
847 int x13 = input[12];
848 int x14 = input[1];
849 int x15 = input[14];
850
851 // stage 1
852 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
853 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
854 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
855 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
856 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
857 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
858 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
859 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
860 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
861 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
862 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
863 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
864 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
865 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
866 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
867 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
868
869 x0 = fdct_round_shift(s0 + s8);
870 x1 = fdct_round_shift(s1 + s9);
871 x2 = fdct_round_shift(s2 + s10);
872 x3 = fdct_round_shift(s3 + s11);
873 x4 = fdct_round_shift(s4 + s12);
874 x5 = fdct_round_shift(s5 + s13);
875 x6 = fdct_round_shift(s6 + s14);
876 x7 = fdct_round_shift(s7 + s15);
877 x8 = fdct_round_shift(s0 - s8);
878 x9 = fdct_round_shift(s1 - s9);
879 x10 = fdct_round_shift(s2 - s10);
880 x11 = fdct_round_shift(s3 - s11);
881 x12 = fdct_round_shift(s4 - s12);
882 x13 = fdct_round_shift(s5 - s13);
883 x14 = fdct_round_shift(s6 - s14);
884 x15 = fdct_round_shift(s7 - s15);
885
886 // stage 2
887 s0 = x0;
888 s1 = x1;
889 s2 = x2;
890 s3 = x3;
891 s4 = x4;
892 s5 = x5;
893 s6 = x6;
894 s7 = x7;
895 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
896 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
897 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
898 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
899 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
900 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
901 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
902 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
903
904 x0 = s0 + s4;
905 x1 = s1 + s5;
906 x2 = s2 + s6;
907 x3 = s3 + s7;
908 x4 = s0 - s4;
909 x5 = s1 - s5;
910 x6 = s2 - s6;
911 x7 = s3 - s7;
912 x8 = fdct_round_shift(s8 + s12);
913 x9 = fdct_round_shift(s9 + s13);
914 x10 = fdct_round_shift(s10 + s14);
915 x11 = fdct_round_shift(s11 + s15);
916 x12 = fdct_round_shift(s8 - s12);
917 x13 = fdct_round_shift(s9 - s13);
918 x14 = fdct_round_shift(s10 - s14);
919 x15 = fdct_round_shift(s11 - s15);
920
921 // stage 3
922 s0 = x0;
923 s1 = x1;
924 s2 = x2;
925 s3 = x3;
926 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
927 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
928 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
929 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
930 s8 = x8;
931 s9 = x9;
932 s10 = x10;
933 s11 = x11;
934 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
935 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
936 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
937 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
938
939 x0 = s0 + s2;
940 x1 = s1 + s3;
941 x2 = s0 - s2;
942 x3 = s1 - s3;
943 x4 = fdct_round_shift(s4 + s6);
944 x5 = fdct_round_shift(s5 + s7);
945 x6 = fdct_round_shift(s4 - s6);
946 x7 = fdct_round_shift(s5 - s7);
947 x8 = s8 + s10;
948 x9 = s9 + s11;
949 x10 = s8 - s10;
950 x11 = s9 - s11;
951 x12 = fdct_round_shift(s12 + s14);
952 x13 = fdct_round_shift(s13 + s15);
953 x14 = fdct_round_shift(s12 - s14);
954 x15 = fdct_round_shift(s13 - s15);
955
956 // stage 4
957 s2 = (- cospi_16_64) * (x2 + x3);
958 s3 = cospi_16_64 * (x2 - x3);
959 s6 = cospi_16_64 * (x6 + x7);
960 s7 = cospi_16_64 * (- x6 + x7);
961 s10 = cospi_16_64 * (x10 + x11);
962 s11 = cospi_16_64 * (- x10 + x11);
963 s14 = (- cospi_16_64) * (x14 + x15);
964 s15 = cospi_16_64 * (x14 - x15);
965
966 x2 = fdct_round_shift(s2);
967 x3 = fdct_round_shift(s3);
968 x6 = fdct_round_shift(s6);
969 x7 = fdct_round_shift(s7);
970 x10 = fdct_round_shift(s10);
971 x11 = fdct_round_shift(s11);
972 x14 = fdct_round_shift(s14);
973 x15 = fdct_round_shift(s15);
974
975 output[0] = x0;
976 output[1] = - x8;
977 output[2] = x12;
978 output[3] = - x4;
979 output[4] = x6;
980 output[5] = x14;
981 output[6] = x10;
982 output[7] = x2;
983 output[8] = x3;
984 output[9] = x11;
985 output[10] = x15;
986 output[11] = x7;
987 output[12] = x5;
988 output[13] = - x13;
989 output[14] = x9;
990 output[15] = - x1;
991 }
992
993 static const transform_2d FHT_16[] = {
994 { fdct16, fdct16 }, // DCT_DCT = 0
995 { fadst16, fdct16 }, // ADST_DCT = 1
996 { fdct16, fadst16 }, // DCT_ADST = 2
997 { fadst16, fadst16 } // ADST_ADST = 3
998 };
999
vp9_fht16x16_c(const int16_t * input,int16_t * output,int stride,int tx_type)1000 void vp9_fht16x16_c(const int16_t *input, int16_t *output,
1001 int stride, int tx_type) {
1002 if (tx_type == DCT_DCT) {
1003 vp9_fdct16x16_c(input, output, stride);
1004 } else {
1005 int16_t out[256];
1006 int16_t *outptr = &out[0];
1007 int i, j;
1008 int16_t temp_in[16], temp_out[16];
1009 const transform_2d ht = FHT_16[tx_type];
1010
1011 // Columns
1012 for (i = 0; i < 16; ++i) {
1013 for (j = 0; j < 16; ++j)
1014 temp_in[j] = input[j * stride + i] * 4;
1015 ht.cols(temp_in, temp_out);
1016 for (j = 0; j < 16; ++j)
1017 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1018 }
1019
1020 // Rows
1021 for (i = 0; i < 16; ++i) {
1022 for (j = 0; j < 16; ++j)
1023 temp_in[j] = out[j + i * 16];
1024 ht.rows(temp_in, temp_out);
1025 for (j = 0; j < 16; ++j)
1026 output[j + i * 16] = temp_out[j];
1027 }
1028 }
1029 }
1030
dct_32_round(int input)1031 static INLINE int dct_32_round(int input) {
1032 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
1033 assert(-131072 <= rv && rv <= 131071);
1034 return rv;
1035 }
1036
half_round_shift(int input)1037 static INLINE int half_round_shift(int input) {
1038 int rv = (input + 1 + (input < 0)) >> 2;
1039 return rv;
1040 }
1041
fdct32(const int * input,int * output,int round)1042 static void fdct32(const int *input, int *output, int round) {
1043 int step[32];
1044 // Stage 1
1045 step[0] = input[0] + input[(32 - 1)];
1046 step[1] = input[1] + input[(32 - 2)];
1047 step[2] = input[2] + input[(32 - 3)];
1048 step[3] = input[3] + input[(32 - 4)];
1049 step[4] = input[4] + input[(32 - 5)];
1050 step[5] = input[5] + input[(32 - 6)];
1051 step[6] = input[6] + input[(32 - 7)];
1052 step[7] = input[7] + input[(32 - 8)];
1053 step[8] = input[8] + input[(32 - 9)];
1054 step[9] = input[9] + input[(32 - 10)];
1055 step[10] = input[10] + input[(32 - 11)];
1056 step[11] = input[11] + input[(32 - 12)];
1057 step[12] = input[12] + input[(32 - 13)];
1058 step[13] = input[13] + input[(32 - 14)];
1059 step[14] = input[14] + input[(32 - 15)];
1060 step[15] = input[15] + input[(32 - 16)];
1061 step[16] = -input[16] + input[(32 - 17)];
1062 step[17] = -input[17] + input[(32 - 18)];
1063 step[18] = -input[18] + input[(32 - 19)];
1064 step[19] = -input[19] + input[(32 - 20)];
1065 step[20] = -input[20] + input[(32 - 21)];
1066 step[21] = -input[21] + input[(32 - 22)];
1067 step[22] = -input[22] + input[(32 - 23)];
1068 step[23] = -input[23] + input[(32 - 24)];
1069 step[24] = -input[24] + input[(32 - 25)];
1070 step[25] = -input[25] + input[(32 - 26)];
1071 step[26] = -input[26] + input[(32 - 27)];
1072 step[27] = -input[27] + input[(32 - 28)];
1073 step[28] = -input[28] + input[(32 - 29)];
1074 step[29] = -input[29] + input[(32 - 30)];
1075 step[30] = -input[30] + input[(32 - 31)];
1076 step[31] = -input[31] + input[(32 - 32)];
1077
1078 // Stage 2
1079 output[0] = step[0] + step[16 - 1];
1080 output[1] = step[1] + step[16 - 2];
1081 output[2] = step[2] + step[16 - 3];
1082 output[3] = step[3] + step[16 - 4];
1083 output[4] = step[4] + step[16 - 5];
1084 output[5] = step[5] + step[16 - 6];
1085 output[6] = step[6] + step[16 - 7];
1086 output[7] = step[7] + step[16 - 8];
1087 output[8] = -step[8] + step[16 - 9];
1088 output[9] = -step[9] + step[16 - 10];
1089 output[10] = -step[10] + step[16 - 11];
1090 output[11] = -step[11] + step[16 - 12];
1091 output[12] = -step[12] + step[16 - 13];
1092 output[13] = -step[13] + step[16 - 14];
1093 output[14] = -step[14] + step[16 - 15];
1094 output[15] = -step[15] + step[16 - 16];
1095
1096 output[16] = step[16];
1097 output[17] = step[17];
1098 output[18] = step[18];
1099 output[19] = step[19];
1100
1101 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
1102 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
1103 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
1104 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
1105
1106 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
1107 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
1108 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
1109 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
1110
1111 output[28] = step[28];
1112 output[29] = step[29];
1113 output[30] = step[30];
1114 output[31] = step[31];
1115
1116 // dump the magnitude by 4, hence the intermediate values are within
1117 // the range of 16 bits.
1118 if (round) {
1119 output[0] = half_round_shift(output[0]);
1120 output[1] = half_round_shift(output[1]);
1121 output[2] = half_round_shift(output[2]);
1122 output[3] = half_round_shift(output[3]);
1123 output[4] = half_round_shift(output[4]);
1124 output[5] = half_round_shift(output[5]);
1125 output[6] = half_round_shift(output[6]);
1126 output[7] = half_round_shift(output[7]);
1127 output[8] = half_round_shift(output[8]);
1128 output[9] = half_round_shift(output[9]);
1129 output[10] = half_round_shift(output[10]);
1130 output[11] = half_round_shift(output[11]);
1131 output[12] = half_round_shift(output[12]);
1132 output[13] = half_round_shift(output[13]);
1133 output[14] = half_round_shift(output[14]);
1134 output[15] = half_round_shift(output[15]);
1135
1136 output[16] = half_round_shift(output[16]);
1137 output[17] = half_round_shift(output[17]);
1138 output[18] = half_round_shift(output[18]);
1139 output[19] = half_round_shift(output[19]);
1140 output[20] = half_round_shift(output[20]);
1141 output[21] = half_round_shift(output[21]);
1142 output[22] = half_round_shift(output[22]);
1143 output[23] = half_round_shift(output[23]);
1144 output[24] = half_round_shift(output[24]);
1145 output[25] = half_round_shift(output[25]);
1146 output[26] = half_round_shift(output[26]);
1147 output[27] = half_round_shift(output[27]);
1148 output[28] = half_round_shift(output[28]);
1149 output[29] = half_round_shift(output[29]);
1150 output[30] = half_round_shift(output[30]);
1151 output[31] = half_round_shift(output[31]);
1152 }
1153
1154 // Stage 3
1155 step[0] = output[0] + output[(8 - 1)];
1156 step[1] = output[1] + output[(8 - 2)];
1157 step[2] = output[2] + output[(8 - 3)];
1158 step[3] = output[3] + output[(8 - 4)];
1159 step[4] = -output[4] + output[(8 - 5)];
1160 step[5] = -output[5] + output[(8 - 6)];
1161 step[6] = -output[6] + output[(8 - 7)];
1162 step[7] = -output[7] + output[(8 - 8)];
1163 step[8] = output[8];
1164 step[9] = output[9];
1165 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
1166 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
1167 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
1168 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
1169 step[14] = output[14];
1170 step[15] = output[15];
1171
1172 step[16] = output[16] + output[23];
1173 step[17] = output[17] + output[22];
1174 step[18] = output[18] + output[21];
1175 step[19] = output[19] + output[20];
1176 step[20] = -output[20] + output[19];
1177 step[21] = -output[21] + output[18];
1178 step[22] = -output[22] + output[17];
1179 step[23] = -output[23] + output[16];
1180 step[24] = -output[24] + output[31];
1181 step[25] = -output[25] + output[30];
1182 step[26] = -output[26] + output[29];
1183 step[27] = -output[27] + output[28];
1184 step[28] = output[28] + output[27];
1185 step[29] = output[29] + output[26];
1186 step[30] = output[30] + output[25];
1187 step[31] = output[31] + output[24];
1188
1189 // Stage 4
1190 output[0] = step[0] + step[3];
1191 output[1] = step[1] + step[2];
1192 output[2] = -step[2] + step[1];
1193 output[3] = -step[3] + step[0];
1194 output[4] = step[4];
1195 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
1196 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
1197 output[7] = step[7];
1198 output[8] = step[8] + step[11];
1199 output[9] = step[9] + step[10];
1200 output[10] = -step[10] + step[9];
1201 output[11] = -step[11] + step[8];
1202 output[12] = -step[12] + step[15];
1203 output[13] = -step[13] + step[14];
1204 output[14] = step[14] + step[13];
1205 output[15] = step[15] + step[12];
1206
1207 output[16] = step[16];
1208 output[17] = step[17];
1209 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
1210 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
1211 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
1212 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
1213 output[22] = step[22];
1214 output[23] = step[23];
1215 output[24] = step[24];
1216 output[25] = step[25];
1217 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
1218 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
1219 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
1220 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
1221 output[30] = step[30];
1222 output[31] = step[31];
1223
1224 // Stage 5
1225 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
1226 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
1227 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
1228 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
1229 step[4] = output[4] + output[5];
1230 step[5] = -output[5] + output[4];
1231 step[6] = -output[6] + output[7];
1232 step[7] = output[7] + output[6];
1233 step[8] = output[8];
1234 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
1235 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
1236 step[11] = output[11];
1237 step[12] = output[12];
1238 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
1239 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
1240 step[15] = output[15];
1241
1242 step[16] = output[16] + output[19];
1243 step[17] = output[17] + output[18];
1244 step[18] = -output[18] + output[17];
1245 step[19] = -output[19] + output[16];
1246 step[20] = -output[20] + output[23];
1247 step[21] = -output[21] + output[22];
1248 step[22] = output[22] + output[21];
1249 step[23] = output[23] + output[20];
1250 step[24] = output[24] + output[27];
1251 step[25] = output[25] + output[26];
1252 step[26] = -output[26] + output[25];
1253 step[27] = -output[27] + output[24];
1254 step[28] = -output[28] + output[31];
1255 step[29] = -output[29] + output[30];
1256 step[30] = output[30] + output[29];
1257 step[31] = output[31] + output[28];
1258
1259 // Stage 6
1260 output[0] = step[0];
1261 output[1] = step[1];
1262 output[2] = step[2];
1263 output[3] = step[3];
1264 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
1265 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
1266 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
1267 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
1268 output[8] = step[8] + step[9];
1269 output[9] = -step[9] + step[8];
1270 output[10] = -step[10] + step[11];
1271 output[11] = step[11] + step[10];
1272 output[12] = step[12] + step[13];
1273 output[13] = -step[13] + step[12];
1274 output[14] = -step[14] + step[15];
1275 output[15] = step[15] + step[14];
1276
1277 output[16] = step[16];
1278 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
1279 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
1280 output[19] = step[19];
1281 output[20] = step[20];
1282 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
1283 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
1284 output[23] = step[23];
1285 output[24] = step[24];
1286 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
1287 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
1288 output[27] = step[27];
1289 output[28] = step[28];
1290 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
1291 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
1292 output[31] = step[31];
1293
1294 // Stage 7
1295 step[0] = output[0];
1296 step[1] = output[1];
1297 step[2] = output[2];
1298 step[3] = output[3];
1299 step[4] = output[4];
1300 step[5] = output[5];
1301 step[6] = output[6];
1302 step[7] = output[7];
1303 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
1304 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
1305 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
1306 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
1307 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
1308 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
1309 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
1310 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
1311
1312 step[16] = output[16] + output[17];
1313 step[17] = -output[17] + output[16];
1314 step[18] = -output[18] + output[19];
1315 step[19] = output[19] + output[18];
1316 step[20] = output[20] + output[21];
1317 step[21] = -output[21] + output[20];
1318 step[22] = -output[22] + output[23];
1319 step[23] = output[23] + output[22];
1320 step[24] = output[24] + output[25];
1321 step[25] = -output[25] + output[24];
1322 step[26] = -output[26] + output[27];
1323 step[27] = output[27] + output[26];
1324 step[28] = output[28] + output[29];
1325 step[29] = -output[29] + output[28];
1326 step[30] = -output[30] + output[31];
1327 step[31] = output[31] + output[30];
1328
1329 // Final stage --- outputs indices are bit-reversed.
1330 output[0] = step[0];
1331 output[16] = step[1];
1332 output[8] = step[2];
1333 output[24] = step[3];
1334 output[4] = step[4];
1335 output[20] = step[5];
1336 output[12] = step[6];
1337 output[28] = step[7];
1338 output[2] = step[8];
1339 output[18] = step[9];
1340 output[10] = step[10];
1341 output[26] = step[11];
1342 output[6] = step[12];
1343 output[22] = step[13];
1344 output[14] = step[14];
1345 output[30] = step[15];
1346
1347 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
1348 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
1349 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
1350 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
1351 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
1352 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
1353 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
1354 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
1355 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1356 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1357 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1358 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1359 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1360 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1361 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1362 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1363 }
1364
vp9_fdct32x32_1_c(const int16_t * input,int16_t * output,int stride)1365 void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
1366 int r, c;
1367 int16_t sum = 0;
1368 for (r = 0; r < 32; ++r)
1369 for (c = 0; c < 32; ++c)
1370 sum += input[r * stride + c];
1371
1372 output[0] = sum >> 3;
1373 output[1] = 0;
1374 }
1375
vp9_fdct32x32_c(const int16_t * input,int16_t * out,int stride)1376 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
1377 int i, j;
1378 int output[32 * 32];
1379
1380 // Columns
1381 for (i = 0; i < 32; ++i) {
1382 int temp_in[32], temp_out[32];
1383 for (j = 0; j < 32; ++j)
1384 temp_in[j] = input[j * stride + i] * 4;
1385 fdct32(temp_in, temp_out, 0);
1386 for (j = 0; j < 32; ++j)
1387 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1388 }
1389
1390 // Rows
1391 for (i = 0; i < 32; ++i) {
1392 int temp_in[32], temp_out[32];
1393 for (j = 0; j < 32; ++j)
1394 temp_in[j] = output[j + i * 32];
1395 fdct32(temp_in, temp_out, 0);
1396 for (j = 0; j < 32; ++j)
1397 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1398 }
1399 }
1400
1401 // Note that although we use dct_32_round in dct32 computation flow,
1402 // this 2d fdct32x32 for rate-distortion optimization loop is operating
1403 // within 16 bits precision.
vp9_fdct32x32_rd_c(const int16_t * input,int16_t * out,int stride)1404 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
1405 int i, j;
1406 int output[32 * 32];
1407
1408 // Columns
1409 for (i = 0; i < 32; ++i) {
1410 int temp_in[32], temp_out[32];
1411 for (j = 0; j < 32; ++j)
1412 temp_in[j] = input[j * stride + i] * 4;
1413 fdct32(temp_in, temp_out, 0);
1414 for (j = 0; j < 32; ++j)
1415 // TODO(cd): see quality impact of only doing
1416 // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
1417 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1418 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1419 }
1420
1421 // Rows
1422 for (i = 0; i < 32; ++i) {
1423 int temp_in[32], temp_out[32];
1424 for (j = 0; j < 32; ++j)
1425 temp_in[j] = output[j + i * 32];
1426 fdct32(temp_in, temp_out, 1);
1427 for (j = 0; j < 32; ++j)
1428 out[j + i * 32] = temp_out[j];
1429 }
1430 }
1431