1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <math.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_systemdependent.h"
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_common.h"
19 #include "vp9/common/vp9_idct.h"
20
vp9_iwht4x4_16_add_c(const int16_t * input,uint8_t * dest,int stride)21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
23 0.5 shifts per pixel. */
24 int i;
25 int16_t output[16];
26 int a1, b1, c1, d1, e1;
27 const int16_t *ip = input;
28 int16_t *op = output;
29
30 for (i = 0; i < 4; i++) {
31 a1 = ip[0] >> UNIT_QUANT_SHIFT;
32 c1 = ip[1] >> UNIT_QUANT_SHIFT;
33 d1 = ip[2] >> UNIT_QUANT_SHIFT;
34 b1 = ip[3] >> UNIT_QUANT_SHIFT;
35 a1 += c1;
36 d1 -= b1;
37 e1 = (a1 - d1) >> 1;
38 b1 = e1 - b1;
39 c1 = e1 - c1;
40 a1 -= b1;
41 d1 += c1;
42 op[0] = a1;
43 op[1] = b1;
44 op[2] = c1;
45 op[3] = d1;
46 ip += 4;
47 op += 4;
48 }
49
50 ip = output;
51 for (i = 0; i < 4; i++) {
52 a1 = ip[4 * 0];
53 c1 = ip[4 * 1];
54 d1 = ip[4 * 2];
55 b1 = ip[4 * 3];
56 a1 += c1;
57 d1 -= b1;
58 e1 = (a1 - d1) >> 1;
59 b1 = e1 - b1;
60 c1 = e1 - c1;
61 a1 -= b1;
62 d1 += c1;
63 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
64 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
65 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
66 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
67
68 ip++;
69 dest++;
70 }
71 }
72
vp9_iwht4x4_1_add_c(const int16_t * in,uint8_t * dest,int dest_stride)73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
74 int i;
75 int a1, e1;
76 int16_t tmp[4];
77 const int16_t *ip = in;
78 int16_t *op = tmp;
79
80 a1 = ip[0] >> UNIT_QUANT_SHIFT;
81 e1 = a1 >> 1;
82 a1 -= e1;
83 op[0] = a1;
84 op[1] = op[2] = op[3] = e1;
85
86 ip = tmp;
87 for (i = 0; i < 4; i++) {
88 e1 = ip[0] >> 1;
89 a1 = ip[0] - e1;
90 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
91 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
92 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
93 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
94 ip++;
95 dest++;
96 }
97 }
98
idct4(const int16_t * input,int16_t * output)99 static void idct4(const int16_t *input, int16_t *output) {
100 int16_t step[4];
101 int temp1, temp2;
102 // stage 1
103 temp1 = (input[0] + input[2]) * cospi_16_64;
104 temp2 = (input[0] - input[2]) * cospi_16_64;
105 step[0] = dct_const_round_shift(temp1);
106 step[1] = dct_const_round_shift(temp2);
107 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
108 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
109 step[2] = dct_const_round_shift(temp1);
110 step[3] = dct_const_round_shift(temp2);
111
112 // stage 2
113 output[0] = step[0] + step[3];
114 output[1] = step[1] + step[2];
115 output[2] = step[1] - step[2];
116 output[3] = step[0] - step[3];
117 }
118
vp9_idct4x4_16_add_c(const int16_t * input,uint8_t * dest,int stride)119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
120 int16_t out[4 * 4];
121 int16_t *outptr = out;
122 int i, j;
123 int16_t temp_in[4], temp_out[4];
124
125 // Rows
126 for (i = 0; i < 4; ++i) {
127 idct4(input, outptr);
128 input += 4;
129 outptr += 4;
130 }
131
132 // Columns
133 for (i = 0; i < 4; ++i) {
134 for (j = 0; j < 4; ++j)
135 temp_in[j] = out[j * 4 + i];
136 idct4(temp_in, temp_out);
137 for (j = 0; j < 4; ++j)
138 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
139 + dest[j * stride + i]);
140 }
141 }
142
vp9_idct4x4_1_add_c(const int16_t * input,uint8_t * dest,int dest_stride)143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
144 int i;
145 int a1;
146 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
147 out = dct_const_round_shift(out * cospi_16_64);
148 a1 = ROUND_POWER_OF_TWO(out, 4);
149
150 for (i = 0; i < 4; i++) {
151 dest[0] = clip_pixel(dest[0] + a1);
152 dest[1] = clip_pixel(dest[1] + a1);
153 dest[2] = clip_pixel(dest[2] + a1);
154 dest[3] = clip_pixel(dest[3] + a1);
155 dest += dest_stride;
156 }
157 }
158
idct8(const int16_t * input,int16_t * output)159 static void idct8(const int16_t *input, int16_t *output) {
160 int16_t step1[8], step2[8];
161 int temp1, temp2;
162 // stage 1
163 step1[0] = input[0];
164 step1[2] = input[4];
165 step1[1] = input[2];
166 step1[3] = input[6];
167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169 step1[4] = dct_const_round_shift(temp1);
170 step1[7] = dct_const_round_shift(temp2);
171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173 step1[5] = dct_const_round_shift(temp1);
174 step1[6] = dct_const_round_shift(temp2);
175
176 // stage 2 & stage 3 - even half
177 idct4(step1, step1);
178
179 // stage 2 - odd half
180 step2[4] = step1[4] + step1[5];
181 step2[5] = step1[4] - step1[5];
182 step2[6] = -step1[6] + step1[7];
183 step2[7] = step1[6] + step1[7];
184
185 // stage 3 -odd half
186 step1[4] = step2[4];
187 temp1 = (step2[6] - step2[5]) * cospi_16_64;
188 temp2 = (step2[5] + step2[6]) * cospi_16_64;
189 step1[5] = dct_const_round_shift(temp1);
190 step1[6] = dct_const_round_shift(temp2);
191 step1[7] = step2[7];
192
193 // stage 4
194 output[0] = step1[0] + step1[7];
195 output[1] = step1[1] + step1[6];
196 output[2] = step1[2] + step1[5];
197 output[3] = step1[3] + step1[4];
198 output[4] = step1[3] - step1[4];
199 output[5] = step1[2] - step1[5];
200 output[6] = step1[1] - step1[6];
201 output[7] = step1[0] - step1[7];
202 }
203
vp9_idct8x8_64_add_c(const int16_t * input,uint8_t * dest,int stride)204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
205 int16_t out[8 * 8];
206 int16_t *outptr = out;
207 int i, j;
208 int16_t temp_in[8], temp_out[8];
209
210 // First transform rows
211 for (i = 0; i < 8; ++i) {
212 idct8(input, outptr);
213 input += 8;
214 outptr += 8;
215 }
216
217 // Then transform columns
218 for (i = 0; i < 8; ++i) {
219 for (j = 0; j < 8; ++j)
220 temp_in[j] = out[j * 8 + i];
221 idct8(temp_in, temp_out);
222 for (j = 0; j < 8; ++j)
223 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
224 + dest[j * stride + i]);
225 }
226 }
227
vp9_idct8x8_1_add_c(const int16_t * input,uint8_t * dest,int stride)228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
229 int i, j;
230 int a1;
231 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
232 out = dct_const_round_shift(out * cospi_16_64);
233 a1 = ROUND_POWER_OF_TWO(out, 5);
234 for (j = 0; j < 8; ++j) {
235 for (i = 0; i < 8; ++i)
236 dest[i] = clip_pixel(dest[i] + a1);
237 dest += stride;
238 }
239 }
240
iadst4(const int16_t * input,int16_t * output)241 static void iadst4(const int16_t *input, int16_t *output) {
242 int s0, s1, s2, s3, s4, s5, s6, s7;
243
244 int x0 = input[0];
245 int x1 = input[1];
246 int x2 = input[2];
247 int x3 = input[3];
248
249 if (!(x0 | x1 | x2 | x3)) {
250 output[0] = output[1] = output[2] = output[3] = 0;
251 return;
252 }
253
254 s0 = sinpi_1_9 * x0;
255 s1 = sinpi_2_9 * x0;
256 s2 = sinpi_3_9 * x1;
257 s3 = sinpi_4_9 * x2;
258 s4 = sinpi_1_9 * x2;
259 s5 = sinpi_2_9 * x3;
260 s6 = sinpi_4_9 * x3;
261 s7 = x0 - x2 + x3;
262
263 x0 = s0 + s3 + s5;
264 x1 = s1 - s4 - s6;
265 x2 = sinpi_3_9 * s7;
266 x3 = s2;
267
268 s0 = x0 + x3;
269 s1 = x1 + x3;
270 s2 = x2;
271 s3 = x0 + x1 - x3;
272
273 // 1-D transform scaling factor is sqrt(2).
274 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
275 // + 1b (addition) = 29b.
276 // Hence the output bit depth is 15b.
277 output[0] = dct_const_round_shift(s0);
278 output[1] = dct_const_round_shift(s1);
279 output[2] = dct_const_round_shift(s2);
280 output[3] = dct_const_round_shift(s3);
281 }
282
vp9_iht4x4_16_add_c(const int16_t * input,uint8_t * dest,int stride,int tx_type)283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
284 int tx_type) {
285 const transform_2d IHT_4[] = {
286 { idct4, idct4 }, // DCT_DCT = 0
287 { iadst4, idct4 }, // ADST_DCT = 1
288 { idct4, iadst4 }, // DCT_ADST = 2
289 { iadst4, iadst4 } // ADST_ADST = 3
290 };
291
292 int i, j;
293 int16_t out[4 * 4];
294 int16_t *outptr = out;
295 int16_t temp_in[4], temp_out[4];
296
297 // inverse transform row vectors
298 for (i = 0; i < 4; ++i) {
299 IHT_4[tx_type].rows(input, outptr);
300 input += 4;
301 outptr += 4;
302 }
303
304 // inverse transform column vectors
305 for (i = 0; i < 4; ++i) {
306 for (j = 0; j < 4; ++j)
307 temp_in[j] = out[j * 4 + i];
308 IHT_4[tx_type].cols(temp_in, temp_out);
309 for (j = 0; j < 4; ++j)
310 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
311 + dest[j * stride + i]);
312 }
313 }
iadst8(const int16_t * input,int16_t * output)314 static void iadst8(const int16_t *input, int16_t *output) {
315 int s0, s1, s2, s3, s4, s5, s6, s7;
316
317 int x0 = input[7];
318 int x1 = input[0];
319 int x2 = input[5];
320 int x3 = input[2];
321 int x4 = input[3];
322 int x5 = input[4];
323 int x6 = input[1];
324 int x7 = input[6];
325
326 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
327 output[0] = output[1] = output[2] = output[3] = output[4]
328 = output[5] = output[6] = output[7] = 0;
329 return;
330 }
331
332 // stage 1
333 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
334 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
335 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
336 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
337 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
338 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
339 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
340 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
341
342 x0 = dct_const_round_shift(s0 + s4);
343 x1 = dct_const_round_shift(s1 + s5);
344 x2 = dct_const_round_shift(s2 + s6);
345 x3 = dct_const_round_shift(s3 + s7);
346 x4 = dct_const_round_shift(s0 - s4);
347 x5 = dct_const_round_shift(s1 - s5);
348 x6 = dct_const_round_shift(s2 - s6);
349 x7 = dct_const_round_shift(s3 - s7);
350
351 // stage 2
352 s0 = x0;
353 s1 = x1;
354 s2 = x2;
355 s3 = x3;
356 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
357 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
358 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
359 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
360
361 x0 = s0 + s2;
362 x1 = s1 + s3;
363 x2 = s0 - s2;
364 x3 = s1 - s3;
365 x4 = dct_const_round_shift(s4 + s6);
366 x5 = dct_const_round_shift(s5 + s7);
367 x6 = dct_const_round_shift(s4 - s6);
368 x7 = dct_const_round_shift(s5 - s7);
369
370 // stage 3
371 s2 = cospi_16_64 * (x2 + x3);
372 s3 = cospi_16_64 * (x2 - x3);
373 s6 = cospi_16_64 * (x6 + x7);
374 s7 = cospi_16_64 * (x6 - x7);
375
376 x2 = dct_const_round_shift(s2);
377 x3 = dct_const_round_shift(s3);
378 x6 = dct_const_round_shift(s6);
379 x7 = dct_const_round_shift(s7);
380
381 output[0] = x0;
382 output[1] = -x4;
383 output[2] = x6;
384 output[3] = -x2;
385 output[4] = x3;
386 output[5] = -x7;
387 output[6] = x5;
388 output[7] = -x1;
389 }
390
391 static const transform_2d IHT_8[] = {
392 { idct8, idct8 }, // DCT_DCT = 0
393 { iadst8, idct8 }, // ADST_DCT = 1
394 { idct8, iadst8 }, // DCT_ADST = 2
395 { iadst8, iadst8 } // ADST_ADST = 3
396 };
397
vp9_iht8x8_64_add_c(const int16_t * input,uint8_t * dest,int stride,int tx_type)398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
399 int tx_type) {
400 int i, j;
401 int16_t out[8 * 8];
402 int16_t *outptr = out;
403 int16_t temp_in[8], temp_out[8];
404 const transform_2d ht = IHT_8[tx_type];
405
406 // inverse transform row vectors
407 for (i = 0; i < 8; ++i) {
408 ht.rows(input, outptr);
409 input += 8;
410 outptr += 8;
411 }
412
413 // inverse transform column vectors
414 for (i = 0; i < 8; ++i) {
415 for (j = 0; j < 8; ++j)
416 temp_in[j] = out[j * 8 + i];
417 ht.cols(temp_in, temp_out);
418 for (j = 0; j < 8; ++j)
419 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
420 + dest[j * stride + i]);
421 }
422 }
423
vp9_idct8x8_12_add_c(const int16_t * input,uint8_t * dest,int stride)424 void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
425 int16_t out[8 * 8] = { 0 };
426 int16_t *outptr = out;
427 int i, j;
428 int16_t temp_in[8], temp_out[8];
429
430 // First transform rows
431 // only first 4 row has non-zero coefs
432 for (i = 0; i < 4; ++i) {
433 idct8(input, outptr);
434 input += 8;
435 outptr += 8;
436 }
437
438 // Then transform columns
439 for (i = 0; i < 8; ++i) {
440 for (j = 0; j < 8; ++j)
441 temp_in[j] = out[j * 8 + i];
442 idct8(temp_in, temp_out);
443 for (j = 0; j < 8; ++j)
444 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
445 + dest[j * stride + i]);
446 }
447 }
448
idct16(const int16_t * input,int16_t * output)449 static void idct16(const int16_t *input, int16_t *output) {
450 int16_t step1[16], step2[16];
451 int temp1, temp2;
452
453 // stage 1
454 step1[0] = input[0/2];
455 step1[1] = input[16/2];
456 step1[2] = input[8/2];
457 step1[3] = input[24/2];
458 step1[4] = input[4/2];
459 step1[5] = input[20/2];
460 step1[6] = input[12/2];
461 step1[7] = input[28/2];
462 step1[8] = input[2/2];
463 step1[9] = input[18/2];
464 step1[10] = input[10/2];
465 step1[11] = input[26/2];
466 step1[12] = input[6/2];
467 step1[13] = input[22/2];
468 step1[14] = input[14/2];
469 step1[15] = input[30/2];
470
471 // stage 2
472 step2[0] = step1[0];
473 step2[1] = step1[1];
474 step2[2] = step1[2];
475 step2[3] = step1[3];
476 step2[4] = step1[4];
477 step2[5] = step1[5];
478 step2[6] = step1[6];
479 step2[7] = step1[7];
480
481 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
482 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
483 step2[8] = dct_const_round_shift(temp1);
484 step2[15] = dct_const_round_shift(temp2);
485
486 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
487 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
488 step2[9] = dct_const_round_shift(temp1);
489 step2[14] = dct_const_round_shift(temp2);
490
491 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
492 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
493 step2[10] = dct_const_round_shift(temp1);
494 step2[13] = dct_const_round_shift(temp2);
495
496 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
497 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
498 step2[11] = dct_const_round_shift(temp1);
499 step2[12] = dct_const_round_shift(temp2);
500
501 // stage 3
502 step1[0] = step2[0];
503 step1[1] = step2[1];
504 step1[2] = step2[2];
505 step1[3] = step2[3];
506
507 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
508 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
509 step1[4] = dct_const_round_shift(temp1);
510 step1[7] = dct_const_round_shift(temp2);
511 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
512 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
513 step1[5] = dct_const_round_shift(temp1);
514 step1[6] = dct_const_round_shift(temp2);
515
516 step1[8] = step2[8] + step2[9];
517 step1[9] = step2[8] - step2[9];
518 step1[10] = -step2[10] + step2[11];
519 step1[11] = step2[10] + step2[11];
520 step1[12] = step2[12] + step2[13];
521 step1[13] = step2[12] - step2[13];
522 step1[14] = -step2[14] + step2[15];
523 step1[15] = step2[14] + step2[15];
524
525 // stage 4
526 temp1 = (step1[0] + step1[1]) * cospi_16_64;
527 temp2 = (step1[0] - step1[1]) * cospi_16_64;
528 step2[0] = dct_const_round_shift(temp1);
529 step2[1] = dct_const_round_shift(temp2);
530 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
531 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
532 step2[2] = dct_const_round_shift(temp1);
533 step2[3] = dct_const_round_shift(temp2);
534 step2[4] = step1[4] + step1[5];
535 step2[5] = step1[4] - step1[5];
536 step2[6] = -step1[6] + step1[7];
537 step2[7] = step1[6] + step1[7];
538
539 step2[8] = step1[8];
540 step2[15] = step1[15];
541 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
542 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
543 step2[9] = dct_const_round_shift(temp1);
544 step2[14] = dct_const_round_shift(temp2);
545 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
546 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
547 step2[10] = dct_const_round_shift(temp1);
548 step2[13] = dct_const_round_shift(temp2);
549 step2[11] = step1[11];
550 step2[12] = step1[12];
551
552 // stage 5
553 step1[0] = step2[0] + step2[3];
554 step1[1] = step2[1] + step2[2];
555 step1[2] = step2[1] - step2[2];
556 step1[3] = step2[0] - step2[3];
557 step1[4] = step2[4];
558 temp1 = (step2[6] - step2[5]) * cospi_16_64;
559 temp2 = (step2[5] + step2[6]) * cospi_16_64;
560 step1[5] = dct_const_round_shift(temp1);
561 step1[6] = dct_const_round_shift(temp2);
562 step1[7] = step2[7];
563
564 step1[8] = step2[8] + step2[11];
565 step1[9] = step2[9] + step2[10];
566 step1[10] = step2[9] - step2[10];
567 step1[11] = step2[8] - step2[11];
568 step1[12] = -step2[12] + step2[15];
569 step1[13] = -step2[13] + step2[14];
570 step1[14] = step2[13] + step2[14];
571 step1[15] = step2[12] + step2[15];
572
573 // stage 6
574 step2[0] = step1[0] + step1[7];
575 step2[1] = step1[1] + step1[6];
576 step2[2] = step1[2] + step1[5];
577 step2[3] = step1[3] + step1[4];
578 step2[4] = step1[3] - step1[4];
579 step2[5] = step1[2] - step1[5];
580 step2[6] = step1[1] - step1[6];
581 step2[7] = step1[0] - step1[7];
582 step2[8] = step1[8];
583 step2[9] = step1[9];
584 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
585 temp2 = (step1[10] + step1[13]) * cospi_16_64;
586 step2[10] = dct_const_round_shift(temp1);
587 step2[13] = dct_const_round_shift(temp2);
588 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
589 temp2 = (step1[11] + step1[12]) * cospi_16_64;
590 step2[11] = dct_const_round_shift(temp1);
591 step2[12] = dct_const_round_shift(temp2);
592 step2[14] = step1[14];
593 step2[15] = step1[15];
594
595 // stage 7
596 output[0] = step2[0] + step2[15];
597 output[1] = step2[1] + step2[14];
598 output[2] = step2[2] + step2[13];
599 output[3] = step2[3] + step2[12];
600 output[4] = step2[4] + step2[11];
601 output[5] = step2[5] + step2[10];
602 output[6] = step2[6] + step2[9];
603 output[7] = step2[7] + step2[8];
604 output[8] = step2[7] - step2[8];
605 output[9] = step2[6] - step2[9];
606 output[10] = step2[5] - step2[10];
607 output[11] = step2[4] - step2[11];
608 output[12] = step2[3] - step2[12];
609 output[13] = step2[2] - step2[13];
610 output[14] = step2[1] - step2[14];
611 output[15] = step2[0] - step2[15];
612 }
613
vp9_idct16x16_256_add_c(const int16_t * input,uint8_t * dest,int stride)614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
615 int16_t out[16 * 16];
616 int16_t *outptr = out;
617 int i, j;
618 int16_t temp_in[16], temp_out[16];
619
620 // First transform rows
621 for (i = 0; i < 16; ++i) {
622 idct16(input, outptr);
623 input += 16;
624 outptr += 16;
625 }
626
627 // Then transform columns
628 for (i = 0; i < 16; ++i) {
629 for (j = 0; j < 16; ++j)
630 temp_in[j] = out[j * 16 + i];
631 idct16(temp_in, temp_out);
632 for (j = 0; j < 16; ++j)
633 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
634 + dest[j * stride + i]);
635 }
636 }
637
iadst16(const int16_t * input,int16_t * output)638 static void iadst16(const int16_t *input, int16_t *output) {
639 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
640
641 int x0 = input[15];
642 int x1 = input[0];
643 int x2 = input[13];
644 int x3 = input[2];
645 int x4 = input[11];
646 int x5 = input[4];
647 int x6 = input[9];
648 int x7 = input[6];
649 int x8 = input[7];
650 int x9 = input[8];
651 int x10 = input[5];
652 int x11 = input[10];
653 int x12 = input[3];
654 int x13 = input[12];
655 int x14 = input[1];
656 int x15 = input[14];
657
658 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
659 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
660 output[0] = output[1] = output[2] = output[3] = output[4]
661 = output[5] = output[6] = output[7] = output[8]
662 = output[9] = output[10] = output[11] = output[12]
663 = output[13] = output[14] = output[15] = 0;
664 return;
665 }
666
667 // stage 1
668 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
669 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
670 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
671 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
672 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
673 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
674 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
675 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
676 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
677 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
678 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
679 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
680 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
681 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
682 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
683 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
684
685 x0 = dct_const_round_shift(s0 + s8);
686 x1 = dct_const_round_shift(s1 + s9);
687 x2 = dct_const_round_shift(s2 + s10);
688 x3 = dct_const_round_shift(s3 + s11);
689 x4 = dct_const_round_shift(s4 + s12);
690 x5 = dct_const_round_shift(s5 + s13);
691 x6 = dct_const_round_shift(s6 + s14);
692 x7 = dct_const_round_shift(s7 + s15);
693 x8 = dct_const_round_shift(s0 - s8);
694 x9 = dct_const_round_shift(s1 - s9);
695 x10 = dct_const_round_shift(s2 - s10);
696 x11 = dct_const_round_shift(s3 - s11);
697 x12 = dct_const_round_shift(s4 - s12);
698 x13 = dct_const_round_shift(s5 - s13);
699 x14 = dct_const_round_shift(s6 - s14);
700 x15 = dct_const_round_shift(s7 - s15);
701
702 // stage 2
703 s0 = x0;
704 s1 = x1;
705 s2 = x2;
706 s3 = x3;
707 s4 = x4;
708 s5 = x5;
709 s6 = x6;
710 s7 = x7;
711 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
712 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
713 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
714 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
715 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
716 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
717 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
718 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
719
720 x0 = s0 + s4;
721 x1 = s1 + s5;
722 x2 = s2 + s6;
723 x3 = s3 + s7;
724 x4 = s0 - s4;
725 x5 = s1 - s5;
726 x6 = s2 - s6;
727 x7 = s3 - s7;
728 x8 = dct_const_round_shift(s8 + s12);
729 x9 = dct_const_round_shift(s9 + s13);
730 x10 = dct_const_round_shift(s10 + s14);
731 x11 = dct_const_round_shift(s11 + s15);
732 x12 = dct_const_round_shift(s8 - s12);
733 x13 = dct_const_round_shift(s9 - s13);
734 x14 = dct_const_round_shift(s10 - s14);
735 x15 = dct_const_round_shift(s11 - s15);
736
737 // stage 3
738 s0 = x0;
739 s1 = x1;
740 s2 = x2;
741 s3 = x3;
742 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
743 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
744 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
745 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
746 s8 = x8;
747 s9 = x9;
748 s10 = x10;
749 s11 = x11;
750 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
751 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
752 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
753 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
754
755 x0 = s0 + s2;
756 x1 = s1 + s3;
757 x2 = s0 - s2;
758 x3 = s1 - s3;
759 x4 = dct_const_round_shift(s4 + s6);
760 x5 = dct_const_round_shift(s5 + s7);
761 x6 = dct_const_round_shift(s4 - s6);
762 x7 = dct_const_round_shift(s5 - s7);
763 x8 = s8 + s10;
764 x9 = s9 + s11;
765 x10 = s8 - s10;
766 x11 = s9 - s11;
767 x12 = dct_const_round_shift(s12 + s14);
768 x13 = dct_const_round_shift(s13 + s15);
769 x14 = dct_const_round_shift(s12 - s14);
770 x15 = dct_const_round_shift(s13 - s15);
771
772 // stage 4
773 s2 = (- cospi_16_64) * (x2 + x3);
774 s3 = cospi_16_64 * (x2 - x3);
775 s6 = cospi_16_64 * (x6 + x7);
776 s7 = cospi_16_64 * (- x6 + x7);
777 s10 = cospi_16_64 * (x10 + x11);
778 s11 = cospi_16_64 * (- x10 + x11);
779 s14 = (- cospi_16_64) * (x14 + x15);
780 s15 = cospi_16_64 * (x14 - x15);
781
782 x2 = dct_const_round_shift(s2);
783 x3 = dct_const_round_shift(s3);
784 x6 = dct_const_round_shift(s6);
785 x7 = dct_const_round_shift(s7);
786 x10 = dct_const_round_shift(s10);
787 x11 = dct_const_round_shift(s11);
788 x14 = dct_const_round_shift(s14);
789 x15 = dct_const_round_shift(s15);
790
791 output[0] = x0;
792 output[1] = -x8;
793 output[2] = x12;
794 output[3] = -x4;
795 output[4] = x6;
796 output[5] = x14;
797 output[6] = x10;
798 output[7] = x2;
799 output[8] = x3;
800 output[9] = x11;
801 output[10] = x15;
802 output[11] = x7;
803 output[12] = x5;
804 output[13] = -x13;
805 output[14] = x9;
806 output[15] = -x1;
807 }
808
809 static const transform_2d IHT_16[] = {
810 { idct16, idct16 }, // DCT_DCT = 0
811 { iadst16, idct16 }, // ADST_DCT = 1
812 { idct16, iadst16 }, // DCT_ADST = 2
813 { iadst16, iadst16 } // ADST_ADST = 3
814 };
815
vp9_iht16x16_256_add_c(const int16_t * input,uint8_t * dest,int stride,int tx_type)816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
817 int tx_type) {
818 int i, j;
819 int16_t out[16 * 16];
820 int16_t *outptr = out;
821 int16_t temp_in[16], temp_out[16];
822 const transform_2d ht = IHT_16[tx_type];
823
824 // Rows
825 for (i = 0; i < 16; ++i) {
826 ht.rows(input, outptr);
827 input += 16;
828 outptr += 16;
829 }
830
831 // Columns
832 for (i = 0; i < 16; ++i) {
833 for (j = 0; j < 16; ++j)
834 temp_in[j] = out[j * 16 + i];
835 ht.cols(temp_in, temp_out);
836 for (j = 0; j < 16; ++j)
837 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
838 + dest[j * stride + i]);
839 }
840 }
841
vp9_idct16x16_10_add_c(const int16_t * input,uint8_t * dest,int stride)842 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
843 int16_t out[16 * 16] = { 0 };
844 int16_t *outptr = out;
845 int i, j;
846 int16_t temp_in[16], temp_out[16];
847
848 // First transform rows. Since all non-zero dct coefficients are in
849 // upper-left 4x4 area, we only need to calculate first 4 rows here.
850 for (i = 0; i < 4; ++i) {
851 idct16(input, outptr);
852 input += 16;
853 outptr += 16;
854 }
855
856 // Then transform columns
857 for (i = 0; i < 16; ++i) {
858 for (j = 0; j < 16; ++j)
859 temp_in[j] = out[j*16 + i];
860 idct16(temp_in, temp_out);
861 for (j = 0; j < 16; ++j)
862 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
863 + dest[j * stride + i]);
864 }
865 }
866
vp9_idct16x16_1_add_c(const int16_t * input,uint8_t * dest,int stride)867 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
868 int i, j;
869 int a1;
870 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
871 out = dct_const_round_shift(out * cospi_16_64);
872 a1 = ROUND_POWER_OF_TWO(out, 6);
873 for (j = 0; j < 16; ++j) {
874 for (i = 0; i < 16; ++i)
875 dest[i] = clip_pixel(dest[i] + a1);
876 dest += stride;
877 }
878 }
879
idct32(const int16_t * input,int16_t * output)880 static void idct32(const int16_t *input, int16_t *output) {
881 int16_t step1[32], step2[32];
882 int temp1, temp2;
883
884 // stage 1
885 step1[0] = input[0];
886 step1[1] = input[16];
887 step1[2] = input[8];
888 step1[3] = input[24];
889 step1[4] = input[4];
890 step1[5] = input[20];
891 step1[6] = input[12];
892 step1[7] = input[28];
893 step1[8] = input[2];
894 step1[9] = input[18];
895 step1[10] = input[10];
896 step1[11] = input[26];
897 step1[12] = input[6];
898 step1[13] = input[22];
899 step1[14] = input[14];
900 step1[15] = input[30];
901
902 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
903 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
904 step1[16] = dct_const_round_shift(temp1);
905 step1[31] = dct_const_round_shift(temp2);
906
907 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
908 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
909 step1[17] = dct_const_round_shift(temp1);
910 step1[30] = dct_const_round_shift(temp2);
911
912 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
913 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
914 step1[18] = dct_const_round_shift(temp1);
915 step1[29] = dct_const_round_shift(temp2);
916
917 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
918 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
919 step1[19] = dct_const_round_shift(temp1);
920 step1[28] = dct_const_round_shift(temp2);
921
922 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
923 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
924 step1[20] = dct_const_round_shift(temp1);
925 step1[27] = dct_const_round_shift(temp2);
926
927 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
928 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
929 step1[21] = dct_const_round_shift(temp1);
930 step1[26] = dct_const_round_shift(temp2);
931
932 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
933 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
934 step1[22] = dct_const_round_shift(temp1);
935 step1[25] = dct_const_round_shift(temp2);
936
937 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
938 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
939 step1[23] = dct_const_round_shift(temp1);
940 step1[24] = dct_const_round_shift(temp2);
941
942 // stage 2
943 step2[0] = step1[0];
944 step2[1] = step1[1];
945 step2[2] = step1[2];
946 step2[3] = step1[3];
947 step2[4] = step1[4];
948 step2[5] = step1[5];
949 step2[6] = step1[6];
950 step2[7] = step1[7];
951
952 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
953 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
954 step2[8] = dct_const_round_shift(temp1);
955 step2[15] = dct_const_round_shift(temp2);
956
957 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
958 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
959 step2[9] = dct_const_round_shift(temp1);
960 step2[14] = dct_const_round_shift(temp2);
961
962 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
963 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
964 step2[10] = dct_const_round_shift(temp1);
965 step2[13] = dct_const_round_shift(temp2);
966
967 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
968 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
969 step2[11] = dct_const_round_shift(temp1);
970 step2[12] = dct_const_round_shift(temp2);
971
972 step2[16] = step1[16] + step1[17];
973 step2[17] = step1[16] - step1[17];
974 step2[18] = -step1[18] + step1[19];
975 step2[19] = step1[18] + step1[19];
976 step2[20] = step1[20] + step1[21];
977 step2[21] = step1[20] - step1[21];
978 step2[22] = -step1[22] + step1[23];
979 step2[23] = step1[22] + step1[23];
980 step2[24] = step1[24] + step1[25];
981 step2[25] = step1[24] - step1[25];
982 step2[26] = -step1[26] + step1[27];
983 step2[27] = step1[26] + step1[27];
984 step2[28] = step1[28] + step1[29];
985 step2[29] = step1[28] - step1[29];
986 step2[30] = -step1[30] + step1[31];
987 step2[31] = step1[30] + step1[31];
988
989 // stage 3
990 step1[0] = step2[0];
991 step1[1] = step2[1];
992 step1[2] = step2[2];
993 step1[3] = step2[3];
994
995 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
996 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
997 step1[4] = dct_const_round_shift(temp1);
998 step1[7] = dct_const_round_shift(temp2);
999 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1000 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1001 step1[5] = dct_const_round_shift(temp1);
1002 step1[6] = dct_const_round_shift(temp2);
1003
1004 step1[8] = step2[8] + step2[9];
1005 step1[9] = step2[8] - step2[9];
1006 step1[10] = -step2[10] + step2[11];
1007 step1[11] = step2[10] + step2[11];
1008 step1[12] = step2[12] + step2[13];
1009 step1[13] = step2[12] - step2[13];
1010 step1[14] = -step2[14] + step2[15];
1011 step1[15] = step2[14] + step2[15];
1012
1013 step1[16] = step2[16];
1014 step1[31] = step2[31];
1015 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
1016 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
1017 step1[17] = dct_const_round_shift(temp1);
1018 step1[30] = dct_const_round_shift(temp2);
1019 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
1020 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
1021 step1[18] = dct_const_round_shift(temp1);
1022 step1[29] = dct_const_round_shift(temp2);
1023 step1[19] = step2[19];
1024 step1[20] = step2[20];
1025 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
1026 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
1027 step1[21] = dct_const_round_shift(temp1);
1028 step1[26] = dct_const_round_shift(temp2);
1029 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
1030 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
1031 step1[22] = dct_const_round_shift(temp1);
1032 step1[25] = dct_const_round_shift(temp2);
1033 step1[23] = step2[23];
1034 step1[24] = step2[24];
1035 step1[27] = step2[27];
1036 step1[28] = step2[28];
1037
1038 // stage 4
1039 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1040 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1041 step2[0] = dct_const_round_shift(temp1);
1042 step2[1] = dct_const_round_shift(temp2);
1043 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1044 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1045 step2[2] = dct_const_round_shift(temp1);
1046 step2[3] = dct_const_round_shift(temp2);
1047 step2[4] = step1[4] + step1[5];
1048 step2[5] = step1[4] - step1[5];
1049 step2[6] = -step1[6] + step1[7];
1050 step2[7] = step1[6] + step1[7];
1051
1052 step2[8] = step1[8];
1053 step2[15] = step1[15];
1054 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1055 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1056 step2[9] = dct_const_round_shift(temp1);
1057 step2[14] = dct_const_round_shift(temp2);
1058 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1059 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1060 step2[10] = dct_const_round_shift(temp1);
1061 step2[13] = dct_const_round_shift(temp2);
1062 step2[11] = step1[11];
1063 step2[12] = step1[12];
1064
1065 step2[16] = step1[16] + step1[19];
1066 step2[17] = step1[17] + step1[18];
1067 step2[18] = step1[17] - step1[18];
1068 step2[19] = step1[16] - step1[19];
1069 step2[20] = -step1[20] + step1[23];
1070 step2[21] = -step1[21] + step1[22];
1071 step2[22] = step1[21] + step1[22];
1072 step2[23] = step1[20] + step1[23];
1073
1074 step2[24] = step1[24] + step1[27];
1075 step2[25] = step1[25] + step1[26];
1076 step2[26] = step1[25] - step1[26];
1077 step2[27] = step1[24] - step1[27];
1078 step2[28] = -step1[28] + step1[31];
1079 step2[29] = -step1[29] + step1[30];
1080 step2[30] = step1[29] + step1[30];
1081 step2[31] = step1[28] + step1[31];
1082
1083 // stage 5
1084 step1[0] = step2[0] + step2[3];
1085 step1[1] = step2[1] + step2[2];
1086 step1[2] = step2[1] - step2[2];
1087 step1[3] = step2[0] - step2[3];
1088 step1[4] = step2[4];
1089 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1090 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1091 step1[5] = dct_const_round_shift(temp1);
1092 step1[6] = dct_const_round_shift(temp2);
1093 step1[7] = step2[7];
1094
1095 step1[8] = step2[8] + step2[11];
1096 step1[9] = step2[9] + step2[10];
1097 step1[10] = step2[9] - step2[10];
1098 step1[11] = step2[8] - step2[11];
1099 step1[12] = -step2[12] + step2[15];
1100 step1[13] = -step2[13] + step2[14];
1101 step1[14] = step2[13] + step2[14];
1102 step1[15] = step2[12] + step2[15];
1103
1104 step1[16] = step2[16];
1105 step1[17] = step2[17];
1106 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1107 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1108 step1[18] = dct_const_round_shift(temp1);
1109 step1[29] = dct_const_round_shift(temp2);
1110 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1111 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1112 step1[19] = dct_const_round_shift(temp1);
1113 step1[28] = dct_const_round_shift(temp2);
1114 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1115 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1116 step1[20] = dct_const_round_shift(temp1);
1117 step1[27] = dct_const_round_shift(temp2);
1118 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1119 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1120 step1[21] = dct_const_round_shift(temp1);
1121 step1[26] = dct_const_round_shift(temp2);
1122 step1[22] = step2[22];
1123 step1[23] = step2[23];
1124 step1[24] = step2[24];
1125 step1[25] = step2[25];
1126 step1[30] = step2[30];
1127 step1[31] = step2[31];
1128
1129 // stage 6
1130 step2[0] = step1[0] + step1[7];
1131 step2[1] = step1[1] + step1[6];
1132 step2[2] = step1[2] + step1[5];
1133 step2[3] = step1[3] + step1[4];
1134 step2[4] = step1[3] - step1[4];
1135 step2[5] = step1[2] - step1[5];
1136 step2[6] = step1[1] - step1[6];
1137 step2[7] = step1[0] - step1[7];
1138 step2[8] = step1[8];
1139 step2[9] = step1[9];
1140 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1141 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1142 step2[10] = dct_const_round_shift(temp1);
1143 step2[13] = dct_const_round_shift(temp2);
1144 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1145 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1146 step2[11] = dct_const_round_shift(temp1);
1147 step2[12] = dct_const_round_shift(temp2);
1148 step2[14] = step1[14];
1149 step2[15] = step1[15];
1150
1151 step2[16] = step1[16] + step1[23];
1152 step2[17] = step1[17] + step1[22];
1153 step2[18] = step1[18] + step1[21];
1154 step2[19] = step1[19] + step1[20];
1155 step2[20] = step1[19] - step1[20];
1156 step2[21] = step1[18] - step1[21];
1157 step2[22] = step1[17] - step1[22];
1158 step2[23] = step1[16] - step1[23];
1159
1160 step2[24] = -step1[24] + step1[31];
1161 step2[25] = -step1[25] + step1[30];
1162 step2[26] = -step1[26] + step1[29];
1163 step2[27] = -step1[27] + step1[28];
1164 step2[28] = step1[27] + step1[28];
1165 step2[29] = step1[26] + step1[29];
1166 step2[30] = step1[25] + step1[30];
1167 step2[31] = step1[24] + step1[31];
1168
1169 // stage 7
1170 step1[0] = step2[0] + step2[15];
1171 step1[1] = step2[1] + step2[14];
1172 step1[2] = step2[2] + step2[13];
1173 step1[3] = step2[3] + step2[12];
1174 step1[4] = step2[4] + step2[11];
1175 step1[5] = step2[5] + step2[10];
1176 step1[6] = step2[6] + step2[9];
1177 step1[7] = step2[7] + step2[8];
1178 step1[8] = step2[7] - step2[8];
1179 step1[9] = step2[6] - step2[9];
1180 step1[10] = step2[5] - step2[10];
1181 step1[11] = step2[4] - step2[11];
1182 step1[12] = step2[3] - step2[12];
1183 step1[13] = step2[2] - step2[13];
1184 step1[14] = step2[1] - step2[14];
1185 step1[15] = step2[0] - step2[15];
1186
1187 step1[16] = step2[16];
1188 step1[17] = step2[17];
1189 step1[18] = step2[18];
1190 step1[19] = step2[19];
1191 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1192 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1193 step1[20] = dct_const_round_shift(temp1);
1194 step1[27] = dct_const_round_shift(temp2);
1195 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1196 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1197 step1[21] = dct_const_round_shift(temp1);
1198 step1[26] = dct_const_round_shift(temp2);
1199 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1200 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1201 step1[22] = dct_const_round_shift(temp1);
1202 step1[25] = dct_const_round_shift(temp2);
1203 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1204 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1205 step1[23] = dct_const_round_shift(temp1);
1206 step1[24] = dct_const_round_shift(temp2);
1207 step1[28] = step2[28];
1208 step1[29] = step2[29];
1209 step1[30] = step2[30];
1210 step1[31] = step2[31];
1211
1212 // final stage
1213 output[0] = step1[0] + step1[31];
1214 output[1] = step1[1] + step1[30];
1215 output[2] = step1[2] + step1[29];
1216 output[3] = step1[3] + step1[28];
1217 output[4] = step1[4] + step1[27];
1218 output[5] = step1[5] + step1[26];
1219 output[6] = step1[6] + step1[25];
1220 output[7] = step1[7] + step1[24];
1221 output[8] = step1[8] + step1[23];
1222 output[9] = step1[9] + step1[22];
1223 output[10] = step1[10] + step1[21];
1224 output[11] = step1[11] + step1[20];
1225 output[12] = step1[12] + step1[19];
1226 output[13] = step1[13] + step1[18];
1227 output[14] = step1[14] + step1[17];
1228 output[15] = step1[15] + step1[16];
1229 output[16] = step1[15] - step1[16];
1230 output[17] = step1[14] - step1[17];
1231 output[18] = step1[13] - step1[18];
1232 output[19] = step1[12] - step1[19];
1233 output[20] = step1[11] - step1[20];
1234 output[21] = step1[10] - step1[21];
1235 output[22] = step1[9] - step1[22];
1236 output[23] = step1[8] - step1[23];
1237 output[24] = step1[7] - step1[24];
1238 output[25] = step1[6] - step1[25];
1239 output[26] = step1[5] - step1[26];
1240 output[27] = step1[4] - step1[27];
1241 output[28] = step1[3] - step1[28];
1242 output[29] = step1[2] - step1[29];
1243 output[30] = step1[1] - step1[30];
1244 output[31] = step1[0] - step1[31];
1245 }
1246
vp9_idct32x32_1024_add_c(const int16_t * input,uint8_t * dest,int stride)1247 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
1248 int16_t out[32 * 32];
1249 int16_t *outptr = out;
1250 int i, j;
1251 int16_t temp_in[32], temp_out[32];
1252
1253 // Rows
1254 for (i = 0; i < 32; ++i) {
1255 int16_t zero_coeff[16];
1256 for (j = 0; j < 16; ++j)
1257 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1258 for (j = 0; j < 8; ++j)
1259 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1260 for (j = 0; j < 4; ++j)
1261 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1262 for (j = 0; j < 2; ++j)
1263 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1264
1265 if (zero_coeff[0] | zero_coeff[1])
1266 idct32(input, outptr);
1267 else
1268 vpx_memset(outptr, 0, sizeof(int16_t) * 32);
1269 input += 32;
1270 outptr += 32;
1271 }
1272
1273 // Columns
1274 for (i = 0; i < 32; ++i) {
1275 for (j = 0; j < 32; ++j)
1276 temp_in[j] = out[j * 32 + i];
1277 idct32(temp_in, temp_out);
1278 for (j = 0; j < 32; ++j)
1279 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1280 + dest[j * stride + i]);
1281 }
1282 }
1283
vp9_idct32x32_34_add_c(const int16_t * input,uint8_t * dest,int stride)1284 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
1285 int16_t out[32 * 32] = {0};
1286 int16_t *outptr = out;
1287 int i, j;
1288 int16_t temp_in[32], temp_out[32];
1289
1290 // Rows
1291 // only upper-left 8x8 has non-zero coeff
1292 for (i = 0; i < 8; ++i) {
1293 idct32(input, outptr);
1294 input += 32;
1295 outptr += 32;
1296 }
1297
1298 // Columns
1299 for (i = 0; i < 32; ++i) {
1300 for (j = 0; j < 32; ++j)
1301 temp_in[j] = out[j * 32 + i];
1302 idct32(temp_in, temp_out);
1303 for (j = 0; j < 32; ++j)
1304 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1305 + dest[j * stride + i]);
1306 }
1307 }
1308
vp9_idct32x32_1_add_c(const int16_t * input,uint8_t * dest,int stride)1309 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
1310 int i, j;
1311 int a1;
1312
1313 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
1314 out = dct_const_round_shift(out * cospi_16_64);
1315 a1 = ROUND_POWER_OF_TWO(out, 6);
1316
1317 for (j = 0; j < 32; ++j) {
1318 for (i = 0; i < 32; ++i)
1319 dest[i] = clip_pixel(dest[i] + a1);
1320 dest += stride;
1321 }
1322 }
1323
1324 // idct
vp9_idct4x4_add(const int16_t * input,uint8_t * dest,int stride,int eob)1325 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1326 if (eob > 1)
1327 vp9_idct4x4_16_add(input, dest, stride);
1328 else
1329 vp9_idct4x4_1_add(input, dest, stride);
1330 }
1331
1332
vp9_iwht4x4_add(const int16_t * input,uint8_t * dest,int stride,int eob)1333 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1334 if (eob > 1)
1335 vp9_iwht4x4_16_add(input, dest, stride);
1336 else
1337 vp9_iwht4x4_1_add(input, dest, stride);
1338 }
1339
vp9_idct8x8_add(const int16_t * input,uint8_t * dest,int stride,int eob)1340 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1341 // If dc is 1, then input[0] is the reconstructed value, do not need
1342 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
1343
1344 // The calculation can be simplified if there are not many non-zero dct
1345 // coefficients. Use eobs to decide what to do.
1346 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
1347 // Combine that with code here.
1348 if (eob == 1)
1349 // DC only DCT coefficient
1350 vp9_idct8x8_1_add(input, dest, stride);
1351 else if (eob <= 12)
1352 vp9_idct8x8_12_add(input, dest, stride);
1353 else
1354 vp9_idct8x8_64_add(input, dest, stride);
1355 }
1356
vp9_idct16x16_add(const int16_t * input,uint8_t * dest,int stride,int eob)1357 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
1358 int eob) {
1359 /* The calculation can be simplified if there are not many non-zero dct
1360 * coefficients. Use eobs to separate different cases. */
1361 if (eob == 1)
1362 /* DC only DCT coefficient. */
1363 vp9_idct16x16_1_add(input, dest, stride);
1364 else if (eob <= 10)
1365 vp9_idct16x16_10_add(input, dest, stride);
1366 else
1367 vp9_idct16x16_256_add(input, dest, stride);
1368 }
1369
vp9_idct32x32_add(const int16_t * input,uint8_t * dest,int stride,int eob)1370 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
1371 int eob) {
1372 if (eob == 1)
1373 vp9_idct32x32_1_add(input, dest, stride);
1374 else if (eob <= 34)
1375 // non-zero coeff only in upper-left 8x8
1376 vp9_idct32x32_34_add(input, dest, stride);
1377 else
1378 vp9_idct32x32_1024_add(input, dest, stride);
1379 }
1380
1381 // iht
vp9_iht4x4_add(TX_TYPE tx_type,const int16_t * input,uint8_t * dest,int stride,int eob)1382 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1383 int stride, int eob) {
1384 if (tx_type == DCT_DCT)
1385 vp9_idct4x4_add(input, dest, stride, eob);
1386 else
1387 vp9_iht4x4_16_add(input, dest, stride, tx_type);
1388 }
1389
vp9_iht8x8_add(TX_TYPE tx_type,const int16_t * input,uint8_t * dest,int stride,int eob)1390 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1391 int stride, int eob) {
1392 if (tx_type == DCT_DCT) {
1393 vp9_idct8x8_add(input, dest, stride, eob);
1394 } else {
1395 vp9_iht8x8_64_add(input, dest, stride, tx_type);
1396 }
1397 }
1398
vp9_iht16x16_add(TX_TYPE tx_type,const int16_t * input,uint8_t * dest,int stride,int eob)1399 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1400 int stride, int eob) {
1401 if (tx_type == DCT_DCT) {
1402 vp9_idct16x16_add(input, dest, stride, eob);
1403 } else {
1404 vp9_iht16x16_256_add(input, dest, stride, tx_type);
1405 }
1406 }
1407