1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 #include "mp4def.h"
19 #include "idct.h"
20 #include "motion_comp.h"
21
22 #ifdef FAST_IDCT
23
24 /****************************************************************
25 * vca_idct.c : created 6/1/99 for several options
26 * of hard-coded reduced idct function (using nz_coefs)
27 ******************************************************************/
28
29 /*****************************************************/
30 //pretested version
idctrow0(int16 *,uint8 *,uint8 *,int)31 void idctrow0(int16 *, uint8 *, uint8 *, int)
32 {
33 return ;
34 }
idctcol0(int16 *)35 void idctcol0(int16 *)
36 {
37 return ;
38 }
39
idctrow1(int16 * blk,uint8 * pred,uint8 * dst,int width)40 void idctrow1(int16 *blk, uint8 *pred, uint8 *dst, int width)
41 {
42 /* shortcut */
43 int tmp;
44 int i = 8;
45 uint32 pred_word, dst_word;
46 int res, res2;
47
48 /* preset the offset, such that we can take advantage pre-offset addressing mode */
49 width -= 4;
50 dst -= width;
51 pred -= 12;
52 blk -= 8;
53
54 while (i--)
55 {
56 tmp = (*(blk += 8) + 32) >> 6;
57 *blk = 0;
58
59 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
60 res = tmp + (pred_word & 0xFF);
61 CLIP_RESULT(res);
62 res2 = tmp + ((pred_word >> 8) & 0xFF);
63 CLIP_RESULT(res2);
64 dst_word = (res2 << 8) | res;
65 res = tmp + ((pred_word >> 16) & 0xFF);
66 CLIP_RESULT(res);
67 dst_word |= (res << 16);
68 res = tmp + ((pred_word >> 24) & 0xFF);
69 CLIP_RESULT(res);
70 dst_word |= (res << 24);
71 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
72
73 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
74 res = tmp + (pred_word & 0xFF);
75 CLIP_RESULT(res);
76 res2 = tmp + ((pred_word >> 8) & 0xFF);
77 CLIP_RESULT(res2);
78 dst_word = (res2 << 8) | res;
79 res = tmp + ((pred_word >> 16) & 0xFF);
80 CLIP_RESULT(res);
81 dst_word |= (res << 16);
82 res = tmp + ((pred_word >> 24) & 0xFF);
83 CLIP_RESULT(res);
84 dst_word |= (res << 24);
85 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
86 }
87 return;
88 }
89
idctcol1(int16 * blk)90 void idctcol1(int16 *blk)
91 { /* shortcut */
92 blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
93 blk[0] << 3;
94 return;
95 }
96
idctrow2(int16 * blk,uint8 * pred,uint8 * dst,int width)97 void idctrow2(int16 *blk, uint8 *pred, uint8 *dst, int width)
98 {
99 int32 x0, x1, x2, x4, x5;
100 int i = 8;
101 uint32 pred_word, dst_word;
102 int res, res2;
103
104 /* preset the offset, such that we can take advantage pre-offset addressing mode */
105 width -= 4;
106 dst -= width;
107 pred -= 12;
108 blk -= 8;
109
110 while (i--)
111 {
112 /* shortcut */
113 x4 = blk[9];
114 blk[9] = 0;
115 x0 = ((*(blk += 8)) << 8) + 8192;
116 *blk = 0; /* for proper rounding in the fourth stage */
117
118 /* first stage */
119 x5 = (W7 * x4 + 4) >> 3;
120 x4 = (W1 * x4 + 4) >> 3;
121
122 /* third stage */
123 x2 = (181 * (x4 + x5) + 128) >> 8;
124 x1 = (181 * (x4 - x5) + 128) >> 8;
125
126 /* fourth stage */
127 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
128 res = (x0 + x4) >> 14;
129 ADD_AND_CLIP1(res);
130 res2 = (x0 + x2) >> 14;
131 ADD_AND_CLIP2(res2);
132 dst_word = (res2 << 8) | res;
133 res = (x0 + x1) >> 14;
134 ADD_AND_CLIP3(res);
135 dst_word |= (res << 16);
136 res = (x0 + x5) >> 14;
137 ADD_AND_CLIP4(res);
138 dst_word |= (res << 24);
139 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
140
141 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
142 res = (x0 - x5) >> 14;
143 ADD_AND_CLIP1(res);
144 res2 = (x0 - x1) >> 14;
145 ADD_AND_CLIP2(res2);
146 dst_word = (res2 << 8) | res;
147 res = (x0 - x2) >> 14;
148 ADD_AND_CLIP3(res);
149 dst_word |= (res << 16);
150 res = (x0 - x4) >> 14;
151 ADD_AND_CLIP4(res);
152 dst_word |= (res << 24);
153 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
154 }
155 return ;
156 }
157
idctcol2(int16 * blk)158 void idctcol2(int16 *blk)
159 {
160 int32 x0, x1, x3, x5, x7;//, x8;
161
162 x1 = blk[8];
163 x0 = ((int32)blk[0] << 11) + 128;
164 /* both upper and lower*/
165
166 x7 = W7 * x1;
167 x1 = W1 * x1;
168
169 x3 = x7;
170 x5 = (181 * (x1 - x7) + 128) >> 8;
171 x7 = (181 * (x1 + x7) + 128) >> 8;
172
173 blk[0] = (x0 + x1) >> 8;
174 blk[8] = (x0 + x7) >> 8;
175 blk[16] = (x0 + x5) >> 8;
176 blk[24] = (x0 + x3) >> 8;
177 blk[56] = (x0 - x1) >> 8;
178 blk[48] = (x0 - x7) >> 8;
179 blk[40] = (x0 - x5) >> 8;
180 blk[32] = (x0 - x3) >> 8;
181
182 return ;
183 }
184
idctrow3(int16 * blk,uint8 * pred,uint8 * dst,int width)185 void idctrow3(int16 *blk, uint8 *pred, uint8 *dst, int width)
186 {
187 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
188 int i = 8;
189 uint32 pred_word, dst_word;
190 int res, res2;
191
192 /* preset the offset, such that we can take advantage pre-offset addressing mode */
193 width -= 4;
194 dst -= width;
195 pred -= 12;
196 blk -= 8;
197
198 while (i--)
199 {
200 x2 = blk[10];
201 blk[10] = 0;
202 x1 = blk[9];
203 blk[9] = 0;
204 x0 = ((*(blk += 8)) << 8) + 8192;
205 *blk = 0; /* for proper rounding in the fourth stage */
206 /* both upper and lower*/
207 /* both x2orx6 and x0orx4 */
208
209 x4 = x0;
210 x6 = (W6 * x2 + 4) >> 3;
211 x2 = (W2 * x2 + 4) >> 3;
212 x8 = x0 - x2;
213 x0 += x2;
214 x2 = x8;
215 x8 = x4 - x6;
216 x4 += x6;
217 x6 = x8;
218
219 x7 = (W7 * x1 + 4) >> 3;
220 x1 = (W1 * x1 + 4) >> 3;
221 x3 = x7;
222 x5 = (181 * (x1 - x7) + 128) >> 8;
223 x7 = (181 * (x1 + x7) + 128) >> 8;
224
225 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
226 res = (x0 + x1) >> 14;
227 ADD_AND_CLIP1(res);
228 res2 = (x4 + x7) >> 14;
229 ADD_AND_CLIP2(res2);
230 dst_word = (res2 << 8) | res;
231 res = (x6 + x5) >> 14;
232 ADD_AND_CLIP3(res);
233 dst_word |= (res << 16);
234 res = (x2 + x3) >> 14;
235 ADD_AND_CLIP4(res);
236 dst_word |= (res << 24);
237 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
238
239 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
240 res = (x2 - x3) >> 14;
241 ADD_AND_CLIP1(res);
242 res2 = (x6 - x5) >> 14;
243 ADD_AND_CLIP2(res2);
244 dst_word = (res2 << 8) | res;
245 res = (x4 - x7) >> 14;
246 ADD_AND_CLIP3(res);
247 dst_word |= (res << 16);
248 res = (x0 - x1) >> 14;
249 ADD_AND_CLIP4(res);
250 dst_word |= (res << 24);
251 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
252 }
253
254 return ;
255 }
256
idctcol3(int16 * blk)257 void idctcol3(int16 *blk)
258 {
259 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
260
261 x2 = blk[16];
262 x1 = blk[8];
263 x0 = ((int32)blk[0] << 11) + 128;
264
265 x4 = x0;
266 x6 = W6 * x2;
267 x2 = W2 * x2;
268 x8 = x0 - x2;
269 x0 += x2;
270 x2 = x8;
271 x8 = x4 - x6;
272 x4 += x6;
273 x6 = x8;
274
275 x7 = W7 * x1;
276 x1 = W1 * x1;
277 x3 = x7;
278 x5 = (181 * (x1 - x7) + 128) >> 8;
279 x7 = (181 * (x1 + x7) + 128) >> 8;
280
281 blk[0] = (x0 + x1) >> 8;
282 blk[8] = (x4 + x7) >> 8;
283 blk[16] = (x6 + x5) >> 8;
284 blk[24] = (x2 + x3) >> 8;
285 blk[56] = (x0 - x1) >> 8;
286 blk[48] = (x4 - x7) >> 8;
287 blk[40] = (x6 - x5) >> 8;
288 blk[32] = (x2 - x3) >> 8;
289
290 return;
291 }
292
293
idctrow4(int16 * blk,uint8 * pred,uint8 * dst,int width)294 void idctrow4(int16 *blk, uint8 *pred, uint8 *dst, int width)
295 {
296 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
297 int i = 8;
298 uint32 pred_word, dst_word;
299 int res, res2;
300
301 /* preset the offset, such that we can take advantage pre-offset addressing mode */
302 width -= 4;
303 dst -= width;
304 pred -= 12;
305 blk -= 8;
306
307 while (i--)
308 {
309 x2 = blk[10];
310 blk[10] = 0;
311 x1 = blk[9];
312 blk[9] = 0;
313 x3 = blk[11];
314 blk[11] = 0;
315 x0 = ((*(blk += 8)) << 8) + 8192;
316 *blk = 0; /* for proper rounding in the fourth stage */
317
318 x4 = x0;
319 x6 = (W6 * x2 + 4) >> 3;
320 x2 = (W2 * x2 + 4) >> 3;
321 x8 = x0 - x2;
322 x0 += x2;
323 x2 = x8;
324 x8 = x4 - x6;
325 x4 += x6;
326 x6 = x8;
327
328 x7 = (W7 * x1 + 4) >> 3;
329 x1 = (W1 * x1 + 4) >> 3;
330 x5 = (W3 * x3 + 4) >> 3;
331 x3 = (- W5 * x3 + 4) >> 3;
332 x8 = x1 - x5;
333 x1 += x5;
334 x5 = x8;
335 x8 = x7 - x3;
336 x3 += x7;
337 x7 = (181 * (x5 + x8) + 128) >> 8;
338 x5 = (181 * (x5 - x8) + 128) >> 8;
339
340 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
341 res = (x0 + x1) >> 14;
342 ADD_AND_CLIP1(res);
343 res2 = (x4 + x7) >> 14;
344 ADD_AND_CLIP2(res2);
345 dst_word = (res2 << 8) | res;
346 res = (x6 + x5) >> 14;
347 ADD_AND_CLIP3(res);
348 dst_word |= (res << 16);
349 res = (x2 + x3) >> 14;
350 ADD_AND_CLIP4(res);
351 dst_word |= (res << 24);
352 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
353
354 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
355 res = (x2 - x3) >> 14;
356 ADD_AND_CLIP1(res);
357 res2 = (x6 - x5) >> 14;
358 ADD_AND_CLIP2(res2);
359 dst_word = (res2 << 8) | res;
360 res = (x4 - x7) >> 14;
361 ADD_AND_CLIP3(res);
362 dst_word |= (res << 16);
363 res = (x0 - x1) >> 14;
364 ADD_AND_CLIP4(res);
365 dst_word |= (res << 24);
366 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
367 }
368 return ;
369 }
370
idctcol4(int16 * blk)371 void idctcol4(int16 *blk)
372 {
373 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
374 x2 = blk[16];
375 x1 = blk[8];
376 x3 = blk[24];
377 x0 = ((int32)blk[0] << 11) + 128;
378
379 x4 = x0;
380 x6 = W6 * x2;
381 x2 = W2 * x2;
382 x8 = x0 - x2;
383 x0 += x2;
384 x2 = x8;
385 x8 = x4 - x6;
386 x4 += x6;
387 x6 = x8;
388
389 x7 = W7 * x1;
390 x1 = W1 * x1;
391 x5 = W3 * x3;
392 x3 = -W5 * x3;
393 x8 = x1 - x5;
394 x1 += x5;
395 x5 = x8;
396 x8 = x7 - x3;
397 x3 += x7;
398 x7 = (181 * (x5 + x8) + 128) >> 8;
399 x5 = (181 * (x5 - x8) + 128) >> 8;
400
401
402 blk[0] = (x0 + x1) >> 8;
403 blk[8] = (x4 + x7) >> 8;
404 blk[16] = (x6 + x5) >> 8;
405 blk[24] = (x2 + x3) >> 8;
406 blk[56] = (x0 - x1) >> 8;
407 blk[48] = (x4 - x7) >> 8;
408 blk[40] = (x6 - x5) >> 8;
409 blk[32] = (x2 - x3) >> 8;
410
411 return ;
412 }
413
idctrow0_intra(int16 *,PIXEL *,int)414 void idctrow0_intra(int16 *, PIXEL *, int)
415 {
416 return ;
417 }
418
idctrow1_intra(int16 * blk,PIXEL * comp,int width)419 void idctrow1_intra(int16 *blk, PIXEL *comp, int width)
420 {
421 /* shortcut */
422 int32 tmp;
423 int i = 8;
424 int offset = width;
425 uint32 word;
426
427 comp -= offset;
428 while (i--)
429 {
430 tmp = ((blk[0] + 32) >> 6);
431 blk[0] = 0;
432 CLIP_RESULT(tmp)
433
434 word = (tmp << 8) | tmp;
435 word = (word << 16) | word;
436
437 *((uint32*)(comp += offset)) = word;
438 *((uint32*)(comp + 4)) = word;
439
440
441
442
443 blk += B_SIZE;
444 }
445 return;
446 }
447
idctrow2_intra(int16 * blk,PIXEL * comp,int width)448 void idctrow2_intra(int16 *blk, PIXEL *comp, int width)
449 {
450 int32 x0, x1, x2, x4, x5, temp;
451 int i = 8;
452 int offset = width;
453 int32 word;
454
455 comp -= offset;
456 while (i--)
457 {
458 /* shortcut */
459 x4 = blk[1];
460 blk[1] = 0;
461 x0 = ((int32)blk[0] << 8) + 8192;
462 blk[0] = 0; /* for proper rounding in the fourth stage */
463
464 /* first stage */
465 x5 = (W7 * x4 + 4) >> 3;
466 x4 = (W1 * x4 + 4) >> 3;
467
468 /* third stage */
469 x2 = (181 * (x4 + x5) + 128) >> 8;
470 x1 = (181 * (x4 - x5) + 128) >> 8;
471
472 /* fourth stage */
473 word = ((x0 + x4) >> 14);
474 CLIP_RESULT(word)
475
476 temp = ((x0 + x2) >> 14);
477 CLIP_RESULT(temp)
478 word = word | (temp << 8);
479 temp = ((x0 + x1) >> 14);
480 CLIP_RESULT(temp)
481 word = word | (temp << 16);
482 temp = ((x0 + x5) >> 14);
483 CLIP_RESULT(temp)
484 word = word | (temp << 24);
485 *((int32*)(comp += offset)) = word;
486
487 word = ((x0 - x5) >> 14);
488 CLIP_RESULT(word)
489 temp = ((x0 - x1) >> 14);
490 CLIP_RESULT(temp)
491 word = word | (temp << 8);
492 temp = ((x0 - x2) >> 14);
493 CLIP_RESULT(temp)
494 word = word | (temp << 16);
495 temp = ((x0 - x4) >> 14);
496 CLIP_RESULT(temp)
497 word = word | (temp << 24);
498 *((int32*)(comp + 4)) = word;
499
500 blk += B_SIZE;
501 }
502 return ;
503 }
504
idctrow3_intra(int16 * blk,PIXEL * comp,int width)505 void idctrow3_intra(int16 *blk, PIXEL *comp, int width)
506 {
507 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
508 int i = 8;
509 int offset = width;
510 int32 word;
511
512 comp -= offset;
513
514 while (i--)
515 {
516 x2 = blk[2];
517 blk[2] = 0;
518 x1 = blk[1];
519 blk[1] = 0;
520 x0 = ((int32)blk[0] << 8) + 8192;
521 blk[0] = 0;/* for proper rounding in the fourth stage */
522 /* both upper and lower*/
523 /* both x2orx6 and x0orx4 */
524
525 x4 = x0;
526 x6 = (W6 * x2 + 4) >> 3;
527 x2 = (W2 * x2 + 4) >> 3;
528 x8 = x0 - x2;
529 x0 += x2;
530 x2 = x8;
531 x8 = x4 - x6;
532 x4 += x6;
533 x6 = x8;
534
535 x7 = (W7 * x1 + 4) >> 3;
536 x1 = (W1 * x1 + 4) >> 3;
537 x3 = x7;
538 x5 = (181 * (x1 - x7) + 128) >> 8;
539 x7 = (181 * (x1 + x7) + 128) >> 8;
540
541 word = ((x0 + x1) >> 14);
542 CLIP_RESULT(word)
543 temp = ((x4 + x7) >> 14);
544 CLIP_RESULT(temp)
545 word = word | (temp << 8);
546
547
548 temp = ((x6 + x5) >> 14);
549 CLIP_RESULT(temp)
550 word = word | (temp << 16);
551
552 temp = ((x2 + x3) >> 14);
553 CLIP_RESULT(temp)
554 word = word | (temp << 24);
555 *((int32*)(comp += offset)) = word;
556
557 word = ((x2 - x3) >> 14);
558 CLIP_RESULT(word)
559
560 temp = ((x6 - x5) >> 14);
561 CLIP_RESULT(temp)
562 word = word | (temp << 8);
563
564 temp = ((x4 - x7) >> 14);
565 CLIP_RESULT(temp)
566 word = word | (temp << 16);
567
568 temp = ((x0 - x1) >> 14);
569 CLIP_RESULT(temp)
570 word = word | (temp << 24);
571 *((int32*)(comp + 4)) = word;
572
573 blk += B_SIZE;
574 }
575 return ;
576 }
577
idctrow4_intra(int16 * blk,PIXEL * comp,int width)578 void idctrow4_intra(int16 *blk, PIXEL *comp, int width)
579 {
580 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
581 int i = 8;
582 int offset = width;
583 int32 word;
584
585 comp -= offset;
586
587 while (i--)
588 {
589 x2 = blk[2];
590 blk[2] = 0;
591 x1 = blk[1];
592 blk[1] = 0;
593 x3 = blk[3];
594 blk[3] = 0;
595 x0 = ((int32)blk[0] << 8) + 8192;
596 blk[0] = 0;/* for proper rounding in the fourth stage */
597
598 x4 = x0;
599 x6 = (W6 * x2 + 4) >> 3;
600 x2 = (W2 * x2 + 4) >> 3;
601 x8 = x0 - x2;
602 x0 += x2;
603 x2 = x8;
604 x8 = x4 - x6;
605 x4 += x6;
606 x6 = x8;
607
608 x7 = (W7 * x1 + 4) >> 3;
609 x1 = (W1 * x1 + 4) >> 3;
610 x5 = (W3 * x3 + 4) >> 3;
611 x3 = (- W5 * x3 + 4) >> 3;
612 x8 = x1 - x5;
613 x1 += x5;
614 x5 = x8;
615 x8 = x7 - x3;
616 x3 += x7;
617 x7 = (181 * (x5 + x8) + 128) >> 8;
618 x5 = (181 * (x5 - x8) + 128) >> 8;
619
620 word = ((x0 + x1) >> 14);
621 CLIP_RESULT(word)
622
623 temp = ((x4 + x7) >> 14);
624 CLIP_RESULT(temp)
625 word = word | (temp << 8);
626
627
628 temp = ((x6 + x5) >> 14);
629 CLIP_RESULT(temp)
630 word = word | (temp << 16);
631
632 temp = ((x2 + x3) >> 14);
633 CLIP_RESULT(temp)
634 word = word | (temp << 24);
635 *((int32*)(comp += offset)) = word;
636
637 word = ((x2 - x3) >> 14);
638 CLIP_RESULT(word)
639
640 temp = ((x6 - x5) >> 14);
641 CLIP_RESULT(temp)
642 word = word | (temp << 8);
643
644 temp = ((x4 - x7) >> 14);
645 CLIP_RESULT(temp)
646 word = word | (temp << 16);
647
648 temp = ((x0 - x1) >> 14);
649 CLIP_RESULT(temp)
650 word = word | (temp << 24);
651 *((int32*)(comp + 4)) = word;
652
653 blk += B_SIZE;
654 }
655
656 return ;
657 }
658
659 #endif
660
661