1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 #include "mp4def.h"
19 #include "idct.h"
20 #include "motion_comp.h"
21 
22 #ifdef FAST_IDCT
23 
24 /****************************************************************
25 *       vca_idct.c : created 6/1/99 for several options
26 *                     of hard-coded reduced idct function (using nz_coefs)
27 ******************************************************************/
28 
29 /*****************************************************/
30 //pretested version
idctrow0(int16 *,uint8 *,uint8 *,int)31 void idctrow0(int16 *, uint8 *, uint8 *, int)
32 {
33     return ;
34 }
idctcol0(int16 *)35 void idctcol0(int16 *)
36 {
37     return ;
38 }
39 
idctrow1(int16 * blk,uint8 * pred,uint8 * dst,int width)40 void idctrow1(int16 *blk, uint8 *pred, uint8 *dst, int width)
41 {
42     /* shortcut */
43     int tmp;
44     int i = 8;
45     uint32 pred_word, dst_word;
46     int res, res2;
47 
48     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
49     width -= 4;
50     dst -= width;
51     pred -= 12;
52     blk -= 8;
53 
54     while (i--)
55     {
56         tmp = (*(blk += 8) + 32) >> 6;
57         *blk = 0;
58 
59         pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
60         res = tmp + (pred_word & 0xFF);
61         CLIP_RESULT(res);
62         res2 = tmp + ((pred_word >> 8) & 0xFF);
63         CLIP_RESULT(res2);
64         dst_word = (res2 << 8) | res;
65         res = tmp + ((pred_word >> 16) & 0xFF);
66         CLIP_RESULT(res);
67         dst_word |= (res << 16);
68         res = tmp + ((pred_word >> 24) & 0xFF);
69         CLIP_RESULT(res);
70         dst_word |= (res << 24);
71         *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
72 
73         pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
74         res = tmp + (pred_word & 0xFF);
75         CLIP_RESULT(res);
76         res2 = tmp + ((pred_word >> 8) & 0xFF);
77         CLIP_RESULT(res2);
78         dst_word = (res2 << 8) | res;
79         res = tmp + ((pred_word >> 16) & 0xFF);
80         CLIP_RESULT(res);
81         dst_word |= (res << 16);
82         res = tmp + ((pred_word >> 24) & 0xFF);
83         CLIP_RESULT(res);
84         dst_word |= (res << 24);
85         *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
86     }
87     return;
88 }
89 
idctcol1(int16 * blk)90 void idctcol1(int16 *blk)
91 { /* shortcut */
92     blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
93                                               blk[0] << 3;
94     return;
95 }
96 
idctrow2(int16 * blk,uint8 * pred,uint8 * dst,int width)97 void idctrow2(int16 *blk, uint8 *pred, uint8 *dst, int width)
98 {
99     int32 x0, x1, x2, x4, x5;
100     int i = 8;
101     uint32 pred_word, dst_word;
102     int res, res2;
103 
104     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
105     width -= 4;
106     dst -= width;
107     pred -= 12;
108     blk -= 8;
109 
110     while (i--)
111     {
112         /* shortcut */
113         x4 = blk[9];
114         blk[9] = 0;
115         x0 = ((*(blk += 8)) << 8) + 8192;
116         *blk = 0;  /* for proper rounding in the fourth stage */
117 
118         /* first stage */
119         x5 = (W7 * x4 + 4) >> 3;
120         x4 = (W1 * x4 + 4) >> 3;
121 
122         /* third stage */
123         x2 = (181 * (x4 + x5) + 128) >> 8;
124         x1 = (181 * (x4 - x5) + 128) >> 8;
125 
126         /* fourth stage */
127         pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
128         res = (x0 + x4) >> 14;
129         ADD_AND_CLIP1(res);
130         res2 = (x0 + x2) >> 14;
131         ADD_AND_CLIP2(res2);
132         dst_word = (res2 << 8) | res;
133         res = (x0 + x1) >> 14;
134         ADD_AND_CLIP3(res);
135         dst_word |= (res << 16);
136         res = (x0 + x5) >> 14;
137         ADD_AND_CLIP4(res);
138         dst_word |= (res << 24);
139         *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
140 
141         pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
142         res = (x0 - x5) >> 14;
143         ADD_AND_CLIP1(res);
144         res2 = (x0 - x1) >> 14;
145         ADD_AND_CLIP2(res2);
146         dst_word = (res2 << 8) | res;
147         res = (x0 - x2) >> 14;
148         ADD_AND_CLIP3(res);
149         dst_word |= (res << 16);
150         res = (x0 - x4) >> 14;
151         ADD_AND_CLIP4(res);
152         dst_word |= (res << 24);
153         *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
154     }
155     return ;
156 }
157 
idctcol2(int16 * blk)158 void idctcol2(int16 *blk)
159 {
160     int32 x0, x1, x3, x5, x7;//, x8;
161 
162     x1 = blk[8];
163     x0 = ((int32)blk[0] << 11) + 128;
164     /* both upper and lower*/
165 
166     x7 = W7 * x1;
167     x1 = W1 * x1;
168 
169     x3 = x7;
170     x5 = (181 * (x1 - x7) + 128) >> 8;
171     x7 = (181 * (x1 + x7) + 128) >> 8;
172 
173     blk[0] = (x0 + x1) >> 8;
174     blk[8] = (x0 + x7) >> 8;
175     blk[16] = (x0 + x5) >> 8;
176     blk[24] = (x0 + x3) >> 8;
177     blk[56] = (x0 - x1) >> 8;
178     blk[48] = (x0 - x7) >> 8;
179     blk[40] = (x0 - x5) >> 8;
180     blk[32] = (x0 - x3) >> 8;
181 
182     return ;
183 }
184 
idctrow3(int16 * blk,uint8 * pred,uint8 * dst,int width)185 void idctrow3(int16 *blk, uint8 *pred, uint8 *dst, int width)
186 {
187     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
188     int i = 8;
189     uint32 pred_word, dst_word;
190     int res, res2;
191 
192     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
193     width -= 4;
194     dst -= width;
195     pred -= 12;
196     blk -= 8;
197 
198     while (i--)
199     {
200         x2 = blk[10];
201         blk[10] = 0;
202         x1 = blk[9];
203         blk[9] = 0;
204         x0 = ((*(blk += 8)) << 8) + 8192;
205         *blk = 0;   /* for proper rounding in the fourth stage */
206         /* both upper and lower*/
207         /* both x2orx6 and x0orx4 */
208 
209         x4 = x0;
210         x6 = (W6 * x2 + 4) >> 3;
211         x2 = (W2 * x2 + 4) >> 3;
212         x8 = x0 - x2;
213         x0 += x2;
214         x2 = x8;
215         x8 = x4 - x6;
216         x4 += x6;
217         x6 = x8;
218 
219         x7 = (W7 * x1 + 4) >> 3;
220         x1 = (W1 * x1 + 4) >> 3;
221         x3 = x7;
222         x5 = (181 * (x1 - x7) + 128) >> 8;
223         x7 = (181 * (x1 + x7) + 128) >> 8;
224 
225         pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
226         res = (x0 + x1) >> 14;
227         ADD_AND_CLIP1(res);
228         res2 = (x4 + x7) >> 14;
229         ADD_AND_CLIP2(res2);
230         dst_word = (res2 << 8) | res;
231         res = (x6 + x5) >> 14;
232         ADD_AND_CLIP3(res);
233         dst_word |= (res << 16);
234         res = (x2 + x3) >> 14;
235         ADD_AND_CLIP4(res);
236         dst_word |= (res << 24);
237         *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
238 
239         pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
240         res = (x2 - x3) >> 14;
241         ADD_AND_CLIP1(res);
242         res2 = (x6 - x5) >> 14;
243         ADD_AND_CLIP2(res2);
244         dst_word = (res2 << 8) | res;
245         res = (x4 - x7) >> 14;
246         ADD_AND_CLIP3(res);
247         dst_word |= (res << 16);
248         res = (x0 - x1) >> 14;
249         ADD_AND_CLIP4(res);
250         dst_word |= (res << 24);
251         *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
252     }
253 
254     return ;
255 }
256 
idctcol3(int16 * blk)257 void idctcol3(int16 *blk)
258 {
259     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
260 
261     x2 = blk[16];
262     x1 = blk[8];
263     x0 = ((int32)blk[0] << 11) + 128;
264 
265     x4 = x0;
266     x6 = W6 * x2;
267     x2 = W2 * x2;
268     x8 = x0 - x2;
269     x0 += x2;
270     x2 = x8;
271     x8 = x4 - x6;
272     x4 += x6;
273     x6 = x8;
274 
275     x7 = W7 * x1;
276     x1 = W1 * x1;
277     x3 = x7;
278     x5 = (181 * (x1 - x7) + 128) >> 8;
279     x7 = (181 * (x1 + x7) + 128) >> 8;
280 
281     blk[0] = (x0 + x1) >> 8;
282     blk[8] = (x4 + x7) >> 8;
283     blk[16] = (x6 + x5) >> 8;
284     blk[24] = (x2 + x3) >> 8;
285     blk[56] = (x0 - x1) >> 8;
286     blk[48] = (x4 - x7) >> 8;
287     blk[40] = (x6 - x5) >> 8;
288     blk[32] = (x2 - x3) >> 8;
289 
290     return;
291 }
292 
293 
idctrow4(int16 * blk,uint8 * pred,uint8 * dst,int width)294 void idctrow4(int16 *blk, uint8 *pred, uint8 *dst, int width)
295 {
296     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
297     int i = 8;
298     uint32 pred_word, dst_word;
299     int res, res2;
300 
301     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
302     width -= 4;
303     dst -= width;
304     pred -= 12;
305     blk -= 8;
306 
307     while (i--)
308     {
309         x2 = blk[10];
310         blk[10] = 0;
311         x1 = blk[9];
312         blk[9] = 0;
313         x3 = blk[11];
314         blk[11] = 0;
315         x0 = ((*(blk += 8)) << 8) + 8192;
316         *blk = 0;    /* for proper rounding in the fourth stage */
317 
318         x4 = x0;
319         x6 = (W6 * x2 + 4) >> 3;
320         x2 = (W2 * x2 + 4) >> 3;
321         x8 = x0 - x2;
322         x0 += x2;
323         x2 = x8;
324         x8 = x4 - x6;
325         x4 += x6;
326         x6 = x8;
327 
328         x7 = (W7 * x1 + 4) >> 3;
329         x1 = (W1 * x1 + 4) >> 3;
330         x5 = (W3 * x3 + 4) >> 3;
331         x3 = (- W5 * x3 + 4) >> 3;
332         x8 = x1 - x5;
333         x1 += x5;
334         x5 = x8;
335         x8 = x7 - x3;
336         x3 += x7;
337         x7 = (181 * (x5 + x8) + 128) >> 8;
338         x5 = (181 * (x5 - x8) + 128) >> 8;
339 
340         pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
341         res = (x0 + x1) >> 14;
342         ADD_AND_CLIP1(res);
343         res2 = (x4 + x7) >> 14;
344         ADD_AND_CLIP2(res2);
345         dst_word = (res2 << 8) | res;
346         res = (x6 + x5) >> 14;
347         ADD_AND_CLIP3(res);
348         dst_word |= (res << 16);
349         res = (x2 + x3) >> 14;
350         ADD_AND_CLIP4(res);
351         dst_word |= (res << 24);
352         *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
353 
354         pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
355         res = (x2 - x3) >> 14;
356         ADD_AND_CLIP1(res);
357         res2 = (x6 - x5) >> 14;
358         ADD_AND_CLIP2(res2);
359         dst_word = (res2 << 8) | res;
360         res = (x4 - x7) >> 14;
361         ADD_AND_CLIP3(res);
362         dst_word |= (res << 16);
363         res = (x0 - x1) >> 14;
364         ADD_AND_CLIP4(res);
365         dst_word |= (res << 24);
366         *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
367     }
368     return ;
369 }
370 
idctcol4(int16 * blk)371 void idctcol4(int16 *blk)
372 {
373     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
374     x2 = blk[16];
375     x1 = blk[8];
376     x3 = blk[24];
377     x0 = ((int32)blk[0] << 11) + 128;
378 
379     x4 = x0;
380     x6 = W6 * x2;
381     x2 = W2 * x2;
382     x8 = x0 - x2;
383     x0 += x2;
384     x2 = x8;
385     x8 = x4 - x6;
386     x4 += x6;
387     x6 = x8;
388 
389     x7 = W7 * x1;
390     x1 = W1 * x1;
391     x5 = W3 * x3;
392     x3 = -W5 * x3;
393     x8 = x1 - x5;
394     x1 += x5;
395     x5 = x8;
396     x8 = x7 - x3;
397     x3 += x7;
398     x7 = (181 * (x5 + x8) + 128) >> 8;
399     x5 = (181 * (x5 - x8) + 128) >> 8;
400 
401 
402     blk[0] = (x0 + x1) >> 8;
403     blk[8] = (x4 + x7) >> 8;
404     blk[16] = (x6 + x5) >> 8;
405     blk[24] = (x2 + x3) >> 8;
406     blk[56] = (x0 - x1) >> 8;
407     blk[48] = (x4 - x7) >> 8;
408     blk[40] = (x6 - x5) >> 8;
409     blk[32] = (x2 - x3) >> 8;
410 
411     return ;
412 }
413 
idctrow0_intra(int16 *,PIXEL *,int)414 void idctrow0_intra(int16 *, PIXEL *, int)
415 {
416     return ;
417 }
418 
idctrow1_intra(int16 * blk,PIXEL * comp,int width)419 void idctrow1_intra(int16 *blk, PIXEL *comp, int width)
420 {
421     /* shortcut */
422     int32 tmp;
423     int i = 8;
424     int offset = width;
425     uint32 word;
426 
427     comp -= offset;
428     while (i--)
429     {
430         tmp = ((blk[0] + 32) >> 6);
431         blk[0] = 0;
432         CLIP_RESULT(tmp)
433 
434         word = (tmp << 8) | tmp;
435         word = (word << 16) | word;
436 
437         *((uint32*)(comp += offset)) = word;
438         *((uint32*)(comp + 4)) = word;
439 
440 
441 
442 
443         blk += B_SIZE;
444     }
445     return;
446 }
447 
idctrow2_intra(int16 * blk,PIXEL * comp,int width)448 void idctrow2_intra(int16 *blk, PIXEL *comp, int width)
449 {
450     int32 x0, x1, x2, x4, x5, temp;
451     int i = 8;
452     int offset = width;
453     int32 word;
454 
455     comp -= offset;
456     while (i--)
457     {
458         /* shortcut */
459         x4 = blk[1];
460         blk[1] = 0;
461         x0 = ((int32)blk[0] << 8) + 8192;
462         blk[0] = 0;   /* for proper rounding in the fourth stage */
463 
464         /* first stage */
465         x5 = (W7 * x4 + 4) >> 3;
466         x4 = (W1 * x4 + 4) >> 3;
467 
468         /* third stage */
469         x2 = (181 * (x4 + x5) + 128) >> 8;
470         x1 = (181 * (x4 - x5) + 128) >> 8;
471 
472         /* fourth stage */
473         word = ((x0 + x4) >> 14);
474         CLIP_RESULT(word)
475 
476         temp = ((x0 + x2) >> 14);
477         CLIP_RESULT(temp)
478         word = word | (temp << 8);
479         temp = ((x0 + x1) >> 14);
480         CLIP_RESULT(temp)
481         word = word | (temp << 16);
482         temp = ((x0 + x5) >> 14);
483         CLIP_RESULT(temp)
484         word = word | (temp << 24);
485         *((int32*)(comp += offset)) = word;
486 
487         word = ((x0 - x5) >> 14);
488         CLIP_RESULT(word)
489         temp = ((x0 - x1) >> 14);
490         CLIP_RESULT(temp)
491         word = word | (temp << 8);
492         temp = ((x0 - x2) >> 14);
493         CLIP_RESULT(temp)
494         word = word | (temp << 16);
495         temp = ((x0 - x4) >> 14);
496         CLIP_RESULT(temp)
497         word = word | (temp << 24);
498         *((int32*)(comp + 4)) = word;
499 
500         blk += B_SIZE;
501     }
502     return ;
503 }
504 
idctrow3_intra(int16 * blk,PIXEL * comp,int width)505 void idctrow3_intra(int16 *blk, PIXEL *comp, int width)
506 {
507     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
508     int i = 8;
509     int offset = width;
510     int32 word;
511 
512     comp -= offset;
513 
514     while (i--)
515     {
516         x2 = blk[2];
517         blk[2] = 0;
518         x1 = blk[1];
519         blk[1] = 0;
520         x0 = ((int32)blk[0] << 8) + 8192;
521         blk[0] = 0;/* for proper rounding in the fourth stage */
522         /* both upper and lower*/
523         /* both x2orx6 and x0orx4 */
524 
525         x4 = x0;
526         x6 = (W6 * x2 + 4) >> 3;
527         x2 = (W2 * x2 + 4) >> 3;
528         x8 = x0 - x2;
529         x0 += x2;
530         x2 = x8;
531         x8 = x4 - x6;
532         x4 += x6;
533         x6 = x8;
534 
535         x7 = (W7 * x1 + 4) >> 3;
536         x1 = (W1 * x1 + 4) >> 3;
537         x3 = x7;
538         x5 = (181 * (x1 - x7) + 128) >> 8;
539         x7 = (181 * (x1 + x7) + 128) >> 8;
540 
541         word = ((x0 + x1) >> 14);
542         CLIP_RESULT(word)
543         temp = ((x4 + x7) >> 14);
544         CLIP_RESULT(temp)
545         word = word | (temp << 8);
546 
547 
548         temp = ((x6 + x5) >> 14);
549         CLIP_RESULT(temp)
550         word = word | (temp << 16);
551 
552         temp = ((x2 + x3) >> 14);
553         CLIP_RESULT(temp)
554         word = word | (temp << 24);
555         *((int32*)(comp += offset)) = word;
556 
557         word = ((x2 - x3) >> 14);
558         CLIP_RESULT(word)
559 
560         temp = ((x6 - x5) >> 14);
561         CLIP_RESULT(temp)
562         word = word | (temp << 8);
563 
564         temp = ((x4 - x7) >> 14);
565         CLIP_RESULT(temp)
566         word = word | (temp << 16);
567 
568         temp = ((x0 - x1) >> 14);
569         CLIP_RESULT(temp)
570         word = word | (temp << 24);
571         *((int32*)(comp + 4)) = word;
572 
573         blk += B_SIZE;
574     }
575     return ;
576 }
577 
idctrow4_intra(int16 * blk,PIXEL * comp,int width)578 void idctrow4_intra(int16 *blk, PIXEL *comp, int width)
579 {
580     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
581     int i = 8;
582     int offset = width;
583     int32 word;
584 
585     comp -= offset;
586 
587     while (i--)
588     {
589         x2 = blk[2];
590         blk[2] = 0;
591         x1 = blk[1];
592         blk[1] = 0;
593         x3 = blk[3];
594         blk[3] = 0;
595         x0 = ((int32)blk[0] << 8) + 8192;
596         blk[0] = 0;/* for proper rounding in the fourth stage */
597 
598         x4 = x0;
599         x6 = (W6 * x2 + 4) >> 3;
600         x2 = (W2 * x2 + 4) >> 3;
601         x8 = x0 - x2;
602         x0 += x2;
603         x2 = x8;
604         x8 = x4 - x6;
605         x4 += x6;
606         x6 = x8;
607 
608         x7 = (W7 * x1 + 4) >> 3;
609         x1 = (W1 * x1 + 4) >> 3;
610         x5 = (W3 * x3 + 4) >> 3;
611         x3 = (- W5 * x3 + 4) >> 3;
612         x8 = x1 - x5;
613         x1 += x5;
614         x5 = x8;
615         x8 = x7 - x3;
616         x3 += x7;
617         x7 = (181 * (x5 + x8) + 128) >> 8;
618         x5 = (181 * (x5 - x8) + 128) >> 8;
619 
620         word = ((x0 + x1) >> 14);
621         CLIP_RESULT(word)
622 
623         temp = ((x4 + x7) >> 14);
624         CLIP_RESULT(temp)
625         word = word | (temp << 8);
626 
627 
628         temp = ((x6 + x5) >> 14);
629         CLIP_RESULT(temp)
630         word = word | (temp << 16);
631 
632         temp = ((x2 + x3) >> 14);
633         CLIP_RESULT(temp)
634         word = word | (temp << 24);
635         *((int32*)(comp += offset)) = word;
636 
637         word = ((x2 - x3) >> 14);
638         CLIP_RESULT(word)
639 
640         temp = ((x6 - x5) >> 14);
641         CLIP_RESULT(temp)
642         word = word | (temp << 8);
643 
644         temp = ((x4 - x7) >> 14);
645         CLIP_RESULT(temp)
646         word = word | (temp << 16);
647 
648         temp = ((x0 - x1) >> 14);
649         CLIP_RESULT(temp)
650         word = word | (temp << 24);
651         *((int32*)(comp + 4)) = word;
652 
653         blk += B_SIZE;
654     }
655 
656     return ;
657 }
658 
659 #endif
660 
661