1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 #include "mp4def.h"
19 #include "idct.h"
20 #include "motion_comp.h"
21 
22 #ifdef FAST_IDCT
23 
24 /****************************************************************
25 *       vca_idct.c : created 6/1/99 for several options
26 *                     of hard-coded reduced idct function (using nz_coefs)
27 ******************************************************************/
28 
29 /*****************************************************/
30 //pretested version
idctrow0(int16 *,uint8 *,uint8 *,int)31 void idctrow0(int16 *, uint8 *, uint8 *, int)
32 {
33     return ;
34 }
idctcol0(int16 *)35 void idctcol0(int16 *)
36 {
37     return ;
38 }
39 
40 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow1(int16 * blk,uint8 * pred,uint8 * dst,int width)41 void idctrow1(int16 *blk, uint8 *pred, uint8 *dst, int width)
42 {
43     /* shortcut */
44     int tmp;
45     int i = 8;
46     uint32 pred_word, dst_word;
47     int res, res2;
48 
49     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
50     width -= 4;
51     dst -= width;
52     pred -= 12;
53     blk -= 8;
54 
55     while (i--)
56     {
57         tmp = (*(blk += 8) + 32) >> 6;
58         *blk = 0;
59 
60         pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
61         res = tmp + (pred_word & 0xFF);
62         CLIP_RESULT(res);
63         res2 = tmp + ((pred_word >> 8) & 0xFF);
64         CLIP_RESULT(res2);
65         dst_word = (res2 << 8) | res;
66         res = tmp + ((pred_word >> 16) & 0xFF);
67         CLIP_RESULT(res);
68         dst_word |= (res << 16);
69         res = tmp + ((pred_word >> 24) & 0xFF);
70         CLIP_RESULT(res);
71         dst_word |= (res << 24);
72         *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
73 
74         pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
75         res = tmp + (pred_word & 0xFF);
76         CLIP_RESULT(res);
77         res2 = tmp + ((pred_word >> 8) & 0xFF);
78         CLIP_RESULT(res2);
79         dst_word = (res2 << 8) | res;
80         res = tmp + ((pred_word >> 16) & 0xFF);
81         CLIP_RESULT(res);
82         dst_word |= (res << 16);
83         res = tmp + ((pred_word >> 24) & 0xFF);
84         CLIP_RESULT(res);
85         dst_word |= (res << 24);
86         *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
87     }
88     return;
89 }
90 
idctcol1(int16 * blk)91 void idctcol1(int16 *blk)
92 { /* shortcut */
93     blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
94                                               blk[0] << 3;
95     return;
96 }
97 
98 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow2(int16 * blk,uint8 * pred,uint8 * dst,int width)99 void idctrow2(int16 *blk, uint8 *pred, uint8 *dst, int width)
100 {
101     int32 x0, x1, x2, x4, x5;
102     int i = 8;
103     uint32 pred_word, dst_word;
104     int res, res2;
105 
106     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
107     width -= 4;
108     dst -= width;
109     pred -= 12;
110     blk -= 8;
111 
112     while (i--)
113     {
114         /* shortcut */
115         x4 = blk[9];
116         blk[9] = 0;
117         x0 = ((*(blk += 8)) << 8) + 8192;
118         *blk = 0;  /* for proper rounding in the fourth stage */
119 
120         /* first stage */
121         x5 = (W7 * x4 + 4) >> 3;
122         x4 = (W1 * x4 + 4) >> 3;
123 
124         /* third stage */
125         x2 = (181 * (x4 + x5) + 128) >> 8;
126         x1 = (181 * (x4 - x5) + 128) >> 8;
127 
128         /* fourth stage */
129         pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
130         res = (x0 + x4) >> 14;
131         ADD_AND_CLIP1(res);
132         res2 = (x0 + x2) >> 14;
133         ADD_AND_CLIP2(res2);
134         dst_word = (res2 << 8) | res;
135         res = (x0 + x1) >> 14;
136         ADD_AND_CLIP3(res);
137         dst_word |= (res << 16);
138         res = (x0 + x5) >> 14;
139         ADD_AND_CLIP4(res);
140         dst_word |= (res << 24);
141         *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
142 
143         pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
144         res = (x0 - x5) >> 14;
145         ADD_AND_CLIP1(res);
146         res2 = (x0 - x1) >> 14;
147         ADD_AND_CLIP2(res2);
148         dst_word = (res2 << 8) | res;
149         res = (x0 - x2) >> 14;
150         ADD_AND_CLIP3(res);
151         dst_word |= (res << 16);
152         res = (x0 - x4) >> 14;
153         ADD_AND_CLIP4(res);
154         dst_word |= (res << 24);
155         *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
156     }
157     return ;
158 }
159 
160 __attribute__((no_sanitize("signed-integer-overflow")))
idctcol2(int16 * blk)161 void idctcol2(int16 *blk)
162 {
163     int32 x0, x1, x3, x5, x7;//, x8;
164 
165     x1 = blk[8];
166     x0 = ((int32)blk[0] << 11) + 128;
167     /* both upper and lower*/
168 
169     x7 = W7 * x1;
170     x1 = W1 * x1;
171 
172     x3 = x7;
173     x5 = (181 * (x1 - x7) + 128) >> 8;
174     x7 = (181 * (x1 + x7) + 128) >> 8;
175 
176     blk[0] = (x0 + x1) >> 8;
177     blk[8] = (x0 + x7) >> 8;
178     blk[16] = (x0 + x5) >> 8;
179     blk[24] = (x0 + x3) >> 8;
180     blk[56] = (x0 - x1) >> 8;
181     blk[48] = (x0 - x7) >> 8;
182     blk[40] = (x0 - x5) >> 8;
183     blk[32] = (x0 - x3) >> 8;
184 
185     return ;
186 }
187 
188 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow3(int16 * blk,uint8 * pred,uint8 * dst,int width)189 void idctrow3(int16 *blk, uint8 *pred, uint8 *dst, int width)
190 {
191     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
192     int i = 8;
193     uint32 pred_word, dst_word;
194     int res, res2;
195 
196     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
197     width -= 4;
198     dst -= width;
199     pred -= 12;
200     blk -= 8;
201 
202     while (i--)
203     {
204         x2 = blk[10];
205         blk[10] = 0;
206         x1 = blk[9];
207         blk[9] = 0;
208         x0 = ((*(blk += 8)) << 8) + 8192;
209         *blk = 0;   /* for proper rounding in the fourth stage */
210         /* both upper and lower*/
211         /* both x2orx6 and x0orx4 */
212 
213         x4 = x0;
214         x6 = (W6 * x2 + 4) >> 3;
215         x2 = (W2 * x2 + 4) >> 3;
216         x8 = x0 - x2;
217         x0 += x2;
218         x2 = x8;
219         x8 = x4 - x6;
220         x4 += x6;
221         x6 = x8;
222 
223         x7 = (W7 * x1 + 4) >> 3;
224         x1 = (W1 * x1 + 4) >> 3;
225         x3 = x7;
226         x5 = (181 * (x1 - x7) + 128) >> 8;
227         x7 = (181 * (x1 + x7) + 128) >> 8;
228 
229         pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
230         res = (x0 + x1) >> 14;
231         ADD_AND_CLIP1(res);
232         res2 = (x4 + x7) >> 14;
233         ADD_AND_CLIP2(res2);
234         dst_word = (res2 << 8) | res;
235         res = (x6 + x5) >> 14;
236         ADD_AND_CLIP3(res);
237         dst_word |= (res << 16);
238         res = (x2 + x3) >> 14;
239         ADD_AND_CLIP4(res);
240         dst_word |= (res << 24);
241         *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
242 
243         pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
244         res = (x2 - x3) >> 14;
245         ADD_AND_CLIP1(res);
246         res2 = (x6 - x5) >> 14;
247         ADD_AND_CLIP2(res2);
248         dst_word = (res2 << 8) | res;
249         res = (x4 - x7) >> 14;
250         ADD_AND_CLIP3(res);
251         dst_word |= (res << 16);
252         res = (x0 - x1) >> 14;
253         ADD_AND_CLIP4(res);
254         dst_word |= (res << 24);
255         *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
256     }
257 
258     return ;
259 }
260 
261 __attribute__((no_sanitize("signed-integer-overflow")))
idctcol3(int16 * blk)262 void idctcol3(int16 *blk)
263 {
264     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
265 
266     x2 = blk[16];
267     x1 = blk[8];
268     x0 = ((int32)blk[0] << 11) + 128;
269 
270     x4 = x0;
271     x6 = W6 * x2;
272     x2 = W2 * x2;
273     x8 = x0 - x2;
274     x0 += x2;
275     x2 = x8;
276     x8 = x4 - x6;
277     x4 += x6;
278     x6 = x8;
279 
280     x7 = W7 * x1;
281     x1 = W1 * x1;
282     x3 = x7;
283     x5 = (181 * (x1 - x7) + 128) >> 8;
284     x7 = (181 * (x1 + x7) + 128) >> 8;
285 
286     blk[0] = (x0 + x1) >> 8;
287     blk[8] = (x4 + x7) >> 8;
288     blk[16] = (x6 + x5) >> 8;
289     blk[24] = (x2 + x3) >> 8;
290     blk[56] = (x0 - x1) >> 8;
291     blk[48] = (x4 - x7) >> 8;
292     blk[40] = (x6 - x5) >> 8;
293     blk[32] = (x2 - x3) >> 8;
294 
295     return;
296 }
297 
298 
299 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow4(int16 * blk,uint8 * pred,uint8 * dst,int width)300 void idctrow4(int16 *blk, uint8 *pred, uint8 *dst, int width)
301 {
302     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
303     int i = 8;
304     uint32 pred_word, dst_word;
305     int res, res2;
306 
307     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
308     width -= 4;
309     dst -= width;
310     pred -= 12;
311     blk -= 8;
312 
313     while (i--)
314     {
315         x2 = blk[10];
316         blk[10] = 0;
317         x1 = blk[9];
318         blk[9] = 0;
319         x3 = blk[11];
320         blk[11] = 0;
321         x0 = ((*(blk += 8)) << 8) + 8192;
322         *blk = 0;    /* for proper rounding in the fourth stage */
323 
324         x4 = x0;
325         x6 = (W6 * x2 + 4) >> 3;
326         x2 = (W2 * x2 + 4) >> 3;
327         x8 = x0 - x2;
328         x0 += x2;
329         x2 = x8;
330         x8 = x4 - x6;
331         x4 += x6;
332         x6 = x8;
333 
334         x7 = (W7 * x1 + 4) >> 3;
335         x1 = (W1 * x1 + 4) >> 3;
336         x5 = (W3 * x3 + 4) >> 3;
337         x3 = (- W5 * x3 + 4) >> 3;
338         x8 = x1 - x5;
339         x1 += x5;
340         x5 = x8;
341         x8 = x7 - x3;
342         x3 += x7;
343         x7 = (181 * (x5 + x8) + 128) >> 8;
344         x5 = (181 * (x5 - x8) + 128) >> 8;
345 
346         pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
347         res = (x0 + x1) >> 14;
348         ADD_AND_CLIP1(res);
349         res2 = (x4 + x7) >> 14;
350         ADD_AND_CLIP2(res2);
351         dst_word = (res2 << 8) | res;
352         res = (x6 + x5) >> 14;
353         ADD_AND_CLIP3(res);
354         dst_word |= (res << 16);
355         res = (x2 + x3) >> 14;
356         ADD_AND_CLIP4(res);
357         dst_word |= (res << 24);
358         *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
359 
360         pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
361         res = (x2 - x3) >> 14;
362         ADD_AND_CLIP1(res);
363         res2 = (x6 - x5) >> 14;
364         ADD_AND_CLIP2(res2);
365         dst_word = (res2 << 8) | res;
366         res = (x4 - x7) >> 14;
367         ADD_AND_CLIP3(res);
368         dst_word |= (res << 16);
369         res = (x0 - x1) >> 14;
370         ADD_AND_CLIP4(res);
371         dst_word |= (res << 24);
372         *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
373     }
374     return ;
375 }
376 
377 __attribute__((no_sanitize("signed-integer-overflow")))
idctcol4(int16 * blk)378 void idctcol4(int16 *blk)
379 {
380     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
381     x2 = blk[16];
382     x1 = blk[8];
383     x3 = blk[24];
384     x0 = ((int32)blk[0] << 11) + 128;
385 
386     x4 = x0;
387     x6 = W6 * x2;
388     x2 = W2 * x2;
389     x8 = x0 - x2;
390     x0 += x2;
391     x2 = x8;
392     x8 = x4 - x6;
393     x4 += x6;
394     x6 = x8;
395 
396     x7 = W7 * x1;
397     x1 = W1 * x1;
398     x5 = W3 * x3;
399     x3 = -W5 * x3;
400     x8 = x1 - x5;
401     x1 += x5;
402     x5 = x8;
403     x8 = x7 - x3;
404     x3 += x7;
405     x7 = (181 * (x5 + x8) + 128) >> 8;
406     x5 = (181 * (x5 - x8) + 128) >> 8;
407 
408 
409     blk[0] = (x0 + x1) >> 8;
410     blk[8] = (x4 + x7) >> 8;
411     blk[16] = (x6 + x5) >> 8;
412     blk[24] = (x2 + x3) >> 8;
413     blk[56] = (x0 - x1) >> 8;
414     blk[48] = (x4 - x7) >> 8;
415     blk[40] = (x6 - x5) >> 8;
416     blk[32] = (x2 - x3) >> 8;
417 
418     return ;
419 }
420 
idctrow0_intra(int16 *,PIXEL *,int)421 void idctrow0_intra(int16 *, PIXEL *, int)
422 {
423     return ;
424 }
425 
idctrow1_intra(int16 * blk,PIXEL * comp,int width)426 void idctrow1_intra(int16 *blk, PIXEL *comp, int width)
427 {
428     /* shortcut */
429     int32 tmp;
430     int i = 8;
431     int offset = width;
432     uint32 word;
433 
434     comp -= offset;
435     while (i--)
436     {
437         tmp = ((blk[0] + 32) >> 6);
438         blk[0] = 0;
439         CLIP_RESULT(tmp)
440 
441         word = (tmp << 8) | tmp;
442         word = (word << 16) | word;
443 
444         *((uint32*)(comp += offset)) = word;
445         *((uint32*)(comp + 4)) = word;
446 
447 
448 
449 
450         blk += B_SIZE;
451     }
452     return;
453 }
454 
455 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow2_intra(int16 * blk,PIXEL * comp,int width)456 void idctrow2_intra(int16 *blk, PIXEL *comp, int width)
457 {
458     int32 x0, x1, x2, x4, x5, temp;
459     int i = 8;
460     int offset = width;
461     int32 word;
462 
463     comp -= offset;
464     while (i--)
465     {
466         /* shortcut */
467         x4 = blk[1];
468         blk[1] = 0;
469         x0 = ((int32)blk[0] << 8) + 8192;
470         blk[0] = 0;   /* for proper rounding in the fourth stage */
471 
472         /* first stage */
473         x5 = (W7 * x4 + 4) >> 3;
474         x4 = (W1 * x4 + 4) >> 3;
475 
476         /* third stage */
477         x2 = (181 * (x4 + x5) + 128) >> 8;
478         x1 = (181 * (x4 - x5) + 128) >> 8;
479 
480         /* fourth stage */
481         word = ((x0 + x4) >> 14);
482         CLIP_RESULT(word)
483 
484         temp = ((x0 + x2) >> 14);
485         CLIP_RESULT(temp)
486         word = word | (temp << 8);
487         temp = ((x0 + x1) >> 14);
488         CLIP_RESULT(temp)
489         word = word | (temp << 16);
490         temp = ((x0 + x5) >> 14);
491         CLIP_RESULT(temp)
492         word = word | (temp << 24);
493         *((int32*)(comp += offset)) = word;
494 
495         word = ((x0 - x5) >> 14);
496         CLIP_RESULT(word)
497         temp = ((x0 - x1) >> 14);
498         CLIP_RESULT(temp)
499         word = word | (temp << 8);
500         temp = ((x0 - x2) >> 14);
501         CLIP_RESULT(temp)
502         word = word | (temp << 16);
503         temp = ((x0 - x4) >> 14);
504         CLIP_RESULT(temp)
505         word = word | (temp << 24);
506         *((int32*)(comp + 4)) = word;
507 
508         blk += B_SIZE;
509     }
510     return ;
511 }
512 
513 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow3_intra(int16 * blk,PIXEL * comp,int width)514 void idctrow3_intra(int16 *blk, PIXEL *comp, int width)
515 {
516     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
517     int i = 8;
518     int offset = width;
519     int32 word;
520 
521     comp -= offset;
522 
523     while (i--)
524     {
525         x2 = blk[2];
526         blk[2] = 0;
527         x1 = blk[1];
528         blk[1] = 0;
529         x0 = ((int32)blk[0] << 8) + 8192;
530         blk[0] = 0;/* for proper rounding in the fourth stage */
531         /* both upper and lower*/
532         /* both x2orx6 and x0orx4 */
533 
534         x4 = x0;
535         x6 = (W6 * x2 + 4) >> 3;
536         x2 = (W2 * x2 + 4) >> 3;
537         x8 = x0 - x2;
538         x0 += x2;
539         x2 = x8;
540         x8 = x4 - x6;
541         x4 += x6;
542         x6 = x8;
543 
544         x7 = (W7 * x1 + 4) >> 3;
545         x1 = (W1 * x1 + 4) >> 3;
546         x3 = x7;
547         x5 = (181 * (x1 - x7) + 128) >> 8;
548         x7 = (181 * (x1 + x7) + 128) >> 8;
549 
550         word = ((x0 + x1) >> 14);
551         CLIP_RESULT(word)
552         temp = ((x4 + x7) >> 14);
553         CLIP_RESULT(temp)
554         word = word | (temp << 8);
555 
556 
557         temp = ((x6 + x5) >> 14);
558         CLIP_RESULT(temp)
559         word = word | (temp << 16);
560 
561         temp = ((x2 + x3) >> 14);
562         CLIP_RESULT(temp)
563         word = word | (temp << 24);
564         *((int32*)(comp += offset)) = word;
565 
566         word = ((x2 - x3) >> 14);
567         CLIP_RESULT(word)
568 
569         temp = ((x6 - x5) >> 14);
570         CLIP_RESULT(temp)
571         word = word | (temp << 8);
572 
573         temp = ((x4 - x7) >> 14);
574         CLIP_RESULT(temp)
575         word = word | (temp << 16);
576 
577         temp = ((x0 - x1) >> 14);
578         CLIP_RESULT(temp)
579         word = word | (temp << 24);
580         *((int32*)(comp + 4)) = word;
581 
582         blk += B_SIZE;
583     }
584     return ;
585 }
586 
587 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow4_intra(int16 * blk,PIXEL * comp,int width)588 void idctrow4_intra(int16 *blk, PIXEL *comp, int width)
589 {
590     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
591     int i = 8;
592     int offset = width;
593     int32 word;
594 
595     comp -= offset;
596 
597     while (i--)
598     {
599         x2 = blk[2];
600         blk[2] = 0;
601         x1 = blk[1];
602         blk[1] = 0;
603         x3 = blk[3];
604         blk[3] = 0;
605         x0 = ((int32)blk[0] << 8) + 8192;
606         blk[0] = 0;/* for proper rounding in the fourth stage */
607 
608         x4 = x0;
609         x6 = (W6 * x2 + 4) >> 3;
610         x2 = (W2 * x2 + 4) >> 3;
611         x8 = x0 - x2;
612         x0 += x2;
613         x2 = x8;
614         x8 = x4 - x6;
615         x4 += x6;
616         x6 = x8;
617 
618         x7 = (W7 * x1 + 4) >> 3;
619         x1 = (W1 * x1 + 4) >> 3;
620         x5 = (W3 * x3 + 4) >> 3;
621         x3 = (- W5 * x3 + 4) >> 3;
622         x8 = x1 - x5;
623         x1 += x5;
624         x5 = x8;
625         x8 = x7 - x3;
626         x3 += x7;
627         x7 = (181 * (x5 + x8) + 128) >> 8;
628         x5 = (181 * (x5 - x8) + 128) >> 8;
629 
630         word = ((x0 + x1) >> 14);
631         CLIP_RESULT(word)
632 
633         temp = ((x4 + x7) >> 14);
634         CLIP_RESULT(temp)
635         word = word | (temp << 8);
636 
637 
638         temp = ((x6 + x5) >> 14);
639         CLIP_RESULT(temp)
640         word = word | (temp << 16);
641 
642         temp = ((x2 + x3) >> 14);
643         CLIP_RESULT(temp)
644         word = word | (temp << 24);
645         *((int32*)(comp += offset)) = word;
646 
647         word = ((x2 - x3) >> 14);
648         CLIP_RESULT(word)
649 
650         temp = ((x6 - x5) >> 14);
651         CLIP_RESULT(temp)
652         word = word | (temp << 8);
653 
654         temp = ((x4 - x7) >> 14);
655         CLIP_RESULT(temp)
656         word = word | (temp << 16);
657 
658         temp = ((x0 - x1) >> 14);
659         CLIP_RESULT(temp)
660         word = word | (temp << 24);
661         *((int32*)(comp + 4)) = word;
662 
663         blk += B_SIZE;
664     }
665 
666     return ;
667 }
668 
669 #endif
670 
671