1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 /*
19
20 ------------------------------------------------------------------------------
21 REVISION HISTORY
22 Who: Date: July/2001
23 Description: 1. Optimized BlockIDCT bitmap checking.
24 2. Rearranged functions.
25 3. Do column IDCT first, then row IDCT.
26 4. Combine motion comp and IDCT, require
27 two sets of row IDCTs one for INTRA
28 and one for INTER.
29 5. Add AAN IDCT
30
31 Who: Date: 8/16/01
32 1. Increase the input precision to 8 bits, i.e. change RDCTBITS
33 to 11, have to comment out all in-line assembly since 16 bit
34 multiplication doesn't work. Try to use diffent precision with
35 32 bit mult. but hasn't finished. Turns out that without in-line
36 assembly the performance doesn't change much (only 1%).
37 Who: Date: 9/04/05
38 1. Replace AAN IDCT with Chen's IDCT to accommodate 16 bit data type.
39
40 */
41 #include "mp4def.h"
42 #include "mp4enc_lib.h"
43 #include "mp4lib_int.h"
44 #include "dct.h"
45
46 #define ADD_CLIP { \
47 tmp = *rec + tmp; \
48 if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
49 *rec++ = tmp; \
50 }
51
52 #define INTRA_CLIP { \
53 if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
54 *rec++ = tmp; \
55 }
56
57
58 #define CLIP_RESULT(x) if((UInt)x > 0xFF){x = 0xFF & (~(x>>31));}
59 #define ADD_AND_CLIP1(x) x += (pred_word&0xFF); CLIP_RESULT(x);
60 #define ADD_AND_CLIP2(x) x += ((pred_word>>8)&0xFF); CLIP_RESULT(x);
61 #define ADD_AND_CLIP3(x) x += ((pred_word>>16)&0xFF); CLIP_RESULT(x);
62 #define ADD_AND_CLIP4(x) x += ((pred_word>>24)&0xFF); CLIP_RESULT(x);
63
64
idct_col0(Short * blk)65 void idct_col0(Short *blk)
66 {
67 OSCL_UNUSED_ARG(blk);
68
69 return;
70 }
71
idct_col1(Short * blk)72 void idct_col1(Short *blk)
73 {
74 blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
75 blk[0] << 3;
76 return ;
77 }
78
idct_col2(Short * blk)79 void idct_col2(Short *blk)
80 {
81 int32 x0, x1, x3, x5, x7;//, x8;
82
83 x1 = blk[8];
84 x0 = ((int32)blk[0] << 11) + 128;
85 /* both upper and lower*/
86
87 x7 = W7 * x1;
88 x1 = W1 * x1;
89
90 x3 = x7;
91 x5 = (181 * (x1 - x7) + 128) >> 8;
92 x7 = (181 * (x1 + x7) + 128) >> 8;
93
94 blk[0] = (x0 + x1) >> 8;
95 blk[8] = (x0 + x7) >> 8;
96 blk[16] = (x0 + x5) >> 8;
97 blk[24] = (x0 + x3) >> 8;
98 blk[56] = (x0 - x1) >> 8;
99 blk[48] = (x0 - x7) >> 8;
100 blk[40] = (x0 - x5) >> 8;
101 blk[32] = (x0 - x3) >> 8;
102 return ;
103 }
104
idct_col3(Short * blk)105 void idct_col3(Short *blk)
106 {
107 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
108
109 x2 = blk[16];
110 x1 = blk[8];
111 x0 = ((int32)blk[0] << 11) + 128;
112
113 x4 = x0;
114 x6 = W6 * x2;
115 x2 = W2 * x2;
116 x8 = x0 - x2;
117 x0 += x2;
118 x2 = x8;
119 x8 = x4 - x6;
120 x4 += x6;
121 x6 = x8;
122
123 x7 = W7 * x1;
124 x1 = W1 * x1;
125 x3 = x7;
126 x5 = (181 * (x1 - x7) + 128) >> 8;
127 x7 = (181 * (x1 + x7) + 128) >> 8;
128
129 blk[0] = (x0 + x1) >> 8;
130 blk[8] = (x4 + x7) >> 8;
131 blk[16] = (x6 + x5) >> 8;
132 blk[24] = (x2 + x3) >> 8;
133 blk[56] = (x0 - x1) >> 8;
134 blk[48] = (x4 - x7) >> 8;
135 blk[40] = (x6 - x5) >> 8;
136 blk[32] = (x2 - x3) >> 8;
137 return ;
138 }
139
idct_col4(Short * blk)140 void idct_col4(Short *blk)
141 {
142 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
143 x2 = blk[16];
144 x1 = blk[8];
145 x3 = blk[24];
146 x0 = ((int32)blk[0] << 11) + 128;
147
148 x4 = x0;
149 x6 = W6 * x2;
150 x2 = W2 * x2;
151 x8 = x0 - x2;
152 x0 += x2;
153 x2 = x8;
154 x8 = x4 - x6;
155 x4 += x6;
156 x6 = x8;
157
158 x7 = W7 * x1;
159 x1 = W1 * x1;
160 x5 = W3 * x3;
161 x3 = -W5 * x3;
162 x8 = x1 - x5;
163 x1 += x5;
164 x5 = x8;
165 x8 = x7 - x3;
166 x3 += x7;
167 x7 = (181 * (x5 + x8) + 128) >> 8;
168 x5 = (181 * (x5 - x8) + 128) >> 8;
169
170
171 blk[0] = (x0 + x1) >> 8;
172 blk[8] = (x4 + x7) >> 8;
173 blk[16] = (x6 + x5) >> 8;
174 blk[24] = (x2 + x3) >> 8;
175 blk[56] = (x0 - x1) >> 8;
176 blk[48] = (x4 - x7) >> 8;
177 blk[40] = (x6 - x5) >> 8;
178 blk[32] = (x2 - x3) >> 8;
179 return ;
180 }
181
182 #ifndef SMALL_DCT
idct_col0x40(Short * blk)183 void idct_col0x40(Short *blk)
184 {
185 int32 x1, x3, x5, x7;//, x8;
186
187 x1 = blk[8];
188 /* both upper and lower*/
189
190 x7 = W7 * x1;
191 x1 = W1 * x1;
192
193 x3 = x7;
194 x5 = (181 * (x1 - x7) + 128) >> 8;
195 x7 = (181 * (x1 + x7) + 128) >> 8;
196
197 blk[0] = (128 + x1) >> 8;
198 blk[8] = (128 + x7) >> 8;
199 blk[16] = (128 + x5) >> 8;
200 blk[24] = (128 + x3) >> 8;
201 blk[56] = (128 - x1) >> 8;
202 blk[48] = (128 - x7) >> 8;
203 blk[40] = (128 - x5) >> 8;
204 blk[32] = (128 - x3) >> 8;
205
206 return ;
207 }
208
idct_col0x20(Short * blk)209 void idct_col0x20(Short *blk)
210 {
211 int32 x0, x2, x4, x6;
212
213 x2 = blk[16];
214 x6 = W6 * x2;
215 x2 = W2 * x2;
216 x0 = 128 + x2;
217 x2 = 128 - x2;
218 x4 = 128 + x6;
219 x6 = 128 - x6;
220
221 blk[0] = (x0) >> 8;
222 blk[56] = (x0) >> 8;
223 blk[8] = (x4) >> 8;
224 blk[48] = (x4) >> 8;
225 blk[16] = (x6) >> 8;
226 blk[40] = (x6) >> 8;
227 blk[24] = (x2) >> 8;
228 blk[32] = (x2) >> 8;
229
230 return ;
231 }
232
idct_col0x10(Short * blk)233 void idct_col0x10(Short *blk)
234 {
235 int32 x1, x3, x5, x7;
236
237 x3 = blk[24];
238 x1 = W3 * x3;
239 x3 = W5 * x3;
240
241 x7 = (181 * (x3 - x1) + 128) >> 8;
242 x5 = (-181 * (x1 + x3) + 128) >> 8;
243
244
245 blk[0] = (128 + x1) >> 8;
246 blk[8] = (128 + x7) >> 8;
247 blk[16] = (128 + x5) >> 8;
248 blk[24] = (128 - x3) >> 8;
249 blk[56] = (128 - x1) >> 8;
250 blk[48] = (128 - x7) >> 8;
251 blk[40] = (128 - x5) >> 8;
252 blk[32] = (128 + x3) >> 8;
253
254 return ;
255 }
256
257 #endif /* SMALL_DCT */
258
idct_col(Short * blk)259 void idct_col(Short *blk)
260 {
261 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
262
263 x1 = (int32)blk[32] << 11;
264 x2 = blk[48];
265 x3 = blk[16];
266 x4 = blk[8];
267 x5 = blk[56];
268 x6 = blk[40];
269 x7 = blk[24];
270 x0 = ((int32)blk[0] << 11) + 128;
271
272 /* first stage */
273 x8 = W7 * (x4 + x5);
274 x4 = x8 + (W1 - W7) * x4;
275 x5 = x8 - (W1 + W7) * x5;
276 x8 = W3 * (x6 + x7);
277 x6 = x8 - (W3 - W5) * x6;
278 x7 = x8 - (W3 + W5) * x7;
279
280 /* second stage */
281 x8 = x0 + x1;
282 x0 -= x1;
283 x1 = W6 * (x3 + x2);
284 x2 = x1 - (W2 + W6) * x2;
285 x3 = x1 + (W2 - W6) * x3;
286 x1 = x4 + x6;
287 x4 -= x6;
288 x6 = x5 + x7;
289 x5 -= x7;
290
291 /* third stage */
292 x7 = x8 + x3;
293 x8 -= x3;
294 x3 = x0 + x2;
295 x0 -= x2;
296 x2 = (181 * (x4 + x5) + 128) >> 8;
297 x4 = (181 * (x4 - x5) + 128) >> 8;
298
299 /* fourth stage */
300 blk[0] = (x7 + x1) >> 8;
301 blk[8] = (x3 + x2) >> 8;
302 blk[16] = (x0 + x4) >> 8;
303 blk[24] = (x8 + x6) >> 8;
304 blk[32] = (x8 - x6) >> 8;
305 blk[40] = (x0 - x4) >> 8;
306 blk[48] = (x3 - x2) >> 8;
307 blk[56] = (x7 - x1) >> 8;
308
309 return ;
310 }
311
312 /* This function should not be called at all ****/
idct_row0Inter(Short * srce,UChar * rec,Int lx)313 void idct_row0Inter(Short *srce, UChar *rec, Int lx)
314 {
315 OSCL_UNUSED_ARG(srce);
316
317 OSCL_UNUSED_ARG(rec);
318
319 OSCL_UNUSED_ARG(lx);
320
321 return;
322 }
323
idct_row1Inter(Short * blk,UChar * rec,Int lx)324 void idct_row1Inter(Short *blk, UChar *rec, Int lx)
325 {
326 int tmp;
327 int i = 8;
328 uint32 pred_word, dst_word;
329 int res, res2;
330
331 /* preset the offset, such that we can take advantage pre-offset addressing mode */
332 rec -= lx;
333 blk -= 8;
334
335 while (i--)
336 {
337 tmp = (*(blk += 8) + 32) >> 6;
338 *blk = 0;
339
340 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
341 res = tmp + (pred_word & 0xFF);
342 CLIP_RESULT(res);
343 res2 = tmp + ((pred_word >> 8) & 0xFF);
344 CLIP_RESULT(res2);
345 dst_word = (res2 << 8) | res;
346 res = tmp + ((pred_word >> 16) & 0xFF);
347 CLIP_RESULT(res);
348 dst_word |= (res << 16);
349 res = tmp + ((pred_word >> 24) & 0xFF);
350 CLIP_RESULT(res);
351 dst_word |= (res << 24);
352 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
353
354 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
355 res = tmp + (pred_word & 0xFF);
356 CLIP_RESULT(res);
357 res2 = tmp + ((pred_word >> 8) & 0xFF);
358 CLIP_RESULT(res2);
359 dst_word = (res2 << 8) | res;
360 res = tmp + ((pred_word >> 16) & 0xFF);
361 CLIP_RESULT(res);
362 dst_word |= (res << 16);
363 res = tmp + ((pred_word >> 24) & 0xFF);
364 CLIP_RESULT(res);
365 dst_word |= (res << 24);
366 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
367 }
368 return;
369 }
370
idct_row2Inter(Short * blk,UChar * rec,Int lx)371 void idct_row2Inter(Short *blk, UChar *rec, Int lx)
372 {
373 int32 x0, x1, x2, x4, x5;
374 int i = 8;
375 uint32 pred_word, dst_word;
376 int res, res2;
377
378 /* preset the offset, such that we can take advantage pre-offset addressing mode */
379 rec -= lx;
380 blk -= 8;
381
382 while (i--)
383 {
384 /* shortcut */
385 x4 = blk[9];
386 blk[9] = 0;
387 x0 = ((*(blk += 8)) << 8) + 8192;
388 *blk = 0; /* for proper rounding in the fourth stage */
389
390 /* first stage */
391 x5 = (W7 * x4 + 4) >> 3;
392 x4 = (W1 * x4 + 4) >> 3;
393
394 /* third stage */
395 x2 = (181 * (x4 + x5) + 128) >> 8;
396 x1 = (181 * (x4 - x5) + 128) >> 8;
397
398 /* fourth stage */
399 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
400 res = (x0 + x4) >> 14;
401 ADD_AND_CLIP1(res);
402 res2 = (x0 + x2) >> 14;
403 ADD_AND_CLIP2(res2);
404 dst_word = (res2 << 8) | res;
405 res = (x0 + x1) >> 14;
406 ADD_AND_CLIP3(res);
407 dst_word |= (res << 16);
408 res = (x0 + x5) >> 14;
409 ADD_AND_CLIP4(res);
410 dst_word |= (res << 24);
411 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
412
413 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
414 res = (x0 - x5) >> 14;
415 ADD_AND_CLIP1(res);
416 res2 = (x0 - x1) >> 14;
417 ADD_AND_CLIP2(res2);
418 dst_word = (res2 << 8) | res;
419 res = (x0 - x2) >> 14;
420 ADD_AND_CLIP3(res);
421 dst_word |= (res << 16);
422 res = (x0 - x4) >> 14;
423 ADD_AND_CLIP4(res);
424 dst_word |= (res << 24);
425 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
426 }
427 return ;
428 }
429
idct_row3Inter(Short * blk,UChar * rec,Int lx)430 void idct_row3Inter(Short *blk, UChar *rec, Int lx)
431 {
432 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
433 int i = 8;
434 uint32 pred_word, dst_word;
435 int res, res2;
436
437 /* preset the offset, such that we can take advantage pre-offset addressing mode */
438 rec -= lx;
439 blk -= 8;
440
441 while (i--)
442 {
443 x2 = blk[10];
444 blk[10] = 0;
445 x1 = blk[9];
446 blk[9] = 0;
447 x0 = ((*(blk += 8)) << 8) + 8192;
448 *blk = 0; /* for proper rounding in the fourth stage */
449 /* both upper and lower*/
450 /* both x2orx6 and x0orx4 */
451
452 x4 = x0;
453 x6 = (W6 * x2 + 4) >> 3;
454 x2 = (W2 * x2 + 4) >> 3;
455 x8 = x0 - x2;
456 x0 += x2;
457 x2 = x8;
458 x8 = x4 - x6;
459 x4 += x6;
460 x6 = x8;
461
462 x7 = (W7 * x1 + 4) >> 3;
463 x1 = (W1 * x1 + 4) >> 3;
464 x3 = x7;
465 x5 = (181 * (x1 - x7) + 128) >> 8;
466 x7 = (181 * (x1 + x7) + 128) >> 8;
467
468 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
469 res = (x0 + x1) >> 14;
470 ADD_AND_CLIP1(res);
471 res2 = (x4 + x7) >> 14;
472 ADD_AND_CLIP2(res2);
473 dst_word = (res2 << 8) | res;
474 res = (x6 + x5) >> 14;
475 ADD_AND_CLIP3(res);
476 dst_word |= (res << 16);
477 res = (x2 + x3) >> 14;
478 ADD_AND_CLIP4(res);
479 dst_word |= (res << 24);
480 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
481
482 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
483 res = (x2 - x3) >> 14;
484 ADD_AND_CLIP1(res);
485 res2 = (x6 - x5) >> 14;
486 ADD_AND_CLIP2(res2);
487 dst_word = (res2 << 8) | res;
488 res = (x4 - x7) >> 14;
489 ADD_AND_CLIP3(res);
490 dst_word |= (res << 16);
491 res = (x0 - x1) >> 14;
492 ADD_AND_CLIP4(res);
493 dst_word |= (res << 24);
494 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
495 }
496
497 return ;
498 }
499
idct_row4Inter(Short * blk,UChar * rec,Int lx)500 void idct_row4Inter(Short *blk, UChar *rec, Int lx)
501 {
502 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
503 int i = 8;
504 uint32 pred_word, dst_word;
505 int res, res2;
506
507 /* preset the offset, such that we can take advantage pre-offset addressing mode */
508 rec -= lx;
509 blk -= 8;
510
511 while (i--)
512 {
513 x2 = blk[10];
514 blk[10] = 0;
515 x1 = blk[9];
516 blk[9] = 0;
517 x3 = blk[11];
518 blk[11] = 0;
519 x0 = ((*(blk += 8)) << 8) + 8192;
520 *blk = 0; /* for proper rounding in the fourth stage */
521
522 x4 = x0;
523 x6 = (W6 * x2 + 4) >> 3;
524 x2 = (W2 * x2 + 4) >> 3;
525 x8 = x0 - x2;
526 x0 += x2;
527 x2 = x8;
528 x8 = x4 - x6;
529 x4 += x6;
530 x6 = x8;
531
532 x7 = (W7 * x1 + 4) >> 3;
533 x1 = (W1 * x1 + 4) >> 3;
534 x5 = (W3 * x3 + 4) >> 3;
535 x3 = (- W5 * x3 + 4) >> 3;
536 x8 = x1 - x5;
537 x1 += x5;
538 x5 = x8;
539 x8 = x7 - x3;
540 x3 += x7;
541 x7 = (181 * (x5 + x8) + 128) >> 8;
542 x5 = (181 * (x5 - x8) + 128) >> 8;
543
544 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
545 res = (x0 + x1) >> 14;
546 ADD_AND_CLIP1(res);
547 res2 = (x4 + x7) >> 14;
548 ADD_AND_CLIP2(res2);
549 dst_word = (res2 << 8) | res;
550 res = (x6 + x5) >> 14;
551 ADD_AND_CLIP3(res);
552 dst_word |= (res << 16);
553 res = (x2 + x3) >> 14;
554 ADD_AND_CLIP4(res);
555 dst_word |= (res << 24);
556 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
557
558 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
559 res = (x2 - x3) >> 14;
560 ADD_AND_CLIP1(res);
561 res2 = (x6 - x5) >> 14;
562 ADD_AND_CLIP2(res2);
563 dst_word = (res2 << 8) | res;
564 res = (x4 - x7) >> 14;
565 ADD_AND_CLIP3(res);
566 dst_word |= (res << 16);
567 res = (x0 - x1) >> 14;
568 ADD_AND_CLIP4(res);
569 dst_word |= (res << 24);
570 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
571 }
572 return ;
573 }
574
575 #ifndef SMALL_DCT
idct_row0x40Inter(Short * blk,UChar * rec,Int lx)576 void idct_row0x40Inter(Short *blk, UChar *rec, Int lx)
577 {
578 int32 x1, x2, x4, x5;
579 int i = 8;
580 uint32 pred_word, dst_word;
581 int res, res2;
582
583 /* preset the offset, such that we can take advantage pre-offset addressing mode */
584 rec -= lx;
585
586 while (i--)
587 {
588 /* shortcut */
589 x4 = blk[1];
590 blk[1] = 0;
591 blk += 8; /* for proper rounding in the fourth stage */
592
593 /* first stage */
594 x5 = (W7 * x4 + 4) >> 3;
595 x4 = (W1 * x4 + 4) >> 3;
596
597 /* third stage */
598 x2 = (181 * (x4 + x5) + 128) >> 8;
599 x1 = (181 * (x4 - x5) + 128) >> 8;
600
601 /* fourth stage */
602 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
603 res = (8192 + x4) >> 14;
604 ADD_AND_CLIP1(res);
605 res2 = (8192 + x2) >> 14;
606 ADD_AND_CLIP2(res2);
607 dst_word = (res2 << 8) | res;
608 res = (8192 + x1) >> 14;
609 ADD_AND_CLIP3(res);
610 dst_word |= (res << 16);
611 res = (8192 + x5) >> 14;
612 ADD_AND_CLIP4(res);
613 dst_word |= (res << 24);
614 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
615
616 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
617 res = (8192 - x5) >> 14;
618 ADD_AND_CLIP1(res);
619 res2 = (8192 - x1) >> 14;
620 ADD_AND_CLIP2(res2);
621 dst_word = (res2 << 8) | res;
622 res = (8192 - x2) >> 14;
623 ADD_AND_CLIP3(res);
624 dst_word |= (res << 16);
625 res = (8192 - x4) >> 14;
626 ADD_AND_CLIP4(res);
627 dst_word |= (res << 24);
628 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
629 }
630 return ;
631 }
632
idct_row0x20Inter(Short * blk,UChar * rec,Int lx)633 void idct_row0x20Inter(Short *blk, UChar *rec, Int lx)
634 {
635 int32 x0, x2, x4, x6;
636 int i = 8;
637 uint32 pred_word, dst_word;
638 int res, res2;
639
640 /* preset the offset, such that we can take advantage pre-offset addressing mode */
641 rec -= lx;
642
643 while (i--)
644 {
645 x2 = blk[2];
646 blk[2] = 0;
647 blk += 8; /* for proper rounding in the fourth stage */
648 /* both upper and lower*/
649 /* both x2orx6 and x0orx4 */
650 x6 = (W6 * x2 + 4) >> 3;
651 x2 = (W2 * x2 + 4) >> 3;
652 x0 = 8192 + x2;
653 x2 = 8192 - x2;
654 x4 = 8192 + x6;
655 x6 = 8192 - x6;
656
657 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
658 res = (x0) >> 14;
659 ADD_AND_CLIP1(res);
660 res2 = (x4) >> 14;
661 ADD_AND_CLIP2(res2);
662 dst_word = (res2 << 8) | res;
663 res = (x6) >> 14;
664 ADD_AND_CLIP3(res);
665 dst_word |= (res << 16);
666 res = (x2) >> 14;
667 ADD_AND_CLIP4(res);
668 dst_word |= (res << 24);
669 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
670
671 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
672 res = (x2) >> 14;
673 ADD_AND_CLIP1(res);
674 res2 = (x6) >> 14;
675 ADD_AND_CLIP2(res2);
676 dst_word = (res2 << 8) | res;
677 res = (x4) >> 14;
678 ADD_AND_CLIP3(res);
679 dst_word |= (res << 16);
680 res = (x0) >> 14;
681 ADD_AND_CLIP4(res);
682 dst_word |= (res << 24);
683 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
684 }
685
686 return ;
687 }
688
idct_row0x10Inter(Short * blk,UChar * rec,Int lx)689 void idct_row0x10Inter(Short *blk, UChar *rec, Int lx)
690 {
691 int32 x1, x3, x5, x7;
692 int i = 8;
693 uint32 pred_word, dst_word;
694 int res, res2;
695
696 /* preset the offset, such that we can take advantage pre-offset addressing mode */
697 rec -= lx;
698
699 while (i--)
700 {
701 x3 = blk[3];
702 blk[3] = 0;
703 blk += 8;
704
705 x1 = (W3 * x3 + 4) >> 3;
706 x3 = (-W5 * x3 + 4) >> 3;
707
708 x7 = (-181 * (x3 + x1) + 128) >> 8;
709 x5 = (181 * (x3 - x1) + 128) >> 8;
710
711 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
712 res = (8192 + x1) >> 14;
713 ADD_AND_CLIP1(res);
714 res2 = (8192 + x7) >> 14;
715 ADD_AND_CLIP2(res2);
716 dst_word = (res2 << 8) | res;
717 res = (8192 + x5) >> 14;
718 ADD_AND_CLIP3(res);
719 dst_word |= (res << 16);
720 res = (8192 + x3) >> 14;
721 ADD_AND_CLIP4(res);
722 dst_word |= (res << 24);
723 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
724
725 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
726 res = (8192 - x3) >> 14;
727 ADD_AND_CLIP1(res);
728 res2 = (8192 - x5) >> 14;
729 ADD_AND_CLIP2(res2);
730 dst_word = (res2 << 8) | res;
731 res = (8192 - x7) >> 14;
732 ADD_AND_CLIP3(res);
733 dst_word |= (res << 16);
734 res = (8192 - x1) >> 14;
735 ADD_AND_CLIP4(res);
736 dst_word |= (res << 24);
737 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
738 }
739 return ;
740 }
741
742 #endif /* SMALL_DCT */
743
idct_rowInter(Short * blk,UChar * rec,Int lx)744 void idct_rowInter(Short *blk, UChar *rec, Int lx)
745 {
746 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
747 int i = 8;
748 uint32 pred_word, dst_word;
749 int res, res2;
750
751 /* preset the offset, such that we can take advantage pre-offset addressing mode */
752 rec -= lx;
753 blk -= 8;
754
755 while (i--)
756 {
757 x1 = (int32)blk[12] << 8;
758 blk[12] = 0;
759 x2 = blk[14];
760 blk[14] = 0;
761 x3 = blk[10];
762 blk[10] = 0;
763 x4 = blk[9];
764 blk[9] = 0;
765 x5 = blk[15];
766 blk[15] = 0;
767 x6 = blk[13];
768 blk[13] = 0;
769 x7 = blk[11];
770 blk[11] = 0;
771 x0 = ((*(blk += 8)) << 8) + 8192;
772 *blk = 0; /* for proper rounding in the fourth stage */
773
774 /* first stage */
775 x8 = W7 * (x4 + x5) + 4;
776 x4 = (x8 + (W1 - W7) * x4) >> 3;
777 x5 = (x8 - (W1 + W7) * x5) >> 3;
778 x8 = W3 * (x6 + x7) + 4;
779 x6 = (x8 - (W3 - W5) * x6) >> 3;
780 x7 = (x8 - (W3 + W5) * x7) >> 3;
781
782 /* second stage */
783 x8 = x0 + x1;
784 x0 -= x1;
785 x1 = W6 * (x3 + x2) + 4;
786 x2 = (x1 - (W2 + W6) * x2) >> 3;
787 x3 = (x1 + (W2 - W6) * x3) >> 3;
788 x1 = x4 + x6;
789 x4 -= x6;
790 x6 = x5 + x7;
791 x5 -= x7;
792
793 /* third stage */
794 x7 = x8 + x3;
795 x8 -= x3;
796 x3 = x0 + x2;
797 x0 -= x2;
798 x2 = (181 * (x4 + x5) + 128) >> 8;
799 x4 = (181 * (x4 - x5) + 128) >> 8;
800
801 /* fourth stage */
802 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
803
804 res = (x7 + x1) >> 14;
805 ADD_AND_CLIP1(res);
806 res2 = (x3 + x2) >> 14;
807 ADD_AND_CLIP2(res2);
808 dst_word = (res2 << 8) | res;
809 res = (x0 + x4) >> 14;
810 ADD_AND_CLIP3(res);
811 dst_word |= (res << 16);
812 res = (x8 + x6) >> 14;
813 ADD_AND_CLIP4(res);
814 dst_word |= (res << 24);
815 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
816
817 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
818
819 res = (x8 - x6) >> 14;
820 ADD_AND_CLIP1(res);
821 res2 = (x0 - x4) >> 14;
822 ADD_AND_CLIP2(res2);
823 dst_word = (res2 << 8) | res;
824 res = (x3 - x2) >> 14;
825 ADD_AND_CLIP3(res);
826 dst_word |= (res << 16);
827 res = (x7 - x1) >> 14;
828 ADD_AND_CLIP4(res);
829 dst_word |= (res << 24);
830 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
831 }
832 return;
833 }
834
idct_row0Intra(Short * srce,UChar * rec,Int lx)835 void idct_row0Intra(Short *srce, UChar *rec, Int lx)
836 {
837 OSCL_UNUSED_ARG(srce);
838
839 OSCL_UNUSED_ARG(rec);
840
841 OSCL_UNUSED_ARG(lx);
842
843 return;
844 }
845
idct_row1Intra(Short * blk,UChar * rec,Int lx)846 void idct_row1Intra(Short *blk, UChar *rec, Int lx)
847 {
848 int32 tmp;
849 int i = 8;
850
851 rec -= lx;
852 blk -= 8;
853 while (i--)
854 {
855 tmp = ((*(blk += 8) + 32) >> 6);
856 *blk = 0;
857 CLIP_RESULT(tmp)
858
859 tmp |= (tmp << 8);
860 tmp |= (tmp << 16);
861 *((uint32*)(rec += lx)) = tmp;
862 *((uint32*)(rec + 4)) = tmp;
863 }
864 return;
865 }
866
idct_row2Intra(Short * blk,UChar * rec,Int lx)867 void idct_row2Intra(Short *blk, UChar *rec, Int lx)
868 {
869 int32 x0, x1, x2, x4, x5;
870 int res, res2;
871 uint32 dst_word;
872 int i = 8;
873
874 rec -= lx;
875 blk -= 8;
876 while (i--)
877 {
878 /* shortcut */
879 x4 = blk[9];
880 blk[9] = 0;
881 x0 = ((*(blk += 8)) << 8) + 8192;
882 *blk = 0; /* for proper rounding in the fourth stage */
883
884 /* first stage */
885 x5 = (W7 * x4 + 4) >> 3;
886 x4 = (W1 * x4 + 4) >> 3;
887
888 /* third stage */
889 x2 = (181 * (x4 + x5) + 128) >> 8;
890 x1 = (181 * (x4 - x5) + 128) >> 8;
891
892 /* fourth stage */
893 res = ((x0 + x4) >> 14);
894 CLIP_RESULT(res)
895 res2 = ((x0 + x2) >> 14);
896 CLIP_RESULT(res2)
897 dst_word = (res2 << 8) | res;
898 res = ((x0 + x1) >> 14);
899 CLIP_RESULT(res)
900 dst_word |= (res << 16);
901 res = ((x0 + x5) >> 14);
902 CLIP_RESULT(res)
903 dst_word |= (res << 24);
904 *((uint32*)(rec += lx)) = dst_word;
905
906 res = ((x0 - x5) >> 14);
907 CLIP_RESULT(res)
908 res2 = ((x0 - x1) >> 14);
909 CLIP_RESULT(res2)
910 dst_word = (res2 << 8) | res;
911 res = ((x0 - x2) >> 14);
912 CLIP_RESULT(res)
913 dst_word |= (res << 16);
914 res = ((x0 - x4) >> 14);
915 CLIP_RESULT(res)
916 dst_word |= (res << 24);
917 *((uint32*)(rec + 4)) = dst_word;
918 }
919 return ;
920 }
921
idct_row3Intra(Short * blk,UChar * rec,Int lx)922 void idct_row3Intra(Short *blk, UChar *rec, Int lx)
923 {
924 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
925 int res, res2;
926 uint32 dst_word;
927 int i = 8;
928
929 rec -= lx;
930 blk -= 8;
931 while (i--)
932 {
933 x2 = blk[10];
934 blk[10] = 0;
935 x1 = blk[9];
936 blk[9] = 0;
937 x0 = ((*(blk += 8)) << 8) + 8192;
938 *blk = 0;/* for proper rounding in the fourth stage */
939 /* both upper and lower*/
940 /* both x2orx6 and x0orx4 */
941
942 x4 = x0;
943 x6 = (W6 * x2 + 4) >> 3;
944 x2 = (W2 * x2 + 4) >> 3;
945 x8 = x0 - x2;
946 x0 += x2;
947 x2 = x8;
948 x8 = x4 - x6;
949 x4 += x6;
950 x6 = x8;
951
952 x7 = (W7 * x1 + 4) >> 3;
953 x1 = (W1 * x1 + 4) >> 3;
954 x3 = x7;
955 x5 = (181 * (x1 - x7) + 128) >> 8;
956 x7 = (181 * (x1 + x7) + 128) >> 8;
957
958 res = ((x0 + x1) >> 14);
959 CLIP_RESULT(res)
960 res2 = ((x4 + x7) >> 14);
961 CLIP_RESULT(res2)
962 dst_word = (res2 << 8) | res;
963 res = ((x6 + x5) >> 14);
964 CLIP_RESULT(res)
965 dst_word |= (res << 16);
966 res = ((x2 + x3) >> 14);
967 CLIP_RESULT(res)
968 dst_word |= (res << 24);
969 *((uint32*)(rec += lx)) = dst_word;
970
971 res = ((x2 - x3) >> 14);
972 CLIP_RESULT(res)
973 res2 = ((x6 - x5) >> 14);
974 CLIP_RESULT(res2)
975 dst_word = (res2 << 8) | res;
976 res = ((x4 - x7) >> 14);
977 CLIP_RESULT(res)
978 dst_word |= (res << 16);
979 res = ((x0 - x1) >> 14);
980 CLIP_RESULT(res)
981 dst_word |= (res << 24);
982 *((uint32*)(rec + 4)) = dst_word;
983
984 }
985 return ;
986 }
987
idct_row4Intra(Short * blk,UChar * rec,Int lx)988 void idct_row4Intra(Short *blk, UChar *rec, Int lx)
989 {
990 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
991 int res, res2;
992 uint32 dst_word;
993 int i = 8;
994
995 rec -= lx;
996 blk -= 8;
997 while (i--)
998 {
999 x2 = blk[10];
1000 blk[10] = 0;
1001 x1 = blk[9];
1002 blk[9] = 0;
1003 x3 = blk[11];
1004 blk[11] = 0;
1005 x0 = ((*(blk += 8)) << 8) + 8192;
1006 *blk = 0; /* for proper rounding in the fourth stage */
1007
1008 x4 = x0;
1009 x6 = (W6 * x2 + 4) >> 3;
1010 x2 = (W2 * x2 + 4) >> 3;
1011 x8 = x0 - x2;
1012 x0 += x2;
1013 x2 = x8;
1014 x8 = x4 - x6;
1015 x4 += x6;
1016 x6 = x8;
1017
1018 x7 = (W7 * x1 + 4) >> 3;
1019 x1 = (W1 * x1 + 4) >> 3;
1020 x5 = (W3 * x3 + 4) >> 3;
1021 x3 = (- W5 * x3 + 4) >> 3;
1022 x8 = x1 - x5;
1023 x1 += x5;
1024 x5 = x8;
1025 x8 = x7 - x3;
1026 x3 += x7;
1027 x7 = (181 * (x5 + x8) + 128) >> 8;
1028 x5 = (181 * (x5 - x8) + 128) >> 8;
1029
1030 res = ((x0 + x1) >> 14);
1031 CLIP_RESULT(res)
1032 res2 = ((x4 + x7) >> 14);
1033 CLIP_RESULT(res2)
1034 dst_word = (res2 << 8) | res;
1035 res = ((x6 + x5) >> 14);
1036 CLIP_RESULT(res)
1037 dst_word |= (res << 16);
1038 res = ((x2 + x3) >> 14);
1039 CLIP_RESULT(res)
1040 dst_word |= (res << 24);
1041 *((uint32*)(rec += lx)) = dst_word;
1042
1043 res = ((x2 - x3) >> 14);
1044 CLIP_RESULT(res)
1045 res2 = ((x6 - x5) >> 14);
1046 CLIP_RESULT(res2)
1047 dst_word = (res2 << 8) | res;
1048 res = ((x4 - x7) >> 14);
1049 CLIP_RESULT(res)
1050 dst_word |= (res << 16);
1051 res = ((x0 - x1) >> 14);
1052 CLIP_RESULT(res)
1053 dst_word |= (res << 24);
1054 *((uint32*)(rec + 4)) = dst_word;
1055 }
1056
1057 return ;
1058 }
1059
1060 #ifndef SMALL_DCT
idct_row0x40Intra(Short * blk,UChar * rec,Int lx)1061 void idct_row0x40Intra(Short *blk, UChar *rec, Int lx)
1062 {
1063 int32 x1, x2, x4, x5;
1064 int res, res2;
1065 uint32 dst_word;
1066 int i = 8;
1067
1068 rec -= lx;
1069
1070 while (i--)
1071 {
1072 /* shortcut */
1073 x4 = blk[1];
1074 blk[1] = 0;
1075 blk += 8;
1076
1077 /* first stage */
1078 x5 = (W7 * x4 + 4) >> 3;
1079 x4 = (W1 * x4 + 4) >> 3;
1080
1081 /* third stage */
1082 x2 = (181 * (x4 + x5) + 128) >> 8;
1083 x1 = (181 * (x4 - x5) + 128) >> 8;
1084
1085 /* fourth stage */
1086 res = ((8192 + x4) >> 14);
1087 CLIP_RESULT(res)
1088 res2 = ((8192 + x2) >> 14);
1089 CLIP_RESULT(res2)
1090 dst_word = (res2 << 8) | res;
1091 res = ((8192 + x1) >> 14);
1092 CLIP_RESULT(res)
1093 dst_word |= (res << 16);
1094 res = ((8192 + x5) >> 14);
1095 CLIP_RESULT(res)
1096 dst_word |= (res << 24);
1097 *((uint32*)(rec += lx)) = dst_word;
1098
1099 res = ((8192 - x5) >> 14);
1100 CLIP_RESULT(res)
1101 res2 = ((8192 - x1) >> 14);
1102 CLIP_RESULT(res2)
1103 dst_word = (res2 << 8) | res;
1104 res = ((8192 - x2) >> 14);
1105 CLIP_RESULT(res)
1106 dst_word |= (res << 16);
1107 res = ((8192 - x4) >> 14);
1108 CLIP_RESULT(res)
1109 dst_word |= (res << 24);
1110 *((uint32*)(rec + 4)) = dst_word;
1111
1112 }
1113 return ;
1114 }
1115
idct_row0x20Intra(Short * blk,UChar * rec,Int lx)1116 void idct_row0x20Intra(Short *blk, UChar *rec, Int lx)
1117 {
1118 int32 x0, x2, x4, x6;
1119 int res, res2;
1120 uint32 dst_word;
1121 int i = 8;
1122
1123 rec -= lx;
1124 while (i--)
1125 {
1126 x2 = blk[2];
1127 blk[2] = 0;
1128 blk += 8;
1129
1130 /* both upper and lower*/
1131 /* both x2orx6 and x0orx4 */
1132 x6 = (W6 * x2 + 4) >> 3;
1133 x2 = (W2 * x2 + 4) >> 3;
1134 x0 = 8192 + x2;
1135 x2 = 8192 - x2;
1136 x4 = 8192 + x6;
1137 x6 = 8192 - x6;
1138
1139 res = ((x0) >> 14);
1140 CLIP_RESULT(res)
1141 res2 = ((x4) >> 14);
1142 CLIP_RESULT(res2)
1143 dst_word = (res2 << 8) | res;
1144 res = ((x6) >> 14);
1145 CLIP_RESULT(res)
1146 dst_word |= (res << 16);
1147 res = ((x2) >> 14);
1148 CLIP_RESULT(res)
1149 dst_word |= (res << 24);
1150 *((uint32*)(rec += lx)) = dst_word;
1151
1152 res = ((x2) >> 14);
1153 CLIP_RESULT(res)
1154 res2 = ((x6) >> 14);
1155 CLIP_RESULT(res2)
1156 dst_word = (res2 << 8) | res;
1157 res = ((x4) >> 14);
1158 CLIP_RESULT(res)
1159 dst_word |= (res << 16);
1160 res = ((x0) >> 14);
1161 CLIP_RESULT(res)
1162 dst_word |= (res << 24);
1163 *((uint32*)(rec + 4)) = dst_word;
1164
1165 }
1166 return ;
1167 }
1168
idct_row0x10Intra(Short * blk,UChar * rec,Int lx)1169 void idct_row0x10Intra(Short *blk, UChar *rec, Int lx)
1170 {
1171 int32 x1, x3, x5, x7;
1172 int res, res2;
1173 uint32 dst_word;
1174 int i = 8;
1175
1176 rec -= lx;
1177 while (i--)
1178 {
1179 x3 = blk[3];
1180 blk[3] = 0 ;
1181 blk += 8;
1182
1183 x1 = (W3 * x3 + 4) >> 3;
1184 x3 = (W5 * x3 + 4) >> 3;
1185
1186 x7 = (181 * (x3 - x1) + 128) >> 8;
1187 x5 = (-181 * (x1 + x3) + 128) >> 8;
1188
1189 res = ((8192 + x1) >> 14);
1190 CLIP_RESULT(res)
1191 res2 = ((8192 + x7) >> 14);
1192 CLIP_RESULT(res2)
1193 dst_word = (res2 << 8) | res;
1194 res = ((8192 + x5) >> 14);
1195 CLIP_RESULT(res)
1196 dst_word |= (res << 16);
1197 res = ((8192 - x3) >> 14);
1198 CLIP_RESULT(res)
1199 dst_word |= (res << 24);
1200 *((uint32*)(rec += lx)) = dst_word;
1201
1202 res = ((8192 + x3) >> 14);
1203 CLIP_RESULT(res)
1204 res2 = ((8192 - x5) >> 14);
1205 CLIP_RESULT(res2)
1206 dst_word = (res2 << 8) | res;
1207 res = ((8192 - x7) >> 14);
1208 CLIP_RESULT(res)
1209 dst_word |= (res << 16);
1210 res = ((8192 - x1) >> 14);
1211 CLIP_RESULT(res)
1212 dst_word |= (res << 24);
1213 *((uint32*)(rec + 4)) = dst_word;
1214
1215 }
1216
1217 return ;
1218 }
1219
1220 #endif /* SMALL_DCT */
idct_rowIntra(Short * blk,UChar * rec,Int lx)1221 void idct_rowIntra(Short *blk, UChar *rec, Int lx)
1222 {
1223 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1224 int i = 8;
1225 int res, res2;
1226 uint32 dst_word;
1227
1228 blk -= 8;
1229 rec -= lx;
1230
1231 while (i--)
1232 {
1233 x1 = (int32)blk[12] << 8;
1234 blk[12] = 0;
1235 x2 = blk[14];
1236 blk[14] = 0;
1237 x3 = blk[10];
1238 blk[10] = 0;
1239 x4 = blk[9];
1240 blk[9] = 0;
1241 x5 = blk[15];
1242 blk[15] = 0;
1243 x6 = blk[13];
1244 blk[13] = 0;
1245 x7 = blk[11];
1246 blk[11] = 0;
1247 x0 = ((*(blk += 8)) << 8) + 8192;
1248 *blk = 0; /* for proper rounding in the fourth stage */
1249
1250 /* first stage */
1251 x8 = W7 * (x4 + x5) + 4;
1252 x4 = (x8 + (W1 - W7) * x4) >> 3;
1253 x5 = (x8 - (W1 + W7) * x5) >> 3;
1254 x8 = W3 * (x6 + x7) + 4;
1255 x6 = (x8 - (W3 - W5) * x6) >> 3;
1256 x7 = (x8 - (W3 + W5) * x7) >> 3;
1257
1258 /* second stage */
1259 x8 = x0 + x1;
1260 x0 -= x1;
1261 x1 = W6 * (x3 + x2) + 4;
1262 x2 = (x1 - (W2 + W6) * x2) >> 3;
1263 x3 = (x1 + (W2 - W6) * x3) >> 3;
1264 x1 = x4 + x6;
1265 x4 -= x6;
1266 x6 = x5 + x7;
1267 x5 -= x7;
1268
1269 /* third stage */
1270 x7 = x8 + x3;
1271 x8 -= x3;
1272 x3 = x0 + x2;
1273 x0 -= x2;
1274 x2 = (181 * (x4 + x5) + 128) >> 8;
1275 x4 = (181 * (x4 - x5) + 128) >> 8;
1276
1277 /* fourth stage */
1278 res = ((x7 + x1) >> 14);
1279 CLIP_RESULT(res)
1280 res2 = ((x3 + x2) >> 14);
1281 CLIP_RESULT(res2)
1282 dst_word = res | (res2 << 8);
1283 res = ((x0 + x4) >> 14);
1284 CLIP_RESULT(res)
1285 dst_word |= (res << 16);
1286 res = ((x8 + x6) >> 14);
1287 CLIP_RESULT(res)
1288 dst_word |= (res << 24);
1289 *((uint32*)(rec += lx)) = dst_word;
1290
1291 res = ((x8 - x6) >> 14);
1292 CLIP_RESULT(res)
1293 res2 = ((x0 - x4) >> 14);
1294 CLIP_RESULT(res2)
1295 dst_word = res | (res2 << 8);
1296 res = ((x3 - x2) >> 14);
1297 CLIP_RESULT(res)
1298 dst_word |= (res << 16);
1299 res = ((x7 - x1) >> 14);
1300 CLIP_RESULT(res)
1301 dst_word |= (res << 24);
1302 *((uint32*)(rec + 4)) = dst_word;
1303 }
1304 return;
1305 }
1306
1307
1308 /* This function should not be called at all ****/
idct_row0zmv(Short * srce,UChar * rec,UChar * pred,Int lx)1309 void idct_row0zmv(Short *srce, UChar *rec, UChar *pred, Int lx)
1310 {
1311 OSCL_UNUSED_ARG(srce);
1312 OSCL_UNUSED_ARG(rec);
1313 OSCL_UNUSED_ARG(pred);
1314 OSCL_UNUSED_ARG(lx);
1315
1316 return;
1317 }
1318
idct_row1zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1319 void idct_row1zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1320 {
1321 int tmp;
1322 int i = 8;
1323 uint32 pred_word, dst_word;
1324 int res, res2;
1325
1326 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1327 pred -= 16;
1328 rec -= lx;
1329 blk -= 8;
1330
1331 while (i--)
1332 {
1333 tmp = (*(blk += 8) + 32) >> 6;
1334 *blk = 0;
1335
1336 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1337 res = tmp + (pred_word & 0xFF);
1338 CLIP_RESULT(res);
1339 res2 = tmp + ((pred_word >> 8) & 0xFF);
1340 CLIP_RESULT(res2);
1341 dst_word = (res2 << 8) | res;
1342 res = tmp + ((pred_word >> 16) & 0xFF);
1343 CLIP_RESULT(res);
1344 dst_word |= (res << 16);
1345 res = tmp + ((pred_word >> 24) & 0xFF);
1346 CLIP_RESULT(res);
1347 dst_word |= (res << 24);
1348 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1349
1350 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1351 res = tmp + (pred_word & 0xFF);
1352 CLIP_RESULT(res);
1353 res2 = tmp + ((pred_word >> 8) & 0xFF);
1354 CLIP_RESULT(res2);
1355 dst_word = (res2 << 8) | res;
1356 res = tmp + ((pred_word >> 16) & 0xFF);
1357 CLIP_RESULT(res);
1358 dst_word |= (res << 16);
1359 res = tmp + ((pred_word >> 24) & 0xFF);
1360 CLIP_RESULT(res);
1361 dst_word |= (res << 24);
1362 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1363 }
1364 return;
1365 }
1366
idct_row2zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1367 void idct_row2zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1368 {
1369 int32 x0, x1, x2, x4, x5;
1370 int i = 8;
1371 uint32 pred_word, dst_word;
1372 int res, res2;
1373
1374 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1375 rec -= lx;
1376 pred -= 16;
1377 blk -= 8;
1378
1379 while (i--)
1380 {
1381 /* shortcut */
1382 x4 = blk[9];
1383 blk[9] = 0;
1384 x0 = ((*(blk += 8)) << 8) + 8192;
1385 *blk = 0; /* for proper rounding in the fourth stage */
1386
1387 /* first stage */
1388 x5 = (W7 * x4 + 4) >> 3;
1389 x4 = (W1 * x4 + 4) >> 3;
1390
1391 /* third stage */
1392 x2 = (181 * (x4 + x5) + 128) >> 8;
1393 x1 = (181 * (x4 - x5) + 128) >> 8;
1394
1395 /* fourth stage */
1396 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1397 res = (x0 + x4) >> 14;
1398 ADD_AND_CLIP1(res);
1399 res2 = (x0 + x2) >> 14;
1400 ADD_AND_CLIP2(res2);
1401 dst_word = (res2 << 8) | res;
1402 res = (x0 + x1) >> 14;
1403 ADD_AND_CLIP3(res);
1404 dst_word |= (res << 16);
1405 res = (x0 + x5) >> 14;
1406 ADD_AND_CLIP4(res);
1407 dst_word |= (res << 24);
1408 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1409
1410 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1411 res = (x0 - x5) >> 14;
1412 ADD_AND_CLIP1(res);
1413 res2 = (x0 - x1) >> 14;
1414 ADD_AND_CLIP2(res2);
1415 dst_word = (res2 << 8) | res;
1416 res = (x0 - x2) >> 14;
1417 ADD_AND_CLIP3(res);
1418 dst_word |= (res << 16);
1419 res = (x0 - x4) >> 14;
1420 ADD_AND_CLIP4(res);
1421 dst_word |= (res << 24);
1422 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1423 }
1424 return ;
1425 }
1426
idct_row3zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1427 void idct_row3zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1428 {
1429 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1430 int i = 8;
1431 uint32 pred_word, dst_word;
1432 int res, res2;
1433
1434 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1435 rec -= lx;
1436 pred -= 16;
1437 blk -= 8;
1438
1439 while (i--)
1440 {
1441 x2 = blk[10];
1442 blk[10] = 0;
1443 x1 = blk[9];
1444 blk[9] = 0;
1445 x0 = ((*(blk += 8)) << 8) + 8192;
1446 *blk = 0; /* for proper rounding in the fourth stage */
1447 /* both upper and lower*/
1448 /* both x2orx6 and x0orx4 */
1449
1450 x4 = x0;
1451 x6 = (W6 * x2 + 4) >> 3;
1452 x2 = (W2 * x2 + 4) >> 3;
1453 x8 = x0 - x2;
1454 x0 += x2;
1455 x2 = x8;
1456 x8 = x4 - x6;
1457 x4 += x6;
1458 x6 = x8;
1459
1460 x7 = (W7 * x1 + 4) >> 3;
1461 x1 = (W1 * x1 + 4) >> 3;
1462 x3 = x7;
1463 x5 = (181 * (x1 - x7) + 128) >> 8;
1464 x7 = (181 * (x1 + x7) + 128) >> 8;
1465
1466 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1467 res = (x0 + x1) >> 14;
1468 ADD_AND_CLIP1(res);
1469 res2 = (x4 + x7) >> 14;
1470 ADD_AND_CLIP2(res2);
1471 dst_word = (res2 << 8) | res;
1472 res = (x6 + x5) >> 14;
1473 ADD_AND_CLIP3(res);
1474 dst_word |= (res << 16);
1475 res = (x2 + x3) >> 14;
1476 ADD_AND_CLIP4(res);
1477 dst_word |= (res << 24);
1478 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1479
1480 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1481 res = (x2 - x3) >> 14;
1482 ADD_AND_CLIP1(res);
1483 res2 = (x6 - x5) >> 14;
1484 ADD_AND_CLIP2(res2);
1485 dst_word = (res2 << 8) | res;
1486 res = (x4 - x7) >> 14;
1487 ADD_AND_CLIP3(res);
1488 dst_word |= (res << 16);
1489 res = (x0 - x1) >> 14;
1490 ADD_AND_CLIP4(res);
1491 dst_word |= (res << 24);
1492 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1493 }
1494
1495 return ;
1496 }
1497
idct_row4zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1498 void idct_row4zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1499 {
1500 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1501 int i = 8;
1502 uint32 pred_word, dst_word;
1503 int res, res2;
1504
1505 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1506 rec -= lx;
1507 pred -= 16;
1508 blk -= 8;
1509
1510 while (i--)
1511 {
1512 x2 = blk[10];
1513 blk[10] = 0;
1514 x1 = blk[9];
1515 blk[9] = 0;
1516 x3 = blk[11];
1517 blk[11] = 0;
1518 x0 = ((*(blk += 8)) << 8) + 8192;
1519 *blk = 0; /* for proper rounding in the fourth stage */
1520
1521 x4 = x0;
1522 x6 = (W6 * x2 + 4) >> 3;
1523 x2 = (W2 * x2 + 4) >> 3;
1524 x8 = x0 - x2;
1525 x0 += x2;
1526 x2 = x8;
1527 x8 = x4 - x6;
1528 x4 += x6;
1529 x6 = x8;
1530
1531 x7 = (W7 * x1 + 4) >> 3;
1532 x1 = (W1 * x1 + 4) >> 3;
1533 x5 = (W3 * x3 + 4) >> 3;
1534 x3 = (- W5 * x3 + 4) >> 3;
1535 x8 = x1 - x5;
1536 x1 += x5;
1537 x5 = x8;
1538 x8 = x7 - x3;
1539 x3 += x7;
1540 x7 = (181 * (x5 + x8) + 128) >> 8;
1541 x5 = (181 * (x5 - x8) + 128) >> 8;
1542
1543 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1544 res = (x0 + x1) >> 14;
1545 ADD_AND_CLIP1(res);
1546 res2 = (x4 + x7) >> 14;
1547 ADD_AND_CLIP2(res2);
1548 dst_word = (res2 << 8) | res;
1549 res = (x6 + x5) >> 14;
1550 ADD_AND_CLIP3(res);
1551 dst_word |= (res << 16);
1552 res = (x2 + x3) >> 14;
1553 ADD_AND_CLIP4(res);
1554 dst_word |= (res << 24);
1555 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1556
1557 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1558 res = (x2 - x3) >> 14;
1559 ADD_AND_CLIP1(res);
1560 res2 = (x6 - x5) >> 14;
1561 ADD_AND_CLIP2(res2);
1562 dst_word = (res2 << 8) | res;
1563 res = (x4 - x7) >> 14;
1564 ADD_AND_CLIP3(res);
1565 dst_word |= (res << 16);
1566 res = (x0 - x1) >> 14;
1567 ADD_AND_CLIP4(res);
1568 dst_word |= (res << 24);
1569 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1570 }
1571 return ;
1572 }
1573
1574 #ifndef SMALL_DCT
idct_row0x40zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1575 void idct_row0x40zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1576 {
1577 int32 x1, x2, x4, x5;
1578 int i = 8;
1579 uint32 pred_word, dst_word;
1580 int res, res2;
1581
1582 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1583 rec -= lx;
1584 pred -= 16;
1585
1586 while (i--)
1587 {
1588 /* shortcut */
1589 x4 = blk[1];
1590 blk[1] = 0;
1591 blk += 8; /* for proper rounding in the fourth stage */
1592
1593 /* first stage */
1594 x5 = (W7 * x4 + 4) >> 3;
1595 x4 = (W1 * x4 + 4) >> 3;
1596
1597 /* third stage */
1598 x2 = (181 * (x4 + x5) + 128) >> 8;
1599 x1 = (181 * (x4 - x5) + 128) >> 8;
1600
1601 /* fourth stage */
1602 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1603 res = (8192 + x4) >> 14;
1604 ADD_AND_CLIP1(res);
1605 res2 = (8192 + x2) >> 14;
1606 ADD_AND_CLIP2(res2);
1607 dst_word = (res2 << 8) | res;
1608 res = (8192 + x1) >> 14;
1609 ADD_AND_CLIP3(res);
1610 dst_word |= (res << 16);
1611 res = (8192 + x5) >> 14;
1612 ADD_AND_CLIP4(res);
1613 dst_word |= (res << 24);
1614 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1615
1616 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1617 res = (8192 - x5) >> 14;
1618 ADD_AND_CLIP1(res);
1619 res2 = (8192 - x1) >> 14;
1620 ADD_AND_CLIP2(res2);
1621 dst_word = (res2 << 8) | res;
1622 res = (8192 - x2) >> 14;
1623 ADD_AND_CLIP3(res);
1624 dst_word |= (res << 16);
1625 res = (8192 - x4) >> 14;
1626 ADD_AND_CLIP4(res);
1627 dst_word |= (res << 24);
1628 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1629 }
1630 return ;
1631 }
1632
idct_row0x20zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1633 void idct_row0x20zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1634 {
1635 int32 x0, x2, x4, x6;
1636 int i = 8;
1637 uint32 pred_word, dst_word;
1638 int res, res2;
1639
1640 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1641 rec -= lx;
1642 pred -= 16;
1643
1644 while (i--)
1645 {
1646 x2 = blk[2];
1647 blk[2] = 0;
1648 blk += 8; /* for proper rounding in the fourth stage */
1649 /* both upper and lower*/
1650 /* both x2orx6 and x0orx4 */
1651 x6 = (W6 * x2 + 4) >> 3;
1652 x2 = (W2 * x2 + 4) >> 3;
1653 x0 = 8192 + x2;
1654 x2 = 8192 - x2;
1655 x4 = 8192 + x6;
1656 x6 = 8192 - x6;
1657
1658 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1659 res = (x0) >> 14;
1660 ADD_AND_CLIP1(res);
1661 res2 = (x4) >> 14;
1662 ADD_AND_CLIP2(res2);
1663 dst_word = (res2 << 8) | res;
1664 res = (x6) >> 14;
1665 ADD_AND_CLIP3(res);
1666 dst_word |= (res << 16);
1667 res = (x2) >> 14;
1668 ADD_AND_CLIP4(res);
1669 dst_word |= (res << 24);
1670 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1671
1672 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1673 res = (x2) >> 14;
1674 ADD_AND_CLIP1(res);
1675 res2 = (x6) >> 14;
1676 ADD_AND_CLIP2(res2);
1677 dst_word = (res2 << 8) | res;
1678 res = (x4) >> 14;
1679 ADD_AND_CLIP3(res);
1680 dst_word |= (res << 16);
1681 res = (x0) >> 14;
1682 ADD_AND_CLIP4(res);
1683 dst_word |= (res << 24);
1684 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1685 }
1686
1687 return ;
1688 }
1689
idct_row0x10zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1690 void idct_row0x10zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1691 {
1692 int32 x1, x3, x5, x7;
1693 int i = 8;
1694 uint32 pred_word, dst_word;
1695 int res, res2;
1696
1697 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1698 rec -= lx;
1699 pred -= 16;
1700
1701 while (i--)
1702 {
1703 x3 = blk[3];
1704 blk[3] = 0;
1705 blk += 8;
1706
1707 x1 = (W3 * x3 + 4) >> 3;
1708 x3 = (-W5 * x3 + 4) >> 3;
1709
1710 x7 = (-181 * (x3 + x1) + 128) >> 8;
1711 x5 = (181 * (x3 - x1) + 128) >> 8;
1712
1713 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1714 res = (8192 + x1) >> 14;
1715 ADD_AND_CLIP1(res);
1716 res2 = (8192 + x7) >> 14;
1717 ADD_AND_CLIP2(res2);
1718 dst_word = (res2 << 8) | res;
1719 res = (8192 + x5) >> 14;
1720 ADD_AND_CLIP3(res);
1721 dst_word |= (res << 16);
1722 res = (8192 + x3) >> 14;
1723 ADD_AND_CLIP4(res);
1724 dst_word |= (res << 24);
1725 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1726
1727 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1728 res = (8192 - x3) >> 14;
1729 ADD_AND_CLIP1(res);
1730 res2 = (8192 - x5) >> 14;
1731 ADD_AND_CLIP2(res2);
1732 dst_word = (res2 << 8) | res;
1733 res = (8192 - x7) >> 14;
1734 ADD_AND_CLIP3(res);
1735 dst_word |= (res << 16);
1736 res = (8192 - x1) >> 14;
1737 ADD_AND_CLIP4(res);
1738 dst_word |= (res << 24);
1739 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1740 }
1741 return ;
1742 }
1743
1744 #endif /* SMALL_DCT */
1745
idct_rowzmv(Short * blk,UChar * rec,UChar * pred,Int lx)1746 void idct_rowzmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1747 {
1748 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1749 int i = 8;
1750 uint32 pred_word, dst_word;
1751 int res, res2;
1752
1753 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1754 rec -= lx;
1755 pred -= 16;
1756 blk -= 8;
1757
1758 while (i--)
1759 {
1760 x1 = (int32)blk[12] << 8;
1761 blk[12] = 0;
1762 x2 = blk[14];
1763 blk[14] = 0;
1764 x3 = blk[10];
1765 blk[10] = 0;
1766 x4 = blk[9];
1767 blk[9] = 0;
1768 x5 = blk[15];
1769 blk[15] = 0;
1770 x6 = blk[13];
1771 blk[13] = 0;
1772 x7 = blk[11];
1773 blk[11] = 0;
1774 x0 = ((*(blk += 8)) << 8) + 8192;
1775 *blk = 0; /* for proper rounding in the fourth stage */
1776
1777 /* first stage */
1778 x8 = W7 * (x4 + x5) + 4;
1779 x4 = (x8 + (W1 - W7) * x4) >> 3;
1780 x5 = (x8 - (W1 + W7) * x5) >> 3;
1781 x8 = W3 * (x6 + x7) + 4;
1782 x6 = (x8 - (W3 - W5) * x6) >> 3;
1783 x7 = (x8 - (W3 + W5) * x7) >> 3;
1784
1785 /* second stage */
1786 x8 = x0 + x1;
1787 x0 -= x1;
1788 x1 = W6 * (x3 + x2) + 4;
1789 x2 = (x1 - (W2 + W6) * x2) >> 3;
1790 x3 = (x1 + (W2 - W6) * x3) >> 3;
1791 x1 = x4 + x6;
1792 x4 -= x6;
1793 x6 = x5 + x7;
1794 x5 -= x7;
1795
1796 /* third stage */
1797 x7 = x8 + x3;
1798 x8 -= x3;
1799 x3 = x0 + x2;
1800 x0 -= x2;
1801 x2 = (181 * (x4 + x5) + 128) >> 8;
1802 x4 = (181 * (x4 - x5) + 128) >> 8;
1803
1804 /* fourth stage */
1805 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1806
1807 res = (x7 + x1) >> 14;
1808 ADD_AND_CLIP1(res);
1809 res2 = (x3 + x2) >> 14;
1810 ADD_AND_CLIP2(res2);
1811 dst_word = (res2 << 8) | res;
1812 res = (x0 + x4) >> 14;
1813 ADD_AND_CLIP3(res);
1814 dst_word |= (res << 16);
1815 res = (x8 + x6) >> 14;
1816 ADD_AND_CLIP4(res);
1817 dst_word |= (res << 24);
1818 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1819
1820 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1821
1822 res = (x8 - x6) >> 14;
1823 ADD_AND_CLIP1(res);
1824 res2 = (x0 - x4) >> 14;
1825 ADD_AND_CLIP2(res2);
1826 dst_word = (res2 << 8) | res;
1827 res = (x3 - x2) >> 14;
1828 ADD_AND_CLIP3(res);
1829 dst_word |= (res << 16);
1830 res = (x7 - x1) >> 14;
1831 ADD_AND_CLIP4(res);
1832 dst_word |= (res << 24);
1833 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1834 }
1835 return;
1836 }
1837
1838 /*----------------------------------------------------------------------------
1839 ; End Function: idctcol
1840 ----------------------------------------------------------------------------*/
1841 /* ======================================================================== */
1842 /* Function : BlockIDCTMotionComp */
1843 /* Date : 10/16/2000 */
1844 /* Purpose : fast IDCT routine */
1845 /* In/out : */
1846 /* Int* coeff_in Dequantized coefficient
1847 Int block_out output IDCT coefficient
1848 Int maxval clip value */
1849 /* Modified : 7/31/01, add checking for all-zero and DC-only block. */
1850 /* do 8 columns at a time */
1851 /* 8/2/01, do column first then row-IDCT. */
1852 /* 8/2/01, remove clipping (included in motion comp). */
1853 /* 8/7/01, combine with motion comp. */
1854 /* 8/8/01, use AAN IDCT */
1855 /* 9/4/05, use Chen's IDCT and 16 bit block */
1856 /* ======================================================================== */
BlockIDCTMotionComp(Short * block,UChar * bitmapcol,UChar bitmaprow,Int dctMode,UChar * rec,UChar * pred,Int lx_intra)1857 void BlockIDCTMotionComp(Short *block, UChar *bitmapcol, UChar bitmaprow,
1858 Int dctMode, UChar *rec, UChar *pred, Int lx_intra)
1859 {
1860 Int i;
1861 Int tmp, tmp2;
1862 ULong tmp4;
1863 Int bmap;
1864 Short *ptr = block;
1865 UChar *endcol;
1866 UInt mask = 0xFF;
1867 Int lx = lx_intra >> 1;
1868 Int intra = (lx_intra & 1);
1869
1870 /* all-zero block */
1871 if (dctMode == 0 || bitmaprow == 0)
1872 {
1873 if (intra)
1874 {
1875 *((ULong*)rec) = *((ULong*)(rec + 4)) = 0;
1876 *((ULong*)(rec += lx)) = 0;
1877 *((ULong*)(rec + 4)) = 0;
1878 *((ULong*)(rec += lx)) = 0;
1879 *((ULong*)(rec + 4)) = 0;
1880 *((ULong*)(rec += lx)) = 0;
1881 *((ULong*)(rec + 4)) = 0;
1882 *((ULong*)(rec += lx)) = 0;
1883 *((ULong*)(rec + 4)) = 0;
1884 *((ULong*)(rec += lx)) = 0;
1885 *((ULong*)(rec + 4)) = 0;
1886 *((ULong*)(rec += lx)) = 0;
1887 *((ULong*)(rec + 4)) = 0;
1888 *((ULong*)(rec += lx)) = 0;
1889 *((ULong*)(rec + 4)) = 0;
1890 return ;
1891 }
1892 else /* copy from previous frame */
1893 {
1894 *((ULong*)rec) = *((ULong*)pred);
1895 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1896 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1897 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1898 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1899 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1900 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1901 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1902 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1903 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1904 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1905 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1906 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1907 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1908 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1909 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1910 return ;
1911 }
1912 }
1913
1914 /* Test for DC only block */
1915 if (dctMode == 1 || (bitmaprow == 0x80 && bitmapcol[0] == 0x80))
1916 {
1917 i = ((block[0] << 3) + 32) >> 6;
1918 block[0] = 0;
1919 if (intra)
1920 {
1921 if ((UInt)i > mask) i = mask & (~(i >> 31));
1922
1923 tmp = i | (i << 8);
1924 tmp |= (tmp << 16);
1925
1926 *((ULong*)rec) = *((ULong*)(rec + 4)) = tmp;
1927 *((ULong*)(rec += lx)) = tmp;
1928 *((ULong*)(rec + 4)) = tmp;
1929 *((ULong*)(rec += lx)) = tmp;
1930 *((ULong*)(rec + 4)) = tmp;
1931 *((ULong*)(rec += lx)) = tmp;
1932 *((ULong*)(rec + 4)) = tmp;
1933 *((ULong*)(rec += lx)) = tmp;
1934 *((ULong*)(rec + 4)) = tmp;
1935 *((ULong*)(rec += lx)) = tmp;
1936 *((ULong*)(rec + 4)) = tmp;
1937 *((ULong*)(rec += lx)) = tmp;
1938 *((ULong*)(rec + 4)) = tmp;
1939 *((ULong*)(rec += lx)) = tmp;
1940 *((ULong*)(rec + 4)) = tmp;
1941
1942 return ;
1943 }
1944 else
1945 {
1946 endcol = rec + (lx << 3);
1947 do
1948 {
1949 tmp4 = *((ULong*)pred);
1950 tmp2 = tmp4 & 0xFF;
1951 tmp2 += i;
1952 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
1953 tmp = (tmp4 >> 8) & 0xFF;
1954 tmp += i;
1955 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1956 tmp2 |= (tmp << 8);
1957 tmp = (tmp4 >> 16) & 0xFF;
1958 tmp += i;
1959 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1960 tmp2 |= (tmp << 16);
1961 tmp = (tmp4 >> 24) & 0xFF;
1962 tmp += i;
1963 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1964 tmp2 |= (tmp << 24);
1965 *((ULong*)rec) = tmp2;
1966
1967 tmp4 = *((ULong*)(pred + 4));
1968 tmp2 = tmp4 & 0xFF;
1969 tmp2 += i;
1970 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
1971 tmp = (tmp4 >> 8) & 0xFF;
1972 tmp += i;
1973 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1974 tmp2 |= (tmp << 8);
1975 tmp = (tmp4 >> 16) & 0xFF;
1976 tmp += i;
1977 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1978 tmp2 |= (tmp << 16);
1979 tmp = (tmp4 >> 24) & 0xFF;
1980 tmp += i;
1981 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1982 tmp2 |= (tmp << 24);
1983 *((ULong*)(rec + 4)) = tmp2;
1984
1985 rec += lx;
1986 pred += 16;
1987 }
1988 while (rec < endcol);
1989 return ;
1990 }
1991 }
1992
1993 for (i = 0; i < dctMode; i++)
1994 {
1995 bmap = (Int)bitmapcol[i];
1996 if (bmap)
1997 {
1998 if ((bmap&0xf) == 0)
1999 (*(idctcolVCA[bmap>>4]))(ptr);
2000 else
2001 idct_col(ptr);
2002 }
2003 ptr++;
2004 }
2005
2006 if ((bitmaprow&0xf) == 0)
2007 {
2008 if (intra)
2009 (*(idctrowVCAIntra[(Int)(bitmaprow>>4)]))(block, rec, lx);
2010 else
2011 (*(idctrowVCAzmv[(Int)(bitmaprow>>4)]))(block, rec, pred, lx);
2012 }
2013 else
2014 {
2015 if (intra)
2016 idct_rowIntra(block, rec, lx);
2017 else
2018 idct_rowzmv(block, rec, pred, lx);
2019 }
2020 }
2021