1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*------------------------------------------------------------------------------
18 
19     Table of contents
20 
21      1. Include headers
22      2. External compiler flags
23      3. Module defines
24      4. Local function prototypes
25      5. Functions
26 
27 ------------------------------------------------------------------------------*/
28 
29 /*------------------------------------------------------------------------------
30     1. Include headers
31 ------------------------------------------------------------------------------*/
32 
33 #include "basetype.h"
34 #include "h264bsd_reconstruct.h"
35 #include "h264bsd_macroblock_layer.h"
36 #include "h264bsd_image.h"
37 #include "h264bsd_util.h"
38 
39 #ifdef H264DEC_OMXDL
40 #include "omxtypes.h"
41 #include "omxVC.h"
42 #include "armVC.h"
43 #endif /* H264DEC_OMXDL */
44 
45 #define UNUSED(x) (void)(x)
46 
47 /*------------------------------------------------------------------------------
48     2. External compiler flags
49 --------------------------------------------------------------------------------
50 
51 --------------------------------------------------------------------------------
52     3. Module defines
53 ------------------------------------------------------------------------------*/
54 
55 /* Switch off the following Lint messages for this file:
56  * Info 701: Shift left of signed quantity (int)
57  * Info 702: Shift right of signed quantity (int)
58  */
59 /*lint -e701 -e702 */
60 
61 /* Luma fractional-sample positions
62  *
63  *  G a b c H
64  *  d e f g
65  *  h i j k m
66  *  n p q r
67  *  M   s   N
68  *
69  *  G, H, M and N are integer sample positions
70  *  a-s are fractional samples that need to be interpolated.
71  */
72 #ifndef H264DEC_OMXDL
73 static const u32 lumaFracPos[4][4] = {
74   /* G  d  h  n    a  e  i  p    b  f  j   q     c   g   k   r */
75     {0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}, {12, 13, 14, 15}};
76 #endif /* H264DEC_OMXDL */
77 
78 /* clipping table, defined in h264bsd_intra_prediction.c */
79 extern const u8 h264bsdClip[];
80 
81 /*------------------------------------------------------------------------------
82     4. Local function prototypes
83 ------------------------------------------------------------------------------*/
84 
85 #ifndef H264DEC_OMXDL
86 
87 /*------------------------------------------------------------------------------
88 
89     Function: h264bsdInterpolateChromaHor
90 
91         Functional description:
92           This function performs chroma interpolation in horizontal direction.
93           Overfilling is done only if needed. Reference image (pRef) is
94           read at correct position and the predicted part is written to
95           macroblock's chrominance (predPartChroma)
96         Inputs:
97           pRef              pointer to reference frame Cb top-left corner
98           x0                integer x-coordinate for prediction
99           y0                integer y-coordinate for prediction
100           width             width of the reference frame chrominance in pixels
101           height            height of the reference frame chrominance in pixels
102           xFrac             horizontal fraction for prediction in 1/8 pixels
103           chromaPartWidth   width of the predicted part in pixels
104           chromaPartHeight  height of the predicted part in pixels
105         Outputs:
106           predPartChroma    pointer where predicted part is written
107 
108 ------------------------------------------------------------------------------*/
109 #ifndef H264DEC_ARM11
h264bsdInterpolateChromaHor(u8 * pRef,u8 * predPartChroma,i32 x0,i32 y0,u32 width,u32 height,u32 xFrac,u32 chromaPartWidth,u32 chromaPartHeight)110 void h264bsdInterpolateChromaHor(
111   u8 *pRef,
112   u8 *predPartChroma,
113   i32 x0,
114   i32 y0,
115   u32 width,
116   u32 height,
117   u32 xFrac,
118   u32 chromaPartWidth,
119   u32 chromaPartHeight)
120 {
121 
122 /* Variables */
123 
124     u32 x, y, tmp1, tmp2, tmp3, tmp4, c, val;
125     u8 *ptrA, *cbr;
126     u32 comp;
127     u8 block[9*8*2];
128 
129 /* Code */
130 
131     ASSERT(predPartChroma);
132     ASSERT(chromaPartWidth);
133     ASSERT(chromaPartHeight);
134     ASSERT(xFrac < 8);
135     ASSERT(pRef);
136 
137     if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
138         (y0 < 0) || ((u32)y0+chromaPartHeight > height))
139     {
140         h264bsdFillBlock(pRef, block, x0, y0, width, height,
141             chromaPartWidth + 1, chromaPartHeight, chromaPartWidth + 1);
142         pRef += width * height;
143         h264bsdFillBlock(pRef, block + (chromaPartWidth+1)*chromaPartHeight,
144             x0, y0, width, height, chromaPartWidth + 1,
145             chromaPartHeight, chromaPartWidth + 1);
146 
147         pRef = block;
148         x0 = 0;
149         y0 = 0;
150         width = chromaPartWidth+1;
151         height = chromaPartHeight;
152     }
153 
154     val = 8 - xFrac;
155 
156     for (comp = 0; comp <= 1; comp++)
157     {
158 
159         ptrA = pRef + (comp * height + (u32)y0) * width + x0;
160         cbr = predPartChroma + comp * 8 * 8;
161 
162         /* 2x2 pels per iteration
163          * bilinear horizontal interpolation */
164         for (y = (chromaPartHeight >> 1); y; y--)
165         {
166             for (x = (chromaPartWidth >> 1); x; x--)
167             {
168                 tmp1 = ptrA[width];
169                 tmp2 = *ptrA++;
170                 tmp3 = ptrA[width];
171                 tmp4 = *ptrA++;
172                 c = ((val * tmp1 + xFrac * tmp3) << 3) + 32;
173                 c >>= 6;
174                 cbr[8] = (u8)c;
175                 c = ((val * tmp2 + xFrac * tmp4) << 3) + 32;
176                 c >>= 6;
177                 *cbr++ = (u8)c;
178                 tmp1 = ptrA[width];
179                 tmp2 = *ptrA;
180                 c = ((val * tmp3 + xFrac * tmp1) << 3) + 32;
181                 c >>= 6;
182                 cbr[8] = (u8)c;
183                 c = ((val * tmp4 + xFrac * tmp2) << 3) + 32;
184                 c >>= 6;
185                 *cbr++ = (u8)c;
186             }
187             cbr += 2*8 - chromaPartWidth;
188             ptrA += 2*width - chromaPartWidth;
189         }
190     }
191 
192 }
193 
194 /*------------------------------------------------------------------------------
195 
196     Function: h264bsdInterpolateChromaVer
197 
198         Functional description:
199           This function performs chroma interpolation in vertical direction.
200           Overfilling is done only if needed. Reference image (pRef) is
201           read at correct position and the predicted part is written to
202           macroblock's chrominance (predPartChroma)
203 
204 ------------------------------------------------------------------------------*/
205 
h264bsdInterpolateChromaVer(u8 * pRef,u8 * predPartChroma,i32 x0,i32 y0,u32 width,u32 height,u32 yFrac,u32 chromaPartWidth,u32 chromaPartHeight)206 void h264bsdInterpolateChromaVer(
207   u8 *pRef,
208   u8 *predPartChroma,
209   i32 x0,
210   i32 y0,
211   u32 width,
212   u32 height,
213   u32 yFrac,
214   u32 chromaPartWidth,
215   u32 chromaPartHeight)
216 {
217 
218 /* Variables */
219 
220     u32 x, y, tmp1, tmp2, tmp3, c, val;
221     u8 *ptrA, *cbr;
222     u32 comp;
223     u8 block[9*8*2];
224 
225 /* Code */
226 
227     ASSERT(predPartChroma);
228     ASSERT(chromaPartWidth);
229     ASSERT(chromaPartHeight);
230     ASSERT(yFrac < 8);
231     ASSERT(pRef);
232 
233     if ((x0 < 0) || ((u32)x0+chromaPartWidth > width) ||
234         (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
235     {
236         h264bsdFillBlock(pRef, block, x0, y0, width, height, chromaPartWidth,
237             chromaPartHeight + 1, chromaPartWidth);
238         pRef += width * height;
239         h264bsdFillBlock(pRef, block + chromaPartWidth*(chromaPartHeight+1),
240             x0, y0, width, height, chromaPartWidth,
241             chromaPartHeight + 1, chromaPartWidth);
242 
243         pRef = block;
244         x0 = 0;
245         y0 = 0;
246         width = chromaPartWidth;
247         height = chromaPartHeight+1;
248     }
249 
250     val = 8 - yFrac;
251 
252     for (comp = 0; comp <= 1; comp++)
253     {
254 
255         ptrA = pRef + (comp * height + (u32)y0) * width + x0;
256         cbr = predPartChroma + comp * 8 * 8;
257 
258         /* 2x2 pels per iteration
259          * bilinear vertical interpolation */
260         for (y = (chromaPartHeight >> 1); y; y--)
261         {
262             for (x = (chromaPartWidth >> 1); x; x--)
263             {
264                 tmp3 = ptrA[width*2];
265                 tmp2 = ptrA[width];
266                 tmp1 = *ptrA++;
267                 c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
268                 c >>= 6;
269                 cbr[8] = (u8)c;
270                 c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
271                 c >>= 6;
272                 *cbr++ = (u8)c;
273                 tmp3 = ptrA[width*2];
274                 tmp2 = ptrA[width];
275                 tmp1 = *ptrA++;
276                 c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
277                 c >>= 6;
278                 cbr[8] = (u8)c;
279                 c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
280                 c >>= 6;
281                 *cbr++ = (u8)c;
282             }
283             cbr += 2*8 - chromaPartWidth;
284             ptrA += 2*width - chromaPartWidth;
285         }
286     }
287 
288 }
289 #endif
290 /*------------------------------------------------------------------------------
291 
292     Function: h264bsdInterpolateChromaHorVer
293 
294         Functional description:
295           This function performs chroma interpolation in horizontal and
296           vertical direction. Overfilling is done only if needed. Reference
297           image (ref) is read at correct position and the predicted part
298           is written to macroblock's chrominance (predPartChroma)
299 
300 ------------------------------------------------------------------------------*/
301 
h264bsdInterpolateChromaHorVer(u8 * ref,u8 * predPartChroma,i32 x0,i32 y0,u32 width,u32 height,u32 xFrac,u32 yFrac,u32 chromaPartWidth,u32 chromaPartHeight)302 void h264bsdInterpolateChromaHorVer(
303   u8 *ref,
304   u8 *predPartChroma,
305   i32 x0,
306   i32 y0,
307   u32 width,
308   u32 height,
309   u32 xFrac,
310   u32 yFrac,
311   u32 chromaPartWidth,
312   u32 chromaPartHeight)
313 {
314     u8 block[9*9*2];
315     u32 x, y, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, valX, valY, plus32 = 32;
316     u32 comp;
317     u8 *ptrA, *cbr;
318 
319 /* Code */
320 
321     ASSERT(predPartChroma);
322     ASSERT(chromaPartWidth);
323     ASSERT(chromaPartHeight);
324     ASSERT(xFrac < 8);
325     ASSERT(yFrac < 8);
326     ASSERT(ref);
327 
328     if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
329         (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
330     {
331         h264bsdFillBlock(ref, block, x0, y0, width, height,
332             chromaPartWidth + 1, chromaPartHeight + 1, chromaPartWidth + 1);
333         ref += width * height;
334         h264bsdFillBlock(ref, block + (chromaPartWidth+1)*(chromaPartHeight+1),
335             x0, y0, width, height, chromaPartWidth + 1,
336             chromaPartHeight + 1, chromaPartWidth + 1);
337 
338         ref = block;
339         x0 = 0;
340         y0 = 0;
341         width = chromaPartWidth+1;
342         height = chromaPartHeight+1;
343     }
344 
345     valX = 8 - xFrac;
346     valY = 8 - yFrac;
347 
348     for (comp = 0; comp <= 1; comp++)
349     {
350 
351         ptrA = ref + (comp * height + (u32)y0) * width + x0;
352         cbr = predPartChroma + comp * 8 * 8;
353 
354         /* 2x2 pels per iteration
355          * bilinear vertical and horizontal interpolation */
356         for (y = (chromaPartHeight >> 1); y; y--)
357         {
358             tmp1 = *ptrA;
359             tmp3 = ptrA[width];
360             tmp5 = ptrA[width*2];
361             tmp1 *= valY;
362             tmp1 += tmp3 * yFrac;
363             tmp3 *= valY;
364             tmp3 += tmp5 * yFrac;
365             for (x = (chromaPartWidth >> 1); x; x--)
366             {
367                 tmp2 = *++ptrA;
368                 tmp4 = ptrA[width];
369                 tmp6 = ptrA[width*2];
370                 tmp2 *= valY;
371                 tmp2 += tmp4 * yFrac;
372                 tmp4 *= valY;
373                 tmp4 += tmp6 * yFrac;
374                 tmp1 = tmp1 * valX + plus32;
375                 tmp3 = tmp3 * valX + plus32;
376                 tmp1 += tmp2 * xFrac;
377                 tmp1 >>= 6;
378                 tmp3 += tmp4 * xFrac;
379                 tmp3 >>= 6;
380                 cbr[8] = (u8)tmp3;
381                 *cbr++ = (u8)tmp1;
382 
383                 tmp1 = *++ptrA;
384                 tmp3 = ptrA[width];
385                 tmp5 = ptrA[width*2];
386                 tmp1 *= valY;
387                 tmp1 += tmp3 * yFrac;
388                 tmp3 *= valY;
389                 tmp3 += tmp5 * yFrac;
390                 tmp2 = tmp2 * valX + plus32;
391                 tmp4 = tmp4 * valX + plus32;
392                 tmp2 += tmp1 * xFrac;
393                 tmp2 >>= 6;
394                 tmp4 += tmp3 * xFrac;
395                 tmp4 >>= 6;
396                 cbr[8] = (u8)tmp4;
397                 *cbr++ = (u8)tmp2;
398             }
399             cbr += 2*8 - chromaPartWidth;
400             ptrA += 2*width - chromaPartWidth;
401         }
402     }
403 
404 }
405 
406 /*------------------------------------------------------------------------------
407 
408     Function: PredictChroma
409 
410         Functional description:
411           Top level chroma prediction function that calls the appropriate
412           interpolation function. The output is written to macroblock array.
413 
414 ------------------------------------------------------------------------------*/
415 
PredictChroma(u8 * mbPartChroma,u32 xAL,u32 yAL,u32 partWidth,u32 partHeight,mv_t * mv,image_t * refPic)416 static void PredictChroma(
417   u8 *mbPartChroma,
418   u32 xAL,
419   u32 yAL,
420   u32 partWidth,
421   u32 partHeight,
422   mv_t *mv,
423   image_t *refPic)
424 {
425 
426 /* Variables */
427 
428     u32 xFrac, yFrac, width, height, chromaPartWidth, chromaPartHeight;
429     i32 xInt, yInt;
430     u8 *ref;
431 
432 /* Code */
433 
434     ASSERT(mv);
435     ASSERT(refPic);
436     ASSERT(refPic->data);
437     ASSERT(refPic->width);
438     ASSERT(refPic->height);
439 
440     width  = 8 * refPic->width;
441     height = 8 * refPic->height;
442 
443     xInt = (xAL >> 1) + (mv->hor >> 3);
444     yInt = (yAL >> 1) + (mv->ver >> 3);
445     xFrac = mv->hor & 0x7;
446     yFrac = mv->ver & 0x7;
447 
448     chromaPartWidth  = partWidth >> 1;
449     chromaPartHeight = partHeight >> 1;
450     ref = refPic->data + 256 * refPic->width * refPic->height;
451 
452     if (xFrac && yFrac)
453     {
454         h264bsdInterpolateChromaHorVer(ref, mbPartChroma, xInt, yInt, width,
455                 height, xFrac, yFrac, chromaPartWidth, chromaPartHeight);
456     }
457     else if (xFrac)
458     {
459         h264bsdInterpolateChromaHor(ref, mbPartChroma, xInt, yInt, width,
460                 height, xFrac, chromaPartWidth, chromaPartHeight);
461     }
462     else if (yFrac)
463     {
464         h264bsdInterpolateChromaVer(ref, mbPartChroma, xInt, yInt, width,
465                 height, yFrac, chromaPartWidth, chromaPartHeight);
466     }
467     else
468     {
469         h264bsdFillBlock(ref, mbPartChroma, xInt, yInt, width, height,
470             chromaPartWidth, chromaPartHeight, 8);
471         ref += width * height;
472         h264bsdFillBlock(ref, mbPartChroma + 8*8, xInt, yInt, width, height,
473             chromaPartWidth, chromaPartHeight, 8);
474     }
475 
476 }
477 
478 
479 /*------------------------------------------------------------------------------
480 
481     Function: h264bsdInterpolateVerHalf
482 
483         Functional description:
484           Function to perform vertical interpolation of pixel position 'h'
485           for a block. Overfilling is done only if needed. Reference
486           image (ref) is read at correct position and the predicted part
487           is written to macroblock array (mb)
488 
489 ------------------------------------------------------------------------------*/
490 #ifndef H264DEC_ARM11
h264bsdInterpolateVerHalf(u8 * ref,u8 * mb,i32 x0,i32 y0,u32 width,u32 height,u32 partWidth,u32 partHeight)491 void h264bsdInterpolateVerHalf(
492   u8 *ref,
493   u8 *mb,
494   i32 x0,
495   i32 y0,
496   u32 width,
497   u32 height,
498   u32 partWidth,
499   u32 partHeight)
500 {
501     u32 p1[21*21/4+1];
502     u32 i, j;
503     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
504     u8 *ptrC, *ptrV;
505     const u8 *clp = h264bsdClip + 512;
506 
507     /* Code */
508 
509     ASSERT(ref);
510     ASSERT(mb);
511 
512     if ((x0 < 0) || ((u32)x0+partWidth > width) ||
513         (y0 < 0) || ((u32)y0+partHeight+5 > height))
514     {
515         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
516                 partWidth, partHeight+5, partWidth);
517 
518         x0 = 0;
519         y0 = 0;
520         ref = (u8*)p1;
521         width = partWidth;
522     }
523 
524     ref += (u32)y0 * width + (u32)x0;
525 
526     ptrC = ref + width;
527     ptrV = ptrC + 5*width;
528 
529     /* 4 pixels per iteration, interpolate using 5 vertical samples */
530     for (i = (partHeight >> 2); i; i--)
531     {
532         /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
533         for (j = partWidth; j; j--)
534         {
535             tmp4 = ptrV[-(i32)width*2];
536             tmp5 = ptrV[-(i32)width];
537             tmp1 = ptrV[width];
538             tmp2 = ptrV[width*2];
539             tmp6 = *ptrV++;
540 
541             tmp7 = tmp4 + tmp1;
542             tmp2 -= (tmp7 << 2);
543             tmp2 -= tmp7;
544             tmp2 += 16;
545             tmp7 = tmp5 + tmp6;
546             tmp3 = ptrC[width*2];
547             tmp2 += (tmp7 << 4);
548             tmp2 += (tmp7 << 2);
549             tmp2 += tmp3;
550             tmp2 = clp[tmp2>>5];
551             tmp1 += 16;
552             mb[48] = (u8)tmp2;
553 
554             tmp7 = tmp3 + tmp6;
555             tmp1 -= (tmp7 << 2);
556             tmp1 -= tmp7;
557             tmp7 = tmp4 + tmp5;
558             tmp2 = ptrC[width];
559             tmp1 += (tmp7 << 4);
560             tmp1 += (tmp7 << 2);
561             tmp1 += tmp2;
562             tmp1 = clp[tmp1>>5];
563             tmp6 += 16;
564             mb[32] = (u8)tmp1;
565 
566             tmp7 = tmp2 + tmp5;
567             tmp6 -= (tmp7 << 2);
568             tmp6 -= tmp7;
569             tmp7 = tmp4 + tmp3;
570             tmp1 = *ptrC;
571             tmp6 += (tmp7 << 4);
572             tmp6 += (tmp7 << 2);
573             tmp6 += tmp1;
574             tmp6 = clp[tmp6>>5];
575             tmp5 += 16;
576             mb[16] = (u8)tmp6;
577 
578             tmp1 += tmp4;
579             tmp5 -= (tmp1 << 2);
580             tmp5 -= tmp1;
581             tmp3 += tmp2;
582             tmp6 = ptrC[-(i32)width];
583             tmp5 += (tmp3 << 4);
584             tmp5 += (tmp3 << 2);
585             tmp5 += tmp6;
586             tmp5 = clp[tmp5>>5];
587             *mb++ = (u8)tmp5;
588             ptrC++;
589         }
590         ptrC += 4*width - partWidth;
591         ptrV += 4*width - partWidth;
592         mb += 4*16 - partWidth;
593     }
594 
595 }
596 
597 /*------------------------------------------------------------------------------
598 
599     Function: h264bsdInterpolateVerQuarter
600 
601         Functional description:
602           Function to perform vertical interpolation of pixel position 'd'
603           or 'n' for a block. Overfilling is done only if needed. Reference
604           image (ref) is read at correct position and the predicted part
605           is written to macroblock array (mb)
606 
607 ------------------------------------------------------------------------------*/
608 
h264bsdInterpolateVerQuarter(u8 * ref,u8 * mb,i32 x0,i32 y0,u32 width,u32 height,u32 partWidth,u32 partHeight,u32 verOffset)609 void h264bsdInterpolateVerQuarter(
610   u8 *ref,
611   u8 *mb,
612   i32 x0,
613   i32 y0,
614   u32 width,
615   u32 height,
616   u32 partWidth,
617   u32 partHeight,
618   u32 verOffset)    /* 0 for pixel d, 1 for pixel n */
619 {
620     u32 p1[21*21/4+1];
621     u32 i, j;
622     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
623     u8 *ptrC, *ptrV, *ptrInt;
624     const u8 *clp = h264bsdClip + 512;
625 
626     /* Code */
627 
628     ASSERT(ref);
629     ASSERT(mb);
630 
631     if ((x0 < 0) || ((u32)x0+partWidth > width) ||
632         (y0 < 0) || ((u32)y0+partHeight+5 > height))
633     {
634         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
635                 partWidth, partHeight+5, partWidth);
636 
637         x0 = 0;
638         y0 = 0;
639         ref = (u8*)p1;
640         width = partWidth;
641     }
642 
643     ref += (u32)y0 * width + (u32)x0;
644 
645     ptrC = ref + width;
646     ptrV = ptrC + 5*width;
647 
648     /* Pointer to integer sample position, either M or R */
649     ptrInt = ptrC + (2+verOffset)*width;
650 
651     /* 4 pixels per iteration
652      * interpolate using 5 vertical samples and average between
653      * interpolated value and integer sample value */
654     for (i = (partHeight >> 2); i; i--)
655     {
656         /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
657         for (j = partWidth; j; j--)
658         {
659             tmp4 = ptrV[-(i32)width*2];
660             tmp5 = ptrV[-(i32)width];
661             tmp1 = ptrV[width];
662             tmp2 = ptrV[width*2];
663             tmp6 = *ptrV++;
664 
665             tmp7 = tmp4 + tmp1;
666             tmp2 -= (tmp7 << 2);
667             tmp2 -= tmp7;
668             tmp2 += 16;
669             tmp7 = tmp5 + tmp6;
670             tmp3 = ptrC[width*2];
671             tmp2 += (tmp7 << 4);
672             tmp2 += (tmp7 << 2);
673             tmp2 += tmp3;
674             tmp2 = clp[tmp2>>5];
675             tmp7 = ptrInt[width*2];
676             tmp1 += 16;
677             tmp2++;
678             mb[48] = (u8)((tmp2 + tmp7) >> 1);
679 
680             tmp7 = tmp3 + tmp6;
681             tmp1 -= (tmp7 << 2);
682             tmp1 -= tmp7;
683             tmp7 = tmp4 + tmp5;
684             tmp2 = ptrC[width];
685             tmp1 += (tmp7 << 4);
686             tmp1 += (tmp7 << 2);
687             tmp1 += tmp2;
688             tmp1 = clp[tmp1>>5];
689             tmp7 = ptrInt[width];
690             tmp6 += 16;
691             tmp1++;
692             mb[32] = (u8)((tmp1 + tmp7) >> 1);
693 
694             tmp7 = tmp2 + tmp5;
695             tmp6 -= (tmp7 << 2);
696             tmp6 -= tmp7;
697             tmp7 = tmp4 + tmp3;
698             tmp1 = *ptrC;
699             tmp6 += (tmp7 << 4);
700             tmp6 += (tmp7 << 2);
701             tmp6 += tmp1;
702             tmp6 = clp[tmp6>>5];
703             tmp7 = *ptrInt;
704             tmp5 += 16;
705             tmp6++;
706             mb[16] = (u8)((tmp6 + tmp7) >> 1);
707 
708             tmp1 += tmp4;
709             tmp5 -= (tmp1 << 2);
710             tmp5 -= tmp1;
711             tmp3 += tmp2;
712             tmp6 = ptrC[-(i32)width];
713             tmp5 += (tmp3 << 4);
714             tmp5 += (tmp3 << 2);
715             tmp5 += tmp6;
716             tmp5 = clp[tmp5>>5];
717             tmp7 = ptrInt[-(i32)width];
718             tmp5++;
719             *mb++ = (u8)((tmp5 + tmp7) >> 1);
720             ptrC++;
721             ptrInt++;
722         }
723         ptrC += 4*width - partWidth;
724         ptrV += 4*width - partWidth;
725         ptrInt += 4*width - partWidth;
726         mb += 4*16 - partWidth;
727     }
728 
729 }
730 
731 /*------------------------------------------------------------------------------
732 
733     Function: h264bsdInterpolateHorHalf
734 
735         Functional description:
736           Function to perform horizontal interpolation of pixel position 'b'
737           for a block. Overfilling is done only if needed. Reference
738           image (ref) is read at correct position and the predicted part
739           is written to macroblock array (mb)
740 
741 ------------------------------------------------------------------------------*/
742 
h264bsdInterpolateHorHalf(u8 * ref,u8 * mb,i32 x0,i32 y0,u32 width,u32 height,u32 partWidth,u32 partHeight)743 void h264bsdInterpolateHorHalf(
744   u8 *ref,
745   u8 *mb,
746   i32 x0,
747   i32 y0,
748   u32 width,
749   u32 height,
750   u32 partWidth,
751   u32 partHeight)
752 {
753     u32 p1[21*21/4+1];
754     u8 *ptrJ;
755     u32 x, y;
756     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
757     const u8 *clp = h264bsdClip + 512;
758 
759     /* Code */
760 
761     ASSERT(ref);
762     ASSERT(mb);
763     ASSERT((partWidth&0x3) == 0);
764     ASSERT((partHeight&0x3) == 0);
765 
766     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
767         (y0 < 0) || ((u32)y0+partHeight > height))
768     {
769         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
770                 partWidth+5, partHeight, partWidth+5);
771 
772         x0 = 0;
773         y0 = 0;
774         ref = (u8*)p1;
775         width = partWidth + 5;
776     }
777 
778     ref += (u32)y0 * width + (u32)x0;
779 
780     ptrJ = ref + 5;
781 
782     for (y = partHeight; y; y--)
783     {
784         tmp6 = *(ptrJ - 5);
785         tmp5 = *(ptrJ - 4);
786         tmp4 = *(ptrJ - 3);
787         tmp3 = *(ptrJ - 2);
788         tmp2 = *(ptrJ - 1);
789 
790         /* calculate 4 pels per iteration */
791         for (x = (partWidth >> 2); x; x--)
792         {
793             /* First pixel */
794             tmp6 += 16;
795             tmp7 = tmp3 + tmp4;
796             tmp6 += (tmp7 << 4);
797             tmp6 += (tmp7 << 2);
798             tmp7 = tmp2 + tmp5;
799             tmp1 = *ptrJ++;
800             tmp6 -= (tmp7 << 2);
801             tmp6 -= tmp7;
802             tmp6 += tmp1;
803             tmp6 = clp[tmp6>>5];
804             /* Second pixel */
805             tmp5 += 16;
806             tmp7 = tmp2 + tmp3;
807             *mb++ = (u8)tmp6;
808             tmp5 += (tmp7 << 4);
809             tmp5 += (tmp7 << 2);
810             tmp7 = tmp1 + tmp4;
811             tmp6 = *ptrJ++;
812             tmp5 -= (tmp7 << 2);
813             tmp5 -= tmp7;
814             tmp5 += tmp6;
815             tmp5 = clp[tmp5>>5];
816             /* Third pixel */
817             tmp4 += 16;
818             tmp7 = tmp1 + tmp2;
819             *mb++ = (u8)tmp5;
820             tmp4 += (tmp7 << 4);
821             tmp4 += (tmp7 << 2);
822             tmp7 = tmp6 + tmp3;
823             tmp5 = *ptrJ++;
824             tmp4 -= (tmp7 << 2);
825             tmp4 -= tmp7;
826             tmp4 += tmp5;
827             tmp4 = clp[tmp4>>5];
828             /* Fourth pixel */
829             tmp3 += 16;
830             tmp7 = tmp6 + tmp1;
831             *mb++ = (u8)tmp4;
832             tmp3 += (tmp7 << 4);
833             tmp3 += (tmp7 << 2);
834             tmp7 = tmp5 + tmp2;
835             tmp4 = *ptrJ++;
836             tmp3 -= (tmp7 << 2);
837             tmp3 -= tmp7;
838             tmp3 += tmp4;
839             tmp3 = clp[tmp3>>5];
840             tmp7 = tmp4;
841             tmp4 = tmp6;
842             tmp6 = tmp2;
843             tmp2 = tmp7;
844             *mb++ = (u8)tmp3;
845             tmp3 = tmp5;
846             tmp5 = tmp1;
847         }
848         ptrJ += width - partWidth;
849         mb += 16 - partWidth;
850     }
851 
852 }
853 
854 /*------------------------------------------------------------------------------
855 
856     Function: h264bsdInterpolateHorQuarter
857 
858         Functional description:
859           Function to perform horizontal interpolation of pixel position 'a'
860           or 'c' for a block. Overfilling is done only if needed. Reference
861           image (ref) is read at correct position and the predicted part
862           is written to macroblock array (mb)
863 
864 ------------------------------------------------------------------------------*/
865 
h264bsdInterpolateHorQuarter(u8 * ref,u8 * mb,i32 x0,i32 y0,u32 width,u32 height,u32 partWidth,u32 partHeight,u32 horOffset)866 void h264bsdInterpolateHorQuarter(
867   u8 *ref,
868   u8 *mb,
869   i32 x0,
870   i32 y0,
871   u32 width,
872   u32 height,
873   u32 partWidth,
874   u32 partHeight,
875   u32 horOffset) /* 0 for pixel a, 1 for pixel c */
876 {
877     u32 p1[21*21/4+1];
878     u8 *ptrJ;
879     u32 x, y;
880     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
881     const u8 *clp = h264bsdClip + 512;
882 
883     /* Code */
884 
885     ASSERT(ref);
886     ASSERT(mb);
887 
888     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
889         (y0 < 0) || ((u32)y0+partHeight > height))
890     {
891         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
892                 partWidth+5, partHeight, partWidth+5);
893 
894         x0 = 0;
895         y0 = 0;
896         ref = (u8*)p1;
897         width = partWidth + 5;
898     }
899 
900     ref += (u32)y0 * width + (u32)x0;
901 
902     ptrJ = ref + 5;
903 
904     for (y = partHeight; y; y--)
905     {
906         tmp6 = *(ptrJ - 5);
907         tmp5 = *(ptrJ - 4);
908         tmp4 = *(ptrJ - 3);
909         tmp3 = *(ptrJ - 2);
910         tmp2 = *(ptrJ - 1);
911 
912         /* calculate 4 pels per iteration */
913         for (x = (partWidth >> 2); x; x--)
914         {
915             /* First pixel */
916             tmp6 += 16;
917             tmp7 = tmp3 + tmp4;
918             tmp6 += (tmp7 << 4);
919             tmp6 += (tmp7 << 2);
920             tmp7 = tmp2 + tmp5;
921             tmp1 = *ptrJ++;
922             tmp6 -= (tmp7 << 2);
923             tmp6 -= tmp7;
924             tmp6 += tmp1;
925             tmp6 = clp[tmp6>>5];
926             tmp5 += 16;
927             if (!horOffset)
928                 tmp6 += tmp4;
929             else
930                 tmp6 += tmp3;
931             *mb++ = (u8)((tmp6 + 1) >> 1);
932             /* Second pixel */
933             tmp7 = tmp2 + tmp3;
934             tmp5 += (tmp7 << 4);
935             tmp5 += (tmp7 << 2);
936             tmp7 = tmp1 + tmp4;
937             tmp6 = *ptrJ++;
938             tmp5 -= (tmp7 << 2);
939             tmp5 -= tmp7;
940             tmp5 += tmp6;
941             tmp5 = clp[tmp5>>5];
942             tmp4 += 16;
943             if (!horOffset)
944                 tmp5 += tmp3;
945             else
946                 tmp5 += tmp2;
947             *mb++ = (u8)((tmp5 + 1) >> 1);
948             /* Third pixel */
949             tmp7 = tmp1 + tmp2;
950             tmp4 += (tmp7 << 4);
951             tmp4 += (tmp7 << 2);
952             tmp7 = tmp6 + tmp3;
953             tmp5 = *ptrJ++;
954             tmp4 -= (tmp7 << 2);
955             tmp4 -= tmp7;
956             tmp4 += tmp5;
957             tmp4 = clp[tmp4>>5];
958             tmp3 += 16;
959             if (!horOffset)
960                 tmp4 += tmp2;
961             else
962                 tmp4 += tmp1;
963             *mb++ = (u8)((tmp4 + 1) >> 1);
964             /* Fourth pixel */
965             tmp7 = tmp6 + tmp1;
966             tmp3 += (tmp7 << 4);
967             tmp3 += (tmp7 << 2);
968             tmp7 = tmp5 + tmp2;
969             tmp4 = *ptrJ++;
970             tmp3 -= (tmp7 << 2);
971             tmp3 -= tmp7;
972             tmp3 += tmp4;
973             tmp3 = clp[tmp3>>5];
974             if (!horOffset)
975                 tmp3 += tmp1;
976             else
977                 tmp3 += tmp6;
978             *mb++ = (u8)((tmp3 + 1) >> 1);
979             tmp3 = tmp5;
980             tmp5 = tmp1;
981             tmp7 = tmp4;
982             tmp4 = tmp6;
983             tmp6 = tmp2;
984             tmp2 = tmp7;
985         }
986         ptrJ += width - partWidth;
987         mb += 16 - partWidth;
988     }
989 
990 }
991 
992 /*------------------------------------------------------------------------------
993 
994     Function: h264bsdInterpolateHorVerQuarter
995 
996         Functional description:
997           Function to perform horizontal and vertical interpolation of pixel
998           position 'e', 'g', 'p' or 'r' for a block. Overfilling is done only
999           if needed. Reference image (ref) is read at correct position and
1000           the predicted part is written to macroblock array (mb)
1001 
1002 ------------------------------------------------------------------------------*/
1003 
h264bsdInterpolateHorVerQuarter(u8 * ref,u8 * mb,i32 x0,i32 y0,u32 width,u32 height,u32 partWidth,u32 partHeight,u32 horVerOffset)1004 void h264bsdInterpolateHorVerQuarter(
1005   u8 *ref,
1006   u8 *mb,
1007   i32 x0,
1008   i32 y0,
1009   u32 width,
1010   u32 height,
1011   u32 partWidth,
1012   u32 partHeight,
1013   u32 horVerOffset) /* 0 for pixel e, 1 for pixel g,
1014                        2 for pixel p, 3 for pixel r */
1015 {
1016     u32 p1[21*21/4+1];
1017     u8 *ptrC, *ptrJ, *ptrV;
1018     u32 x, y;
1019     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1020     const u8 *clp = h264bsdClip + 512;
1021 
1022     /* Code */
1023 
1024     ASSERT(ref);
1025     ASSERT(mb);
1026 
1027     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1028         (y0 < 0) || ((u32)y0+partHeight+5 > height))
1029     {
1030         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1031                 partWidth+5, partHeight+5, partWidth+5);
1032 
1033         x0 = 0;
1034         y0 = 0;
1035         ref = (u8*)p1;
1036         width = partWidth+5;
1037     }
1038 
1039     /* Ref points to G + (-2, -2) */
1040     ref += (u32)y0 * width + (u32)x0;
1041 
1042     /* ptrJ points to either J or Q, depending on vertical offset */
1043     ptrJ = ref + (((horVerOffset & 0x2) >> 1) + 2) * width + 5;
1044 
1045     /* ptrC points to either C or D, depending on horizontal offset */
1046     ptrC = ref + width + 2 + (horVerOffset & 0x1);
1047 
1048     for (y = partHeight; y; y--)
1049     {
1050         tmp6 = *(ptrJ - 5);
1051         tmp5 = *(ptrJ - 4);
1052         tmp4 = *(ptrJ - 3);
1053         tmp3 = *(ptrJ - 2);
1054         tmp2 = *(ptrJ - 1);
1055 
1056         /* Horizontal interpolation, calculate 4 pels per iteration */
1057         for (x = (partWidth >> 2); x; x--)
1058         {
1059             /* First pixel */
1060             tmp6 += 16;
1061             tmp7 = tmp3 + tmp4;
1062             tmp6 += (tmp7 << 4);
1063             tmp6 += (tmp7 << 2);
1064             tmp7 = tmp2 + tmp5;
1065             tmp1 = *ptrJ++;
1066             tmp6 -= (tmp7 << 2);
1067             tmp6 -= tmp7;
1068             tmp6 += tmp1;
1069             tmp6 = clp[tmp6>>5];
1070             /* Second pixel */
1071             tmp5 += 16;
1072             tmp7 = tmp2 + tmp3;
1073             *mb++ = (u8)tmp6;
1074             tmp5 += (tmp7 << 4);
1075             tmp5 += (tmp7 << 2);
1076             tmp7 = tmp1 + tmp4;
1077             tmp6 = *ptrJ++;
1078             tmp5 -= (tmp7 << 2);
1079             tmp5 -= tmp7;
1080             tmp5 += tmp6;
1081             tmp5 = clp[tmp5>>5];
1082             /* Third pixel */
1083             tmp4 += 16;
1084             tmp7 = tmp1 + tmp2;
1085             *mb++ = (u8)tmp5;
1086             tmp4 += (tmp7 << 4);
1087             tmp4 += (tmp7 << 2);
1088             tmp7 = tmp6 + tmp3;
1089             tmp5 = *ptrJ++;
1090             tmp4 -= (tmp7 << 2);
1091             tmp4 -= tmp7;
1092             tmp4 += tmp5;
1093             tmp4 = clp[tmp4>>5];
1094             /* Fourth pixel */
1095             tmp3 += 16;
1096             tmp7 = tmp6 + tmp1;
1097             *mb++ = (u8)tmp4;
1098             tmp3 += (tmp7 << 4);
1099             tmp3 += (tmp7 << 2);
1100             tmp7 = tmp5 + tmp2;
1101             tmp4 = *ptrJ++;
1102             tmp3 -= (tmp7 << 2);
1103             tmp3 -= tmp7;
1104             tmp3 += tmp4;
1105             tmp3 = clp[tmp3>>5];
1106             tmp7 = tmp4;
1107             tmp4 = tmp6;
1108             tmp6 = tmp2;
1109             tmp2 = tmp7;
1110             *mb++ = (u8)tmp3;
1111             tmp3 = tmp5;
1112             tmp5 = tmp1;
1113         }
1114         ptrJ += width - partWidth;
1115         mb += 16 - partWidth;
1116     }
1117 
1118     mb -= 16*partHeight;
1119     ptrV = ptrC + 5*width;
1120 
1121     for (y = (partHeight >> 2); y; y--)
1122     {
1123         /* Vertical interpolation and averaging, 4 pels per iteration */
1124         for (x = partWidth; x; x--)
1125         {
1126             tmp4 = ptrV[-(i32)width*2];
1127             tmp5 = ptrV[-(i32)width];
1128             tmp1 = ptrV[width];
1129             tmp2 = ptrV[width*2];
1130             tmp6 = *ptrV++;
1131 
1132             tmp7 = tmp4 + tmp1;
1133             tmp2 -= (tmp7 << 2);
1134             tmp2 -= tmp7;
1135             tmp2 += 16;
1136             tmp7 = tmp5 + tmp6;
1137             tmp3 = ptrC[width*2];
1138             tmp2 += (tmp7 << 4);
1139             tmp2 += (tmp7 << 2);
1140             tmp2 += tmp3;
1141             tmp7 = clp[tmp2>>5];
1142             tmp2 = mb[48];
1143             tmp1 += 16;
1144             tmp7++;
1145             mb[48] = (u8)((tmp2 + tmp7) >> 1);
1146 
1147             tmp7 = tmp3 + tmp6;
1148             tmp1 -= (tmp7 << 2);
1149             tmp1 -= tmp7;
1150             tmp7 = tmp4 + tmp5;
1151             tmp2 = ptrC[width];
1152             tmp1 += (tmp7 << 4);
1153             tmp1 += (tmp7 << 2);
1154             tmp1 += tmp2;
1155             tmp7 = clp[tmp1>>5];
1156             tmp1 = mb[32];
1157             tmp6 += 16;
1158             tmp7++;
1159             mb[32] = (u8)((tmp1 + tmp7) >> 1);
1160 
1161             tmp1 = *ptrC;
1162             tmp7 = tmp2 + tmp5;
1163             tmp6 -= (tmp7 << 2);
1164             tmp6 -= tmp7;
1165             tmp7 = tmp4 + tmp3;
1166             tmp6 += (tmp7 << 4);
1167             tmp6 += (tmp7 << 2);
1168             tmp6 += tmp1;
1169             tmp7 = clp[tmp6>>5];
1170             tmp6 = mb[16];
1171             tmp5 += 16;
1172             tmp7++;
1173             mb[16] = (u8)((tmp6 + tmp7) >> 1);
1174 
1175             tmp6 = ptrC[-(i32)width];
1176             tmp1 += tmp4;
1177             tmp5 -= (tmp1 << 2);
1178             tmp5 -= tmp1;
1179             tmp3 += tmp2;
1180             tmp5 += (tmp3 << 4);
1181             tmp5 += (tmp3 << 2);
1182             tmp5 += tmp6;
1183             tmp7 = clp[tmp5>>5];
1184             tmp5 = *mb;
1185             tmp7++;
1186             *mb++ = (u8)((tmp5 + tmp7) >> 1);
1187             ptrC++;
1188 
1189         }
1190         ptrC += 4*width - partWidth;
1191         ptrV += 4*width - partWidth;
1192         mb += 4*16 - partWidth;
1193     }
1194 
1195 }
1196 #endif
1197 
1198 /*------------------------------------------------------------------------------
1199 
1200     Function: h264bsdInterpolateMidHalf
1201 
1202         Functional description:
1203           Function to perform horizontal and vertical interpolation of pixel
1204           position 'j' for a block. Overfilling is done only if needed.
1205           Reference image (ref) is read at correct position and the predicted
1206           part is written to macroblock array (mb)
1207 
1208 ------------------------------------------------------------------------------*/
1209 
h264bsdInterpolateMidHalf(u8 * ref,u8 * mb,i32 x0,i32 y0,u32 width,u32 height,u32 partWidth,u32 partHeight)1210 void h264bsdInterpolateMidHalf(
1211   u8 *ref,
1212   u8 *mb,
1213   i32 x0,
1214   i32 y0,
1215   u32 width,
1216   u32 height,
1217   u32 partWidth,
1218   u32 partHeight)
1219 {
1220     u32 p1[21*21/4+1];
1221     u32 x, y;
1222     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1223     i32 *ptrC, *ptrV, *b1;
1224     u8  *ptrJ;
1225     i32 table[21*16];
1226     const u8 *clp = h264bsdClip + 512;
1227 
1228     /* Code */
1229 
1230     ASSERT(ref);
1231     ASSERT(mb);
1232 
1233     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1234         (y0 < 0) || ((u32)y0+partHeight+5 > height))
1235     {
1236         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1237                 partWidth+5, partHeight+5, partWidth+5);
1238 
1239         x0 = 0;
1240         y0 = 0;
1241         ref = (u8*)p1;
1242         width = partWidth+5;
1243     }
1244 
1245     ref += (u32)y0 * width + (u32)x0;
1246 
1247     b1 = table;
1248     ptrJ = ref + 5;
1249 
1250     /* First step: calculate intermediate values for
1251      * horizontal interpolation */
1252     for (y = partHeight + 5; y; y--)
1253     {
1254         tmp6 = *(ptrJ - 5);
1255         tmp5 = *(ptrJ - 4);
1256         tmp4 = *(ptrJ - 3);
1257         tmp3 = *(ptrJ - 2);
1258         tmp2 = *(ptrJ - 1);
1259 
1260         /* 4 pels per iteration */
1261         for (x = (partWidth >> 2); x; x--)
1262         {
1263             /* First pixel */
1264             tmp7 = tmp3 + tmp4;
1265             tmp6 += (tmp7 << 4);
1266             tmp6 += (tmp7 << 2);
1267             tmp7 = tmp2 + tmp5;
1268             tmp1 = *ptrJ++;
1269             tmp6 -= (tmp7 << 2);
1270             tmp6 -= tmp7;
1271             tmp6 += tmp1;
1272             *b1++ = tmp6;
1273             /* Second pixel */
1274             tmp7 = tmp2 + tmp3;
1275             tmp5 += (tmp7 << 4);
1276             tmp5 += (tmp7 << 2);
1277             tmp7 = tmp1 + tmp4;
1278             tmp6 = *ptrJ++;
1279             tmp5 -= (tmp7 << 2);
1280             tmp5 -= tmp7;
1281             tmp5 += tmp6;
1282             *b1++ = tmp5;
1283             /* Third pixel */
1284             tmp7 = tmp1 + tmp2;
1285             tmp4 += (tmp7 << 4);
1286             tmp4 += (tmp7 << 2);
1287             tmp7 = tmp6 + tmp3;
1288             tmp5 = *ptrJ++;
1289             tmp4 -= (tmp7 << 2);
1290             tmp4 -= tmp7;
1291             tmp4 += tmp5;
1292             *b1++ = tmp4;
1293             /* Fourth pixel */
1294             tmp7 = tmp6 + tmp1;
1295             tmp3 += (tmp7 << 4);
1296             tmp3 += (tmp7 << 2);
1297             tmp7 = tmp5 + tmp2;
1298             tmp4 = *ptrJ++;
1299             tmp3 -= (tmp7 << 2);
1300             tmp3 -= tmp7;
1301             tmp3 += tmp4;
1302             *b1++ = tmp3;
1303             tmp7 = tmp4;
1304             tmp4 = tmp6;
1305             tmp6 = tmp2;
1306             tmp2 = tmp7;
1307             tmp3 = tmp5;
1308             tmp5 = tmp1;
1309         }
1310         ptrJ += width - partWidth;
1311     }
1312 
1313     /* Second step: calculate vertical interpolation */
1314     ptrC = table + partWidth;
1315     ptrV = ptrC + 5*partWidth;
1316     for (y = (partHeight >> 2); y; y--)
1317     {
1318         /* 4 pels per iteration */
1319         for (x = partWidth; x; x--)
1320         {
1321             tmp4 = ptrV[-(i32)partWidth*2];
1322             tmp5 = ptrV[-(i32)partWidth];
1323             tmp1 = ptrV[partWidth];
1324             tmp2 = ptrV[partWidth*2];
1325             tmp6 = *ptrV++;
1326 
1327             tmp7 = tmp4 + tmp1;
1328             tmp2 -= (tmp7 << 2);
1329             tmp2 -= tmp7;
1330             tmp2 += 512;
1331             tmp7 = tmp5 + tmp6;
1332             tmp3 = ptrC[partWidth*2];
1333             tmp2 += (tmp7 << 4);
1334             tmp2 += (tmp7 << 2);
1335             tmp2 += tmp3;
1336             tmp7 = clp[tmp2>>10];
1337             tmp1 += 512;
1338             mb[48] = (u8)tmp7;
1339 
1340             tmp7 = tmp3 + tmp6;
1341             tmp1 -= (tmp7 << 2);
1342             tmp1 -= tmp7;
1343             tmp7 = tmp4 + tmp5;
1344             tmp2 = ptrC[partWidth];
1345             tmp1 += (tmp7 << 4);
1346             tmp1 += (tmp7 << 2);
1347             tmp1 += tmp2;
1348             tmp7 = clp[tmp1>>10];
1349             tmp6 += 512;
1350             mb[32] = (u8)tmp7;
1351 
1352             tmp1 = *ptrC;
1353             tmp7 = tmp2 + tmp5;
1354             tmp6 -= (tmp7 << 2);
1355             tmp6 -= tmp7;
1356             tmp7 = tmp4 + tmp3;
1357             tmp6 += (tmp7 << 4);
1358             tmp6 += (tmp7 << 2);
1359             tmp6 += tmp1;
1360             tmp7 = clp[tmp6>>10];
1361             tmp5 += 512;
1362             mb[16] = (u8)tmp7;
1363 
1364             tmp6 = ptrC[-(i32)partWidth];
1365             tmp1 += tmp4;
1366             tmp5 -= (tmp1 << 2);
1367             tmp5 -= tmp1;
1368             tmp3 += tmp2;
1369             tmp5 += (tmp3 << 4);
1370             tmp5 += (tmp3 << 2);
1371             tmp5 += tmp6;
1372             tmp7 = clp[tmp5>>10];
1373             *mb++ = (u8)tmp7;
1374             ptrC++;
1375         }
1376         mb += 4*16 - partWidth;
1377         ptrC += 3*partWidth;
1378         ptrV += 3*partWidth;
1379     }
1380 
1381 }
1382 
1383 
1384 /*------------------------------------------------------------------------------
1385 
1386     Function: h264bsdInterpolateMidVerQuarter
1387 
1388         Functional description:
1389           Function to perform horizontal and vertical interpolation of pixel
1390           position 'f' or 'q' for a block. Overfilling is done only if needed.
1391           Reference image (ref) is read at correct position and the predicted
1392           part is written to macroblock array (mb)
1393 
1394 ------------------------------------------------------------------------------*/
1395 
h264bsdInterpolateMidVerQuarter(u8 * ref,u8 * mb,i32 x0,i32 y0,u32 width,u32 height,u32 partWidth,u32 partHeight,u32 verOffset)1396 void h264bsdInterpolateMidVerQuarter(
1397   u8 *ref,
1398   u8 *mb,
1399   i32 x0,
1400   i32 y0,
1401   u32 width,
1402   u32 height,
1403   u32 partWidth,
1404   u32 partHeight,
1405   u32 verOffset)    /* 0 for pixel f, 1 for pixel q */
1406 {
1407     u32 p1[21*21/4+1];
1408     u32 x, y;
1409     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1410     i32 *ptrC, *ptrV, *ptrInt, *b1;
1411     u8  *ptrJ;
1412     i32 table[21*16];
1413     const u8 *clp = h264bsdClip + 512;
1414 
1415     /* Code */
1416 
1417     ASSERT(ref);
1418     ASSERT(mb);
1419 
1420     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1421         (y0 < 0) || ((u32)y0+partHeight+5 > height))
1422     {
1423         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1424                 partWidth+5, partHeight+5, partWidth+5);
1425 
1426         x0 = 0;
1427         y0 = 0;
1428         ref = (u8*)p1;
1429         width = partWidth+5;
1430     }
1431 
1432     ref += (u32)y0 * width + (u32)x0;
1433 
1434     b1 = table;
1435     ptrJ = ref + 5;
1436 
1437     /* First step: calculate intermediate values for
1438      * horizontal interpolation */
1439     for (y = partHeight + 5; y; y--)
1440     {
1441         tmp6 = *(ptrJ - 5);
1442         tmp5 = *(ptrJ - 4);
1443         tmp4 = *(ptrJ - 3);
1444         tmp3 = *(ptrJ - 2);
1445         tmp2 = *(ptrJ - 1);
1446         for (x = (partWidth >> 2); x; x--)
1447         {
1448             /* First pixel */
1449             tmp7 = tmp3 + tmp4;
1450             tmp6 += (tmp7 << 4);
1451             tmp6 += (tmp7 << 2);
1452             tmp7 = tmp2 + tmp5;
1453             tmp1 = *ptrJ++;
1454             tmp6 -= (tmp7 << 2);
1455             tmp6 -= tmp7;
1456             tmp6 += tmp1;
1457             *b1++ = tmp6;
1458             /* Second pixel */
1459             tmp7 = tmp2 + tmp3;
1460             tmp5 += (tmp7 << 4);
1461             tmp5 += (tmp7 << 2);
1462             tmp7 = tmp1 + tmp4;
1463             tmp6 = *ptrJ++;
1464             tmp5 -= (tmp7 << 2);
1465             tmp5 -= tmp7;
1466             tmp5 += tmp6;
1467             *b1++ = tmp5;
1468             /* Third pixel */
1469             tmp7 = tmp1 + tmp2;
1470             tmp4 += (tmp7 << 4);
1471             tmp4 += (tmp7 << 2);
1472             tmp7 = tmp6 + tmp3;
1473             tmp5 = *ptrJ++;
1474             tmp4 -= (tmp7 << 2);
1475             tmp4 -= tmp7;
1476             tmp4 += tmp5;
1477             *b1++ = tmp4;
1478             /* Fourth pixel */
1479             tmp7 = tmp6 + tmp1;
1480             tmp3 += (tmp7 << 4);
1481             tmp3 += (tmp7 << 2);
1482             tmp7 = tmp5 + tmp2;
1483             tmp4 = *ptrJ++;
1484             tmp3 -= (tmp7 << 2);
1485             tmp3 -= tmp7;
1486             tmp3 += tmp4;
1487             *b1++ = tmp3;
1488             tmp7 = tmp4;
1489             tmp4 = tmp6;
1490             tmp6 = tmp2;
1491             tmp2 = tmp7;
1492             tmp3 = tmp5;
1493             tmp5 = tmp1;
1494         }
1495         ptrJ += width - partWidth;
1496     }
1497 
1498     /* Second step: calculate vertical interpolation and average */
1499     ptrC = table + partWidth;
1500     ptrV = ptrC + 5*partWidth;
1501     /* Pointer to integer sample position, either M or R */
1502     ptrInt = ptrC + (2+verOffset)*partWidth;
1503     for (y = (partHeight >> 2); y; y--)
1504     {
1505         for (x = partWidth; x; x--)
1506         {
1507             tmp4 = ptrV[-(i32)partWidth*2];
1508             tmp5 = ptrV[-(i32)partWidth];
1509             tmp1 = ptrV[partWidth];
1510             tmp2 = ptrV[partWidth*2];
1511             tmp6 = *ptrV++;
1512 
1513             tmp7 = tmp4 + tmp1;
1514             tmp2 -= (tmp7 << 2);
1515             tmp2 -= tmp7;
1516             tmp2 += 512;
1517             tmp7 = tmp5 + tmp6;
1518             tmp3 = ptrC[partWidth*2];
1519             tmp2 += (tmp7 << 4);
1520             tmp2 += (tmp7 << 2);
1521             tmp7 = ptrInt[partWidth*2];
1522             tmp2 += tmp3;
1523             tmp2 = clp[tmp2>>10];
1524             tmp7 += 16;
1525             tmp7 = clp[tmp7>>5];
1526             tmp1 += 512;
1527             tmp2++;
1528             mb[48] = (u8)((tmp7 + tmp2) >> 1);
1529 
1530             tmp7 = tmp3 + tmp6;
1531             tmp1 -= (tmp7 << 2);
1532             tmp1 -= tmp7;
1533             tmp7 = tmp4 + tmp5;
1534             tmp2 = ptrC[partWidth];
1535             tmp1 += (tmp7 << 4);
1536             tmp1 += (tmp7 << 2);
1537             tmp7 = ptrInt[partWidth];
1538             tmp1 += tmp2;
1539             tmp1 = clp[tmp1>>10];
1540             tmp7 += 16;
1541             tmp7 = clp[tmp7>>5];
1542             tmp6 += 512;
1543             tmp1++;
1544             mb[32] = (u8)((tmp7 + tmp1) >> 1);
1545 
1546             tmp1 = *ptrC;
1547             tmp7 = tmp2 + tmp5;
1548             tmp6 -= (tmp7 << 2);
1549             tmp6 -= tmp7;
1550             tmp7 = tmp4 + tmp3;
1551             tmp6 += (tmp7 << 4);
1552             tmp6 += (tmp7 << 2);
1553             tmp7 = *ptrInt;
1554             tmp6 += tmp1;
1555             tmp6 = clp[tmp6>>10];
1556             tmp7 += 16;
1557             tmp7 = clp[tmp7>>5];
1558             tmp5 += 512;
1559             tmp6++;
1560             mb[16] = (u8)((tmp7 + tmp6) >> 1);
1561 
1562             tmp6 = ptrC[-(i32)partWidth];
1563             tmp1 += tmp4;
1564             tmp5 -= (tmp1 << 2);
1565             tmp5 -= tmp1;
1566             tmp3 += tmp2;
1567             tmp5 += (tmp3 << 4);
1568             tmp5 += (tmp3 << 2);
1569             tmp7 = ptrInt[-(i32)partWidth];
1570             tmp5 += tmp6;
1571             tmp5 = clp[tmp5>>10];
1572             tmp7 += 16;
1573             tmp7 = clp[tmp7>>5];
1574             tmp5++;
1575             *mb++ = (u8)((tmp7 + tmp5) >> 1);
1576             ptrC++;
1577             ptrInt++;
1578         }
1579         mb += 4*16 - partWidth;
1580         ptrC += 3*partWidth;
1581         ptrV += 3*partWidth;
1582         ptrInt += 3*partWidth;
1583     }
1584 
1585 }
1586 
1587 
1588 /*------------------------------------------------------------------------------
1589 
1590     Function: h264bsdInterpolateMidHorQuarter
1591 
1592         Functional description:
1593           Function to perform horizontal and vertical interpolation of pixel
1594           position 'i' or 'k' for a block. Overfilling is done only if needed.
1595           Reference image (ref) is read at correct position and the predicted
1596           part is written to macroblock array (mb)
1597 
1598 ------------------------------------------------------------------------------*/
1599 
h264bsdInterpolateMidHorQuarter(u8 * ref,u8 * mb,i32 x0,i32 y0,u32 width,u32 height,u32 partWidth,u32 partHeight,u32 horOffset)1600 void h264bsdInterpolateMidHorQuarter(
1601   u8 *ref,
1602   u8 *mb,
1603   i32 x0,
1604   i32 y0,
1605   u32 width,
1606   u32 height,
1607   u32 partWidth,
1608   u32 partHeight,
1609   u32 horOffset)    /* 0 for pixel i, 1 for pixel k */
1610 {
1611     u32 p1[21*21/4+1];
1612     u32 x, y;
1613     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1614     i32 *ptrJ, *ptrInt, *h1;
1615     u8  *ptrC, *ptrV;
1616     i32 table[21*16];
1617     i32 tableWidth = (i32)partWidth+5;
1618     const u8 *clp = h264bsdClip + 512;
1619 
1620     /* Code */
1621 
1622     ASSERT(ref);
1623     ASSERT(mb);
1624 
1625     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1626         (y0 < 0) || ((u32)y0+partHeight+5 > height))
1627     {
1628         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1629                 partWidth+5, partHeight+5, partWidth+5);
1630 
1631         x0 = 0;
1632         y0 = 0;
1633         ref = (u8*)p1;
1634         width = partWidth+5;
1635     }
1636 
1637     ref += (u32)y0 * width + (u32)x0;
1638 
1639     h1 = table + tableWidth;
1640     ptrC = ref + width;
1641     ptrV = ptrC + 5*width;
1642 
1643     /* First step: calculate intermediate values for
1644      * vertical interpolation */
1645     for (y = (partHeight >> 2); y; y--)
1646     {
1647         for (x = (u32)tableWidth; x; x--)
1648         {
1649             tmp4 = ptrV[-(i32)width*2];
1650             tmp5 = ptrV[-(i32)width];
1651             tmp1 = ptrV[width];
1652             tmp2 = ptrV[width*2];
1653             tmp6 = *ptrV++;
1654 
1655             tmp7 = tmp4 + tmp1;
1656             tmp2 -= (tmp7 << 2);
1657             tmp2 -= tmp7;
1658             tmp7 = tmp5 + tmp6;
1659             tmp3 = ptrC[width*2];
1660             tmp2 += (tmp7 << 4);
1661             tmp2 += (tmp7 << 2);
1662             tmp2 += tmp3;
1663             h1[tableWidth*2] = tmp2;
1664 
1665             tmp7 = tmp3 + tmp6;
1666             tmp1 -= (tmp7 << 2);
1667             tmp1 -= tmp7;
1668             tmp7 = tmp4 + tmp5;
1669             tmp2 = ptrC[width];
1670             tmp1 += (tmp7 << 4);
1671             tmp1 += (tmp7 << 2);
1672             tmp1 += tmp2;
1673             h1[tableWidth] = tmp1;
1674 
1675             tmp1 = *ptrC;
1676             tmp7 = tmp2 + tmp5;
1677             tmp6 -= (tmp7 << 2);
1678             tmp6 -= tmp7;
1679             tmp7 = tmp4 + tmp3;
1680             tmp6 += (tmp7 << 4);
1681             tmp6 += (tmp7 << 2);
1682             tmp6 += tmp1;
1683             *h1 = tmp6;
1684 
1685             tmp6 = ptrC[-(i32)width];
1686             tmp1 += tmp4;
1687             tmp5 -= (tmp1 << 2);
1688             tmp5 -= tmp1;
1689             tmp3 += tmp2;
1690             tmp5 += (tmp3 << 4);
1691             tmp5 += (tmp3 << 2);
1692             tmp5 += tmp6;
1693             h1[-tableWidth] = tmp5;
1694             h1++;
1695             ptrC++;
1696         }
1697         ptrC += 4*width - partWidth - 5;
1698         ptrV += 4*width - partWidth - 5;
1699         h1 += 3*tableWidth;
1700     }
1701 
1702     /* Second step: calculate horizontal interpolation and average */
1703     ptrJ = table + 5;
1704     /* Pointer to integer sample position, either G or H */
1705     ptrInt = table + 2 + horOffset;
1706     for (y = partHeight; y; y--)
1707     {
1708         tmp6 = *(ptrJ - 5);
1709         tmp5 = *(ptrJ - 4);
1710         tmp4 = *(ptrJ - 3);
1711         tmp3 = *(ptrJ - 2);
1712         tmp2 = *(ptrJ - 1);
1713         for (x = (partWidth>>2); x; x--)
1714         {
1715             /* First pixel */
1716             tmp6 += 512;
1717             tmp7 = tmp3 + tmp4;
1718             tmp6 += (tmp7 << 4);
1719             tmp6 += (tmp7 << 2);
1720             tmp7 = tmp2 + tmp5;
1721             tmp1 = *ptrJ++;
1722             tmp6 -= (tmp7 << 2);
1723             tmp6 -= tmp7;
1724             tmp7 = *ptrInt++;
1725             tmp6 += tmp1;
1726             tmp6 = clp[tmp6 >> 10];
1727             tmp7 += 16;
1728             tmp7 = clp[tmp7 >> 5];
1729             tmp5 += 512;
1730             tmp6++;
1731             *mb++ = (u8)((tmp6 + tmp7) >> 1);
1732             /* Second pixel */
1733             tmp7 = tmp2 + tmp3;
1734             tmp5 += (tmp7 << 4);
1735             tmp5 += (tmp7 << 2);
1736             tmp7 = tmp1 + tmp4;
1737             tmp6 = *ptrJ++;
1738             tmp5 -= (tmp7 << 2);
1739             tmp5 -= tmp7;
1740             tmp7 = *ptrInt++;
1741             tmp5 += tmp6;
1742             tmp5 = clp[tmp5 >> 10];
1743             tmp7 += 16;
1744             tmp7 = clp[tmp7 >> 5];
1745             tmp4 += 512;
1746             tmp5++;
1747             *mb++ = (u8)((tmp5 + tmp7) >> 1);
1748             /* Third pixel */
1749             tmp7 = tmp1 + tmp2;
1750             tmp4 += (tmp7 << 4);
1751             tmp4 += (tmp7 << 2);
1752             tmp7 = tmp6 + tmp3;
1753             tmp5 = *ptrJ++;
1754             tmp4 -= (tmp7 << 2);
1755             tmp4 -= tmp7;
1756             tmp7 = *ptrInt++;
1757             tmp4 += tmp5;
1758             tmp4 = clp[tmp4 >> 10];
1759             tmp7 += 16;
1760             tmp7 = clp[tmp7 >> 5];
1761             tmp3 += 512;
1762             tmp4++;
1763             *mb++ = (u8)((tmp4 + tmp7) >> 1);
1764             /* Fourth pixel */
1765             tmp7 = tmp6 + tmp1;
1766             tmp3 += (tmp7 << 4);
1767             tmp3 += (tmp7 << 2);
1768             tmp7 = tmp5 + tmp2;
1769             tmp4 = *ptrJ++;
1770             tmp3 -= (tmp7 << 2);
1771             tmp3 -= tmp7;
1772             tmp7 = *ptrInt++;
1773             tmp3 += tmp4;
1774             tmp3 = clp[tmp3 >> 10];
1775             tmp7 += 16;
1776             tmp7 = clp[tmp7 >> 5];
1777             tmp3++;
1778             *mb++ = (u8)((tmp3 + tmp7) >> 1);
1779             tmp3 = tmp5;
1780             tmp5 = tmp1;
1781             tmp7 = tmp4;
1782             tmp4 = tmp6;
1783             tmp6 = tmp2;
1784             tmp2 = tmp7;
1785         }
1786         ptrJ += 5;
1787         ptrInt += 5;
1788         mb += 16 - partWidth;
1789     }
1790 
1791 }
1792 
1793 
1794 /*------------------------------------------------------------------------------
1795 
1796     Function: h264bsdPredictSamples
1797 
1798         Functional description:
1799           This function reconstructs a prediction for a macroblock partition.
1800           The prediction is either copied or interpolated using the reference
1801           frame and the motion vector. Both luminance and chrominance parts are
1802           predicted. The prediction is stored in given macroblock array (data).
1803         Inputs:
1804           data          pointer to macroblock array (384 bytes) for output
1805           mv            pointer to motion vector used for prediction
1806           refPic        pointer to reference picture structure
1807           xA            x-coordinate for current macroblock
1808           yA            y-coordinate for current macroblock
1809           partX         x-offset for partition in macroblock
1810           partY         y-offset for partition in macroblock
1811           partWidth     width of partition
1812           partHeight    height of partition
1813         Outputs:
1814           data          macroblock array (16x16+8x8+8x8) where predicted
1815                         partition is stored at correct position
1816 
1817 ------------------------------------------------------------------------------*/
1818 
h264bsdPredictSamples(u8 * data,mv_t * mv,image_t * refPic,u32 xA,u32 yA,u32 partX,u32 partY,u32 partWidth,u32 partHeight)1819 void h264bsdPredictSamples(
1820   u8 *data,
1821   mv_t *mv,
1822   image_t *refPic,
1823   u32 xA,
1824   u32 yA,
1825   u32 partX,
1826   u32 partY,
1827   u32 partWidth,
1828   u32 partHeight)
1829 
1830 {
1831 
1832 /* Variables */
1833 
1834     u32 xFrac, yFrac, width, height;
1835     i32 xInt, yInt;
1836     u8 *lumaPartData;
1837 
1838 /* Code */
1839 
1840     ASSERT(data);
1841     ASSERT(mv);
1842     ASSERT(partWidth);
1843     ASSERT(partHeight);
1844     ASSERT(refPic);
1845     ASSERT(refPic->data);
1846     ASSERT(refPic->width);
1847     ASSERT(refPic->height);
1848 
1849     /* luma */
1850     lumaPartData = data + 16*partY + partX;
1851 
1852     xFrac = mv->hor & 0x3;
1853     yFrac = mv->ver & 0x3;
1854 
1855     width = 16 * refPic->width;
1856     height = 16 * refPic->height;
1857 
1858     xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
1859     yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
1860 
1861     ASSERT(lumaFracPos[xFrac][yFrac] < 16);
1862 
1863     switch (lumaFracPos[xFrac][yFrac])
1864     {
1865         case 0: /* G */
1866             h264bsdFillBlock(refPic->data, lumaPartData,
1867                     xInt,yInt,width,height,partWidth,partHeight,16);
1868             break;
1869         case 1: /* d */
1870             h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
1871                     xInt, yInt-2, width, height, partWidth, partHeight, 0);
1872             break;
1873         case 2: /* h */
1874             h264bsdInterpolateVerHalf(refPic->data, lumaPartData,
1875                     xInt, yInt-2, width, height, partWidth, partHeight);
1876             break;
1877         case 3: /* n */
1878             h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
1879                     xInt, yInt-2, width, height, partWidth, partHeight, 1);
1880             break;
1881         case 4: /* a */
1882             h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
1883                     xInt-2, yInt, width, height, partWidth, partHeight, 0);
1884             break;
1885         case 5: /* e */
1886             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1887                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1888             break;
1889         case 6: /* i */
1890             h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
1891                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1892             break;
1893         case 7: /* p */
1894             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1895                     xInt-2, yInt-2, width, height, partWidth, partHeight, 2);
1896             break;
1897         case 8: /* b */
1898             h264bsdInterpolateHorHalf(refPic->data, lumaPartData,
1899                     xInt-2, yInt, width, height, partWidth, partHeight);
1900             break;
1901         case 9: /* f */
1902             h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
1903                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1904             break;
1905         case 10: /* j */
1906             h264bsdInterpolateMidHalf(refPic->data, lumaPartData,
1907                     xInt-2, yInt-2, width, height, partWidth, partHeight);
1908             break;
1909         case 11: /* q */
1910             h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
1911                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1912             break;
1913         case 12: /* c */
1914             h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
1915                     xInt-2, yInt, width, height, partWidth, partHeight, 1);
1916             break;
1917         case 13: /* g */
1918             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1919                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1920             break;
1921         case 14: /* k */
1922             h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
1923                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1924             break;
1925         default: /* case 15, r */
1926             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1927                     xInt-2, yInt-2, width, height, partWidth, partHeight, 3);
1928             break;
1929     }
1930 
1931     /* chroma */
1932     PredictChroma(
1933       data + 16*16 + (partY>>1)*8 + (partX>>1),
1934       xA + partX,
1935       yA + partY,
1936       partWidth,
1937       partHeight,
1938       mv,
1939       refPic);
1940 
1941 }
1942 
1943 #else /* H264DEC_OMXDL */
1944 /*------------------------------------------------------------------------------
1945 
1946     Function: h264bsdPredictSamples
1947 
1948         Functional description:
1949           This function reconstructs a prediction for a macroblock partition.
1950           The prediction is either copied or interpolated using the reference
1951           frame and the motion vector. Both luminance and chrominance parts are
1952           predicted. The prediction is stored in given macroblock array (data).
1953         Inputs:
1954           data          pointer to macroblock array (384 bytes) for output
1955           mv            pointer to motion vector used for prediction
1956           refPic        pointer to reference picture structure
1957           xA            x-coordinate for current macroblock
1958           yA            y-coordinate for current macroblock
1959           partX         x-offset for partition in macroblock
1960           partY         y-offset for partition in macroblock
1961           partWidth     width of partition
1962           partHeight    height of partition
1963         Outputs:
1964           data          macroblock array (16x16+8x8+8x8) where predicted
1965                         partition is stored at correct position
1966 
1967 ------------------------------------------------------------------------------*/
1968 
1969 /*lint -e{550} Symbol 'res' not accessed */
h264bsdPredictSamples(u8 * data,mv_t * mv,image_t * refPic,u32 colAndRow,u32 part,u8 * pFill)1970 void h264bsdPredictSamples(
1971   u8 *data,
1972   mv_t *mv,
1973   image_t *refPic,
1974   u32 colAndRow,
1975   u32 part,
1976   u8 *pFill)
1977 
1978 {
1979 
1980 /* Variables */
1981 
1982     u32 xFrac, yFrac;
1983     u32 width, height;
1984     i32 xInt, yInt, x0, y0;
1985     u8 *partData, *ref;
1986     OMXSize roi;
1987     u32 fillWidth;
1988     u32 fillHeight;
1989     OMXResult res;
1990     u32 xA, yA;
1991     u32 partX, partY;
1992     u32 partWidth, partHeight;
1993 
1994 /* Code */
1995 
1996     ASSERT(data);
1997     ASSERT(mv);
1998     ASSERT(refPic);
1999     ASSERT(refPic->data);
2000     ASSERT(refPic->width);
2001     ASSERT(refPic->height);
2002 
2003     xA = (colAndRow & 0xFFFF0000) >> 16;
2004     yA = (colAndRow & 0x0000FFFF);
2005 
2006     partX = (part & 0xFF000000) >> 24;
2007     partY = (part & 0x00FF0000) >> 16;
2008     partWidth = (part & 0x0000FF00) >> 8;
2009     partHeight = (part & 0x000000FF);
2010 
2011     ASSERT(partWidth);
2012     ASSERT(partHeight);
2013 
2014     /* luma */
2015     partData = data + 16*partY + partX;
2016 
2017     xFrac = mv->hor & 0x3;
2018     yFrac = mv->ver & 0x3;
2019 
2020     width = 16 * refPic->width;
2021     height = 16 * refPic->height;
2022 
2023     xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
2024     yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
2025 
2026     x0 = (xFrac) ? xInt-2 : xInt;
2027     y0 = (yFrac) ? yInt-2 : yInt;
2028 
2029     if (xFrac)
2030     {
2031         if (partWidth == 16)
2032             fillWidth = 32;
2033         else
2034             fillWidth = 16;
2035     }
2036     else
2037         fillWidth = (partWidth*2);
2038     if (yFrac)
2039         fillHeight = partHeight+5;
2040     else
2041         fillHeight = partHeight;
2042 
2043 
2044     if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
2045         (y0 < 0) || ((u32)y0+fillHeight > height))
2046     {
2047         h264bsdFillBlock(refPic->data, (u8*)pFill, x0, y0, width, height,
2048                 fillWidth, fillHeight, fillWidth);
2049 
2050         x0 = 0;
2051         y0 = 0;
2052         ref = pFill;
2053         width = fillWidth;
2054         if (yFrac)
2055             ref += 2*width;
2056         if (xFrac)
2057             ref += 2;
2058     }
2059     else
2060     {
2061         /*lint --e(737) Loss of sign */
2062         ref = refPic->data + yInt*width + xInt;
2063     }
2064     /* Luma interpolation */
2065     roi.width = (i32)partWidth;
2066     roi.height = (i32)partHeight;
2067 
2068     res = omxVCM4P10_InterpolateLuma(ref, (i32)width, partData, 16,
2069                                         (i32)xFrac, (i32)yFrac, roi);
2070     ASSERT(res == 0);
2071 
2072     /* Chroma */
2073     width  = 8 * refPic->width;
2074     height = 8 * refPic->height;
2075 
2076     x0 = ((xA + partX) >> 1) + (mv->hor >> 3);
2077     y0 = ((yA + partY) >> 1) + (mv->ver >> 3);
2078     xFrac = mv->hor & 0x7;
2079     yFrac = mv->ver & 0x7;
2080 
2081     ref = refPic->data + 256 * refPic->width * refPic->height;
2082 
2083     roi.width = (i32)(partWidth >> 1);
2084     fillWidth = ((partWidth >> 1) + 8) & ~0x7;
2085     roi.height = (i32)(partHeight >> 1);
2086     fillHeight = (partHeight >> 1) + 1;
2087 
2088     if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
2089         (y0 < 0) || ((u32)y0+fillHeight > height))
2090     {
2091         h264bsdFillBlock(ref, pFill, x0, y0, width, height,
2092             fillWidth, fillHeight, fillWidth);
2093         ref += width * height;
2094         h264bsdFillBlock(ref, pFill + fillWidth*fillHeight,
2095             x0, y0, width, height, fillWidth,
2096             fillHeight, fillWidth);
2097 
2098         ref = pFill;
2099         x0 = 0;
2100         y0 = 0;
2101         width = fillWidth;
2102         height = fillHeight;
2103     }
2104 
2105     partData = data + 16*16 + (partY>>1)*8 + (partX>>1);
2106 
2107     /* Chroma interpolation */
2108     /*lint --e(737) Loss of sign */
2109     ref += y0 * width + x0;
2110     res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
2111                             (u32)roi.width, (u32)roi.height, xFrac, yFrac);
2112     ASSERT(res == 0);
2113     partData += 8 * 8;
2114     ref += height * width;
2115     res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
2116                             (u32)roi.width, (u32)roi.height, xFrac, yFrac);
2117     ASSERT(res == 0);
2118 
2119 }
2120 
2121 #endif /* H264DEC_OMXDL */
2122 
2123 
2124 /*------------------------------------------------------------------------------
2125 
2126     Function: FillRow1
2127 
2128         Functional description:
2129           This function gets a row of reference pels in a 'normal' case when no
2130           overfilling is necessary.
2131 
2132 ------------------------------------------------------------------------------*/
2133 
FillRow1(u8 * ref,u8 * fill,i32 left,i32 center,i32 right)2134 static void FillRow1(
2135   u8 *ref,
2136   u8 *fill,
2137   i32 left,
2138   i32 center,
2139   i32 right)
2140 {
2141     UNUSED(left);
2142     UNUSED(right);
2143     ASSERT(ref);
2144     ASSERT(fill);
2145 
2146     H264SwDecMemcpy(fill, ref, (u32)center);
2147 
2148     /*lint -e(715) */
2149 }
2150 
2151 
2152 /*------------------------------------------------------------------------------
2153 
2154     Function: h264bsdFillRow7
2155 
2156         Functional description:
2157           This function gets a row of reference pels when horizontal coordinate
2158           is partly negative or partly greater than reference picture width
2159           (overfilling some pels on left and/or right edge).
2160         Inputs:
2161           ref       pointer to reference samples
2162           left      amount of pixels to overfill on left-edge
2163           center    amount of pixels to copy
2164           right     amount of pixels to overfill on right-edge
2165         Outputs:
2166           fill      pointer where samples are stored
2167 
2168 ------------------------------------------------------------------------------*/
2169 #ifndef H264DEC_NEON
h264bsdFillRow7(u8 * ref,u8 * fill,i32 left,i32 center,i32 right)2170 void h264bsdFillRow7(
2171   u8 *ref,
2172   u8 *fill,
2173   i32 left,
2174   i32 center,
2175   i32 right)
2176 {
2177     u8 tmp;
2178 
2179     ASSERT(ref);
2180     ASSERT(fill);
2181 
2182     if (left)
2183         tmp = *ref;
2184 
2185     for ( ; left; left--)
2186         /*lint -esym(644,tmp)  tmp is initialized if used */
2187         *fill++ = tmp;
2188 
2189     for ( ; center; center--)
2190         *fill++ = *ref++;
2191 
2192     if (right)
2193         tmp = ref[-1];
2194 
2195     for ( ; right; right--)
2196         /*lint -esym(644,tmp)  tmp is initialized if used */
2197         *fill++ = tmp;
2198 }
2199 #endif
2200 /*------------------------------------------------------------------------------
2201 
2202     Function: h264bsdFillBlock
2203 
2204         Functional description:
2205           This function gets a block of reference pels. It determines whether
2206           overfilling is needed or not and repeatedly calls an appropriate
2207           function (by using a function pointer) that fills one row the block.
2208         Inputs:
2209           ref               pointer to reference frame
2210           x0                x-coordinate for block
2211           y0                y-coordinate for block
2212           width             width of reference frame
2213           height            height of reference frame
2214           blockWidth        width of block
2215           blockHeight       height of block
2216           fillScanLength    length of a line in output array (pixels)
2217         Outputs:
2218           fill              pointer to array where output block is written
2219 
2220 ------------------------------------------------------------------------------*/
2221 
h264bsdFillBlock(u8 * ref,u8 * fill,i32 x0,i32 y0,u32 width,u32 height,u32 blockWidth,u32 blockHeight,u32 fillScanLength)2222 void h264bsdFillBlock(
2223   u8 *ref,
2224   u8 *fill,
2225   i32 x0,
2226   i32 y0,
2227   u32 width,
2228   u32 height,
2229   u32 blockWidth,
2230   u32 blockHeight,
2231   u32 fillScanLength)
2232 
2233 {
2234 
2235 /* Variables */
2236 
2237     i32 xstop, ystop;
2238     void (*fp)(u8*, u8*, i32, i32, i32);
2239     i32 left, x, right;
2240     i32 top, y, bottom;
2241 
2242 /* Code */
2243 
2244     ASSERT(ref);
2245     ASSERT(fill);
2246     ASSERT(width);
2247     ASSERT(height);
2248     ASSERT(fill);
2249     ASSERT(blockWidth);
2250     ASSERT(blockHeight);
2251 
2252     xstop = x0 + (i32)blockWidth;
2253     ystop = y0 + (i32)blockHeight;
2254 
2255     /* Choose correct function whether overfilling on left-edge or right-edge
2256      * is needed or not */
2257     if (x0 >= 0 && xstop <= (i32)width)
2258         fp = FillRow1;
2259     else
2260         fp = h264bsdFillRow7;
2261 
2262     if (ystop < 0)
2263         y0 = -(i32)blockHeight;
2264 
2265     if (xstop < 0)
2266         x0 = -(i32)blockWidth;
2267 
2268     if (y0 > (i32)height)
2269         y0 = (i32)height;
2270 
2271     if (x0 > (i32)width)
2272         x0 = (i32)width;
2273 
2274     xstop = x0 + (i32)blockWidth;
2275     ystop = y0 + (i32)blockHeight;
2276 
2277     if (x0 > 0)
2278         ref += x0;
2279 
2280     if (y0 > 0)
2281         ref += y0 * (i32)width;
2282 
2283     left = x0 < 0 ? -x0 : 0;
2284     right = xstop > (i32)width ? xstop - (i32)width : 0;
2285     x = (i32)blockWidth - left - right;
2286 
2287     top = y0 < 0 ? -y0 : 0;
2288     bottom = ystop > (i32)height ? ystop - (i32)height : 0;
2289     y = (i32)blockHeight - top - bottom;
2290 
2291     /* Top-overfilling */
2292     for ( ; top; top-- )
2293     {
2294         (*fp)(ref, fill, left, x, right);
2295         fill += fillScanLength;
2296     }
2297 
2298     /* Lines inside reference image */
2299     for ( ; y; y-- )
2300     {
2301         (*fp)(ref, fill, left, x, right);
2302         ref += width;
2303         fill += fillScanLength;
2304     }
2305 
2306     ref -= width;
2307 
2308     /* Bottom-overfilling */
2309     for ( ; bottom; bottom-- )
2310     {
2311         (*fp)(ref, fill, left, x, right);
2312         fill += fillScanLength;
2313     }
2314 }
2315 
2316 /*lint +e701 +e702 */
2317 
2318 
2319