1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 #include "avcenc_lib.h"
19 #include "avcenc_int.h"
20 
21 
22 #define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
23                  x = 0xFF & (~(x>>31));}
24 
25 /* (blkwidth << 2) + (dy << 1) + dx */
26 static void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
27 {
28     &eChromaFullMC_SIMD,
29     &eChromaHorizontalMC_SIMD,
30     &eChromaVerticalMC_SIMD,
31     &eChromaDiagonalMC_SIMD,
32     &eChromaFullMC_SIMD,
33     &eChromaHorizontalMC2_SIMD,
34     &eChromaVerticalMC2_SIMD,
35     &eChromaDiagonalMC2_SIMD
36 };
37 /* Perform motion prediction and compensation with residue if exist. */
AVCMBMotionComp(AVCEncObject * encvid,AVCCommonObj * video)38 void AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video)
39 {
40     (void)(encvid);
41 
42     AVCMacroblock *currMB = video->currMB;
43     AVCPictureData *currPic = video->currPic;
44     int mbPartIdx, subMbPartIdx;
45     int ref_idx;
46     int offset_MbPart_indx = 0;
47     int16 *mv;
48     uint32 x_pos, y_pos;
49     uint8 *curL, *curCb, *curCr;
50     uint8 *ref_l, *ref_Cb, *ref_Cr;
51     uint8 *predBlock, *predCb, *predCr;
52     int block_x, block_y, offset_x, offset_y, offsetP, offset;
53     int x_position = (video->mb_x << 4);
54     int y_position = (video->mb_y << 4);
55     int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
56     int picWidth = currPic->width;
57     int picPitch = currPic->pitch;
58     int picHeight = currPic->height;
59     uint32 tmp_word;
60 
61     tmp_word = y_position * picPitch;
62     curL = currPic->Sl + tmp_word + x_position;
63     offset = (tmp_word >> 2) + (x_position >> 1);
64     curCb = currPic->Scb + offset;
65     curCr = currPic->Scr + offset;
66 
67     predBlock = curL;
68     predCb = curCb;
69     predCr = curCr;
70 
71     GetMotionVectorPredictor(video, 1);
72 
73     for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
74     {
75         MbHeight = currMB->SubMbPartHeight[mbPartIdx];
76         MbWidth = currMB->SubMbPartWidth[mbPartIdx];
77         mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
78         mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
79         ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
80         offset_indx = 0;
81 
82         ref_l = video->RefPicList0[ref_idx]->Sl;
83         ref_Cb = video->RefPicList0[ref_idx]->Scb;
84         ref_Cr = video->RefPicList0[ref_idx]->Scr;
85 
86         for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
87         {
88             block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);
89             block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
90             mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
91             offset_x = x_position + (block_x << 2);
92             offset_y = y_position + (block_y << 2);
93             x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
94             y_pos = (offset_y << 2) + *mv;   /*quarter pel */
95 
96             //offset = offset_y * currPic->width;
97             //offsetC = (offset >> 2) + (offset_x >> 1);
98             offsetP = (block_y << 2) * picPitch + (block_x << 2);
99             eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos,
100                             /*comp_Sl + offset + offset_x,*/
101                             predBlock + offsetP, picPitch, MbWidth, MbHeight);
102 
103             offsetP = (block_y * picWidth) + (block_x << 1);
104             eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
105                               /*comp_Scb +  offsetC,*/
106                               predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
107             eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
108                               /*comp_Scr +  offsetC,*/
109                               predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
110 
111             offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
112         }
113         offset_MbPart_indx = currMB->MbPartWidth >> 4;
114     }
115 
116     return ;
117 }
118 
119 
120 /* preform the actual  motion comp here */
eLumaMotionComp(uint8 * ref,int picpitch,int picheight,int x_pos,int y_pos,uint8 * pred,int pred_pitch,int blkwidth,int blkheight)121 void eLumaMotionComp(uint8 *ref, int picpitch, int picheight,
122                      int x_pos, int y_pos,
123                      uint8 *pred, int pred_pitch,
124                      int blkwidth, int blkheight)
125 {
126     (void)(picheight);
127 
128     int dx, dy;
129     int temp2[21][21]; /* for intermediate results */
130     uint8 *ref2;
131 
132     dx = x_pos & 3;
133     dy = y_pos & 3;
134     x_pos = x_pos >> 2;  /* round it to full-pel resolution */
135     y_pos = y_pos >> 2;
136 
137     /* perform actual motion compensation */
138     if (dx == 0 && dy == 0)
139     {  /* fullpel position *//* G */
140 
141         ref += y_pos * picpitch + x_pos;
142 
143         eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
144 
145     }   /* other positions */
146     else  if (dy == 0)
147     { /* no vertical interpolation *//* a,b,c*/
148 
149         ref += y_pos * picpitch + x_pos;
150 
151         eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx);
152     }
153     else if (dx == 0)
154     { /*no horizontal interpolation *//* d,h,n */
155 
156         ref += y_pos * picpitch + x_pos;
157 
158         eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy);
159     }
160     else if (dy == 2)
161     {  /* horizontal cross *//* i, j, k */
162 
163         ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */
164 
165         eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight);
166 
167         eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
168     }
169     else if (dx == 2)
170     { /* vertical cross */ /* f,q */
171 
172         ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */
173 
174         eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5);
175         eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
176     }
177     else
178     { /* diagonal *//* e,g,p,r */
179 
180         ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos;
181 
182         ref += (y_pos * picpitch) + x_pos + (dx / 2);
183 
184         eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
185     }
186 
187     return ;
188 }
189 
eCreateAlign(uint8 * ref,int picpitch,int y_pos,uint8 * out,int blkwidth,int blkheight)190 void eCreateAlign(uint8 *ref, int picpitch, int y_pos,
191                   uint8 *out, int blkwidth, int blkheight)
192 {
193     int i, j;
194     int offset, out_offset;
195     uint32 prev_pix, result, pix1, pix2, pix4;
196 
197     ref += y_pos * picpitch;// + x_pos;
198     out_offset = 24 - blkwidth;
199 
200     //switch(x_pos&0x3){
201     switch (((intptr_t)ref)&0x3)
202     {
203         case 1:
204             offset =  picpitch - blkwidth - 3;
205             for (j = 0; j < blkheight; j++)
206             {
207                 pix1 = *ref++;
208                 pix2 = *((uint16*)ref);
209                 ref += 2;
210                 result = (pix2 << 8) | pix1;
211 
212                 for (i = 3; i < blkwidth; i += 4)
213                 {
214                     pix4 = *((uint32*)ref);
215                     ref += 4;
216                     prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
217                     result |= prev_pix;
218                     *((uint32*)out) = result;  /* write 4 bytes */
219                     out += 4;
220                     result = pix4 >> 8; /* for the next loop */
221                 }
222                 ref += offset;
223                 out += out_offset;
224             }
225             break;
226         case 2:
227             offset =  picpitch - blkwidth - 2;
228             for (j = 0; j < blkheight; j++)
229             {
230                 result = *((uint16*)ref);
231                 ref += 2;
232                 for (i = 2; i < blkwidth; i += 4)
233                 {
234                     pix4 = *((uint32*)ref);
235                     ref += 4;
236                     prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
237                     result |= prev_pix;
238                     *((uint32*)out) = result;  /* write 4 bytes */
239                     out += 4;
240                     result = pix4 >> 16; /* for the next loop */
241                 }
242                 ref += offset;
243                 out += out_offset;
244             }
245             break;
246         case 3:
247             offset =  picpitch - blkwidth - 1;
248             for (j = 0; j < blkheight; j++)
249             {
250                 result = *ref++;
251                 for (i = 1; i < blkwidth; i += 4)
252                 {
253                     pix4 = *((uint32*)ref);
254                     ref += 4;
255                     prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
256                     result |= prev_pix;
257                     *((uint32*)out) = result;  /* write 4 bytes */
258                     out += 4;
259                     result = pix4 >> 24; /* for the next loop */
260                 }
261                 ref += offset;
262                 out += out_offset;
263             }
264             break;
265     }
266 }
267 
eHorzInterp1MC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dx)268 void eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
269                     int blkwidth, int blkheight, int dx)
270 {
271     uint8 *p_ref, *tmp;
272     uint32 *p_cur;
273     uint32 pkres;
274     int result, curr_offset, ref_offset;
275     int j;
276     int32 r0, r1, r2, r3, r4, r5;
277     int32 r13, r6;
278 
279     p_cur = (uint32*)out; /* assume it's word aligned */
280     curr_offset = (outpitch - blkwidth) >> 2;
281     p_ref = in;
282     ref_offset = inpitch - blkwidth;
283 
284     if (dx&1)
285     {
286         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
287         p_ref -= 2;
288         r13 = 0;
289         for (j = blkheight; j > 0; j--)
290         {
291             tmp = p_ref + blkwidth;
292             r0 = p_ref[0];
293             r1 = p_ref[2];
294             r0 |= (r1 << 16);           /* 0,c,0,a */
295             r1 = p_ref[1];
296             r2 = p_ref[3];
297             r1 |= (r2 << 16);           /* 0,d,0,b */
298             while (p_ref < tmp)
299             {
300                 r2 = *(p_ref += 4); /* move pointer to e */
301                 r3 = p_ref[2];
302                 r2 |= (r3 << 16);           /* 0,g,0,e */
303                 r3 = p_ref[1];
304                 r4 = p_ref[3];
305                 r3 |= (r4 << 16);           /* 0,h,0,f */
306 
307                 r4 = r0 + r3;       /* c+h, a+f */
308                 r5 = r0 + r1;   /* c+d, a+b */
309                 r6 = r2 + r3;   /* g+h, e+f */
310                 r5 >>= 16;
311                 r5 |= (r6 << 16);   /* e+f, c+d */
312                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
313                 r4 += 0x100010; /* +16, +16 */
314                 r5 = r1 + r2;       /* d+g, b+e */
315                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
316                 r4 >>= 5;
317                 r13 |= r4;      /* check clipping */
318 
319                 r5 = p_ref[dx+2];
320                 r6 = p_ref[dx+4];
321                 r5 |= (r6 << 16);
322                 r4 += r5;
323                 r4 += 0x10001;
324                 r4 = (r4 >> 1) & 0xFF00FF;
325 
326                 r5 = p_ref[4];  /* i */
327                 r6 = (r5 << 16);
328                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
329                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
330                 r1 >>= 16;
331                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
332                 r1 += r2;       /* f+g, d+e */
333                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
334                 r0 >>= 16;
335                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
336                 r0 += r3;       /* e+h, c+f */
337                 r5 += 0x100010; /* 16,16 */
338                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
339                 r5 >>= 5;
340                 r13 |= r5;      /* check clipping */
341 
342                 r0 = p_ref[dx+3];
343                 r1 = p_ref[dx+5];
344                 r0 |= (r1 << 16);
345                 r5 += r0;
346                 r5 += 0x10001;
347                 r5 = (r5 >> 1) & 0xFF00FF;
348 
349                 r4 |= (r5 << 8);    /* pack them together */
350                 *p_cur++ = r4;
351                 r1 = r3;
352                 r0 = r2;
353             }
354             p_cur += curr_offset; /* move to the next line */
355             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
356 
357             if (r13&0xFF000700) /* need clipping */
358             {
359                 /* move back to the beginning of the line */
360                 p_ref -= (ref_offset + blkwidth);   /* input */
361                 p_cur -= (outpitch >> 2);
362 
363                 tmp = p_ref + blkwidth;
364                 for (; p_ref < tmp;)
365                 {
366 
367                     r0 = *p_ref++;
368                     r1 = *p_ref++;
369                     r2 = *p_ref++;
370                     r3 = *p_ref++;
371                     r4 = *p_ref++;
372                     /* first pixel */
373                     r5 = *p_ref++;
374                     result = (r0 + r5);
375                     r0 = (r1 + r4);
376                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
377                     r0 = (r2 + r3);
378                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
379                     result = (result + 16) >> 5;
380                     CLIP_RESULT(result)
381                     /* 3/4 pel,  no need to clip */
382                     result = (result + p_ref[dx] + 1);
383                     pkres = (result >> 1) ;
384                     /* second pixel */
385                     r0 = *p_ref++;
386                     result = (r1 + r0);
387                     r1 = (r2 + r5);
388                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
389                     r1 = (r3 + r4);
390                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
391                     result = (result + 16) >> 5;
392                     CLIP_RESULT(result)
393                     /* 3/4 pel,  no need to clip */
394                     result = (result + p_ref[dx] + 1);
395                     result = (result >> 1);
396                     pkres  |= (result << 8);
397                     /* third pixel */
398                     r1 = *p_ref++;
399                     result = (r2 + r1);
400                     r2 = (r3 + r0);
401                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
402                     r2 = (r4 + r5);
403                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
404                     result = (result + 16) >> 5;
405                     CLIP_RESULT(result)
406                     /* 3/4 pel,  no need to clip */
407                     result = (result + p_ref[dx] + 1);
408                     result = (result >> 1);
409                     pkres  |= (result << 16);
410                     /* fourth pixel */
411                     r2 = *p_ref++;
412                     result = (r3 + r2);
413                     r3 = (r4 + r1);
414                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
415                     r3 = (r5 + r0);
416                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
417                     result = (result + 16) >> 5;
418                     CLIP_RESULT(result)
419                     /* 3/4 pel,  no need to clip */
420                     result = (result + p_ref[dx] + 1);
421                     result = (result >> 1);
422                     pkres  |= (result << 24);
423                     *p_cur++ = pkres; /* write 4 pixels */
424                     p_ref -= 5;  /* offset back to the middle of filter */
425                 }
426                 p_cur += curr_offset;  /* move to the next line */
427                 p_ref += ref_offset;    /* move to the next line */
428             }
429         }
430     }
431     else
432     {
433         p_ref -= 2;
434         r13 = 0;
435         for (j = blkheight; j > 0; j--)
436         {
437             tmp = p_ref + blkwidth;
438             r0 = p_ref[0];
439             r1 = p_ref[2];
440             r0 |= (r1 << 16);           /* 0,c,0,a */
441             r1 = p_ref[1];
442             r2 = p_ref[3];
443             r1 |= (r2 << 16);           /* 0,d,0,b */
444             while (p_ref < tmp)
445             {
446                 r2 = *(p_ref += 4); /* move pointer to e */
447                 r3 = p_ref[2];
448                 r2 |= (r3 << 16);           /* 0,g,0,e */
449                 r3 = p_ref[1];
450                 r4 = p_ref[3];
451                 r3 |= (r4 << 16);           /* 0,h,0,f */
452 
453                 r4 = r0 + r3;       /* c+h, a+f */
454                 r5 = r0 + r1;   /* c+d, a+b */
455                 r6 = r2 + r3;   /* g+h, e+f */
456                 r5 >>= 16;
457                 r5 |= (r6 << 16);   /* e+f, c+d */
458                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
459                 r4 += 0x100010; /* +16, +16 */
460                 r5 = r1 + r2;       /* d+g, b+e */
461                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
462                 r4 >>= 5;
463                 r13 |= r4;      /* check clipping */
464                 r4 &= 0xFF00FF; /* mask */
465 
466                 r5 = p_ref[4];  /* i */
467                 r6 = (r5 << 16);
468                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
469                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
470                 r1 >>= 16;
471                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
472                 r1 += r2;       /* f+g, d+e */
473                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
474                 r0 >>= 16;
475                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
476                 r0 += r3;       /* e+h, c+f */
477                 r5 += 0x100010; /* 16,16 */
478                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
479                 r5 >>= 5;
480                 r13 |= r5;      /* check clipping */
481                 r5 &= 0xFF00FF; /* mask */
482 
483                 r4 |= (r5 << 8);    /* pack them together */
484                 *p_cur++ = r4;
485                 r1 = r3;
486                 r0 = r2;
487             }
488             p_cur += curr_offset; /* move to the next line */
489             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
490 
491             if (r13&0xFF000700) /* need clipping */
492             {
493                 /* move back to the beginning of the line */
494                 p_ref -= (ref_offset + blkwidth);   /* input */
495                 p_cur -= (outpitch >> 2);
496 
497                 tmp = p_ref + blkwidth;
498                 for (; p_ref < tmp;)
499                 {
500 
501                     r0 = *p_ref++;
502                     r1 = *p_ref++;
503                     r2 = *p_ref++;
504                     r3 = *p_ref++;
505                     r4 = *p_ref++;
506                     /* first pixel */
507                     r5 = *p_ref++;
508                     result = (r0 + r5);
509                     r0 = (r1 + r4);
510                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
511                     r0 = (r2 + r3);
512                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
513                     result = (result + 16) >> 5;
514                     CLIP_RESULT(result)
515                     pkres  = result;
516                     /* second pixel */
517                     r0 = *p_ref++;
518                     result = (r1 + r0);
519                     r1 = (r2 + r5);
520                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
521                     r1 = (r3 + r4);
522                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
523                     result = (result + 16) >> 5;
524                     CLIP_RESULT(result)
525                     pkres  |= (result << 8);
526                     /* third pixel */
527                     r1 = *p_ref++;
528                     result = (r2 + r1);
529                     r2 = (r3 + r0);
530                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
531                     r2 = (r4 + r5);
532                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
533                     result = (result + 16) >> 5;
534                     CLIP_RESULT(result)
535                     pkres  |= (result << 16);
536                     /* fourth pixel */
537                     r2 = *p_ref++;
538                     result = (r3 + r2);
539                     r3 = (r4 + r1);
540                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
541                     r3 = (r5 + r0);
542                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
543                     result = (result + 16) >> 5;
544                     CLIP_RESULT(result)
545                     pkres  |= (result << 24);
546                     *p_cur++ = pkres;   /* write 4 pixels */
547                     p_ref -= 5;
548                 }
549                 p_cur += curr_offset; /* move to the next line */
550                 p_ref += ref_offset;
551             }
552         }
553     }
554 
555     return ;
556 }
557 
eHorzInterp2MC(int * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dx)558 void eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
559                     int blkwidth, int blkheight, int dx)
560 {
561     int *p_ref, *tmp;
562     uint32 *p_cur;
563     uint32 pkres;
564     int result, result2, curr_offset, ref_offset;
565     int j, r0, r1, r2, r3, r4, r5;
566 
567     p_cur = (uint32*)out; /* assume it's word aligned */
568     curr_offset = (outpitch - blkwidth) >> 2;
569     p_ref = in;
570     ref_offset = inpitch - blkwidth;
571 
572     if (dx&1)
573     {
574         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
575 
576         for (j = blkheight; j > 0 ; j--)
577         {
578             tmp = p_ref + blkwidth;
579             for (; p_ref < tmp;)
580             {
581 
582                 r0 = p_ref[-2];
583                 r1 = p_ref[-1];
584                 r2 = *p_ref++;
585                 r3 = *p_ref++;
586                 r4 = *p_ref++;
587                 /* first pixel */
588                 r5 = *p_ref++;
589                 result = (r0 + r5);
590                 r0 = (r1 + r4);
591                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
592                 r0 = (r2 + r3);
593                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
594                 result = (result + 512) >> 10;
595                 CLIP_RESULT(result)
596                 result2 = ((p_ref[dx] + 16) >> 5);
597                 CLIP_RESULT(result2)
598                 /* 3/4 pel,  no need to clip */
599                 result = (result + result2 + 1);
600                 pkres = (result >> 1);
601                 /* second pixel */
602                 r0 = *p_ref++;
603                 result = (r1 + r0);
604                 r1 = (r2 + r5);
605                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
606                 r1 = (r3 + r4);
607                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
608                 result = (result + 512) >> 10;
609                 CLIP_RESULT(result)
610                 result2 = ((p_ref[dx] + 16) >> 5);
611                 CLIP_RESULT(result2)
612                 /* 3/4 pel,  no need to clip */
613                 result = (result + result2 + 1);
614                 result = (result >> 1);
615                 pkres  |= (result << 8);
616                 /* third pixel */
617                 r1 = *p_ref++;
618                 result = (r2 + r1);
619                 r2 = (r3 + r0);
620                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
621                 r2 = (r4 + r5);
622                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
623                 result = (result + 512) >> 10;
624                 CLIP_RESULT(result)
625                 result2 = ((p_ref[dx] + 16) >> 5);
626                 CLIP_RESULT(result2)
627                 /* 3/4 pel,  no need to clip */
628                 result = (result + result2 + 1);
629                 result = (result >> 1);
630                 pkres  |= (result << 16);
631                 /* fourth pixel */
632                 r2 = *p_ref++;
633                 result = (r3 + r2);
634                 r3 = (r4 + r1);
635                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
636                 r3 = (r5 + r0);
637                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
638                 result = (result + 512) >> 10;
639                 CLIP_RESULT(result)
640                 result2 = ((p_ref[dx] + 16) >> 5);
641                 CLIP_RESULT(result2)
642                 /* 3/4 pel,  no need to clip */
643                 result = (result + result2 + 1);
644                 result = (result >> 1);
645                 pkres  |= (result << 24);
646                 *p_cur++ = pkres; /* write 4 pixels */
647                 p_ref -= 3;  /* offset back to the middle of filter */
648             }
649             p_cur += curr_offset;  /* move to the next line */
650             p_ref += ref_offset;    /* move to the next line */
651         }
652     }
653     else
654     {
655         for (j = blkheight; j > 0 ; j--)
656         {
657             tmp = p_ref + blkwidth;
658             for (; p_ref < tmp;)
659             {
660 
661                 r0 = p_ref[-2];
662                 r1 = p_ref[-1];
663                 r2 = *p_ref++;
664                 r3 = *p_ref++;
665                 r4 = *p_ref++;
666                 /* first pixel */
667                 r5 = *p_ref++;
668                 result = (r0 + r5);
669                 r0 = (r1 + r4);
670                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
671                 r0 = (r2 + r3);
672                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
673                 result = (result + 512) >> 10;
674                 CLIP_RESULT(result)
675                 pkres  = result;
676                 /* second pixel */
677                 r0 = *p_ref++;
678                 result = (r1 + r0);
679                 r1 = (r2 + r5);
680                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
681                 r1 = (r3 + r4);
682                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
683                 result = (result + 512) >> 10;
684                 CLIP_RESULT(result)
685                 pkres  |= (result << 8);
686                 /* third pixel */
687                 r1 = *p_ref++;
688                 result = (r2 + r1);
689                 r2 = (r3 + r0);
690                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
691                 r2 = (r4 + r5);
692                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
693                 result = (result + 512) >> 10;
694                 CLIP_RESULT(result)
695                 pkres  |= (result << 16);
696                 /* fourth pixel */
697                 r2 = *p_ref++;
698                 result = (r3 + r2);
699                 r3 = (r4 + r1);
700                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
701                 r3 = (r5 + r0);
702                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
703                 result = (result + 512) >> 10;
704                 CLIP_RESULT(result)
705                 pkres  |= (result << 24);
706                 *p_cur++ = pkres; /* write 4 pixels */
707                 p_ref -= 3;  /* offset back to the middle of filter */
708             }
709             p_cur += curr_offset;  /* move to the next line */
710             p_ref += ref_offset;    /* move to the next line */
711         }
712     }
713 
714     return ;
715 }
716 
eHorzInterp3MC(uint8 * in,int inpitch,int * out,int outpitch,int blkwidth,int blkheight)717 void eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
718                     int blkwidth, int blkheight)
719 {
720     uint8 *p_ref, *tmp;
721     int   *p_cur;
722     int result, curr_offset, ref_offset;
723     int j, r0, r1, r2, r3, r4, r5;
724 
725     p_cur = out;
726     curr_offset = (outpitch - blkwidth);
727     p_ref = in;
728     ref_offset = inpitch - blkwidth;
729 
730     for (j = blkheight; j > 0 ; j--)
731     {
732         tmp = p_ref + blkwidth;
733         for (; p_ref < tmp;)
734         {
735 
736             r0 = p_ref[-2];
737             r1 = p_ref[-1];
738             r2 = *p_ref++;
739             r3 = *p_ref++;
740             r4 = *p_ref++;
741             /* first pixel */
742             r5 = *p_ref++;
743             result = (r0 + r5);
744             r0 = (r1 + r4);
745             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
746             r0 = (r2 + r3);
747             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
748             *p_cur++ = result;
749             /* second pixel */
750             r0 = *p_ref++;
751             result = (r1 + r0);
752             r1 = (r2 + r5);
753             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
754             r1 = (r3 + r4);
755             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
756             *p_cur++ = result;
757             /* third pixel */
758             r1 = *p_ref++;
759             result = (r2 + r1);
760             r2 = (r3 + r0);
761             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
762             r2 = (r4 + r5);
763             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
764             *p_cur++ = result;
765             /* fourth pixel */
766             r2 = *p_ref++;
767             result = (r3 + r2);
768             r3 = (r4 + r1);
769             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
770             r3 = (r5 + r0);
771             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
772             *p_cur++ = result;
773             p_ref -= 3; /* move back to the middle of the filter */
774         }
775         p_cur += curr_offset; /* move to the next line */
776         p_ref += ref_offset;
777     }
778 
779     return ;
780 }
eVertInterp1MC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dy)781 void eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
782                     int blkwidth, int blkheight, int dy)
783 {
784     uint8 *p_cur, *p_ref, *tmp;
785     int result, curr_offset, ref_offset;
786     int j, i;
787     int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
788     uint8  tmp_in[24][24];
789 
790     /* not word-aligned */
791     if (((intptr_t)in)&0x3)
792     {
793         eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
794         in = &tmp_in[2][0];
795         inpitch = 24;
796     }
797     p_cur = out;
798     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
799     ref_offset = blkheight * inpitch; /* for limit */
800 
801     curr_offset += 3;
802 
803     if (dy&1)
804     {
805         dy = (dy >> 1) ? 0 : -inpitch;
806 
807         for (j = 0; j < blkwidth; j += 4, in += 4)
808         {
809             r13 = 0;
810             p_ref = in;
811             p_cur -= outpitch;  /* compensate for the first offset */
812             tmp = p_ref + ref_offset; /* limit */
813             while (p_ref < tmp)  /* the loop un-rolled  */
814             {
815                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
816                 p_ref += inpitch;
817                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
818                 r0 &= 0xFF00FF;
819 
820                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
821                 r7 = (r1 >> 8) & 0xFF00FF;
822                 r1 &= 0xFF00FF;
823 
824                 r0 += r1;
825                 r6 += r7;
826 
827                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
828                 r8 = (r2 >> 8) & 0xFF00FF;
829                 r2 &= 0xFF00FF;
830 
831                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
832                 r7 = (r1 >> 8) & 0xFF00FF;
833                 r1 &= 0xFF00FF;
834                 r1 += r2;
835 
836                 r7 += r8;
837 
838                 r0 += 20 * r1;
839                 r6 += 20 * r7;
840                 r0 += 0x100010;
841                 r6 += 0x100010;
842 
843                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
844                 r8 = (r2 >> 8) & 0xFF00FF;
845                 r2 &= 0xFF00FF;
846 
847                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
848                 r7 = (r1 >> 8) & 0xFF00FF;
849                 r1 &= 0xFF00FF;
850                 r1 += r2;
851 
852                 r7 += r8;
853 
854                 r0 -= 5 * r1;
855                 r6 -= 5 * r7;
856 
857                 r0 >>= 5;
858                 r6 >>= 5;
859                 /* clip */
860                 r13 |= r6;
861                 r13 |= r0;
862                 //CLIPPACK(r6,result)
863 
864                 r1 = *((uint32*)(p_ref + dy));
865                 r2 = (r1 >> 8) & 0xFF00FF;
866                 r1 &= 0xFF00FF;
867                 r0 += r1;
868                 r6 += r2;
869                 r0 += 0x10001;
870                 r6 += 0x10001;
871                 r0 = (r0 >> 1) & 0xFF00FF;
872                 r6 = (r6 >> 1) & 0xFF00FF;
873 
874                 r0 |= (r6 << 8);  /* pack it back */
875                 *((uint32*)(p_cur += outpitch)) = r0;
876             }
877             p_cur += curr_offset; /* offset to the next pixel */
878             if (r13 & 0xFF000700) /* this column need clipping */
879             {
880                 p_cur -= 4;
881                 for (i = 0; i < 4; i++)
882                 {
883                     p_ref = in + i;
884                     p_cur -= outpitch;  /* compensate for the first offset */
885 
886                     tmp = p_ref + ref_offset; /* limit */
887                     while (p_ref < tmp)
888                     {                           /* loop un-rolled */
889                         r0 = *(p_ref - (inpitch << 1));
890                         r1 = *(p_ref - inpitch);
891                         r2 = *p_ref;
892                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
893                         r4 = *(p_ref += inpitch);
894                         /* first pixel */
895                         r5 = *(p_ref += inpitch);
896                         result = (r0 + r5);
897                         r0 = (r1 + r4);
898                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
899                         r0 = (r2 + r3);
900                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
901                         result = (result + 16) >> 5;
902                         CLIP_RESULT(result)
903                         /* 3/4 pel,  no need to clip */
904                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
905                         result = (result >> 1);
906                         *(p_cur += outpitch) = result;
907                         /* second pixel */
908                         r0 = *(p_ref += inpitch);
909                         result = (r1 + r0);
910                         r1 = (r2 + r5);
911                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
912                         r1 = (r3 + r4);
913                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
914                         result = (result + 16) >> 5;
915                         CLIP_RESULT(result)
916                         /* 3/4 pel,  no need to clip */
917                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
918                         result = (result >> 1);
919                         *(p_cur += outpitch) = result;
920                         /* third pixel */
921                         r1 = *(p_ref += inpitch);
922                         result = (r2 + r1);
923                         r2 = (r3 + r0);
924                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
925                         r2 = (r4 + r5);
926                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
927                         result = (result + 16) >> 5;
928                         CLIP_RESULT(result)
929                         /* 3/4 pel,  no need to clip */
930                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
931                         result = (result >> 1);
932                         *(p_cur += outpitch) = result;
933                         /* fourth pixel */
934                         r2 = *(p_ref += inpitch);
935                         result = (r3 + r2);
936                         r3 = (r4 + r1);
937                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
938                         r3 = (r5 + r0);
939                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
940                         result = (result + 16) >> 5;
941                         CLIP_RESULT(result)
942                         /* 3/4 pel,  no need to clip */
943                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
944                         result = (result >> 1);
945                         *(p_cur += outpitch) = result;
946                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
947                     }
948                     p_cur += (curr_offset - 3);
949                 }
950             }
951         }
952     }
953     else
954     {
955         for (j = 0; j < blkwidth; j += 4, in += 4)
956         {
957             r13 = 0;
958             p_ref = in;
959             p_cur -= outpitch;  /* compensate for the first offset */
960             tmp = p_ref + ref_offset; /* limit */
961             while (p_ref < tmp)  /* the loop un-rolled  */
962             {
963                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
964                 p_ref += inpitch;
965                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
966                 r0 &= 0xFF00FF;
967 
968                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
969                 r7 = (r1 >> 8) & 0xFF00FF;
970                 r1 &= 0xFF00FF;
971 
972                 r0 += r1;
973                 r6 += r7;
974 
975                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
976                 r8 = (r2 >> 8) & 0xFF00FF;
977                 r2 &= 0xFF00FF;
978 
979                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
980                 r7 = (r1 >> 8) & 0xFF00FF;
981                 r1 &= 0xFF00FF;
982                 r1 += r2;
983 
984                 r7 += r8;
985 
986                 r0 += 20 * r1;
987                 r6 += 20 * r7;
988                 r0 += 0x100010;
989                 r6 += 0x100010;
990 
991                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
992                 r8 = (r2 >> 8) & 0xFF00FF;
993                 r2 &= 0xFF00FF;
994 
995                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
996                 r7 = (r1 >> 8) & 0xFF00FF;
997                 r1 &= 0xFF00FF;
998                 r1 += r2;
999 
1000                 r7 += r8;
1001 
1002                 r0 -= 5 * r1;
1003                 r6 -= 5 * r7;
1004 
1005                 r0 >>= 5;
1006                 r6 >>= 5;
1007                 /* clip */
1008                 r13 |= r6;
1009                 r13 |= r0;
1010                 //CLIPPACK(r6,result)
1011                 r0 &= 0xFF00FF;
1012                 r6 &= 0xFF00FF;
1013                 r0 |= (r6 << 8);  /* pack it back */
1014                 *((uint32*)(p_cur += outpitch)) = r0;
1015             }
1016             p_cur += curr_offset; /* offset to the next pixel */
1017             if (r13 & 0xFF000700) /* this column need clipping */
1018             {
1019                 p_cur -= 4;
1020                 for (i = 0; i < 4; i++)
1021                 {
1022                     p_ref = in + i;
1023                     p_cur -= outpitch;  /* compensate for the first offset */
1024                     tmp = p_ref + ref_offset; /* limit */
1025                     while (p_ref < tmp)
1026                     {                           /* loop un-rolled */
1027                         r0 = *(p_ref - (inpitch << 1));
1028                         r1 = *(p_ref - inpitch);
1029                         r2 = *p_ref;
1030                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1031                         r4 = *(p_ref += inpitch);
1032                         /* first pixel */
1033                         r5 = *(p_ref += inpitch);
1034                         result = (r0 + r5);
1035                         r0 = (r1 + r4);
1036                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1037                         r0 = (r2 + r3);
1038                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1039                         result = (result + 16) >> 5;
1040                         CLIP_RESULT(result)
1041                         *(p_cur += outpitch) = result;
1042                         /* second pixel */
1043                         r0 = *(p_ref += inpitch);
1044                         result = (r1 + r0);
1045                         r1 = (r2 + r5);
1046                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1047                         r1 = (r3 + r4);
1048                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1049                         result = (result + 16) >> 5;
1050                         CLIP_RESULT(result)
1051                         *(p_cur += outpitch) = result;
1052                         /* third pixel */
1053                         r1 = *(p_ref += inpitch);
1054                         result = (r2 + r1);
1055                         r2 = (r3 + r0);
1056                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1057                         r2 = (r4 + r5);
1058                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1059                         result = (result + 16) >> 5;
1060                         CLIP_RESULT(result)
1061                         *(p_cur += outpitch) = result;
1062                         /* fourth pixel */
1063                         r2 = *(p_ref += inpitch);
1064                         result = (r3 + r2);
1065                         r3 = (r4 + r1);
1066                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1067                         r3 = (r5 + r0);
1068                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1069                         result = (result + 16) >> 5;
1070                         CLIP_RESULT(result)
1071                         *(p_cur += outpitch) = result;
1072                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1073                     }
1074                     p_cur += (curr_offset - 3);
1075                 }
1076             }
1077         }
1078     }
1079 
1080     return ;
1081 }
1082 
eVertInterp2MC(uint8 * in,int inpitch,int * out,int outpitch,int blkwidth,int blkheight)1083 void eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
1084                     int blkwidth, int blkheight)
1085 {
1086     int *p_cur;
1087     uint8 *p_ref, *tmp;
1088     int result, curr_offset, ref_offset;
1089     int j, r0, r1, r2, r3, r4, r5;
1090 
1091     p_cur = out;
1092     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1093     ref_offset = blkheight * inpitch; /* for limit */
1094 
1095     for (j = 0; j < blkwidth; j++)
1096     {
1097         p_cur -= outpitch; /* compensate for the first offset */
1098         p_ref = in++;
1099 
1100         tmp = p_ref + ref_offset; /* limit */
1101         while (p_ref < tmp)
1102         {                           /* loop un-rolled */
1103             r0 = *(p_ref - (inpitch << 1));
1104             r1 = *(p_ref - inpitch);
1105             r2 = *p_ref;
1106             r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1107             r4 = *(p_ref += inpitch);
1108             /* first pixel */
1109             r5 = *(p_ref += inpitch);
1110             result = (r0 + r5);
1111             r0 = (r1 + r4);
1112             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1113             r0 = (r2 + r3);
1114             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1115             *(p_cur += outpitch) = result;
1116             /* second pixel */
1117             r0 = *(p_ref += inpitch);
1118             result = (r1 + r0);
1119             r1 = (r2 + r5);
1120             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1121             r1 = (r3 + r4);
1122             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1123             *(p_cur += outpitch) = result;
1124             /* third pixel */
1125             r1 = *(p_ref += inpitch);
1126             result = (r2 + r1);
1127             r2 = (r3 + r0);
1128             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1129             r2 = (r4 + r5);
1130             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1131             *(p_cur += outpitch) = result;
1132             /* fourth pixel */
1133             r2 = *(p_ref += inpitch);
1134             result = (r3 + r2);
1135             r3 = (r4 + r1);
1136             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1137             r3 = (r5 + r0);
1138             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1139             *(p_cur += outpitch) = result;
1140             p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1141         }
1142         p_cur += curr_offset;
1143     }
1144 
1145     return ;
1146 }
1147 
eVertInterp3MC(int * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dy)1148 void eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
1149                     int blkwidth, int blkheight, int dy)
1150 {
1151     uint8 *p_cur;
1152     int *p_ref, *tmp;
1153     int result, result2, curr_offset, ref_offset;
1154     int j, r0, r1, r2, r3, r4, r5;
1155 
1156     p_cur = out;
1157     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1158     ref_offset = blkheight * inpitch; /* for limit */
1159 
1160     if (dy&1)
1161     {
1162         dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
1163 
1164         for (j = 0; j < blkwidth; j++)
1165         {
1166             p_cur -= outpitch; /* compensate for the first offset */
1167             p_ref = in++;
1168 
1169             tmp = p_ref + ref_offset; /* limit */
1170             while (p_ref < tmp)
1171             {                           /* loop un-rolled */
1172                 r0 = *(p_ref - (inpitch << 1));
1173                 r1 = *(p_ref - inpitch);
1174                 r2 = *p_ref;
1175                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1176                 r4 = *(p_ref += inpitch);
1177                 /* first pixel */
1178                 r5 = *(p_ref += inpitch);
1179                 result = (r0 + r5);
1180                 r0 = (r1 + r4);
1181                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1182                 r0 = (r2 + r3);
1183                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1184                 result = (result + 512) >> 10;
1185                 CLIP_RESULT(result)
1186                 result2 = ((p_ref[dy] + 16) >> 5);
1187                 CLIP_RESULT(result2)
1188                 /* 3/4 pel,  no need to clip */
1189                 result = (result + result2 + 1);
1190                 result = (result >> 1);
1191                 *(p_cur += outpitch) = result;
1192                 /* second pixel */
1193                 r0 = *(p_ref += inpitch);
1194                 result = (r1 + r0);
1195                 r1 = (r2 + r5);
1196                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1197                 r1 = (r3 + r4);
1198                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1199                 result = (result + 512) >> 10;
1200                 CLIP_RESULT(result)
1201                 result2 = ((p_ref[dy] + 16) >> 5);
1202                 CLIP_RESULT(result2)
1203                 /* 3/4 pel,  no need to clip */
1204                 result = (result + result2 + 1);
1205                 result = (result >> 1);
1206                 *(p_cur += outpitch) = result;
1207                 /* third pixel */
1208                 r1 = *(p_ref += inpitch);
1209                 result = (r2 + r1);
1210                 r2 = (r3 + r0);
1211                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1212                 r2 = (r4 + r5);
1213                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1214                 result = (result + 512) >> 10;
1215                 CLIP_RESULT(result)
1216                 result2 = ((p_ref[dy] + 16) >> 5);
1217                 CLIP_RESULT(result2)
1218                 /* 3/4 pel,  no need to clip */
1219                 result = (result + result2 + 1);
1220                 result = (result >> 1);
1221                 *(p_cur += outpitch) = result;
1222                 /* fourth pixel */
1223                 r2 = *(p_ref += inpitch);
1224                 result = (r3 + r2);
1225                 r3 = (r4 + r1);
1226                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1227                 r3 = (r5 + r0);
1228                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1229                 result = (result + 512) >> 10;
1230                 CLIP_RESULT(result)
1231                 result2 = ((p_ref[dy] + 16) >> 5);
1232                 CLIP_RESULT(result2)
1233                 /* 3/4 pel,  no need to clip */
1234                 result = (result + result2 + 1);
1235                 result = (result >> 1);
1236                 *(p_cur += outpitch) = result;
1237                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1238             }
1239             p_cur += curr_offset;
1240         }
1241     }
1242     else
1243     {
1244         for (j = 0; j < blkwidth; j++)
1245         {
1246             p_cur -= outpitch; /* compensate for the first offset */
1247             p_ref = in++;
1248 
1249             tmp = p_ref + ref_offset; /* limit */
1250             while (p_ref < tmp)
1251             {                           /* loop un-rolled */
1252                 r0 = *(p_ref - (inpitch << 1));
1253                 r1 = *(p_ref - inpitch);
1254                 r2 = *p_ref;
1255                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1256                 r4 = *(p_ref += inpitch);
1257                 /* first pixel */
1258                 r5 = *(p_ref += inpitch);
1259                 result = (r0 + r5);
1260                 r0 = (r1 + r4);
1261                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1262                 r0 = (r2 + r3);
1263                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1264                 result = (result + 512) >> 10;
1265                 CLIP_RESULT(result)
1266                 *(p_cur += outpitch) = result;
1267                 /* second pixel */
1268                 r0 = *(p_ref += inpitch);
1269                 result = (r1 + r0);
1270                 r1 = (r2 + r5);
1271                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1272                 r1 = (r3 + r4);
1273                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1274                 result = (result + 512) >> 10;
1275                 CLIP_RESULT(result)
1276                 *(p_cur += outpitch) = result;
1277                 /* third pixel */
1278                 r1 = *(p_ref += inpitch);
1279                 result = (r2 + r1);
1280                 r2 = (r3 + r0);
1281                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1282                 r2 = (r4 + r5);
1283                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1284                 result = (result + 512) >> 10;
1285                 CLIP_RESULT(result)
1286                 *(p_cur += outpitch) = result;
1287                 /* fourth pixel */
1288                 r2 = *(p_ref += inpitch);
1289                 result = (r3 + r2);
1290                 r3 = (r4 + r1);
1291                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1292                 r3 = (r5 + r0);
1293                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1294                 result = (result + 512) >> 10;
1295                 CLIP_RESULT(result)
1296                 *(p_cur += outpitch) = result;
1297                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1298             }
1299             p_cur += curr_offset;
1300         }
1301     }
1302 
1303     return ;
1304 }
1305 
eDiagonalInterpMC(uint8 * in1,uint8 * in2,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight)1306 void eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
1307                        uint8 *out, int outpitch,
1308                        int blkwidth, int blkheight)
1309 {
1310     int j, i;
1311     int result;
1312     uint8 *p_cur, *p_ref, *p_tmp8, *tmp;
1313     int curr_offset, ref_offset;
1314     uint8 tmp_res[24][24], tmp_in[24][24];
1315     uint32 *p_tmp;
1316     uint32 pkres, tmp_result;
1317     int32 r0, r1, r2, r3, r4, r5;
1318     int32 r6, r7, r8, r9, r10, r13;
1319 
1320     ref_offset = inpitch - blkwidth;
1321     p_ref = in1 - 2;
1322     /* perform horizontal interpolation */
1323     /* not word-aligned */
1324     /* It is faster to read 1 byte at time to avoid calling CreateAlign */
1325     /*  if(((uint32)p_ref)&0x3)
1326         {
1327             CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
1328             p_ref = &tmp_in[0][0];
1329             ref_offset = 24-blkwidth;
1330         }*/
1331 
1332     p_tmp = (uint32*) & (tmp_res[0][0]);
1333     for (j = blkheight; j > 0; j--)
1334     {
1335         r13 = 0;
1336         tmp = p_ref + blkwidth;
1337 
1338         //r0 = *((uint32*)p_ref);   /* d,c,b,a */
1339         //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
1340         //r0 &= 0xFF00FF;           /* 0,c,0,a */
1341         /* It is faster to read 1 byte at a time */
1342         r0 = p_ref[0];
1343         r1 = p_ref[2];
1344         r0 |= (r1 << 16);           /* 0,c,0,a */
1345         r1 = p_ref[1];
1346         r2 = p_ref[3];
1347         r1 |= (r2 << 16);           /* 0,d,0,b */
1348 
1349         while (p_ref < tmp)
1350         {
1351             //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
1352             //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
1353             //r2 &= 0xFF00FF;           /* 0,g,0,e */
1354             /* It is faster to read 1 byte at a time */
1355             r2 = *(p_ref += 4);
1356             r3 = p_ref[2];
1357             r2 |= (r3 << 16);           /* 0,g,0,e */
1358             r3 = p_ref[1];
1359             r4 = p_ref[3];
1360             r3 |= (r4 << 16);           /* 0,h,0,f */
1361 
1362             r4 = r0 + r3;       /* c+h, a+f */
1363             r5 = r0 + r1;   /* c+d, a+b */
1364             r6 = r2 + r3;   /* g+h, e+f */
1365             r5 >>= 16;
1366             r5 |= (r6 << 16);   /* e+f, c+d */
1367             r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
1368             r4 += 0x100010; /* +16, +16 */
1369             r5 = r1 + r2;       /* d+g, b+e */
1370             r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
1371             r4 >>= 5;
1372             r13 |= r4;      /* check clipping */
1373             r4 &= 0xFF00FF; /* mask */
1374 
1375             r5 = p_ref[4];  /* i */
1376             r6 = (r5 << 16);
1377             r5 = r6 | (r2 >> 16);/* 0,i,0,g */
1378             r5 += r1;       /* d+i, b+g */ /* r5 not free */
1379             r1 >>= 16;
1380             r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
1381             r1 += r2;       /* f+g, d+e */
1382             r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
1383             r0 >>= 16;
1384             r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
1385             r0 += r3;       /* e+h, c+f */
1386             r5 += 0x100010; /* 16,16 */
1387             r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
1388             r5 >>= 5;
1389             r13 |= r5;      /* check clipping */
1390             r5 &= 0xFF00FF; /* mask */
1391 
1392             r4 |= (r5 << 8);    /* pack them together */
1393             *p_tmp++ = r4;
1394             r1 = r3;
1395             r0 = r2;
1396         }
1397         p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1398         p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
1399 
1400         if (r13&0xFF000700) /* need clipping */
1401         {
1402             /* move back to the beginning of the line */
1403             p_ref -= (ref_offset + blkwidth);   /* input */
1404             p_tmp -= 6; /* intermediate output */
1405             tmp = p_ref + blkwidth;
1406             while (p_ref < tmp)
1407             {
1408                 r0 = *p_ref++;
1409                 r1 = *p_ref++;
1410                 r2 = *p_ref++;
1411                 r3 = *p_ref++;
1412                 r4 = *p_ref++;
1413                 /* first pixel */
1414                 r5 = *p_ref++;
1415                 result = (r0 + r5);
1416                 r0 = (r1 + r4);
1417                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1418                 r0 = (r2 + r3);
1419                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1420                 result = (result + 16) >> 5;
1421                 CLIP_RESULT(result)
1422                 pkres = result;
1423                 /* second pixel */
1424                 r0 = *p_ref++;
1425                 result = (r1 + r0);
1426                 r1 = (r2 + r5);
1427                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1428                 r1 = (r3 + r4);
1429                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1430                 result = (result + 16) >> 5;
1431                 CLIP_RESULT(result)
1432                 pkres |= (result << 8);
1433                 /* third pixel */
1434                 r1 = *p_ref++;
1435                 result = (r2 + r1);
1436                 r2 = (r3 + r0);
1437                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1438                 r2 = (r4 + r5);
1439                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1440                 result = (result + 16) >> 5;
1441                 CLIP_RESULT(result)
1442                 pkres |= (result << 16);
1443                 /* fourth pixel */
1444                 r2 = *p_ref++;
1445                 result = (r3 + r2);
1446                 r3 = (r4 + r1);
1447                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1448                 r3 = (r5 + r0);
1449                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1450                 result = (result + 16) >> 5;
1451                 CLIP_RESULT(result)
1452                 pkres |= (result << 24);
1453 
1454                 *p_tmp++ = pkres; /* write 4 pixel */
1455                 p_ref -= 5;
1456             }
1457             p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1458             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
1459         }
1460     }
1461 
1462     /*  perform vertical interpolation */
1463     /* not word-aligned */
1464     if (((intptr_t)in2)&0x3)
1465     {
1466         eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1467         in2 = &tmp_in[2][0];
1468         inpitch = 24;
1469     }
1470 
1471     p_cur = out;
1472     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
1473     pkres = blkheight * inpitch; /* reuse it for limit */
1474 
1475     curr_offset += 3;
1476 
1477     for (j = 0; j < blkwidth; j += 4, in2 += 4)
1478     {
1479         r13 = 0;
1480         p_ref = in2;
1481         p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
1482         p_tmp8 -= 24;  /* compensate for the first offset */
1483         p_cur -= outpitch;  /* compensate for the first offset */
1484         tmp = p_ref + pkres; /* limit */
1485         while (p_ref < tmp)  /* the loop un-rolled  */
1486         {
1487             /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */
1488             /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
1489             r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
1490             r6 |= (r1<<16);         p_ref+=inpitch; */
1491             r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1492             p_ref += inpitch;
1493             r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1494             r0 &= 0xFF00FF;
1495 
1496             /*p_ref8 = p_ref+(inpitch<<1);
1497             r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
1498             r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
1499             r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1500             r7 = (r1 >> 8) & 0xFF00FF;
1501             r1 &= 0xFF00FF;
1502 
1503             r0 += r1;
1504             r6 += r7;
1505 
1506             /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
1507             r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
1508             r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1509             r8 = (r2 >> 8) & 0xFF00FF;
1510             r2 &= 0xFF00FF;
1511 
1512             /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
1513             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
1514             r2 = p_ref8[3];         r7 |= (r2<<16);*/
1515             r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1516             r7 = (r1 >> 8) & 0xFF00FF;
1517             r1 &= 0xFF00FF;
1518             r1 += r2;
1519 
1520             r7 += r8;
1521 
1522             r0 += 20 * r1;
1523             r6 += 20 * r7;
1524             r0 += 0x100010;
1525             r6 += 0x100010;
1526 
1527             /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
1528             r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
1529             r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1530             r8 = (r2 >> 8) & 0xFF00FF;
1531             r2 &= 0xFF00FF;
1532 
1533             /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
1534             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
1535             r2 = p_ref8[3];         r7 |= (r2<<16);*/
1536             r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1537             r7 = (r1 >> 8) & 0xFF00FF;
1538             r1 &= 0xFF00FF;
1539             r1 += r2;
1540 
1541             r7 += r8;
1542 
1543             r0 -= 5 * r1;
1544             r6 -= 5 * r7;
1545 
1546             r0 >>= 5;
1547             r6 >>= 5;
1548             /* clip */
1549             r13 |= r6;
1550             r13 |= r0;
1551             //CLIPPACK(r6,result)
1552             /* add with horizontal results */
1553             r10 = *((uint32*)(p_tmp8 += 24));
1554             r9 = (r10 >> 8) & 0xFF00FF;
1555             r10 &= 0xFF00FF;
1556 
1557             r0 += r10;
1558             r0 += 0x10001;
1559             r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
1560 
1561             r6 += r9;
1562             r6 += 0x10001;
1563             r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
1564 
1565             r0 |= (r6 << 8);  /* pack it back */
1566             *((uint32*)(p_cur += outpitch)) = r0;
1567         }
1568         p_cur += curr_offset; /* offset to the next pixel */
1569         if (r13 & 0xFF000700) /* this column need clipping */
1570         {
1571             p_cur -= 4;
1572             for (i = 0; i < 4; i++)
1573             {
1574                 p_ref = in2 + i;
1575                 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
1576                 p_tmp8 -= 24;  /* compensate for the first offset */
1577                 p_cur -= outpitch;  /* compensate for the first offset */
1578                 tmp = p_ref + pkres; /* limit */
1579                 while (p_ref < tmp)  /* the loop un-rolled  */
1580                 {
1581                     r0 = *(p_ref - (inpitch << 1));
1582                     r1 = *(p_ref - inpitch);
1583                     r2 = *p_ref;
1584                     r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1585                     r4 = *(p_ref += inpitch);
1586                     /* first pixel */
1587                     r5 = *(p_ref += inpitch);
1588                     result = (r0 + r5);
1589                     r0 = (r1 + r4);
1590                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1591                     r0 = (r2 + r3);
1592                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1593                     result = (result + 16) >> 5;
1594                     CLIP_RESULT(result)
1595                     tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
1596                     result = (result + tmp_result + 1);  /* no clip */
1597                     result = (result >> 1);
1598                     *(p_cur += outpitch) = result;
1599                     /* second pixel */
1600                     r0 = *(p_ref += inpitch);
1601                     result = (r1 + r0);
1602                     r1 = (r2 + r5);
1603                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1604                     r1 = (r3 + r4);
1605                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1606                     result = (result + 16) >> 5;
1607                     CLIP_RESULT(result)
1608                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1609                     result = (result + tmp_result + 1);  /* no clip */
1610                     result = (result >> 1);
1611                     *(p_cur += outpitch) = result;
1612                     /* third pixel */
1613                     r1 = *(p_ref += inpitch);
1614                     result = (r2 + r1);
1615                     r2 = (r3 + r0);
1616                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1617                     r2 = (r4 + r5);
1618                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1619                     result = (result + 16) >> 5;
1620                     CLIP_RESULT(result)
1621                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1622                     result = (result + tmp_result + 1);  /* no clip */
1623                     result = (result >> 1);
1624                     *(p_cur += outpitch) = result;
1625                     /* fourth pixel */
1626                     r2 = *(p_ref += inpitch);
1627                     result = (r3 + r2);
1628                     r3 = (r4 + r1);
1629                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1630                     r3 = (r5 + r0);
1631                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1632                     result = (result + 16) >> 5;
1633                     CLIP_RESULT(result)
1634                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1635                     result = (result + tmp_result + 1);  /* no clip */
1636                     result = (result >> 1);
1637                     *(p_cur += outpitch) = result;
1638                     p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1639                 }
1640                 p_cur += (curr_offset - 3);
1641             }
1642         }
1643     }
1644 
1645     return ;
1646 }
1647 
1648 /* position G */
eFullPelMC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight)1649 void eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1650                 int blkwidth, int blkheight)
1651 {
1652     int i, j;
1653     int offset_in = inpitch - blkwidth;
1654     int offset_out = outpitch - blkwidth;
1655     uint32 temp;
1656     uint8 byte;
1657 
1658     if (((intptr_t)in)&3)
1659     {
1660         for (j = blkheight; j > 0; j--)
1661         {
1662             for (i = blkwidth; i > 0; i -= 4)
1663             {
1664                 temp = *in++;
1665                 byte = *in++;
1666                 temp |= (byte << 8);
1667                 byte = *in++;
1668                 temp |= (byte << 16);
1669                 byte = *in++;
1670                 temp |= (byte << 24);
1671 
1672                 *((uint32*)out) = temp; /* write 4 bytes */
1673                 out += 4;
1674             }
1675             out += offset_out;
1676             in += offset_in;
1677         }
1678     }
1679     else
1680     {
1681         for (j = blkheight; j > 0; j--)
1682         {
1683             for (i = blkwidth; i > 0; i -= 4)
1684             {
1685                 temp = *((uint32*)in);
1686                 *((uint32*)out) = temp;
1687                 in += 4;
1688                 out += 4;
1689             }
1690             out += offset_out;
1691             in += offset_in;
1692         }
1693     }
1694     return ;
1695 }
1696 
ePadChroma(uint8 * ref,int picwidth,int picheight,int picpitch,int x_pos,int y_pos)1697 void ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos)
1698 {
1699     int pad_height;
1700     int pad_width;
1701     uint8 *start;
1702     uint32 word1, word2, word3;
1703     int offset, j;
1704 
1705 
1706     pad_height = 8 + ((y_pos & 7) ? 1 : 0);
1707     pad_width = 8 + ((x_pos & 7) ? 1 : 0);
1708 
1709     y_pos >>= 3;
1710     x_pos >>= 3;
1711     // pad vertical first
1712     if (y_pos < 0) // need to pad up
1713     {
1714         if (x_pos < -8) start = ref - 8;
1715         else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width;
1716         else start = ref + x_pos;
1717 
1718         /* word-align start */
1719         offset = (intptr_t)start & 0x3;
1720         if (offset) start -= offset;
1721 
1722         word1 = *((uint32*)start);
1723         word2 = *((uint32*)(start + 4));
1724         word3 = *((uint32*)(start + 8));
1725 
1726         /* pad up N rows */
1727         j = -y_pos;
1728         if (j > 8) j = 8;
1729         while (j--)
1730         {
1731             *((uint32*)(start -= picpitch)) = word1;
1732             *((uint32*)(start + 4)) = word2;
1733             *((uint32*)(start + 8)) = word3;
1734         }
1735 
1736     }
1737     else if (y_pos + pad_height >= picheight) /* pad down */
1738     {
1739         if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8;
1740         else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) +
1741                     picwidth + 7 - pad_width;
1742         else    start = ref + picpitch * (picheight - 1) + x_pos;
1743 
1744         /* word-align start */
1745         offset = (intptr_t)start & 0x3;
1746         if (offset) start -= offset;
1747 
1748         word1 = *((uint32*)start);
1749         word2 = *((uint32*)(start + 4));
1750         word3 = *((uint32*)(start + 8));
1751 
1752         /* pad down N rows */
1753         j = y_pos + pad_height - picheight;
1754         if (j > 8) j = 8;
1755         while (j--)
1756         {
1757             *((uint32*)(start += picpitch)) = word1;
1758             *((uint32*)(start + 4)) = word2;
1759             *((uint32*)(start + 8)) = word3;
1760         }
1761     }
1762 
1763     /* now pad horizontal */
1764     if (x_pos < 0) // pad left
1765     {
1766         if (y_pos < -8) start = ref - (picpitch << 3);
1767         else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch;
1768         else start = ref + y_pos * picpitch;
1769 
1770         // now pad left 8 pixels for pad_height rows */
1771         j = pad_height;
1772         start -= picpitch;
1773         while (j--)
1774         {
1775             word1 = *(start += picpitch);
1776             word1 |= (word1 << 8);
1777             word1 |= (word1 << 16);
1778             *((uint32*)(start - 8)) = word1;
1779             *((uint32*)(start - 4)) = word1;
1780         }
1781     }
1782     else if (x_pos + pad_width >= picwidth) /* pad right */
1783     {
1784         if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1;
1785         else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1;
1786         else start = ref + y_pos * picpitch + picwidth - 1;
1787 
1788         // now pad right 8 pixels for pad_height rows */
1789         j = pad_height;
1790         start -= picpitch;
1791         while (j--)
1792         {
1793             word1 = *(start += picpitch);
1794             word1 |= (word1 << 8);
1795             word1 |= (word1 << 16);
1796             *((uint32*)(start + 1)) = word1;
1797             *((uint32*)(start + 5)) = word1;
1798         }
1799     }
1800 
1801     return ;
1802 }
1803 
1804 
eChromaMotionComp(uint8 * ref,int picwidth,int picheight,int x_pos,int y_pos,uint8 * pred,int picpitch,int blkwidth,int blkheight)1805 void eChromaMotionComp(uint8 *ref, int picwidth, int picheight,
1806                        int x_pos, int y_pos,
1807                        uint8 *pred, int picpitch,
1808                        int blkwidth, int blkheight)
1809 {
1810     int dx, dy;
1811     int offset_dx, offset_dy;
1812     int index;
1813 
1814     ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos);
1815 
1816     dx = x_pos & 7;
1817     dy = y_pos & 7;
1818     offset_dx = (dx + 7) >> 3;
1819     offset_dy = (dy + 7) >> 3;
1820     x_pos = x_pos >> 3;  /* round it to full-pel resolution */
1821     y_pos = y_pos >> 3;
1822 
1823     ref += y_pos * picpitch + x_pos;
1824 
1825     index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
1826 
1827     (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight);
1828     return ;
1829 }
1830 
1831 
1832 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */
eChromaDiagonalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)1833 void eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1834                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1835 {
1836     int32 r0, r1, r2, r3, result0, result1;
1837     uint8 temp[288];
1838     uint8 *ref, *out;
1839     int i, j;
1840     int dx_8 = 8 - dx;
1841     int dy_8 = 8 - dy;
1842 
1843     /* horizontal first */
1844     out = temp;
1845     for (i = 0; i < blkheight + 1; i++)
1846     {
1847         ref = pRef;
1848         r0 = ref[0];
1849         for (j = 0; j < blkwidth; j += 4)
1850         {
1851             r0 |= (ref[2] << 16);
1852             result0 = dx_8 * r0;
1853 
1854             r1 = ref[1] | (ref[3] << 16);
1855             result0 += dx * r1;
1856             *(int32 *)out = result0;
1857 
1858             result0 = dx_8 * r1;
1859 
1860             r2 = ref[4];
1861             r0 = r0 >> 16;
1862             r1 = r0 | (r2 << 16);
1863             result0 += dx * r1;
1864             *(int32 *)(out + 16) = result0;
1865 
1866             ref += 4;
1867             out += 4;
1868             r0 = r2;
1869         }
1870         pRef += srcPitch;
1871         out += (32 - blkwidth);
1872     }
1873 
1874 //  pRef -= srcPitch*(blkheight+1);
1875     ref = temp;
1876 
1877     for (j = 0; j < blkwidth; j += 4)
1878     {
1879         r0 = *(int32 *)ref;
1880         r1 = *(int32 *)(ref + 16);
1881         ref += 32;
1882         out = pOut;
1883         for (i = 0; i < (blkheight >> 1); i++)
1884         {
1885             result0 = dy_8 * r0 + 0x00200020;
1886             r2 = *(int32 *)ref;
1887             result0 += dy * r2;
1888             result0 >>= 6;
1889             result0 &= 0x00FF00FF;
1890             r0 = r2;
1891 
1892             result1 = dy_8 * r1 + 0x00200020;
1893             r3 = *(int32 *)(ref + 16);
1894             result1 += dy * r3;
1895             result1 >>= 6;
1896             result1 &= 0x00FF00FF;
1897             r1 = r3;
1898             *(int32 *)out = result0 | (result1 << 8);
1899             out += predPitch;
1900             ref += 32;
1901 
1902             result0 = dy_8 * r0 + 0x00200020;
1903             r2 = *(int32 *)ref;
1904             result0 += dy * r2;
1905             result0 >>= 6;
1906             result0 &= 0x00FF00FF;
1907             r0 = r2;
1908 
1909             result1 = dy_8 * r1 + 0x00200020;
1910             r3 = *(int32 *)(ref + 16);
1911             result1 += dy * r3;
1912             result1 >>= 6;
1913             result1 &= 0x00FF00FF;
1914             r1 = r3;
1915             *(int32 *)out = result0 | (result1 << 8);
1916             out += predPitch;
1917             ref += 32;
1918         }
1919         pOut += 4;
1920         ref = temp + 4; /* since it can only iterate twice max */
1921     }
1922     return;
1923 }
1924 
eChromaHorizontalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)1925 void eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1926                               uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1927 {
1928     (void)(dy);
1929 
1930     int32 r0, r1, r2, result0, result1;
1931     uint8 *ref, *out;
1932     int i, j;
1933     int dx_8 = 8 - dx;
1934 
1935     /* horizontal first */
1936     for (i = 0; i < blkheight; i++)
1937     {
1938         ref = pRef;
1939         out = pOut;
1940 
1941         r0 = ref[0];
1942         for (j = 0; j < blkwidth; j += 4)
1943         {
1944             r0 |= (ref[2] << 16);
1945             result0 = dx_8 * r0 + 0x00040004;
1946 
1947             r1 = ref[1] | (ref[3] << 16);
1948             result0 += dx * r1;
1949             result0 >>= 3;
1950             result0 &= 0x00FF00FF;
1951 
1952             result1 = dx_8 * r1 + 0x00040004;
1953 
1954             r2 = ref[4];
1955             r0 = r0 >> 16;
1956             r1 = r0 | (r2 << 16);
1957             result1 += dx * r1;
1958             result1 >>= 3;
1959             result1 &= 0x00FF00FF;
1960 
1961             *(int32 *)out = result0 | (result1 << 8);
1962 
1963             ref += 4;
1964             out += 4;
1965             r0 = r2;
1966         }
1967 
1968         pRef += srcPitch;
1969         pOut += predPitch;
1970     }
1971     return;
1972 }
1973 
eChromaVerticalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)1974 void eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1975                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1976 {
1977     (void)(dx);
1978 
1979     int32 r0, r1, r2, r3, result0, result1;
1980     int i, j;
1981     uint8 *ref, *out;
1982     int dy_8 = 8 - dy;
1983     /* vertical first */
1984     for (i = 0; i < blkwidth; i += 4)
1985     {
1986         ref = pRef;
1987         out = pOut;
1988 
1989         r0 = ref[0] | (ref[2] << 16);
1990         r1 = ref[1] | (ref[3] << 16);
1991         ref += srcPitch;
1992         for (j = 0; j < blkheight; j++)
1993         {
1994             result0 = dy_8 * r0 + 0x00040004;
1995             r2 = ref[0] | (ref[2] << 16);
1996             result0 += dy * r2;
1997             result0 >>= 3;
1998             result0 &= 0x00FF00FF;
1999             r0 = r2;
2000 
2001             result1 = dy_8 * r1 + 0x00040004;
2002             r3 = ref[1] | (ref[3] << 16);
2003             result1 += dy * r3;
2004             result1 >>= 3;
2005             result1 &= 0x00FF00FF;
2006             r1 = r3;
2007             *(int32 *)out = result0 | (result1 << 8);
2008             ref += srcPitch;
2009             out += predPitch;
2010         }
2011         pOut += 4;
2012         pRef += 4;
2013     }
2014     return;
2015 }
2016 
eChromaDiagonalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2017 void eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2018                              uint8 *pOut,  int predPitch, int blkwidth, int blkheight)
2019 {
2020     (void)(blkwidth);
2021 
2022     int32 r0, r1, temp0, temp1, result;
2023     int32 temp[9];
2024     int32 *out;
2025     int i, r_temp;
2026     int dy_8 = 8 - dy;
2027 
2028     /* horizontal first */
2029     out = temp;
2030     for (i = 0; i < blkheight + 1; i++)
2031     {
2032         r_temp = pRef[1];
2033         temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
2034         temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
2035         r0 = temp0 | (temp1 << 16);
2036         *out++ = r0;
2037         pRef += srcPitch;
2038     }
2039 
2040     pRef -= srcPitch * (blkheight + 1);
2041 
2042     out = temp;
2043 
2044     r0 = *out++;
2045 
2046     for (i = 0; i < blkheight; i++)
2047     {
2048         result = dy_8 * r0 + 0x00200020;
2049         r1 = *out++;
2050         result += dy * r1;
2051         result >>= 6;
2052         result &= 0x00FF00FF;
2053         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2054         r0 = r1;
2055         pOut += predPitch;
2056     }
2057     return;
2058 }
2059 
eChromaHorizontalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2060 void eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2061                                uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2062 {
2063     (void)(dy);
2064     (void)(blkwidth);
2065 
2066     int i, temp, temp0, temp1;
2067 
2068     /* horizontal first */
2069     for (i = 0; i < blkheight; i++)
2070     {
2071         temp = pRef[1];
2072         temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
2073         temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
2074 
2075         *(int16 *)pOut = temp0 | (temp1 << 8);
2076         pRef += srcPitch;
2077         pOut += predPitch;
2078 
2079     }
2080     return;
2081 }
eChromaVerticalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2082 void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2083                              uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2084 {
2085     (void)(dx);
2086     (void)(blkwidth);
2087 
2088     int32 r0, r1, result;
2089     int i;
2090     int dy_8 = 8 - dy;
2091     r0 = pRef[0] | (pRef[1] << 16);
2092     pRef += srcPitch;
2093     for (i = 0; i < blkheight; i++)
2094     {
2095         result = dy_8 * r0 + 0x00040004;
2096         r1 = pRef[0] | (pRef[1] << 16);
2097         result += dy * r1;
2098         result >>= 3;
2099         result &= 0x00FF00FF;
2100         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2101         r0 = r1;
2102         pRef += srcPitch;
2103         pOut += predPitch;
2104     }
2105     return;
2106 }
2107 
eChromaFullMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2108 void eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2109                         uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2110 {
2111     (void)(dx);
2112     (void)(dy);
2113 
2114     int i, j;
2115     int offset_in = srcPitch - blkwidth;
2116     int offset_out = predPitch - blkwidth;
2117     uint16 temp;
2118     uint8 byte;
2119 
2120     if (((intptr_t)pRef)&1)
2121     {
2122         for (j = blkheight; j > 0; j--)
2123         {
2124             for (i = blkwidth; i > 0; i -= 2)
2125             {
2126                 temp = *pRef++;
2127                 byte = *pRef++;
2128                 temp |= (byte << 8);
2129                 *((uint16*)pOut) = temp; /* write 2 bytes */
2130                 pOut += 2;
2131             }
2132             pOut += offset_out;
2133             pRef += offset_in;
2134         }
2135     }
2136     else
2137     {
2138         for (j = blkheight; j > 0; j--)
2139         {
2140             for (i = blkwidth; i > 0; i -= 2)
2141             {
2142                 temp = *((uint16*)pRef);
2143                 *((uint16*)pOut) = temp;
2144                 pRef += 2;
2145                 pOut += 2;
2146             }
2147             pOut += offset_out;
2148             pRef += offset_in;
2149         }
2150     }
2151     return ;
2152 }
2153