1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 #include "avcenc_lib.h"
19 #include "avcenc_int.h"
20
21
22 #define CLIP_RESULT(x) if((uint)x > 0xFF){ \
23 x = 0xFF & (~(x>>31));}
24
25 /* (blkwidth << 2) + (dy << 1) + dx */
26 static void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
27 {
28 &eChromaFullMC_SIMD,
29 &eChromaHorizontalMC_SIMD,
30 &eChromaVerticalMC_SIMD,
31 &eChromaDiagonalMC_SIMD,
32 &eChromaFullMC_SIMD,
33 &eChromaHorizontalMC2_SIMD,
34 &eChromaVerticalMC2_SIMD,
35 &eChromaDiagonalMC2_SIMD
36 };
37 /* Perform motion prediction and compensation with residue if exist. */
AVCMBMotionComp(AVCEncObject * encvid,AVCCommonObj * video)38 void AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video)
39 {
40 (void)(encvid);
41
42 AVCMacroblock *currMB = video->currMB;
43 AVCPictureData *currPic = video->currPic;
44 int mbPartIdx, subMbPartIdx;
45 int ref_idx;
46 int offset_MbPart_indx = 0;
47 int16 *mv;
48 uint32 x_pos, y_pos;
49 uint8 *curL, *curCb, *curCr;
50 uint8 *ref_l, *ref_Cb, *ref_Cr;
51 uint8 *predBlock, *predCb, *predCr;
52 int block_x, block_y, offset_x, offset_y, offsetP, offset;
53 int x_position = (video->mb_x << 4);
54 int y_position = (video->mb_y << 4);
55 int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
56 int picWidth = currPic->width;
57 int picPitch = currPic->pitch;
58 int picHeight = currPic->height;
59 uint32 tmp_word;
60
61 tmp_word = y_position * picPitch;
62 curL = currPic->Sl + tmp_word + x_position;
63 offset = (tmp_word >> 2) + (x_position >> 1);
64 curCb = currPic->Scb + offset;
65 curCr = currPic->Scr + offset;
66
67 predBlock = curL;
68 predCb = curCb;
69 predCr = curCr;
70
71 GetMotionVectorPredictor(video, 1);
72
73 for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
74 {
75 MbHeight = currMB->SubMbPartHeight[mbPartIdx];
76 MbWidth = currMB->SubMbPartWidth[mbPartIdx];
77 mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
78 mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
79 ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
80 offset_indx = 0;
81
82 ref_l = video->RefPicList0[ref_idx]->Sl;
83 ref_Cb = video->RefPicList0[ref_idx]->Scb;
84 ref_Cr = video->RefPicList0[ref_idx]->Scr;
85
86 for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
87 {
88 block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);
89 block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
90 mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
91 offset_x = x_position + (block_x << 2);
92 offset_y = y_position + (block_y << 2);
93 x_pos = (offset_x << 2) + *mv++; /*quarter pel */
94 y_pos = (offset_y << 2) + *mv; /*quarter pel */
95
96 //offset = offset_y * currPic->width;
97 //offsetC = (offset >> 2) + (offset_x >> 1);
98 offsetP = (block_y << 2) * picPitch + (block_x << 2);
99 eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos,
100 /*comp_Sl + offset + offset_x,*/
101 predBlock + offsetP, picPitch, MbWidth, MbHeight);
102
103 offsetP = (block_y * picWidth) + (block_x << 1);
104 eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
105 /*comp_Scb + offsetC,*/
106 predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
107 eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
108 /*comp_Scr + offsetC,*/
109 predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
110
111 offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
112 }
113 offset_MbPart_indx = currMB->MbPartWidth >> 4;
114 }
115
116 return ;
117 }
118
119
120 /* preform the actual motion comp here */
eLumaMotionComp(uint8 * ref,int picpitch,int picheight,int x_pos,int y_pos,uint8 * pred,int pred_pitch,int blkwidth,int blkheight)121 void eLumaMotionComp(uint8 *ref, int picpitch, int picheight,
122 int x_pos, int y_pos,
123 uint8 *pred, int pred_pitch,
124 int blkwidth, int blkheight)
125 {
126 (void)(picheight);
127
128 int dx, dy;
129 int temp2[21][21]; /* for intermediate results */
130 uint8 *ref2;
131
132 dx = x_pos & 3;
133 dy = y_pos & 3;
134 x_pos = x_pos >> 2; /* round it to full-pel resolution */
135 y_pos = y_pos >> 2;
136
137 /* perform actual motion compensation */
138 if (dx == 0 && dy == 0)
139 { /* fullpel position *//* G */
140
141 ref += y_pos * picpitch + x_pos;
142
143 eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
144
145 } /* other positions */
146 else if (dy == 0)
147 { /* no vertical interpolation *//* a,b,c*/
148
149 ref += y_pos * picpitch + x_pos;
150
151 eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx);
152 }
153 else if (dx == 0)
154 { /*no horizontal interpolation *//* d,h,n */
155
156 ref += y_pos * picpitch + x_pos;
157
158 eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy);
159 }
160 else if (dy == 2)
161 { /* horizontal cross *//* i, j, k */
162
163 ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */
164
165 eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight);
166
167 eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
168 }
169 else if (dx == 2)
170 { /* vertical cross */ /* f,q */
171
172 ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */
173
174 eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5);
175 eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
176 }
177 else
178 { /* diagonal *//* e,g,p,r */
179
180 ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos;
181
182 ref += (y_pos * picpitch) + x_pos + (dx / 2);
183
184 eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
185 }
186
187 return ;
188 }
189
eCreateAlign(uint8 * ref,int picpitch,int y_pos,uint8 * out,int blkwidth,int blkheight)190 void eCreateAlign(uint8 *ref, int picpitch, int y_pos,
191 uint8 *out, int blkwidth, int blkheight)
192 {
193 int i, j;
194 int offset, out_offset;
195 uint32 prev_pix, result, pix1, pix2, pix4;
196
197 ref += y_pos * picpitch;// + x_pos;
198 out_offset = 24 - blkwidth;
199
200 //switch(x_pos&0x3){
201 switch (((intptr_t)ref)&0x3)
202 {
203 case 1:
204 offset = picpitch - blkwidth - 3;
205 for (j = 0; j < blkheight; j++)
206 {
207 pix1 = *ref++;
208 pix2 = *((uint16*)ref);
209 ref += 2;
210 result = (pix2 << 8) | pix1;
211
212 for (i = 3; i < blkwidth; i += 4)
213 {
214 pix4 = *((uint32*)ref);
215 ref += 4;
216 prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
217 result |= prev_pix;
218 *((uint32*)out) = result; /* write 4 bytes */
219 out += 4;
220 result = pix4 >> 8; /* for the next loop */
221 }
222 ref += offset;
223 out += out_offset;
224 }
225 break;
226 case 2:
227 offset = picpitch - blkwidth - 2;
228 for (j = 0; j < blkheight; j++)
229 {
230 result = *((uint16*)ref);
231 ref += 2;
232 for (i = 2; i < blkwidth; i += 4)
233 {
234 pix4 = *((uint32*)ref);
235 ref += 4;
236 prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
237 result |= prev_pix;
238 *((uint32*)out) = result; /* write 4 bytes */
239 out += 4;
240 result = pix4 >> 16; /* for the next loop */
241 }
242 ref += offset;
243 out += out_offset;
244 }
245 break;
246 case 3:
247 offset = picpitch - blkwidth - 1;
248 for (j = 0; j < blkheight; j++)
249 {
250 result = *ref++;
251 for (i = 1; i < blkwidth; i += 4)
252 {
253 pix4 = *((uint32*)ref);
254 ref += 4;
255 prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
256 result |= prev_pix;
257 *((uint32*)out) = result; /* write 4 bytes */
258 out += 4;
259 result = pix4 >> 24; /* for the next loop */
260 }
261 ref += offset;
262 out += out_offset;
263 }
264 break;
265 }
266 }
267
eHorzInterp1MC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dx)268 void eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
269 int blkwidth, int blkheight, int dx)
270 {
271 uint8 *p_ref, *tmp;
272 uint32 *p_cur;
273 uint32 pkres;
274 int result, curr_offset, ref_offset;
275 int j;
276 int32 r0, r1, r2, r3, r4, r5;
277 int32 r13, r6;
278
279 p_cur = (uint32*)out; /* assume it's word aligned */
280 curr_offset = (outpitch - blkwidth) >> 2;
281 p_ref = in;
282 ref_offset = inpitch - blkwidth;
283
284 if (dx&1)
285 {
286 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
287 p_ref -= 2;
288 r13 = 0;
289 for (j = blkheight; j > 0; j--)
290 {
291 tmp = p_ref + blkwidth;
292 r0 = p_ref[0];
293 r1 = p_ref[2];
294 r0 |= (r1 << 16); /* 0,c,0,a */
295 r1 = p_ref[1];
296 r2 = p_ref[3];
297 r1 |= (r2 << 16); /* 0,d,0,b */
298 while (p_ref < tmp)
299 {
300 r2 = *(p_ref += 4); /* move pointer to e */
301 r3 = p_ref[2];
302 r2 |= (r3 << 16); /* 0,g,0,e */
303 r3 = p_ref[1];
304 r4 = p_ref[3];
305 r3 |= (r4 << 16); /* 0,h,0,f */
306
307 r4 = r0 + r3; /* c+h, a+f */
308 r5 = r0 + r1; /* c+d, a+b */
309 r6 = r2 + r3; /* g+h, e+f */
310 r5 >>= 16;
311 r5 |= (r6 << 16); /* e+f, c+d */
312 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
313 r4 += 0x100010; /* +16, +16 */
314 r5 = r1 + r2; /* d+g, b+e */
315 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
316 r4 >>= 5;
317 r13 |= r4; /* check clipping */
318
319 r5 = p_ref[dx+2];
320 r6 = p_ref[dx+4];
321 r5 |= (r6 << 16);
322 r4 += r5;
323 r4 += 0x10001;
324 r4 = (r4 >> 1) & 0xFF00FF;
325
326 r5 = p_ref[4]; /* i */
327 r6 = (r5 << 16);
328 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
329 r5 += r1; /* d+i, b+g */ /* r5 not free */
330 r1 >>= 16;
331 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
332 r1 += r2; /* f+g, d+e */
333 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
334 r0 >>= 16;
335 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
336 r0 += r3; /* e+h, c+f */
337 r5 += 0x100010; /* 16,16 */
338 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
339 r5 >>= 5;
340 r13 |= r5; /* check clipping */
341
342 r0 = p_ref[dx+3];
343 r1 = p_ref[dx+5];
344 r0 |= (r1 << 16);
345 r5 += r0;
346 r5 += 0x10001;
347 r5 = (r5 >> 1) & 0xFF00FF;
348
349 r4 |= (r5 << 8); /* pack them together */
350 *p_cur++ = r4;
351 r1 = r3;
352 r0 = r2;
353 }
354 p_cur += curr_offset; /* move to the next line */
355 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
356
357 if (r13&0xFF000700) /* need clipping */
358 {
359 /* move back to the beginning of the line */
360 p_ref -= (ref_offset + blkwidth); /* input */
361 p_cur -= (outpitch >> 2);
362
363 tmp = p_ref + blkwidth;
364 for (; p_ref < tmp;)
365 {
366
367 r0 = *p_ref++;
368 r1 = *p_ref++;
369 r2 = *p_ref++;
370 r3 = *p_ref++;
371 r4 = *p_ref++;
372 /* first pixel */
373 r5 = *p_ref++;
374 result = (r0 + r5);
375 r0 = (r1 + r4);
376 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
377 r0 = (r2 + r3);
378 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
379 result = (result + 16) >> 5;
380 CLIP_RESULT(result)
381 /* 3/4 pel, no need to clip */
382 result = (result + p_ref[dx] + 1);
383 pkres = (result >> 1) ;
384 /* second pixel */
385 r0 = *p_ref++;
386 result = (r1 + r0);
387 r1 = (r2 + r5);
388 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
389 r1 = (r3 + r4);
390 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
391 result = (result + 16) >> 5;
392 CLIP_RESULT(result)
393 /* 3/4 pel, no need to clip */
394 result = (result + p_ref[dx] + 1);
395 result = (result >> 1);
396 pkres |= (result << 8);
397 /* third pixel */
398 r1 = *p_ref++;
399 result = (r2 + r1);
400 r2 = (r3 + r0);
401 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
402 r2 = (r4 + r5);
403 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
404 result = (result + 16) >> 5;
405 CLIP_RESULT(result)
406 /* 3/4 pel, no need to clip */
407 result = (result + p_ref[dx] + 1);
408 result = (result >> 1);
409 pkres |= (result << 16);
410 /* fourth pixel */
411 r2 = *p_ref++;
412 result = (r3 + r2);
413 r3 = (r4 + r1);
414 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
415 r3 = (r5 + r0);
416 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
417 result = (result + 16) >> 5;
418 CLIP_RESULT(result)
419 /* 3/4 pel, no need to clip */
420 result = (result + p_ref[dx] + 1);
421 result = (result >> 1);
422 pkres |= (result << 24);
423 *p_cur++ = pkres; /* write 4 pixels */
424 p_ref -= 5; /* offset back to the middle of filter */
425 }
426 p_cur += curr_offset; /* move to the next line */
427 p_ref += ref_offset; /* move to the next line */
428 }
429 }
430 }
431 else
432 {
433 p_ref -= 2;
434 r13 = 0;
435 for (j = blkheight; j > 0; j--)
436 {
437 tmp = p_ref + blkwidth;
438 r0 = p_ref[0];
439 r1 = p_ref[2];
440 r0 |= (r1 << 16); /* 0,c,0,a */
441 r1 = p_ref[1];
442 r2 = p_ref[3];
443 r1 |= (r2 << 16); /* 0,d,0,b */
444 while (p_ref < tmp)
445 {
446 r2 = *(p_ref += 4); /* move pointer to e */
447 r3 = p_ref[2];
448 r2 |= (r3 << 16); /* 0,g,0,e */
449 r3 = p_ref[1];
450 r4 = p_ref[3];
451 r3 |= (r4 << 16); /* 0,h,0,f */
452
453 r4 = r0 + r3; /* c+h, a+f */
454 r5 = r0 + r1; /* c+d, a+b */
455 r6 = r2 + r3; /* g+h, e+f */
456 r5 >>= 16;
457 r5 |= (r6 << 16); /* e+f, c+d */
458 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
459 r4 += 0x100010; /* +16, +16 */
460 r5 = r1 + r2; /* d+g, b+e */
461 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
462 r4 >>= 5;
463 r13 |= r4; /* check clipping */
464 r4 &= 0xFF00FF; /* mask */
465
466 r5 = p_ref[4]; /* i */
467 r6 = (r5 << 16);
468 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
469 r5 += r1; /* d+i, b+g */ /* r5 not free */
470 r1 >>= 16;
471 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
472 r1 += r2; /* f+g, d+e */
473 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
474 r0 >>= 16;
475 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
476 r0 += r3; /* e+h, c+f */
477 r5 += 0x100010; /* 16,16 */
478 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
479 r5 >>= 5;
480 r13 |= r5; /* check clipping */
481 r5 &= 0xFF00FF; /* mask */
482
483 r4 |= (r5 << 8); /* pack them together */
484 *p_cur++ = r4;
485 r1 = r3;
486 r0 = r2;
487 }
488 p_cur += curr_offset; /* move to the next line */
489 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
490
491 if (r13&0xFF000700) /* need clipping */
492 {
493 /* move back to the beginning of the line */
494 p_ref -= (ref_offset + blkwidth); /* input */
495 p_cur -= (outpitch >> 2);
496
497 tmp = p_ref + blkwidth;
498 for (; p_ref < tmp;)
499 {
500
501 r0 = *p_ref++;
502 r1 = *p_ref++;
503 r2 = *p_ref++;
504 r3 = *p_ref++;
505 r4 = *p_ref++;
506 /* first pixel */
507 r5 = *p_ref++;
508 result = (r0 + r5);
509 r0 = (r1 + r4);
510 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
511 r0 = (r2 + r3);
512 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
513 result = (result + 16) >> 5;
514 CLIP_RESULT(result)
515 pkres = result;
516 /* second pixel */
517 r0 = *p_ref++;
518 result = (r1 + r0);
519 r1 = (r2 + r5);
520 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
521 r1 = (r3 + r4);
522 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
523 result = (result + 16) >> 5;
524 CLIP_RESULT(result)
525 pkres |= (result << 8);
526 /* third pixel */
527 r1 = *p_ref++;
528 result = (r2 + r1);
529 r2 = (r3 + r0);
530 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
531 r2 = (r4 + r5);
532 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
533 result = (result + 16) >> 5;
534 CLIP_RESULT(result)
535 pkres |= (result << 16);
536 /* fourth pixel */
537 r2 = *p_ref++;
538 result = (r3 + r2);
539 r3 = (r4 + r1);
540 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
541 r3 = (r5 + r0);
542 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
543 result = (result + 16) >> 5;
544 CLIP_RESULT(result)
545 pkres |= (result << 24);
546 *p_cur++ = pkres; /* write 4 pixels */
547 p_ref -= 5;
548 }
549 p_cur += curr_offset; /* move to the next line */
550 p_ref += ref_offset;
551 }
552 }
553 }
554
555 return ;
556 }
557
eHorzInterp2MC(int * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dx)558 void eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
559 int blkwidth, int blkheight, int dx)
560 {
561 int *p_ref, *tmp;
562 uint32 *p_cur;
563 uint32 pkres;
564 int result, result2, curr_offset, ref_offset;
565 int j, r0, r1, r2, r3, r4, r5;
566
567 p_cur = (uint32*)out; /* assume it's word aligned */
568 curr_offset = (outpitch - blkwidth) >> 2;
569 p_ref = in;
570 ref_offset = inpitch - blkwidth;
571
572 if (dx&1)
573 {
574 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
575
576 for (j = blkheight; j > 0 ; j--)
577 {
578 tmp = p_ref + blkwidth;
579 for (; p_ref < tmp;)
580 {
581
582 r0 = p_ref[-2];
583 r1 = p_ref[-1];
584 r2 = *p_ref++;
585 r3 = *p_ref++;
586 r4 = *p_ref++;
587 /* first pixel */
588 r5 = *p_ref++;
589 result = (r0 + r5);
590 r0 = (r1 + r4);
591 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
592 r0 = (r2 + r3);
593 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
594 result = (result + 512) >> 10;
595 CLIP_RESULT(result)
596 result2 = ((p_ref[dx] + 16) >> 5);
597 CLIP_RESULT(result2)
598 /* 3/4 pel, no need to clip */
599 result = (result + result2 + 1);
600 pkres = (result >> 1);
601 /* second pixel */
602 r0 = *p_ref++;
603 result = (r1 + r0);
604 r1 = (r2 + r5);
605 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
606 r1 = (r3 + r4);
607 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
608 result = (result + 512) >> 10;
609 CLIP_RESULT(result)
610 result2 = ((p_ref[dx] + 16) >> 5);
611 CLIP_RESULT(result2)
612 /* 3/4 pel, no need to clip */
613 result = (result + result2 + 1);
614 result = (result >> 1);
615 pkres |= (result << 8);
616 /* third pixel */
617 r1 = *p_ref++;
618 result = (r2 + r1);
619 r2 = (r3 + r0);
620 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
621 r2 = (r4 + r5);
622 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
623 result = (result + 512) >> 10;
624 CLIP_RESULT(result)
625 result2 = ((p_ref[dx] + 16) >> 5);
626 CLIP_RESULT(result2)
627 /* 3/4 pel, no need to clip */
628 result = (result + result2 + 1);
629 result = (result >> 1);
630 pkres |= (result << 16);
631 /* fourth pixel */
632 r2 = *p_ref++;
633 result = (r3 + r2);
634 r3 = (r4 + r1);
635 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
636 r3 = (r5 + r0);
637 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
638 result = (result + 512) >> 10;
639 CLIP_RESULT(result)
640 result2 = ((p_ref[dx] + 16) >> 5);
641 CLIP_RESULT(result2)
642 /* 3/4 pel, no need to clip */
643 result = (result + result2 + 1);
644 result = (result >> 1);
645 pkres |= (result << 24);
646 *p_cur++ = pkres; /* write 4 pixels */
647 p_ref -= 3; /* offset back to the middle of filter */
648 }
649 p_cur += curr_offset; /* move to the next line */
650 p_ref += ref_offset; /* move to the next line */
651 }
652 }
653 else
654 {
655 for (j = blkheight; j > 0 ; j--)
656 {
657 tmp = p_ref + blkwidth;
658 for (; p_ref < tmp;)
659 {
660
661 r0 = p_ref[-2];
662 r1 = p_ref[-1];
663 r2 = *p_ref++;
664 r3 = *p_ref++;
665 r4 = *p_ref++;
666 /* first pixel */
667 r5 = *p_ref++;
668 result = (r0 + r5);
669 r0 = (r1 + r4);
670 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
671 r0 = (r2 + r3);
672 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
673 result = (result + 512) >> 10;
674 CLIP_RESULT(result)
675 pkres = result;
676 /* second pixel */
677 r0 = *p_ref++;
678 result = (r1 + r0);
679 r1 = (r2 + r5);
680 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
681 r1 = (r3 + r4);
682 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
683 result = (result + 512) >> 10;
684 CLIP_RESULT(result)
685 pkres |= (result << 8);
686 /* third pixel */
687 r1 = *p_ref++;
688 result = (r2 + r1);
689 r2 = (r3 + r0);
690 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
691 r2 = (r4 + r5);
692 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
693 result = (result + 512) >> 10;
694 CLIP_RESULT(result)
695 pkres |= (result << 16);
696 /* fourth pixel */
697 r2 = *p_ref++;
698 result = (r3 + r2);
699 r3 = (r4 + r1);
700 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
701 r3 = (r5 + r0);
702 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
703 result = (result + 512) >> 10;
704 CLIP_RESULT(result)
705 pkres |= (result << 24);
706 *p_cur++ = pkres; /* write 4 pixels */
707 p_ref -= 3; /* offset back to the middle of filter */
708 }
709 p_cur += curr_offset; /* move to the next line */
710 p_ref += ref_offset; /* move to the next line */
711 }
712 }
713
714 return ;
715 }
716
eHorzInterp3MC(uint8 * in,int inpitch,int * out,int outpitch,int blkwidth,int blkheight)717 void eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
718 int blkwidth, int blkheight)
719 {
720 uint8 *p_ref, *tmp;
721 int *p_cur;
722 int result, curr_offset, ref_offset;
723 int j, r0, r1, r2, r3, r4, r5;
724
725 p_cur = out;
726 curr_offset = (outpitch - blkwidth);
727 p_ref = in;
728 ref_offset = inpitch - blkwidth;
729
730 for (j = blkheight; j > 0 ; j--)
731 {
732 tmp = p_ref + blkwidth;
733 for (; p_ref < tmp;)
734 {
735
736 r0 = p_ref[-2];
737 r1 = p_ref[-1];
738 r2 = *p_ref++;
739 r3 = *p_ref++;
740 r4 = *p_ref++;
741 /* first pixel */
742 r5 = *p_ref++;
743 result = (r0 + r5);
744 r0 = (r1 + r4);
745 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
746 r0 = (r2 + r3);
747 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
748 *p_cur++ = result;
749 /* second pixel */
750 r0 = *p_ref++;
751 result = (r1 + r0);
752 r1 = (r2 + r5);
753 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
754 r1 = (r3 + r4);
755 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
756 *p_cur++ = result;
757 /* third pixel */
758 r1 = *p_ref++;
759 result = (r2 + r1);
760 r2 = (r3 + r0);
761 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
762 r2 = (r4 + r5);
763 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
764 *p_cur++ = result;
765 /* fourth pixel */
766 r2 = *p_ref++;
767 result = (r3 + r2);
768 r3 = (r4 + r1);
769 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
770 r3 = (r5 + r0);
771 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
772 *p_cur++ = result;
773 p_ref -= 3; /* move back to the middle of the filter */
774 }
775 p_cur += curr_offset; /* move to the next line */
776 p_ref += ref_offset;
777 }
778
779 return ;
780 }
eVertInterp1MC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dy)781 void eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
782 int blkwidth, int blkheight, int dy)
783 {
784 uint8 *p_cur, *p_ref, *tmp;
785 int result, curr_offset, ref_offset;
786 int j, i;
787 int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
788 uint8 tmp_in[24][24];
789
790 /* not word-aligned */
791 if (((intptr_t)in)&0x3)
792 {
793 eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
794 in = &tmp_in[2][0];
795 inpitch = 24;
796 }
797 p_cur = out;
798 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
799 ref_offset = blkheight * inpitch; /* for limit */
800
801 curr_offset += 3;
802
803 if (dy&1)
804 {
805 dy = (dy >> 1) ? 0 : -inpitch;
806
807 for (j = 0; j < blkwidth; j += 4, in += 4)
808 {
809 r13 = 0;
810 p_ref = in;
811 p_cur -= outpitch; /* compensate for the first offset */
812 tmp = p_ref + ref_offset; /* limit */
813 while (p_ref < tmp) /* the loop un-rolled */
814 {
815 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
816 p_ref += inpitch;
817 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
818 r0 &= 0xFF00FF;
819
820 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
821 r7 = (r1 >> 8) & 0xFF00FF;
822 r1 &= 0xFF00FF;
823
824 r0 += r1;
825 r6 += r7;
826
827 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
828 r8 = (r2 >> 8) & 0xFF00FF;
829 r2 &= 0xFF00FF;
830
831 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
832 r7 = (r1 >> 8) & 0xFF00FF;
833 r1 &= 0xFF00FF;
834 r1 += r2;
835
836 r7 += r8;
837
838 r0 += 20 * r1;
839 r6 += 20 * r7;
840 r0 += 0x100010;
841 r6 += 0x100010;
842
843 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
844 r8 = (r2 >> 8) & 0xFF00FF;
845 r2 &= 0xFF00FF;
846
847 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
848 r7 = (r1 >> 8) & 0xFF00FF;
849 r1 &= 0xFF00FF;
850 r1 += r2;
851
852 r7 += r8;
853
854 r0 -= 5 * r1;
855 r6 -= 5 * r7;
856
857 r0 >>= 5;
858 r6 >>= 5;
859 /* clip */
860 r13 |= r6;
861 r13 |= r0;
862 //CLIPPACK(r6,result)
863
864 r1 = *((uint32*)(p_ref + dy));
865 r2 = (r1 >> 8) & 0xFF00FF;
866 r1 &= 0xFF00FF;
867 r0 += r1;
868 r6 += r2;
869 r0 += 0x10001;
870 r6 += 0x10001;
871 r0 = (r0 >> 1) & 0xFF00FF;
872 r6 = (r6 >> 1) & 0xFF00FF;
873
874 r0 |= (r6 << 8); /* pack it back */
875 *((uint32*)(p_cur += outpitch)) = r0;
876 }
877 p_cur += curr_offset; /* offset to the next pixel */
878 if (r13 & 0xFF000700) /* this column need clipping */
879 {
880 p_cur -= 4;
881 for (i = 0; i < 4; i++)
882 {
883 p_ref = in + i;
884 p_cur -= outpitch; /* compensate for the first offset */
885
886 tmp = p_ref + ref_offset; /* limit */
887 while (p_ref < tmp)
888 { /* loop un-rolled */
889 r0 = *(p_ref - (inpitch << 1));
890 r1 = *(p_ref - inpitch);
891 r2 = *p_ref;
892 r3 = *(p_ref += inpitch); /* modify pointer before loading */
893 r4 = *(p_ref += inpitch);
894 /* first pixel */
895 r5 = *(p_ref += inpitch);
896 result = (r0 + r5);
897 r0 = (r1 + r4);
898 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
899 r0 = (r2 + r3);
900 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
901 result = (result + 16) >> 5;
902 CLIP_RESULT(result)
903 /* 3/4 pel, no need to clip */
904 result = (result + p_ref[dy-(inpitch<<1)] + 1);
905 result = (result >> 1);
906 *(p_cur += outpitch) = result;
907 /* second pixel */
908 r0 = *(p_ref += inpitch);
909 result = (r1 + r0);
910 r1 = (r2 + r5);
911 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
912 r1 = (r3 + r4);
913 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
914 result = (result + 16) >> 5;
915 CLIP_RESULT(result)
916 /* 3/4 pel, no need to clip */
917 result = (result + p_ref[dy-(inpitch<<1)] + 1);
918 result = (result >> 1);
919 *(p_cur += outpitch) = result;
920 /* third pixel */
921 r1 = *(p_ref += inpitch);
922 result = (r2 + r1);
923 r2 = (r3 + r0);
924 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
925 r2 = (r4 + r5);
926 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
927 result = (result + 16) >> 5;
928 CLIP_RESULT(result)
929 /* 3/4 pel, no need to clip */
930 result = (result + p_ref[dy-(inpitch<<1)] + 1);
931 result = (result >> 1);
932 *(p_cur += outpitch) = result;
933 /* fourth pixel */
934 r2 = *(p_ref += inpitch);
935 result = (r3 + r2);
936 r3 = (r4 + r1);
937 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
938 r3 = (r5 + r0);
939 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
940 result = (result + 16) >> 5;
941 CLIP_RESULT(result)
942 /* 3/4 pel, no need to clip */
943 result = (result + p_ref[dy-(inpitch<<1)] + 1);
944 result = (result >> 1);
945 *(p_cur += outpitch) = result;
946 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
947 }
948 p_cur += (curr_offset - 3);
949 }
950 }
951 }
952 }
953 else
954 {
955 for (j = 0; j < blkwidth; j += 4, in += 4)
956 {
957 r13 = 0;
958 p_ref = in;
959 p_cur -= outpitch; /* compensate for the first offset */
960 tmp = p_ref + ref_offset; /* limit */
961 while (p_ref < tmp) /* the loop un-rolled */
962 {
963 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
964 p_ref += inpitch;
965 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
966 r0 &= 0xFF00FF;
967
968 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
969 r7 = (r1 >> 8) & 0xFF00FF;
970 r1 &= 0xFF00FF;
971
972 r0 += r1;
973 r6 += r7;
974
975 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
976 r8 = (r2 >> 8) & 0xFF00FF;
977 r2 &= 0xFF00FF;
978
979 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
980 r7 = (r1 >> 8) & 0xFF00FF;
981 r1 &= 0xFF00FF;
982 r1 += r2;
983
984 r7 += r8;
985
986 r0 += 20 * r1;
987 r6 += 20 * r7;
988 r0 += 0x100010;
989 r6 += 0x100010;
990
991 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
992 r8 = (r2 >> 8) & 0xFF00FF;
993 r2 &= 0xFF00FF;
994
995 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
996 r7 = (r1 >> 8) & 0xFF00FF;
997 r1 &= 0xFF00FF;
998 r1 += r2;
999
1000 r7 += r8;
1001
1002 r0 -= 5 * r1;
1003 r6 -= 5 * r7;
1004
1005 r0 >>= 5;
1006 r6 >>= 5;
1007 /* clip */
1008 r13 |= r6;
1009 r13 |= r0;
1010 //CLIPPACK(r6,result)
1011 r0 &= 0xFF00FF;
1012 r6 &= 0xFF00FF;
1013 r0 |= (r6 << 8); /* pack it back */
1014 *((uint32*)(p_cur += outpitch)) = r0;
1015 }
1016 p_cur += curr_offset; /* offset to the next pixel */
1017 if (r13 & 0xFF000700) /* this column need clipping */
1018 {
1019 p_cur -= 4;
1020 for (i = 0; i < 4; i++)
1021 {
1022 p_ref = in + i;
1023 p_cur -= outpitch; /* compensate for the first offset */
1024 tmp = p_ref + ref_offset; /* limit */
1025 while (p_ref < tmp)
1026 { /* loop un-rolled */
1027 r0 = *(p_ref - (inpitch << 1));
1028 r1 = *(p_ref - inpitch);
1029 r2 = *p_ref;
1030 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1031 r4 = *(p_ref += inpitch);
1032 /* first pixel */
1033 r5 = *(p_ref += inpitch);
1034 result = (r0 + r5);
1035 r0 = (r1 + r4);
1036 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1037 r0 = (r2 + r3);
1038 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1039 result = (result + 16) >> 5;
1040 CLIP_RESULT(result)
1041 *(p_cur += outpitch) = result;
1042 /* second pixel */
1043 r0 = *(p_ref += inpitch);
1044 result = (r1 + r0);
1045 r1 = (r2 + r5);
1046 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1047 r1 = (r3 + r4);
1048 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1049 result = (result + 16) >> 5;
1050 CLIP_RESULT(result)
1051 *(p_cur += outpitch) = result;
1052 /* third pixel */
1053 r1 = *(p_ref += inpitch);
1054 result = (r2 + r1);
1055 r2 = (r3 + r0);
1056 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1057 r2 = (r4 + r5);
1058 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1059 result = (result + 16) >> 5;
1060 CLIP_RESULT(result)
1061 *(p_cur += outpitch) = result;
1062 /* fourth pixel */
1063 r2 = *(p_ref += inpitch);
1064 result = (r3 + r2);
1065 r3 = (r4 + r1);
1066 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1067 r3 = (r5 + r0);
1068 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1069 result = (result + 16) >> 5;
1070 CLIP_RESULT(result)
1071 *(p_cur += outpitch) = result;
1072 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1073 }
1074 p_cur += (curr_offset - 3);
1075 }
1076 }
1077 }
1078 }
1079
1080 return ;
1081 }
1082
eVertInterp2MC(uint8 * in,int inpitch,int * out,int outpitch,int blkwidth,int blkheight)1083 void eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
1084 int blkwidth, int blkheight)
1085 {
1086 int *p_cur;
1087 uint8 *p_ref, *tmp;
1088 int result, curr_offset, ref_offset;
1089 int j, r0, r1, r2, r3, r4, r5;
1090
1091 p_cur = out;
1092 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1093 ref_offset = blkheight * inpitch; /* for limit */
1094
1095 for (j = 0; j < blkwidth; j++)
1096 {
1097 p_cur -= outpitch; /* compensate for the first offset */
1098 p_ref = in++;
1099
1100 tmp = p_ref + ref_offset; /* limit */
1101 while (p_ref < tmp)
1102 { /* loop un-rolled */
1103 r0 = *(p_ref - (inpitch << 1));
1104 r1 = *(p_ref - inpitch);
1105 r2 = *p_ref;
1106 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1107 r4 = *(p_ref += inpitch);
1108 /* first pixel */
1109 r5 = *(p_ref += inpitch);
1110 result = (r0 + r5);
1111 r0 = (r1 + r4);
1112 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1113 r0 = (r2 + r3);
1114 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1115 *(p_cur += outpitch) = result;
1116 /* second pixel */
1117 r0 = *(p_ref += inpitch);
1118 result = (r1 + r0);
1119 r1 = (r2 + r5);
1120 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1121 r1 = (r3 + r4);
1122 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1123 *(p_cur += outpitch) = result;
1124 /* third pixel */
1125 r1 = *(p_ref += inpitch);
1126 result = (r2 + r1);
1127 r2 = (r3 + r0);
1128 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1129 r2 = (r4 + r5);
1130 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1131 *(p_cur += outpitch) = result;
1132 /* fourth pixel */
1133 r2 = *(p_ref += inpitch);
1134 result = (r3 + r2);
1135 r3 = (r4 + r1);
1136 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1137 r3 = (r5 + r0);
1138 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1139 *(p_cur += outpitch) = result;
1140 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1141 }
1142 p_cur += curr_offset;
1143 }
1144
1145 return ;
1146 }
1147
eVertInterp3MC(int * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dy)1148 void eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
1149 int blkwidth, int blkheight, int dy)
1150 {
1151 uint8 *p_cur;
1152 int *p_ref, *tmp;
1153 int result, result2, curr_offset, ref_offset;
1154 int j, r0, r1, r2, r3, r4, r5;
1155
1156 p_cur = out;
1157 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1158 ref_offset = blkheight * inpitch; /* for limit */
1159
1160 if (dy&1)
1161 {
1162 dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
1163
1164 for (j = 0; j < blkwidth; j++)
1165 {
1166 p_cur -= outpitch; /* compensate for the first offset */
1167 p_ref = in++;
1168
1169 tmp = p_ref + ref_offset; /* limit */
1170 while (p_ref < tmp)
1171 { /* loop un-rolled */
1172 r0 = *(p_ref - (inpitch << 1));
1173 r1 = *(p_ref - inpitch);
1174 r2 = *p_ref;
1175 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1176 r4 = *(p_ref += inpitch);
1177 /* first pixel */
1178 r5 = *(p_ref += inpitch);
1179 result = (r0 + r5);
1180 r0 = (r1 + r4);
1181 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1182 r0 = (r2 + r3);
1183 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1184 result = (result + 512) >> 10;
1185 CLIP_RESULT(result)
1186 result2 = ((p_ref[dy] + 16) >> 5);
1187 CLIP_RESULT(result2)
1188 /* 3/4 pel, no need to clip */
1189 result = (result + result2 + 1);
1190 result = (result >> 1);
1191 *(p_cur += outpitch) = result;
1192 /* second pixel */
1193 r0 = *(p_ref += inpitch);
1194 result = (r1 + r0);
1195 r1 = (r2 + r5);
1196 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1197 r1 = (r3 + r4);
1198 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1199 result = (result + 512) >> 10;
1200 CLIP_RESULT(result)
1201 result2 = ((p_ref[dy] + 16) >> 5);
1202 CLIP_RESULT(result2)
1203 /* 3/4 pel, no need to clip */
1204 result = (result + result2 + 1);
1205 result = (result >> 1);
1206 *(p_cur += outpitch) = result;
1207 /* third pixel */
1208 r1 = *(p_ref += inpitch);
1209 result = (r2 + r1);
1210 r2 = (r3 + r0);
1211 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1212 r2 = (r4 + r5);
1213 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1214 result = (result + 512) >> 10;
1215 CLIP_RESULT(result)
1216 result2 = ((p_ref[dy] + 16) >> 5);
1217 CLIP_RESULT(result2)
1218 /* 3/4 pel, no need to clip */
1219 result = (result + result2 + 1);
1220 result = (result >> 1);
1221 *(p_cur += outpitch) = result;
1222 /* fourth pixel */
1223 r2 = *(p_ref += inpitch);
1224 result = (r3 + r2);
1225 r3 = (r4 + r1);
1226 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1227 r3 = (r5 + r0);
1228 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1229 result = (result + 512) >> 10;
1230 CLIP_RESULT(result)
1231 result2 = ((p_ref[dy] + 16) >> 5);
1232 CLIP_RESULT(result2)
1233 /* 3/4 pel, no need to clip */
1234 result = (result + result2 + 1);
1235 result = (result >> 1);
1236 *(p_cur += outpitch) = result;
1237 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1238 }
1239 p_cur += curr_offset;
1240 }
1241 }
1242 else
1243 {
1244 for (j = 0; j < blkwidth; j++)
1245 {
1246 p_cur -= outpitch; /* compensate for the first offset */
1247 p_ref = in++;
1248
1249 tmp = p_ref + ref_offset; /* limit */
1250 while (p_ref < tmp)
1251 { /* loop un-rolled */
1252 r0 = *(p_ref - (inpitch << 1));
1253 r1 = *(p_ref - inpitch);
1254 r2 = *p_ref;
1255 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1256 r4 = *(p_ref += inpitch);
1257 /* first pixel */
1258 r5 = *(p_ref += inpitch);
1259 result = (r0 + r5);
1260 r0 = (r1 + r4);
1261 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1262 r0 = (r2 + r3);
1263 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1264 result = (result + 512) >> 10;
1265 CLIP_RESULT(result)
1266 *(p_cur += outpitch) = result;
1267 /* second pixel */
1268 r0 = *(p_ref += inpitch);
1269 result = (r1 + r0);
1270 r1 = (r2 + r5);
1271 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1272 r1 = (r3 + r4);
1273 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1274 result = (result + 512) >> 10;
1275 CLIP_RESULT(result)
1276 *(p_cur += outpitch) = result;
1277 /* third pixel */
1278 r1 = *(p_ref += inpitch);
1279 result = (r2 + r1);
1280 r2 = (r3 + r0);
1281 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1282 r2 = (r4 + r5);
1283 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1284 result = (result + 512) >> 10;
1285 CLIP_RESULT(result)
1286 *(p_cur += outpitch) = result;
1287 /* fourth pixel */
1288 r2 = *(p_ref += inpitch);
1289 result = (r3 + r2);
1290 r3 = (r4 + r1);
1291 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1292 r3 = (r5 + r0);
1293 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1294 result = (result + 512) >> 10;
1295 CLIP_RESULT(result)
1296 *(p_cur += outpitch) = result;
1297 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1298 }
1299 p_cur += curr_offset;
1300 }
1301 }
1302
1303 return ;
1304 }
1305
eDiagonalInterpMC(uint8 * in1,uint8 * in2,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight)1306 void eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
1307 uint8 *out, int outpitch,
1308 int blkwidth, int blkheight)
1309 {
1310 int j, i;
1311 int result;
1312 uint8 *p_cur, *p_ref, *p_tmp8, *tmp;
1313 int curr_offset, ref_offset;
1314 uint8 tmp_res[24][24], tmp_in[24][24];
1315 uint32 *p_tmp;
1316 uint32 pkres, tmp_result;
1317 int32 r0, r1, r2, r3, r4, r5;
1318 int32 r6, r7, r8, r9, r10, r13;
1319
1320 ref_offset = inpitch - blkwidth;
1321 p_ref = in1 - 2;
1322 /* perform horizontal interpolation */
1323 /* not word-aligned */
1324 /* It is faster to read 1 byte at time to avoid calling CreateAlign */
1325 /* if(((uint32)p_ref)&0x3)
1326 {
1327 CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
1328 p_ref = &tmp_in[0][0];
1329 ref_offset = 24-blkwidth;
1330 }*/
1331
1332 p_tmp = (uint32*) & (tmp_res[0][0]);
1333 for (j = blkheight; j > 0; j--)
1334 {
1335 r13 = 0;
1336 tmp = p_ref + blkwidth;
1337
1338 //r0 = *((uint32*)p_ref); /* d,c,b,a */
1339 //r1 = (r0>>8)&0xFF00FF; /* 0,d,0,b */
1340 //r0 &= 0xFF00FF; /* 0,c,0,a */
1341 /* It is faster to read 1 byte at a time */
1342 r0 = p_ref[0];
1343 r1 = p_ref[2];
1344 r0 |= (r1 << 16); /* 0,c,0,a */
1345 r1 = p_ref[1];
1346 r2 = p_ref[3];
1347 r1 |= (r2 << 16); /* 0,d,0,b */
1348
1349 while (p_ref < tmp)
1350 {
1351 //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
1352 //r3 = (r2>>8)&0xFF00FF; /* 0,h,0,f */
1353 //r2 &= 0xFF00FF; /* 0,g,0,e */
1354 /* It is faster to read 1 byte at a time */
1355 r2 = *(p_ref += 4);
1356 r3 = p_ref[2];
1357 r2 |= (r3 << 16); /* 0,g,0,e */
1358 r3 = p_ref[1];
1359 r4 = p_ref[3];
1360 r3 |= (r4 << 16); /* 0,h,0,f */
1361
1362 r4 = r0 + r3; /* c+h, a+f */
1363 r5 = r0 + r1; /* c+d, a+b */
1364 r6 = r2 + r3; /* g+h, e+f */
1365 r5 >>= 16;
1366 r5 |= (r6 << 16); /* e+f, c+d */
1367 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
1368 r4 += 0x100010; /* +16, +16 */
1369 r5 = r1 + r2; /* d+g, b+e */
1370 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
1371 r4 >>= 5;
1372 r13 |= r4; /* check clipping */
1373 r4 &= 0xFF00FF; /* mask */
1374
1375 r5 = p_ref[4]; /* i */
1376 r6 = (r5 << 16);
1377 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
1378 r5 += r1; /* d+i, b+g */ /* r5 not free */
1379 r1 >>= 16;
1380 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
1381 r1 += r2; /* f+g, d+e */
1382 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
1383 r0 >>= 16;
1384 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
1385 r0 += r3; /* e+h, c+f */
1386 r5 += 0x100010; /* 16,16 */
1387 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
1388 r5 >>= 5;
1389 r13 |= r5; /* check clipping */
1390 r5 &= 0xFF00FF; /* mask */
1391
1392 r4 |= (r5 << 8); /* pack them together */
1393 *p_tmp++ = r4;
1394 r1 = r3;
1395 r0 = r2;
1396 }
1397 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1398 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
1399
1400 if (r13&0xFF000700) /* need clipping */
1401 {
1402 /* move back to the beginning of the line */
1403 p_ref -= (ref_offset + blkwidth); /* input */
1404 p_tmp -= 6; /* intermediate output */
1405 tmp = p_ref + blkwidth;
1406 while (p_ref < tmp)
1407 {
1408 r0 = *p_ref++;
1409 r1 = *p_ref++;
1410 r2 = *p_ref++;
1411 r3 = *p_ref++;
1412 r4 = *p_ref++;
1413 /* first pixel */
1414 r5 = *p_ref++;
1415 result = (r0 + r5);
1416 r0 = (r1 + r4);
1417 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1418 r0 = (r2 + r3);
1419 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1420 result = (result + 16) >> 5;
1421 CLIP_RESULT(result)
1422 pkres = result;
1423 /* second pixel */
1424 r0 = *p_ref++;
1425 result = (r1 + r0);
1426 r1 = (r2 + r5);
1427 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1428 r1 = (r3 + r4);
1429 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1430 result = (result + 16) >> 5;
1431 CLIP_RESULT(result)
1432 pkres |= (result << 8);
1433 /* third pixel */
1434 r1 = *p_ref++;
1435 result = (r2 + r1);
1436 r2 = (r3 + r0);
1437 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1438 r2 = (r4 + r5);
1439 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1440 result = (result + 16) >> 5;
1441 CLIP_RESULT(result)
1442 pkres |= (result << 16);
1443 /* fourth pixel */
1444 r2 = *p_ref++;
1445 result = (r3 + r2);
1446 r3 = (r4 + r1);
1447 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1448 r3 = (r5 + r0);
1449 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1450 result = (result + 16) >> 5;
1451 CLIP_RESULT(result)
1452 pkres |= (result << 24);
1453
1454 *p_tmp++ = pkres; /* write 4 pixel */
1455 p_ref -= 5;
1456 }
1457 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1458 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
1459 }
1460 }
1461
1462 /* perform vertical interpolation */
1463 /* not word-aligned */
1464 if (((intptr_t)in2)&0x3)
1465 {
1466 eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1467 in2 = &tmp_in[2][0];
1468 inpitch = 24;
1469 }
1470
1471 p_cur = out;
1472 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
1473 pkres = blkheight * inpitch; /* reuse it for limit */
1474
1475 curr_offset += 3;
1476
1477 for (j = 0; j < blkwidth; j += 4, in2 += 4)
1478 {
1479 r13 = 0;
1480 p_ref = in2;
1481 p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
1482 p_tmp8 -= 24; /* compensate for the first offset */
1483 p_cur -= outpitch; /* compensate for the first offset */
1484 tmp = p_ref + pkres; /* limit */
1485 while (p_ref < tmp) /* the loop un-rolled */
1486 {
1487 /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */
1488 /*p_ref8 = p_ref-(inpitch<<1); r0 = p_ref8[0]; r1 = p_ref8[2];
1489 r0 |= (r1<<16); r6 = p_ref8[1]; r1 = p_ref8[3];
1490 r6 |= (r1<<16); p_ref+=inpitch; */
1491 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1492 p_ref += inpitch;
1493 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1494 r0 &= 0xFF00FF;
1495
1496 /*p_ref8 = p_ref+(inpitch<<1);
1497 r1 = p_ref8[0]; r7 = p_ref8[2]; r1 |= (r7<<16);
1498 r7 = p_ref8[1]; r2 = p_ref8[3]; r7 |= (r2<<16);*/
1499 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
1500 r7 = (r1 >> 8) & 0xFF00FF;
1501 r1 &= 0xFF00FF;
1502
1503 r0 += r1;
1504 r6 += r7;
1505
1506 /*r2 = p_ref[0]; r8 = p_ref[2]; r2 |= (r8<<16);
1507 r8 = p_ref[1]; r1 = p_ref[3]; r8 |= (r1<<16);*/
1508 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1509 r8 = (r2 >> 8) & 0xFF00FF;
1510 r2 &= 0xFF00FF;
1511
1512 /*p_ref8 = p_ref-inpitch; r1 = p_ref8[0]; r7 = p_ref8[2];
1513 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1];
1514 r2 = p_ref8[3]; r7 |= (r2<<16);*/
1515 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1516 r7 = (r1 >> 8) & 0xFF00FF;
1517 r1 &= 0xFF00FF;
1518 r1 += r2;
1519
1520 r7 += r8;
1521
1522 r0 += 20 * r1;
1523 r6 += 20 * r7;
1524 r0 += 0x100010;
1525 r6 += 0x100010;
1526
1527 /*p_ref8 = p_ref-(inpitch<<1); r2 = p_ref8[0]; r8 = p_ref8[2];
1528 r2 |= (r8<<16); r8 = p_ref8[1]; r1 = p_ref8[3]; r8 |= (r1<<16);*/
1529 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1530 r8 = (r2 >> 8) & 0xFF00FF;
1531 r2 &= 0xFF00FF;
1532
1533 /*p_ref8 = p_ref+inpitch; r1 = p_ref8[0]; r7 = p_ref8[2];
1534 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1];
1535 r2 = p_ref8[3]; r7 |= (r2<<16);*/
1536 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1537 r7 = (r1 >> 8) & 0xFF00FF;
1538 r1 &= 0xFF00FF;
1539 r1 += r2;
1540
1541 r7 += r8;
1542
1543 r0 -= 5 * r1;
1544 r6 -= 5 * r7;
1545
1546 r0 >>= 5;
1547 r6 >>= 5;
1548 /* clip */
1549 r13 |= r6;
1550 r13 |= r0;
1551 //CLIPPACK(r6,result)
1552 /* add with horizontal results */
1553 r10 = *((uint32*)(p_tmp8 += 24));
1554 r9 = (r10 >> 8) & 0xFF00FF;
1555 r10 &= 0xFF00FF;
1556
1557 r0 += r10;
1558 r0 += 0x10001;
1559 r0 = (r0 >> 1) & 0xFF00FF; /* mask to 8 bytes */
1560
1561 r6 += r9;
1562 r6 += 0x10001;
1563 r6 = (r6 >> 1) & 0xFF00FF; /* mask to 8 bytes */
1564
1565 r0 |= (r6 << 8); /* pack it back */
1566 *((uint32*)(p_cur += outpitch)) = r0;
1567 }
1568 p_cur += curr_offset; /* offset to the next pixel */
1569 if (r13 & 0xFF000700) /* this column need clipping */
1570 {
1571 p_cur -= 4;
1572 for (i = 0; i < 4; i++)
1573 {
1574 p_ref = in2 + i;
1575 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
1576 p_tmp8 -= 24; /* compensate for the first offset */
1577 p_cur -= outpitch; /* compensate for the first offset */
1578 tmp = p_ref + pkres; /* limit */
1579 while (p_ref < tmp) /* the loop un-rolled */
1580 {
1581 r0 = *(p_ref - (inpitch << 1));
1582 r1 = *(p_ref - inpitch);
1583 r2 = *p_ref;
1584 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1585 r4 = *(p_ref += inpitch);
1586 /* first pixel */
1587 r5 = *(p_ref += inpitch);
1588 result = (r0 + r5);
1589 r0 = (r1 + r4);
1590 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1591 r0 = (r2 + r3);
1592 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1593 result = (result + 16) >> 5;
1594 CLIP_RESULT(result)
1595 tmp_result = *(p_tmp8 += 24); /* modify pointer before loading */
1596 result = (result + tmp_result + 1); /* no clip */
1597 result = (result >> 1);
1598 *(p_cur += outpitch) = result;
1599 /* second pixel */
1600 r0 = *(p_ref += inpitch);
1601 result = (r1 + r0);
1602 r1 = (r2 + r5);
1603 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1604 r1 = (r3 + r4);
1605 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1606 result = (result + 16) >> 5;
1607 CLIP_RESULT(result)
1608 tmp_result = *(p_tmp8 += 24); /* intermediate result */
1609 result = (result + tmp_result + 1); /* no clip */
1610 result = (result >> 1);
1611 *(p_cur += outpitch) = result;
1612 /* third pixel */
1613 r1 = *(p_ref += inpitch);
1614 result = (r2 + r1);
1615 r2 = (r3 + r0);
1616 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1617 r2 = (r4 + r5);
1618 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1619 result = (result + 16) >> 5;
1620 CLIP_RESULT(result)
1621 tmp_result = *(p_tmp8 += 24); /* intermediate result */
1622 result = (result + tmp_result + 1); /* no clip */
1623 result = (result >> 1);
1624 *(p_cur += outpitch) = result;
1625 /* fourth pixel */
1626 r2 = *(p_ref += inpitch);
1627 result = (r3 + r2);
1628 r3 = (r4 + r1);
1629 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1630 r3 = (r5 + r0);
1631 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1632 result = (result + 16) >> 5;
1633 CLIP_RESULT(result)
1634 tmp_result = *(p_tmp8 += 24); /* intermediate result */
1635 result = (result + tmp_result + 1); /* no clip */
1636 result = (result >> 1);
1637 *(p_cur += outpitch) = result;
1638 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1639 }
1640 p_cur += (curr_offset - 3);
1641 }
1642 }
1643 }
1644
1645 return ;
1646 }
1647
1648 /* position G */
eFullPelMC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight)1649 void eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1650 int blkwidth, int blkheight)
1651 {
1652 int i, j;
1653 int offset_in = inpitch - blkwidth;
1654 int offset_out = outpitch - blkwidth;
1655 uint32 temp;
1656 uint8 byte;
1657
1658 if (((intptr_t)in)&3)
1659 {
1660 for (j = blkheight; j > 0; j--)
1661 {
1662 for (i = blkwidth; i > 0; i -= 4)
1663 {
1664 temp = *in++;
1665 byte = *in++;
1666 temp |= (byte << 8);
1667 byte = *in++;
1668 temp |= (byte << 16);
1669 byte = *in++;
1670 temp |= (byte << 24);
1671
1672 *((uint32*)out) = temp; /* write 4 bytes */
1673 out += 4;
1674 }
1675 out += offset_out;
1676 in += offset_in;
1677 }
1678 }
1679 else
1680 {
1681 for (j = blkheight; j > 0; j--)
1682 {
1683 for (i = blkwidth; i > 0; i -= 4)
1684 {
1685 temp = *((uint32*)in);
1686 *((uint32*)out) = temp;
1687 in += 4;
1688 out += 4;
1689 }
1690 out += offset_out;
1691 in += offset_in;
1692 }
1693 }
1694 return ;
1695 }
1696
ePadChroma(uint8 * ref,int picwidth,int picheight,int picpitch,int x_pos,int y_pos)1697 void ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos)
1698 {
1699 int pad_height;
1700 int pad_width;
1701 uint8 *start;
1702 uint32 word1, word2, word3;
1703 int offset, j;
1704
1705
1706 pad_height = 8 + ((y_pos & 7) ? 1 : 0);
1707 pad_width = 8 + ((x_pos & 7) ? 1 : 0);
1708
1709 y_pos >>= 3;
1710 x_pos >>= 3;
1711 // pad vertical first
1712 if (y_pos < 0) // need to pad up
1713 {
1714 if (x_pos < -8) start = ref - 8;
1715 else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width;
1716 else start = ref + x_pos;
1717
1718 /* word-align start */
1719 offset = (intptr_t)start & 0x3;
1720 if (offset) start -= offset;
1721
1722 word1 = *((uint32*)start);
1723 word2 = *((uint32*)(start + 4));
1724 word3 = *((uint32*)(start + 8));
1725
1726 /* pad up N rows */
1727 j = -y_pos;
1728 if (j > 8) j = 8;
1729 while (j--)
1730 {
1731 *((uint32*)(start -= picpitch)) = word1;
1732 *((uint32*)(start + 4)) = word2;
1733 *((uint32*)(start + 8)) = word3;
1734 }
1735
1736 }
1737 else if (y_pos + pad_height >= picheight) /* pad down */
1738 {
1739 if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8;
1740 else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) +
1741 picwidth + 7 - pad_width;
1742 else start = ref + picpitch * (picheight - 1) + x_pos;
1743
1744 /* word-align start */
1745 offset = (intptr_t)start & 0x3;
1746 if (offset) start -= offset;
1747
1748 word1 = *((uint32*)start);
1749 word2 = *((uint32*)(start + 4));
1750 word3 = *((uint32*)(start + 8));
1751
1752 /* pad down N rows */
1753 j = y_pos + pad_height - picheight;
1754 if (j > 8) j = 8;
1755 while (j--)
1756 {
1757 *((uint32*)(start += picpitch)) = word1;
1758 *((uint32*)(start + 4)) = word2;
1759 *((uint32*)(start + 8)) = word3;
1760 }
1761 }
1762
1763 /* now pad horizontal */
1764 if (x_pos < 0) // pad left
1765 {
1766 if (y_pos < -8) start = ref - (picpitch << 3);
1767 else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch;
1768 else start = ref + y_pos * picpitch;
1769
1770 // now pad left 8 pixels for pad_height rows */
1771 j = pad_height;
1772 start -= picpitch;
1773 while (j--)
1774 {
1775 word1 = *(start += picpitch);
1776 word1 |= (word1 << 8);
1777 word1 |= (word1 << 16);
1778 *((uint32*)(start - 8)) = word1;
1779 *((uint32*)(start - 4)) = word1;
1780 }
1781 }
1782 else if (x_pos + pad_width >= picwidth) /* pad right */
1783 {
1784 if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1;
1785 else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1;
1786 else start = ref + y_pos * picpitch + picwidth - 1;
1787
1788 // now pad right 8 pixels for pad_height rows */
1789 j = pad_height;
1790 start -= picpitch;
1791 while (j--)
1792 {
1793 word1 = *(start += picpitch);
1794 word1 |= (word1 << 8);
1795 word1 |= (word1 << 16);
1796 *((uint32*)(start + 1)) = word1;
1797 *((uint32*)(start + 5)) = word1;
1798 }
1799 }
1800
1801 return ;
1802 }
1803
1804
eChromaMotionComp(uint8 * ref,int picwidth,int picheight,int x_pos,int y_pos,uint8 * pred,int picpitch,int blkwidth,int blkheight)1805 void eChromaMotionComp(uint8 *ref, int picwidth, int picheight,
1806 int x_pos, int y_pos,
1807 uint8 *pred, int picpitch,
1808 int blkwidth, int blkheight)
1809 {
1810 int dx, dy;
1811 int offset_dx, offset_dy;
1812 int index;
1813
1814 ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos);
1815
1816 dx = x_pos & 7;
1817 dy = y_pos & 7;
1818 offset_dx = (dx + 7) >> 3;
1819 offset_dy = (dy + 7) >> 3;
1820 x_pos = x_pos >> 3; /* round it to full-pel resolution */
1821 y_pos = y_pos >> 3;
1822
1823 ref += y_pos * picpitch + x_pos;
1824
1825 index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
1826
1827 (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight);
1828 return ;
1829 }
1830
1831
1832 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */
eChromaDiagonalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)1833 void eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1834 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1835 {
1836 int32 r0, r1, r2, r3, result0, result1;
1837 uint8 temp[288];
1838 uint8 *ref, *out;
1839 int i, j;
1840 int dx_8 = 8 - dx;
1841 int dy_8 = 8 - dy;
1842
1843 /* horizontal first */
1844 out = temp;
1845 for (i = 0; i < blkheight + 1; i++)
1846 {
1847 ref = pRef;
1848 r0 = ref[0];
1849 for (j = 0; j < blkwidth; j += 4)
1850 {
1851 r0 |= (ref[2] << 16);
1852 result0 = dx_8 * r0;
1853
1854 r1 = ref[1] | (ref[3] << 16);
1855 result0 += dx * r1;
1856 *(int32 *)out = result0;
1857
1858 result0 = dx_8 * r1;
1859
1860 r2 = ref[4];
1861 r0 = r0 >> 16;
1862 r1 = r0 | (r2 << 16);
1863 result0 += dx * r1;
1864 *(int32 *)(out + 16) = result0;
1865
1866 ref += 4;
1867 out += 4;
1868 r0 = r2;
1869 }
1870 pRef += srcPitch;
1871 out += (32 - blkwidth);
1872 }
1873
1874 // pRef -= srcPitch*(blkheight+1);
1875 ref = temp;
1876
1877 for (j = 0; j < blkwidth; j += 4)
1878 {
1879 r0 = *(int32 *)ref;
1880 r1 = *(int32 *)(ref + 16);
1881 ref += 32;
1882 out = pOut;
1883 for (i = 0; i < (blkheight >> 1); i++)
1884 {
1885 result0 = dy_8 * r0 + 0x00200020;
1886 r2 = *(int32 *)ref;
1887 result0 += dy * r2;
1888 result0 >>= 6;
1889 result0 &= 0x00FF00FF;
1890 r0 = r2;
1891
1892 result1 = dy_8 * r1 + 0x00200020;
1893 r3 = *(int32 *)(ref + 16);
1894 result1 += dy * r3;
1895 result1 >>= 6;
1896 result1 &= 0x00FF00FF;
1897 r1 = r3;
1898 *(int32 *)out = result0 | (result1 << 8);
1899 out += predPitch;
1900 ref += 32;
1901
1902 result0 = dy_8 * r0 + 0x00200020;
1903 r2 = *(int32 *)ref;
1904 result0 += dy * r2;
1905 result0 >>= 6;
1906 result0 &= 0x00FF00FF;
1907 r0 = r2;
1908
1909 result1 = dy_8 * r1 + 0x00200020;
1910 r3 = *(int32 *)(ref + 16);
1911 result1 += dy * r3;
1912 result1 >>= 6;
1913 result1 &= 0x00FF00FF;
1914 r1 = r3;
1915 *(int32 *)out = result0 | (result1 << 8);
1916 out += predPitch;
1917 ref += 32;
1918 }
1919 pOut += 4;
1920 ref = temp + 4; /* since it can only iterate twice max */
1921 }
1922 return;
1923 }
1924
eChromaHorizontalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)1925 void eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1926 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1927 {
1928 (void)(dy);
1929
1930 int32 r0, r1, r2, result0, result1;
1931 uint8 *ref, *out;
1932 int i, j;
1933 int dx_8 = 8 - dx;
1934
1935 /* horizontal first */
1936 for (i = 0; i < blkheight; i++)
1937 {
1938 ref = pRef;
1939 out = pOut;
1940
1941 r0 = ref[0];
1942 for (j = 0; j < blkwidth; j += 4)
1943 {
1944 r0 |= (ref[2] << 16);
1945 result0 = dx_8 * r0 + 0x00040004;
1946
1947 r1 = ref[1] | (ref[3] << 16);
1948 result0 += dx * r1;
1949 result0 >>= 3;
1950 result0 &= 0x00FF00FF;
1951
1952 result1 = dx_8 * r1 + 0x00040004;
1953
1954 r2 = ref[4];
1955 r0 = r0 >> 16;
1956 r1 = r0 | (r2 << 16);
1957 result1 += dx * r1;
1958 result1 >>= 3;
1959 result1 &= 0x00FF00FF;
1960
1961 *(int32 *)out = result0 | (result1 << 8);
1962
1963 ref += 4;
1964 out += 4;
1965 r0 = r2;
1966 }
1967
1968 pRef += srcPitch;
1969 pOut += predPitch;
1970 }
1971 return;
1972 }
1973
eChromaVerticalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)1974 void eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1975 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1976 {
1977 (void)(dx);
1978
1979 int32 r0, r1, r2, r3, result0, result1;
1980 int i, j;
1981 uint8 *ref, *out;
1982 int dy_8 = 8 - dy;
1983 /* vertical first */
1984 for (i = 0; i < blkwidth; i += 4)
1985 {
1986 ref = pRef;
1987 out = pOut;
1988
1989 r0 = ref[0] | (ref[2] << 16);
1990 r1 = ref[1] | (ref[3] << 16);
1991 ref += srcPitch;
1992 for (j = 0; j < blkheight; j++)
1993 {
1994 result0 = dy_8 * r0 + 0x00040004;
1995 r2 = ref[0] | (ref[2] << 16);
1996 result0 += dy * r2;
1997 result0 >>= 3;
1998 result0 &= 0x00FF00FF;
1999 r0 = r2;
2000
2001 result1 = dy_8 * r1 + 0x00040004;
2002 r3 = ref[1] | (ref[3] << 16);
2003 result1 += dy * r3;
2004 result1 >>= 3;
2005 result1 &= 0x00FF00FF;
2006 r1 = r3;
2007 *(int32 *)out = result0 | (result1 << 8);
2008 ref += srcPitch;
2009 out += predPitch;
2010 }
2011 pOut += 4;
2012 pRef += 4;
2013 }
2014 return;
2015 }
2016
eChromaDiagonalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2017 void eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2018 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2019 {
2020 (void)(blkwidth);
2021
2022 int32 r0, r1, temp0, temp1, result;
2023 int32 temp[9];
2024 int32 *out;
2025 int i, r_temp;
2026 int dy_8 = 8 - dy;
2027
2028 /* horizontal first */
2029 out = temp;
2030 for (i = 0; i < blkheight + 1; i++)
2031 {
2032 r_temp = pRef[1];
2033 temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
2034 temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
2035 r0 = temp0 | (temp1 << 16);
2036 *out++ = r0;
2037 pRef += srcPitch;
2038 }
2039
2040 pRef -= srcPitch * (blkheight + 1);
2041
2042 out = temp;
2043
2044 r0 = *out++;
2045
2046 for (i = 0; i < blkheight; i++)
2047 {
2048 result = dy_8 * r0 + 0x00200020;
2049 r1 = *out++;
2050 result += dy * r1;
2051 result >>= 6;
2052 result &= 0x00FF00FF;
2053 *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2054 r0 = r1;
2055 pOut += predPitch;
2056 }
2057 return;
2058 }
2059
eChromaHorizontalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2060 void eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2061 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2062 {
2063 (void)(dy);
2064 (void)(blkwidth);
2065
2066 int i, temp, temp0, temp1;
2067
2068 /* horizontal first */
2069 for (i = 0; i < blkheight; i++)
2070 {
2071 temp = pRef[1];
2072 temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
2073 temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
2074
2075 *(int16 *)pOut = temp0 | (temp1 << 8);
2076 pRef += srcPitch;
2077 pOut += predPitch;
2078
2079 }
2080 return;
2081 }
eChromaVerticalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2082 void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2083 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2084 {
2085 (void)(dx);
2086 (void)(blkwidth);
2087
2088 int32 r0, r1, result;
2089 int i;
2090 int dy_8 = 8 - dy;
2091 r0 = pRef[0] | (pRef[1] << 16);
2092 pRef += srcPitch;
2093 for (i = 0; i < blkheight; i++)
2094 {
2095 result = dy_8 * r0 + 0x00040004;
2096 r1 = pRef[0] | (pRef[1] << 16);
2097 result += dy * r1;
2098 result >>= 3;
2099 result &= 0x00FF00FF;
2100 *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2101 r0 = r1;
2102 pRef += srcPitch;
2103 pOut += predPitch;
2104 }
2105 return;
2106 }
2107
eChromaFullMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2108 void eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2109 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2110 {
2111 (void)(dx);
2112 (void)(dy);
2113
2114 int i, j;
2115 int offset_in = srcPitch - blkwidth;
2116 int offset_out = predPitch - blkwidth;
2117 uint16 temp;
2118 uint8 byte;
2119
2120 if (((intptr_t)pRef)&1)
2121 {
2122 for (j = blkheight; j > 0; j--)
2123 {
2124 for (i = blkwidth; i > 0; i -= 2)
2125 {
2126 temp = *pRef++;
2127 byte = *pRef++;
2128 temp |= (byte << 8);
2129 *((uint16*)pOut) = temp; /* write 2 bytes */
2130 pOut += 2;
2131 }
2132 pOut += offset_out;
2133 pRef += offset_in;
2134 }
2135 }
2136 else
2137 {
2138 for (j = blkheight; j > 0; j--)
2139 {
2140 for (i = blkwidth; i > 0; i -= 2)
2141 {
2142 temp = *((uint16*)pRef);
2143 *((uint16*)pOut) = temp;
2144 pRef += 2;
2145 pOut += 2;
2146 }
2147 pOut += offset_out;
2148 pRef += offset_in;
2149 }
2150 }
2151 return ;
2152 }
2153