1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 #include "avcenc_lib.h"
19 /* 3/29/01 fast half-pel search based on neighboring guess */
20 /* value ranging from 0 to 4, high complexity (more accurate) to
21    low complexity (less accurate) */
22 #define HP_DISTANCE_TH      5 // 2  /* half-pel distance threshold */
23 
24 #define PREF_16_VEC 129     /* 1MV bias versus 4MVs*/
25 
26 #define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
27                  x = 0xFF & (~(x>>31));}
28 
29 #define CLIP_UPPER16(x)     if((uint)x >= 0x20000000){ \
30         x = 0xFF0000 & (~(x>>31));} \
31         else { \
32         x = (x>>5)&0xFF0000; \
33         }
34 
35 /*=====================================================================
36     Function:   AVCFindHalfPelMB
37     Date:       10/31/2007
38     Purpose:    Find half pel resolution MV surrounding the full-pel MV
39 =====================================================================*/
40 
AVCFindHalfPelMB(AVCEncObject * encvid,uint8 * cur,AVCMV * mot,uint8 * ncand,int xpos,int ypos,int hp_guess,int cmvx,int cmvy)41 int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand,
42                      int xpos, int ypos, int hp_guess, int cmvx, int cmvy)
43 {
44     AVCPictureData *currPic = encvid->common->currPic;
45     int lx = currPic->pitch;
46     int d, dmin, satd_min;
47     uint8* cand;
48     int lambda_motion = encvid->lambda_motion;
49     uint8 *mvbits = encvid->mvbits;
50     int mvcost;
51     /* list of candidate to go through for half-pel search*/
52     uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions
53     uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */
54 
55     int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2};
56     int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2};
57     int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1};
58     int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1};
59     int h, hmin, q, qmin;
60 
61     OSCL_UNUSED_ARG(xpos);
62     OSCL_UNUSED_ARG(ypos);
63     OSCL_UNUSED_ARG(hp_guess);
64 
65     GenerateHalfPelPred(subpel_pred, ncand, lx);
66 
67     cur = encvid->currYMB; // pre-load current original MB
68 
69     cand = hpel_cand[0];
70 
71     // find cost for the current full-pel position
72     dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD
73     mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy);
74     satd_min = dmin;
75     dmin += mvcost;
76     hmin = 0;
77 
78     /* find half-pel */
79     for (h = 1; h < 9; h++)
80     {
81         d = SATD_MB(hpel_cand[h], cur, dmin);
82         mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy);
83         d += mvcost;
84 
85         if (d < dmin)
86         {
87             dmin = d;
88             hmin = h;
89             satd_min = d - mvcost;
90         }
91     }
92 
93     mot->sad = dmin;
94     mot->x += xh[hmin];
95     mot->y += yh[hmin];
96     encvid->best_hpel_pos = hmin;
97 
98     /*** search for quarter-pel ****/
99     GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin);
100 
101     encvid->best_qpel_pos = qmin = -1;
102 
103     for (q = 0; q < 8; q++)
104     {
105         d = SATD_MB(encvid->qpel_cand[q], cur, dmin);
106         mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy);
107         d += mvcost;
108         if (d < dmin)
109         {
110             dmin = d;
111             qmin = q;
112             satd_min = d - mvcost;
113         }
114     }
115 
116     if (qmin != -1)
117     {
118         mot->sad = dmin;
119         mot->x += xq[qmin];
120         mot->y += yq[qmin];
121         encvid->best_qpel_pos = qmin;
122     }
123 
124     return satd_min;
125 }
126 
127 
128 
129 /** This function generates sub-pel prediction around the full-pel candidate.
130 Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */
131 /** The sub-pel position is labeled in spiral manner from the center. */
132 
GenerateHalfPelPred(uint8 * subpel_pred,uint8 * ncand,int lx)133 void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx)
134 {
135     /* let's do straightforward way first */
136     uint8 *ref;
137     uint8 *dst;
138     uint8 tmp8;
139     int32 tmp32;
140     int16 tmp_horz[18*22], *dst_16, *src_16;
141     int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp
142     int i, j;
143 
144     /* first copy full-pel to the first array */
145     /* to be optimized later based on byte-offset load */
146     ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */
147     dst = subpel_pred;
148 
149     dst -= 4; /* offset */
150     for (j = 0; j < 22; j++) /* 24x22 */
151     {
152         i = 6;
153         while (i > 0)
154         {
155             tmp32 = *ref++;
156             tmp8 = *ref++;
157             tmp32 |= (tmp8 << 8);
158             tmp8 = *ref++;
159             tmp32 |= (tmp8 << 16);
160             tmp8 = *ref++;
161             tmp32 |= (tmp8 << 24);
162             *((uint32*)(dst += 4)) = tmp32;
163             i--;
164         }
165         ref += (lx - 24);
166     }
167 
168     /* from the first array, we do horizontal interp */
169     ref = subpel_pred + 2;
170     dst_16 = tmp_horz; /* 17 x 22 */
171 
172     for (j = 4; j > 0; j--)
173     {
174         for (i = 16; i > 0; i -= 4)
175         {
176             a = ref[-2];
177             b = ref[-1];
178             c = ref[0];
179             d = ref[1];
180             e = ref[2];
181             f = ref[3];
182             *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d);
183             a = ref[4];
184             *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e);
185             b = ref[5];
186             *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f);
187             c = ref[6];
188             *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a);
189 
190             ref += 4;
191         }
192         /* do the 17th column here */
193         d = ref[3];
194         *dst_16 =  e + d - 5 * (f + c) + 20 * (a + b);
195         dst_16 += 2; /* stride for tmp_horz is 18 */
196         ref += 8;  /* stride for ref is 24 */
197         if (j == 3)  // move 18 lines down
198         {
199             dst_16 += 324;//18*18;
200             ref += 432;//18*24;
201         }
202     }
203 
204     ref -= 480;//20*24;
205     dst_16 -= 360;//20*18;
206     dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/
207 
208     for (j = 18; j > 0; j--)
209     {
210         for (i = 16; i > 0; i -= 4)
211         {
212             a = ref[-2];
213             b = ref[-1];
214             c = ref[0];
215             d = ref[1];
216             e = ref[2];
217             f = ref[3];
218             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
219             *dst_16++ = tmp32;
220             tmp32 = (tmp32 + 16) >> 5;
221             CLIP_RESULT(tmp32)
222             *dst++ = tmp32;
223 
224             a = ref[4];
225             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
226             *dst_16++ = tmp32;
227             tmp32 = (tmp32 + 16) >> 5;
228             CLIP_RESULT(tmp32)
229             *dst++ = tmp32;
230 
231             b = ref[5];
232             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
233             *dst_16++ = tmp32;
234             tmp32 = (tmp32 + 16) >> 5;
235             CLIP_RESULT(tmp32)
236             *dst++ = tmp32;
237 
238             c = ref[6];
239             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
240             *dst_16++ = tmp32;
241             tmp32 = (tmp32 + 16) >> 5;
242             CLIP_RESULT(tmp32)
243             *dst++ = tmp32;
244 
245             ref += 4;
246         }
247         /* do the 17th column here */
248         d = ref[3];
249         tmp32 =  e + d - 5 * (f + c) + 20 * (a + b);
250         *dst_16 = tmp32;
251         tmp32 = (tmp32 + 16) >> 5;
252         CLIP_RESULT(tmp32)
253         *dst = tmp32;
254 
255         dst += 8;  /* stride for dst is 24 */
256         dst_16 += 2; /* stride for tmp_horz is 18 */
257         ref += 8;  /* stride for ref is 24 */
258     }
259 
260 
261     /* Do middle point filtering*/
262     src_16 = tmp_horz; /* 17 x 22 */
263     dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/
264     dst -= 24; // offset
265     for (i = 0; i < 17; i++)
266     {
267         for (j = 16; j > 0; j -= 4)
268         {
269             a = *src_16;
270             b = *(src_16 += 18);
271             c = *(src_16 += 18);
272             d = *(src_16 += 18);
273             e = *(src_16 += 18);
274             f = *(src_16 += 18);
275 
276             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
277             tmp32 = (tmp32 + 512) >> 10;
278             CLIP_RESULT(tmp32)
279             *(dst += 24) = tmp32;
280 
281             a = *(src_16 += 18);
282             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
283             tmp32 = (tmp32 + 512) >> 10;
284             CLIP_RESULT(tmp32)
285             *(dst += 24) = tmp32;
286 
287             b = *(src_16 += 18);
288             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
289             tmp32 = (tmp32 + 512) >> 10;
290             CLIP_RESULT(tmp32)
291             *(dst += 24) = tmp32;
292 
293             c = *(src_16 += 18);
294             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
295             tmp32 = (tmp32 + 512) >> 10;
296             CLIP_RESULT(tmp32)
297             *(dst += 24) = tmp32;
298 
299             src_16 -= (18 << 2);
300         }
301 
302         d = src_16[90]; // 18*5
303         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
304         tmp32 = (tmp32 + 512) >> 10;
305         CLIP_RESULT(tmp32)
306         dst[24] = tmp32;
307 
308         src_16 -= ((18 << 4) - 1);
309         dst -= ((24 << 4) - 1);
310     }
311 
312     /* do vertical interpolation */
313     ref = subpel_pred + 2;
314     dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */
315     dst -= 24; // offset
316 
317     for (i = 2; i > 0; i--)
318     {
319         for (j = 16; j > 0; j -= 4)
320         {
321             a = *ref;
322             b = *(ref += 24);
323             c = *(ref += 24);
324             d = *(ref += 24);
325             e = *(ref += 24);
326             f = *(ref += 24);
327 
328             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
329             tmp32 = (tmp32 + 16) >> 5;
330             CLIP_RESULT(tmp32)
331             *(dst += 24) = tmp32;  // 10th
332 
333             a = *(ref += 24);
334             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
335             tmp32 = (tmp32 + 16) >> 5;
336             CLIP_RESULT(tmp32)
337             *(dst += 24) = tmp32;  // 10th
338 
339             b = *(ref += 24);
340             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
341             tmp32 = (tmp32 + 16) >> 5;
342             CLIP_RESULT(tmp32)
343             *(dst += 24) = tmp32;  // 10th
344 
345             c = *(ref += 24);
346             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
347             tmp32 = (tmp32 + 16) >> 5;
348             CLIP_RESULT(tmp32)
349             *(dst += 24) = tmp32;  // 10th
350 
351             ref -= (24 << 2);
352         }
353 
354         d = ref[120]; // 24*5
355         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
356         tmp32 = (tmp32 + 16) >> 5;
357         CLIP_RESULT(tmp32)
358         dst[24] = tmp32;  // 10th
359 
360         dst -= ((24 << 4) - 1);
361         ref -= ((24 << 4) - 1);
362     }
363 
364     // note that using SIMD here doesn't help much, the cycle almost stays the same
365     // one can just use the above code and change the for(i=2 to for(i=18
366     for (i = 16; i > 0; i -= 4)
367     {
368         for (j = 17; j > 0; j--)
369         {
370             a = *((uint32*)ref); /* load 4 bytes */
371             b = (a >> 8) & 0xFF00FF; /* second and fourth byte */
372             a &= 0xFF00FF;
373 
374             c = *((uint32*)(ref + 120));
375             d = (c >> 8) & 0xFF00FF;
376             c &= 0xFF00FF;
377 
378             a += c;
379             b += d;
380 
381             e = *((uint32*)(ref + 72)); /* e, f */
382             f = (e >> 8) & 0xFF00FF;
383             e &= 0xFF00FF;
384 
385             c = *((uint32*)(ref + 48)); /* c, d */
386             d = (c >> 8) & 0xFF00FF;
387             c &= 0xFF00FF;
388 
389             c += e;
390             d += f;
391 
392             a += 20 * c;
393             b += 20 * d;
394             a += 0x100010;
395             b += 0x100010;
396 
397             e = *((uint32*)(ref += 24)); /* e, f */
398             f = (e >> 8) & 0xFF00FF;
399             e &= 0xFF00FF;
400 
401             c = *((uint32*)(ref + 72)); /* c, d */
402             d = (c >> 8) & 0xFF00FF;
403             c &= 0xFF00FF;
404 
405             c += e;
406             d += f;
407 
408             a -= 5 * c;
409             b -= 5 * d;
410 
411             c = a << 16;
412             d = b << 16;
413             CLIP_UPPER16(a)
414             CLIP_UPPER16(c)
415             CLIP_UPPER16(b)
416             CLIP_UPPER16(d)
417 
418             a |= (c >> 16);
419             b |= (d >> 16);
420             //  a>>=5;
421             //  b>>=5;
422             /* clip */
423             //  msk |= b;  msk|=a;
424             //  a &= 0xFF00FF;
425             //  b &= 0xFF00FF;
426             a |= (b << 8);  /* pack it back */
427 
428             *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned.
429             *((uint16*)(dst + 2)) = a >> 16;
430 
431         }
432         dst -= 404; // 24*17-4
433         ref -= 404;
434         /*      if(msk & 0xFF00FF00) // need clipping
435                 {
436                     VertInterpWClip(dst,ref); // re-do 4 column with clip
437                 }*/
438     }
439 
440     return ;
441 }
442 
VertInterpWClip(uint8 * dst,uint8 * ref)443 void VertInterpWClip(uint8 *dst, uint8 *ref)
444 {
445     int i, j;
446     int a, b, c, d, e, f;
447     int32 tmp32;
448 
449     dst -= 4;
450     ref -= 4;
451 
452     for (i = 4; i > 0; i--)
453     {
454         for (j = 16; j > 0; j -= 4)
455         {
456             a = *ref;
457             b = *(ref += 24);
458             c = *(ref += 24);
459             d = *(ref += 24);
460             e = *(ref += 24);
461             f = *(ref += 24);
462 
463             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
464             tmp32 = (tmp32 + 16) >> 5;
465             CLIP_RESULT(tmp32)
466             *(dst += 24) = tmp32;  // 10th
467 
468             a = *(ref += 24);
469             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
470             tmp32 = (tmp32 + 16) >> 5;
471             CLIP_RESULT(tmp32)
472             *(dst += 24) = tmp32;  // 10th
473 
474             b = *(ref += 24);
475             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
476             tmp32 = (tmp32 + 16) >> 5;
477             CLIP_RESULT(tmp32)
478             *(dst += 24) = tmp32;  // 10th
479 
480             c = *(ref += 24);
481             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
482             tmp32 = (tmp32 + 16) >> 5;
483             CLIP_RESULT(tmp32)
484             *(dst += 24) = tmp32;  // 10th
485 
486             ref -= (24 << 2);
487         }
488 
489         d = ref[120]; // 24*5
490         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
491         tmp32 = (tmp32 + 16) >> 5;
492         CLIP_RESULT(tmp32)
493         dst[24] = tmp32;  // 10th
494 
495         dst -= ((24 << 4) - 1);
496         ref -= ((24 << 4) - 1);
497     }
498 
499     return ;
500 }
501 
502 
GenerateQuartPelPred(uint8 ** bilin_base,uint8 * qpel_cand,int hpel_pos)503 void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos)
504 {
505     // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2
506     int i, j;
507 
508     uint8 *c1 = qpel_cand;
509     uint8 *tl = bilin_base[0];
510     uint8 *tr = bilin_base[1];
511     uint8 *bl = bilin_base[2];
512     uint8 *br = bilin_base[3];
513     int a, b, c, d;
514     int offset = 1 - (384 * 7);
515 
516     if (!(hpel_pos&1)) // diamond pattern
517     {
518         j = 16;
519         while (j--)
520         {
521             i = 16;
522             while (i--)
523             {
524                 d = tr[24];
525                 a = *tr++;
526                 b = bl[1];
527                 c = *br++;
528 
529                 *c1 = (c + a + 1) >> 1;
530                 *(c1 += 384) = (b + a + 1) >> 1; /* c2 */
531                 *(c1 += 384) = (b + c + 1) >> 1; /* c3 */
532                 *(c1 += 384) = (b + d + 1) >> 1; /* c4 */
533 
534                 b = *bl++;
535 
536                 *(c1 += 384) = (c + d + 1) >> 1;  /* c5 */
537                 *(c1 += 384) = (b + d + 1) >> 1;  /* c6 */
538                 *(c1 += 384) = (b + c + 1) >> 1;  /* c7 */
539                 *(c1 += 384) = (b + a + 1) >> 1;  /* c8 */
540 
541                 c1 += offset;
542             }
543             // advance to the next line, pitch is 24
544             tl += 8;
545             tr += 8;
546             bl += 8;
547             br += 8;
548             c1 += 8;
549         }
550     }
551     else // star pattern
552     {
553         j = 16;
554         while (j--)
555         {
556             i = 16;
557             while (i--)
558             {
559                 a = *br++;
560                 b = *tr++;
561                 c = tl[1];
562                 *c1 = (a + b + 1) >> 1;
563                 b = bl[1];
564                 *(c1 += 384) = (a + c + 1) >> 1; /* c2 */
565                 c = tl[25];
566                 *(c1 += 384) = (a + b + 1) >> 1; /* c3 */
567                 b = tr[23];
568                 *(c1 += 384) = (a + c + 1) >> 1; /* c4 */
569                 c = tl[24];
570                 *(c1 += 384) = (a + b + 1) >> 1; /* c5 */
571                 b = *bl++;
572                 *(c1 += 384) = (a + c + 1) >> 1; /* c6 */
573                 c = *tl++;
574                 *(c1 += 384) = (a + b + 1) >> 1; /* c7 */
575                 *(c1 += 384) = (a + c + 1) >> 1; /* c8 */
576 
577                 c1 += offset;
578             }
579             // advance to the next line, pitch is 24
580             tl += 8;
581             tr += 8;
582             bl += 8;
583             br += 8;
584             c1 += 8;
585         }
586     }
587 
588     return ;
589 }
590 
591 
592 /* assuming cand always has a pitch of 24 */
SATD_MB(uint8 * cand,uint8 * cur,int dmin)593 int SATD_MB(uint8 *cand, uint8 *cur, int dmin)
594 {
595     int cost;
596 
597 
598     dmin = (dmin << 16) | 24;
599     cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL);
600 
601     return cost;
602 }
603 
604 
605 
606 
607 
608