1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 #include "avcenc_lib.h"
19 /* 3/29/01 fast half-pel search based on neighboring guess */
20 /* value ranging from 0 to 4, high complexity (more accurate) to
21 low complexity (less accurate) */
22 #define HP_DISTANCE_TH 5 // 2 /* half-pel distance threshold */
23
24 #define PREF_16_VEC 129 /* 1MV bias versus 4MVs*/
25
26 #define CLIP_RESULT(x) if((uint)x > 0xFF){ \
27 x = 0xFF & (~(x>>31));}
28
29 #define CLIP_UPPER16(x) if((uint)x >= 0x20000000){ \
30 x = 0xFF0000 & (~(x>>31));} \
31 else { \
32 x = (x>>5)&0xFF0000; \
33 }
34
35 /*=====================================================================
36 Function: AVCFindHalfPelMB
37 Date: 10/31/2007
38 Purpose: Find half pel resolution MV surrounding the full-pel MV
39 =====================================================================*/
40
AVCFindHalfPelMB(AVCEncObject * encvid,uint8 * cur,AVCMV * mot,uint8 * ncand,int xpos,int ypos,int hp_guess,int cmvx,int cmvy)41 int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand,
42 int xpos, int ypos, int hp_guess, int cmvx, int cmvy)
43 {
44 AVCPictureData *currPic = encvid->common->currPic;
45 int lx = currPic->pitch;
46 int d, dmin, satd_min;
47 uint8* cand;
48 int lambda_motion = encvid->lambda_motion;
49 uint8 *mvbits = encvid->mvbits;
50 int mvcost;
51 /* list of candidate to go through for half-pel search*/
52 uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions
53 uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */
54
55 int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2};
56 int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2};
57 int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1};
58 int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1};
59 int h, hmin, q, qmin;
60
61 OSCL_UNUSED_ARG(xpos);
62 OSCL_UNUSED_ARG(ypos);
63 OSCL_UNUSED_ARG(hp_guess);
64
65 GenerateHalfPelPred(subpel_pred, ncand, lx);
66
67 cur = encvid->currYMB; // pre-load current original MB
68
69 cand = hpel_cand[0];
70
71 // find cost for the current full-pel position
72 dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD
73 mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy);
74 satd_min = dmin;
75 dmin += mvcost;
76 hmin = 0;
77
78 /* find half-pel */
79 for (h = 1; h < 9; h++)
80 {
81 d = SATD_MB(hpel_cand[h], cur, dmin);
82 mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy);
83 d += mvcost;
84
85 if (d < dmin)
86 {
87 dmin = d;
88 hmin = h;
89 satd_min = d - mvcost;
90 }
91 }
92
93 mot->sad = dmin;
94 mot->x += xh[hmin];
95 mot->y += yh[hmin];
96 encvid->best_hpel_pos = hmin;
97
98 /*** search for quarter-pel ****/
99 GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin);
100
101 encvid->best_qpel_pos = qmin = -1;
102
103 for (q = 0; q < 8; q++)
104 {
105 d = SATD_MB(encvid->qpel_cand[q], cur, dmin);
106 mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy);
107 d += mvcost;
108 if (d < dmin)
109 {
110 dmin = d;
111 qmin = q;
112 satd_min = d - mvcost;
113 }
114 }
115
116 if (qmin != -1)
117 {
118 mot->sad = dmin;
119 mot->x += xq[qmin];
120 mot->y += yq[qmin];
121 encvid->best_qpel_pos = qmin;
122 }
123
124 return satd_min;
125 }
126
127
128
129 /** This function generates sub-pel prediction around the full-pel candidate.
130 Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */
131 /** The sub-pel position is labeled in spiral manner from the center. */
132
GenerateHalfPelPred(uint8 * subpel_pred,uint8 * ncand,int lx)133 void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx)
134 {
135 /* let's do straightforward way first */
136 uint8 *ref;
137 uint8 *dst;
138 uint8 tmp8;
139 int32 tmp32;
140 int16 tmp_horz[18*22], *dst_16, *src_16;
141 int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp
142 int i, j;
143
144 /* first copy full-pel to the first array */
145 /* to be optimized later based on byte-offset load */
146 ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */
147 dst = subpel_pred;
148
149 dst -= 4; /* offset */
150 for (j = 0; j < 22; j++) /* 24x22 */
151 {
152 i = 6;
153 while (i > 0)
154 {
155 tmp32 = *ref++;
156 tmp8 = *ref++;
157 tmp32 |= (tmp8 << 8);
158 tmp8 = *ref++;
159 tmp32 |= (tmp8 << 16);
160 tmp8 = *ref++;
161 tmp32 |= (tmp8 << 24);
162 *((uint32*)(dst += 4)) = tmp32;
163 i--;
164 }
165 ref += (lx - 24);
166 }
167
168 /* from the first array, we do horizontal interp */
169 ref = subpel_pred + 2;
170 dst_16 = tmp_horz; /* 17 x 22 */
171
172 for (j = 4; j > 0; j--)
173 {
174 for (i = 16; i > 0; i -= 4)
175 {
176 a = ref[-2];
177 b = ref[-1];
178 c = ref[0];
179 d = ref[1];
180 e = ref[2];
181 f = ref[3];
182 *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d);
183 a = ref[4];
184 *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e);
185 b = ref[5];
186 *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f);
187 c = ref[6];
188 *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a);
189
190 ref += 4;
191 }
192 /* do the 17th column here */
193 d = ref[3];
194 *dst_16 = e + d - 5 * (f + c) + 20 * (a + b);
195 dst_16 += 2; /* stride for tmp_horz is 18 */
196 ref += 8; /* stride for ref is 24 */
197 if (j == 3) // move 18 lines down
198 {
199 dst_16 += 324;//18*18;
200 ref += 432;//18*24;
201 }
202 }
203
204 ref -= 480;//20*24;
205 dst_16 -= 360;//20*18;
206 dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/
207
208 for (j = 18; j > 0; j--)
209 {
210 for (i = 16; i > 0; i -= 4)
211 {
212 a = ref[-2];
213 b = ref[-1];
214 c = ref[0];
215 d = ref[1];
216 e = ref[2];
217 f = ref[3];
218 tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
219 *dst_16++ = tmp32;
220 tmp32 = (tmp32 + 16) >> 5;
221 CLIP_RESULT(tmp32)
222 *dst++ = tmp32;
223
224 a = ref[4];
225 tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
226 *dst_16++ = tmp32;
227 tmp32 = (tmp32 + 16) >> 5;
228 CLIP_RESULT(tmp32)
229 *dst++ = tmp32;
230
231 b = ref[5];
232 tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
233 *dst_16++ = tmp32;
234 tmp32 = (tmp32 + 16) >> 5;
235 CLIP_RESULT(tmp32)
236 *dst++ = tmp32;
237
238 c = ref[6];
239 tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
240 *dst_16++ = tmp32;
241 tmp32 = (tmp32 + 16) >> 5;
242 CLIP_RESULT(tmp32)
243 *dst++ = tmp32;
244
245 ref += 4;
246 }
247 /* do the 17th column here */
248 d = ref[3];
249 tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
250 *dst_16 = tmp32;
251 tmp32 = (tmp32 + 16) >> 5;
252 CLIP_RESULT(tmp32)
253 *dst = tmp32;
254
255 dst += 8; /* stride for dst is 24 */
256 dst_16 += 2; /* stride for tmp_horz is 18 */
257 ref += 8; /* stride for ref is 24 */
258 }
259
260
261 /* Do middle point filtering*/
262 src_16 = tmp_horz; /* 17 x 22 */
263 dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/
264 dst -= 24; // offset
265 for (i = 0; i < 17; i++)
266 {
267 for (j = 16; j > 0; j -= 4)
268 {
269 a = *src_16;
270 b = *(src_16 += 18);
271 c = *(src_16 += 18);
272 d = *(src_16 += 18);
273 e = *(src_16 += 18);
274 f = *(src_16 += 18);
275
276 tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
277 tmp32 = (tmp32 + 512) >> 10;
278 CLIP_RESULT(tmp32)
279 *(dst += 24) = tmp32;
280
281 a = *(src_16 += 18);
282 tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
283 tmp32 = (tmp32 + 512) >> 10;
284 CLIP_RESULT(tmp32)
285 *(dst += 24) = tmp32;
286
287 b = *(src_16 += 18);
288 tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
289 tmp32 = (tmp32 + 512) >> 10;
290 CLIP_RESULT(tmp32)
291 *(dst += 24) = tmp32;
292
293 c = *(src_16 += 18);
294 tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
295 tmp32 = (tmp32 + 512) >> 10;
296 CLIP_RESULT(tmp32)
297 *(dst += 24) = tmp32;
298
299 src_16 -= (18 << 2);
300 }
301
302 d = src_16[90]; // 18*5
303 tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
304 tmp32 = (tmp32 + 512) >> 10;
305 CLIP_RESULT(tmp32)
306 dst[24] = tmp32;
307
308 src_16 -= ((18 << 4) - 1);
309 dst -= ((24 << 4) - 1);
310 }
311
312 /* do vertical interpolation */
313 ref = subpel_pred + 2;
314 dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */
315 dst -= 24; // offset
316
317 for (i = 2; i > 0; i--)
318 {
319 for (j = 16; j > 0; j -= 4)
320 {
321 a = *ref;
322 b = *(ref += 24);
323 c = *(ref += 24);
324 d = *(ref += 24);
325 e = *(ref += 24);
326 f = *(ref += 24);
327
328 tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
329 tmp32 = (tmp32 + 16) >> 5;
330 CLIP_RESULT(tmp32)
331 *(dst += 24) = tmp32; // 10th
332
333 a = *(ref += 24);
334 tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
335 tmp32 = (tmp32 + 16) >> 5;
336 CLIP_RESULT(tmp32)
337 *(dst += 24) = tmp32; // 10th
338
339 b = *(ref += 24);
340 tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
341 tmp32 = (tmp32 + 16) >> 5;
342 CLIP_RESULT(tmp32)
343 *(dst += 24) = tmp32; // 10th
344
345 c = *(ref += 24);
346 tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
347 tmp32 = (tmp32 + 16) >> 5;
348 CLIP_RESULT(tmp32)
349 *(dst += 24) = tmp32; // 10th
350
351 ref -= (24 << 2);
352 }
353
354 d = ref[120]; // 24*5
355 tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
356 tmp32 = (tmp32 + 16) >> 5;
357 CLIP_RESULT(tmp32)
358 dst[24] = tmp32; // 10th
359
360 dst -= ((24 << 4) - 1);
361 ref -= ((24 << 4) - 1);
362 }
363
364 // note that using SIMD here doesn't help much, the cycle almost stays the same
365 // one can just use the above code and change the for(i=2 to for(i=18
366 for (i = 16; i > 0; i -= 4)
367 {
368 for (j = 17; j > 0; j--)
369 {
370 a = *((uint32*)ref); /* load 4 bytes */
371 b = (a >> 8) & 0xFF00FF; /* second and fourth byte */
372 a &= 0xFF00FF;
373
374 c = *((uint32*)(ref + 120));
375 d = (c >> 8) & 0xFF00FF;
376 c &= 0xFF00FF;
377
378 a += c;
379 b += d;
380
381 e = *((uint32*)(ref + 72)); /* e, f */
382 f = (e >> 8) & 0xFF00FF;
383 e &= 0xFF00FF;
384
385 c = *((uint32*)(ref + 48)); /* c, d */
386 d = (c >> 8) & 0xFF00FF;
387 c &= 0xFF00FF;
388
389 c += e;
390 d += f;
391
392 a += 20 * c;
393 b += 20 * d;
394 a += 0x100010;
395 b += 0x100010;
396
397 e = *((uint32*)(ref += 24)); /* e, f */
398 f = (e >> 8) & 0xFF00FF;
399 e &= 0xFF00FF;
400
401 c = *((uint32*)(ref + 72)); /* c, d */
402 d = (c >> 8) & 0xFF00FF;
403 c &= 0xFF00FF;
404
405 c += e;
406 d += f;
407
408 a -= 5 * c;
409 b -= 5 * d;
410
411 c = a << 16;
412 d = b << 16;
413 CLIP_UPPER16(a)
414 CLIP_UPPER16(c)
415 CLIP_UPPER16(b)
416 CLIP_UPPER16(d)
417
418 a |= (c >> 16);
419 b |= (d >> 16);
420 // a>>=5;
421 // b>>=5;
422 /* clip */
423 // msk |= b; msk|=a;
424 // a &= 0xFF00FF;
425 // b &= 0xFF00FF;
426 a |= (b << 8); /* pack it back */
427
428 *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned.
429 *((uint16*)(dst + 2)) = a >> 16;
430
431 }
432 dst -= 404; // 24*17-4
433 ref -= 404;
434 /* if(msk & 0xFF00FF00) // need clipping
435 {
436 VertInterpWClip(dst,ref); // re-do 4 column with clip
437 }*/
438 }
439
440 return ;
441 }
442
VertInterpWClip(uint8 * dst,uint8 * ref)443 void VertInterpWClip(uint8 *dst, uint8 *ref)
444 {
445 int i, j;
446 int a, b, c, d, e, f;
447 int32 tmp32;
448
449 dst -= 4;
450 ref -= 4;
451
452 for (i = 4; i > 0; i--)
453 {
454 for (j = 16; j > 0; j -= 4)
455 {
456 a = *ref;
457 b = *(ref += 24);
458 c = *(ref += 24);
459 d = *(ref += 24);
460 e = *(ref += 24);
461 f = *(ref += 24);
462
463 tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
464 tmp32 = (tmp32 + 16) >> 5;
465 CLIP_RESULT(tmp32)
466 *(dst += 24) = tmp32; // 10th
467
468 a = *(ref += 24);
469 tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
470 tmp32 = (tmp32 + 16) >> 5;
471 CLIP_RESULT(tmp32)
472 *(dst += 24) = tmp32; // 10th
473
474 b = *(ref += 24);
475 tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
476 tmp32 = (tmp32 + 16) >> 5;
477 CLIP_RESULT(tmp32)
478 *(dst += 24) = tmp32; // 10th
479
480 c = *(ref += 24);
481 tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
482 tmp32 = (tmp32 + 16) >> 5;
483 CLIP_RESULT(tmp32)
484 *(dst += 24) = tmp32; // 10th
485
486 ref -= (24 << 2);
487 }
488
489 d = ref[120]; // 24*5
490 tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
491 tmp32 = (tmp32 + 16) >> 5;
492 CLIP_RESULT(tmp32)
493 dst[24] = tmp32; // 10th
494
495 dst -= ((24 << 4) - 1);
496 ref -= ((24 << 4) - 1);
497 }
498
499 return ;
500 }
501
502
GenerateQuartPelPred(uint8 ** bilin_base,uint8 * qpel_cand,int hpel_pos)503 void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos)
504 {
505 // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2
506 int i, j;
507
508 uint8 *c1 = qpel_cand;
509 uint8 *tl = bilin_base[0];
510 uint8 *tr = bilin_base[1];
511 uint8 *bl = bilin_base[2];
512 uint8 *br = bilin_base[3];
513 int a, b, c, d;
514 int offset = 1 - (384 * 7);
515
516 if (!(hpel_pos&1)) // diamond pattern
517 {
518 j = 16;
519 while (j--)
520 {
521 i = 16;
522 while (i--)
523 {
524 d = tr[24];
525 a = *tr++;
526 b = bl[1];
527 c = *br++;
528
529 *c1 = (c + a + 1) >> 1;
530 *(c1 += 384) = (b + a + 1) >> 1; /* c2 */
531 *(c1 += 384) = (b + c + 1) >> 1; /* c3 */
532 *(c1 += 384) = (b + d + 1) >> 1; /* c4 */
533
534 b = *bl++;
535
536 *(c1 += 384) = (c + d + 1) >> 1; /* c5 */
537 *(c1 += 384) = (b + d + 1) >> 1; /* c6 */
538 *(c1 += 384) = (b + c + 1) >> 1; /* c7 */
539 *(c1 += 384) = (b + a + 1) >> 1; /* c8 */
540
541 c1 += offset;
542 }
543 // advance to the next line, pitch is 24
544 tl += 8;
545 tr += 8;
546 bl += 8;
547 br += 8;
548 c1 += 8;
549 }
550 }
551 else // star pattern
552 {
553 j = 16;
554 while (j--)
555 {
556 i = 16;
557 while (i--)
558 {
559 a = *br++;
560 b = *tr++;
561 c = tl[1];
562 *c1 = (a + b + 1) >> 1;
563 b = bl[1];
564 *(c1 += 384) = (a + c + 1) >> 1; /* c2 */
565 c = tl[25];
566 *(c1 += 384) = (a + b + 1) >> 1; /* c3 */
567 b = tr[23];
568 *(c1 += 384) = (a + c + 1) >> 1; /* c4 */
569 c = tl[24];
570 *(c1 += 384) = (a + b + 1) >> 1; /* c5 */
571 b = *bl++;
572 *(c1 += 384) = (a + c + 1) >> 1; /* c6 */
573 c = *tl++;
574 *(c1 += 384) = (a + b + 1) >> 1; /* c7 */
575 *(c1 += 384) = (a + c + 1) >> 1; /* c8 */
576
577 c1 += offset;
578 }
579 // advance to the next line, pitch is 24
580 tl += 8;
581 tr += 8;
582 bl += 8;
583 br += 8;
584 c1 += 8;
585 }
586 }
587
588 return ;
589 }
590
591
592 /* assuming cand always has a pitch of 24 */
SATD_MB(uint8 * cand,uint8 * cur,int dmin)593 int SATD_MB(uint8 *cand, uint8 *cur, int dmin)
594 {
595 int cost;
596
597
598 dmin = (dmin << 16) | 24;
599 cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL);
600
601 return cost;
602 }
603
604
605
606
607
608