1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 *******************************************************************************
23 * @file
24 *  ih264e_intra_modes_eval.c
25 *
26 * @brief
27 *  This file contains definitions of routines that perform rate distortion
28 *  analysis on a macroblock if they are to be coded as intra.
29 *
30 * @author
31 *  ittiam
32 *
33 * @par List of Functions:
34 *  - ih264e_derive_neighbor_availability_of_mbs()
35 *  - ih264e_derive_ngbr_avbl_of_mb_partitions()
36 *  - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff()
37 *  - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff()
38 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff()
39 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton()
40 *  - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff()
41 *  - ih264e_evaluate_intra16x16_modes()
42 *  - ih264e_evaluate_intra4x4_modes()
43 *  - ih264e_evaluate_intra_chroma_modes()
44 *
45 * @remarks
46 *  None
47 *
48 *******************************************************************************
49 */
50 
51 /*****************************************************************************/
52 /* File Includes                                                             */
53 /*****************************************************************************/
54 
55 /* System include files */
56 #include <stdio.h>
57 #include <string.h>
58 #include <limits.h>
59 #include <assert.h>
60 
61 /* User include files */
62 #include "ih264e_config.h"
63 #include "ih264_typedefs.h"
64 #include "ih264e_defs.h"
65 #include "iv2.h"
66 #include "ive2.h"
67 #include "ih264_debug.h"
68 #include "ih264_defs.h"
69 #include "ih264_macros.h"
70 #include "ih264_intra_pred_filters.h"
71 #include "ih264_structs.h"
72 #include "ih264_common_tables.h"
73 #include "ih264_trans_quant_itrans_iquant.h"
74 #include "ih264_inter_pred_filters.h"
75 #include "ih264_mem_fns.h"
76 #include "ih264_padding.h"
77 #include "ih264_deblk_edge_filters.h"
78 #include "ih264_cabac_tables.h"
79 #include "ime_distortion_metrics.h"
80 #include "ih264e_error.h"
81 #include "ih264e_bitstream.h"
82 #include "ime_defs.h"
83 #include "ime_structs.h"
84 #include "irc_cntrl_param.h"
85 #include "irc_frame_info_collector.h"
86 #include "ih264e_rate_control.h"
87 #include "ih264e_cabac_structs.h"
88 #include "ih264e_structs.h"
89 #include "ih264e_intra_modes_eval.h"
90 #include "ih264e_globals.h"
91 #include "ime_platform_macros.h"
92 
93 
94 /*****************************************************************************/
95 /* Function Definitions                                                      */
96 /*****************************************************************************/
97 
98 /**
99 ******************************************************************************
100 *
101 * @brief
102 *  derivation process for macroblock availability
103 *
104 * @par   Description
105 *  Calculates the availability of the left, top, topright and topleft macroblocks.
106 *
107 * @param[in] ps_proc_ctxt
108 *  pointer to proc context (handle)
109 *
110 * @remarks Based on section 6.4.5 in H264 spec
111 *
112 * @return  none
113 *
114 ******************************************************************************
115 */
ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t * ps_proc)116 void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc)
117 {
118     UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx;
119     UWORD8 *pu1_slice_idx_b;
120     UWORD8 *pu1_slice_idx_a;
121     UWORD8 *pu1_slice_idx_c;
122     UWORD8 *pu1_slice_idx_d;
123     block_neighbors_t *ps_ngbr_avbl;
124     WORD32 i4_mb_x, i4_mb_y;
125     WORD32 i4_wd_mbs;
126 
127     i4_mb_x = ps_proc->i4_mb_x;
128     i4_mb_y = ps_proc->i4_mb_y;
129 
130     i4_wd_mbs = ps_proc->i4_wd_mbs;
131 
132     pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x;
133     pu1_slice_idx_a = pu1_slice_idx_curr - 1;
134     pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs;
135     pu1_slice_idx_c = pu1_slice_idx_b + 1;
136     pu1_slice_idx_d = pu1_slice_idx_b - 1;
137     ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
138 
139     /**********************************************************************/
140     /* The macroblock is marked as available, unless one of the following */
141     /* conditions is true in which case the macroblock shall be marked as */
142     /* not available.                                                     */
143     /* 1. mbAddr < 0                                                      */
144     /* 2  mbAddr > CurrMbAddr                                             */
145     /* 3. the macroblock with address mbAddr belongs to a different slice */
146     /* than the macroblock with address CurrMbAddr                        */
147     /**********************************************************************/
148 
149     /* left macroblock availability */
150     if (i4_mb_x == 0)
151     { /* macroblocks along first column */
152         ps_ngbr_avbl->u1_mb_a = 0;
153     }
154     else
155     { /* macroblocks belong to same slice? */
156         if (*pu1_slice_idx_a != *pu1_slice_idx_curr)
157             ps_ngbr_avbl->u1_mb_a = 0;
158         else
159             ps_ngbr_avbl->u1_mb_a = 1;
160     }
161 
162     /* top macroblock availability */
163     if (i4_mb_y == 0)
164     { /* macroblocks along first row */
165         ps_ngbr_avbl->u1_mb_b = 0;
166     }
167     else
168     { /* macroblocks belong to same slice? */
169         if (*pu1_slice_idx_b != *pu1_slice_idx_curr)
170             ps_ngbr_avbl->u1_mb_b = 0;
171         else
172             ps_ngbr_avbl->u1_mb_b = 1;
173     }
174 
175     /* top right macroblock availability */
176     if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0)
177     { /* macroblocks along last column */
178         ps_ngbr_avbl->u1_mb_c = 0;
179     }
180     else
181     { /* macroblocks belong to same slice? */
182         if (*pu1_slice_idx_c != *pu1_slice_idx_curr)
183             ps_ngbr_avbl->u1_mb_c = 0;
184         else
185             ps_ngbr_avbl->u1_mb_c = 1;
186     }
187 
188     /* top left macroblock availability */
189     if (i4_mb_x == 0 || i4_mb_y == 0)
190     { /* macroblocks along first column */
191         ps_ngbr_avbl->u1_mb_d = 0;
192     }
193     else
194     { /* macroblocks belong to same slice? */
195         if (*pu1_slice_idx_d != *pu1_slice_idx_curr)
196             ps_ngbr_avbl->u1_mb_d = 0;
197         else
198             ps_ngbr_avbl->u1_mb_d = 1;
199     }
200 }
201 
202 /**
203 ******************************************************************************
204 *
205 * @brief
206 *  derivation process for subblock/partition availability
207 *
208 * @par   Description
209 *  Calculates the availability of the left, top, topright and topleft subblock
210 *  or partitions.
211 *
212 * @param[in]    ps_proc_ctxt
213 *  pointer to macroblock context (handle)
214 *
215 * @param[in]    i1_pel_pos_x
216 *  column position of the pel wrt the current block
217 *
218 * @param[in]    i1_pel_pos_y
219 *  row position of the pel in wrt current block
220 *
221 * @remarks     Assumptions: before calling this function it is assumed that
222 *   the neighbor availability of the current macroblock is already derived.
223 *   Based on table 6-3 of H264 specification
224 *
225 * @return      availability status (yes or no)
226 *
227 ******************************************************************************
228 */
ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t * ps_ngbr_avbl,WORD8 i1_pel_pos_x,WORD8 i1_pel_pos_y)229 UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl,
230                                                 WORD8 i1_pel_pos_x,
231                                                 WORD8 i1_pel_pos_y)
232 {
233     UWORD8 u1_neighbor_avail=0;
234 
235     /**********************************************************************/
236     /* values of i1_pel_pos_x in the range 0-15 inclusive correspond to   */
237     /* various columns of a macroblock                                    */
238     /*                                                                    */
239     /* values of i1_pel_pos_y in the range 0-15 inclusive correspond to   */
240     /* various rows of a macroblock                                       */
241     /*                                                                    */
242     /* other values of i1_pel_pos_x & i1_pel_pos_y represents elements    */
243     /* outside the bound of an mb ie., represents its neighbors.          */
244     /**********************************************************************/
245     if (i1_pel_pos_x < 0)
246     { /* column(-1) */
247         if (i1_pel_pos_y < 0)
248         { /* row(-1) */
249             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */
250         }
251         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
252         { /* all rows of a macroblock */
253             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */
254         }
255         else /* if (i1_pel_pos_y >= 16) */
256         { /* rows(+16) */
257             u1_neighbor_avail = 0;  /* current mb bottom left availability */
258         }
259     }
260     else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16)
261     { /* all columns of a macroblock */
262         if (i1_pel_pos_y < 0)
263         { /* row(-1) */
264             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */
265         }
266         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
267         { /* all rows of a macroblock */
268             u1_neighbor_avail = 1; /* current mb availability */
269             /* availability of the partition is dependent on the position of the partition inside the mb */
270             /* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */
271         }
272         else /* if (i1_pel_pos_y >= 16) */
273         { /* rows(+16) */
274             u1_neighbor_avail = 0;  /* current mb bottom availability */
275         }
276     }
277     else if (i1_pel_pos_x >= 16)
278     { /* column(+16) */
279         if (i1_pel_pos_y < 0)
280         { /* row(-1) */
281             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */
282         }
283         else /* if (i1_pel_pos_y >= 0) */
284         { /* all other rows */
285             u1_neighbor_avail = 0;  /* current mb right & bottom right availability */
286         }
287     }
288 
289     return u1_neighbor_avail;
290 }
291 
292 /**
293 ******************************************************************************
294 *
295 * @brief
296 *  evaluate best intra 16x16 mode (rate distortion opt off)
297 *
298 * @par Description
299 *  This function evaluates all the possible intra 16x16 modes and finds the mode
300 *  that best represents the macro-block (least distortion) and occupies fewer
301 *  bits in the bit-stream.
302 *
303 * @param[in]   ps_proc_ctxt
304 *  pointer to process context (handle)
305 *
306 * @remarks
307 *  Ideally the cost of encoding a macroblock is calculated as
308 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
309 *  input block and the reconstructed block and rate is the number of bits taken
310 *  to place the macroblock in the bit-stream. In this routine the rate does not
311 *  exactly point to the total number of bits it takes, rather it points to header
312 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
313 *  and residual bits fall in to texture bits the number of bits taken to encoding
314 *  mbtype is considered as rate, we compute cost. Further we will approximate
315 *  the distortion as the deviation b/w input and the predicted block as opposed
316 *  to input and reconstructed block.
317 *
318 *  NOTE: As per the Document JVT-O079, for intra 16x16 macroblock,
319 *  the SAD and cost are one and the same.
320 *
321 * @return     none
322 *
323 ******************************************************************************
324 */
325 
ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)326 void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
327 {
328     /* Codec Context */
329     codec_t *ps_codec = ps_proc->ps_codec;
330 
331     /* SAD(distortion metric) of an 8x8 block */
332     WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX;
333 
334     /* lambda */
335     UWORD32 u4_lambda = ps_proc->u4_lambda;
336 
337     /* cost = distortion + lambda*rate */
338     WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX;
339 
340     /* intra mode */
341     UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16;
342 
343     /* neighbor pels for intra prediction */
344     UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels;
345 
346     /* neighbor availability */
347     WORD32 i4_ngbr_avbl;
348 
349     /* pointer to src macro block */
350     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
351     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
352 
353     /* pointer to prediction macro block */
354     UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16;
355     UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane;
356 
357     /* strides */
358     WORD32 i4_src_strd = ps_proc->i4_src_strd;
359     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
360     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
361 
362     /* pointer to neighbors left, top, topleft */
363     UWORD8 *pu1_mb_a = pu1_ref_mb - 1;
364     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd;
365     UWORD8 *pu1_mb_d = pu1_mb_b - 1;
366     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
367     /* valid intra modes map */
368     UWORD32 u4_valid_intra_modes;
369 
370     /* lut for valid intra modes */
371     const UWORD8 u1_valid_intra_modes[8] = {4, 6, 4, 6, 5, 7, 5, 15};
372 
373     /* temp var */
374     UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
375     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
376     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
377 
378     /* init temp var */
379     if (ps_proc->i4_slice_type != ISLICE)
380     {
381         /* Offset for MBtype */
382         offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
383         u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
384     }
385 
386     /* locating neighbors that are available for prediction */
387 
388     /* gather prediction pels from the neighbors, if particular set is not available
389      * it is set to zero*/
390     /* left pels */
391     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
392                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
393     if (u1_mb_a)
394     {
395         for(i = 0; i < 16; i++)
396             pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd];
397     }
398     else
399     {
400         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE);
401     }
402     /* top pels */
403     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
404                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
405     if (u1_mb_b)
406     {
407         ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16);
408     }
409     else
410     {
411         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE);
412     }
413     /* topleft pels */
414     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
415                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
416     if (u1_mb_d)
417     {
418         pu1_ngbr_pels_i16[16] = *pu1_mb_d;
419     }
420     else
421     {
422         pu1_ngbr_pels_i16[16] = 0;
423     }
424 
425     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
426     ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl;
427 
428     /* set valid intra modes for evaluation */
429     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
430 
431     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST ||
432                     ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST)
433         u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
434 
435     /* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */
436     ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16,
437                                                   i4_src_strd, i4_pred_strd,
438                                                   i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least,
439                                                   u4_valid_intra_modes);
440 
441     /* cost = distortion + lambda*rate */
442     i4_mb_cost_least = i4_mb_distortion_least;
443 
444     if (((u4_valid_intra_modes >> 3) & 1) != 0)
445     {
446         /* intra prediction for PLANE mode*/
447         (ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl);
448 
449         /* evaluate distortion between the actual blk and the estimated blk for the given mode */
450         ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion);
451 
452         /* cost = distortion + lambda*rate */
453         i4_mb_cost = i4_mb_distortion;
454 
455         /* update the least cost information if necessary */
456         if(i4_mb_cost < i4_mb_distortion_least)
457         {
458             u4_intra_mode = PLANE_I16x16;
459 
460             i4_mb_cost_least = i4_mb_cost;
461             i4_mb_distortion_least = i4_mb_distortion;
462         }
463     }
464 
465     u4_best_intra_16x16_mode = u4_intra_mode;
466 
467     DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode);
468 
469     ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode;
470 
471     /* cost = distortion + lambda*rate */
472     i4_mb_cost_least    = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode];
473 
474 
475     /* update the type of the mb if necessary */
476     if (i4_mb_cost_least < ps_proc->i4_mb_cost)
477     {
478         ps_proc->i4_mb_cost = i4_mb_cost_least;
479         ps_proc->i4_mb_distortion = i4_mb_distortion_least;
480         ps_proc->u4_mb_type = I16x16;
481     }
482 
483     return ;
484 }
485 
486 
487 /**
488 ******************************************************************************
489 *
490 * @brief
491 *  evaluate best intra 8x8 mode (rate distortion opt on)
492 *
493 * @par Description
494 *  This function evaluates all the possible intra 8x8 modes and finds the mode
495 *  that best represents the macro-block (least distortion) and occupies fewer
496 *  bits in the bit-stream.
497 *
498 * @param[in]    ps_proc_ctxt
499 *  pointer to proc ctxt
500 *
501 * @remarks Ideally the cost of encoding a macroblock is calculated as
502 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
503 *  input block and the reconstructed block and rate is the number of bits taken
504 *  to place the macroblock in the bit-stream. In this routine the rate does not
505 *  exactly point to the total number of bits it takes, rather it points to header
506 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
507 *  and residual bits fall in to texture bits the number of bits taken to encoding
508 *  mbtype is considered as rate, we compute cost. Further we will approximate
509 *  the distortion as the deviation b/w input and the predicted block as opposed
510 *  to input and reconstructed block.
511 *
512 *  NOTE: TODO: This function needs to be tested
513 *
514 *  @return      none
515 *
516 ******************************************************************************
517 */
ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)518 void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
519 {
520     /* Codec Context */
521     codec_t *ps_codec = ps_proc->ps_codec;
522 
523     /* SAD(distortion metric) of an 4x4 block */
524     WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
525 
526     /* lambda */
527     UWORD32 u4_lambda = ps_proc->u4_lambda;
528 
529     /* cost = distortion + lambda*rate */
530     WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda;
531 
532     /* cost due to mbtype */
533     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
534 
535     /* intra mode */
536     UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode;
537 
538     /* neighbor pels for intra prediction */
539     UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels;
540 
541     /* pointer to curr partition */
542     UWORD8 *pu1_mb_curr;
543 
544     /* pointer to prediction macro block */
545     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
546 
547     /* strides */
548     WORD32 i4_src_strd = ps_proc->i4_src_strd;
549     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
550 
551     /* neighbors left, top, top right, top left */
552     UWORD8 *pu1_mb_a;
553     UWORD8 *pu1_mb_b;
554     UWORD8 *pu1_mb_d;
555 
556     /* neighbor availability */
557     WORD32 i4_ngbr_avbl;
558     block_neighbors_t s_ngbr_avbl;
559 
560     /* temp vars */
561     UWORD32  b8, u4_pix_x, u4_pix_y;
562     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
563     block_neighbors_t s_ngbr_avbl_MB;
564 
565     /* ngbr mb syntax information */
566     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
567     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
568     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
569     /* valid intra modes map */
570     UWORD32 u4_valid_intra_modes;
571 
572     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
573     {
574         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
575     }
576     /* left pels */
577     s_ngbr_avbl_MB.u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
578                                   && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
579 
580     /* top pels */
581     s_ngbr_avbl_MB.u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
582                                   && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
583 
584     /* topleft pels */
585     s_ngbr_avbl_MB.u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
586                                   && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
587 
588     /* top right */
589     s_ngbr_avbl_MB.u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
590                                   && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
591 
592 
593     for(b8 = 0; b8 < 4; b8++)
594     {
595         u4_pix_x = (b8 & 0x01) << 3;
596         u4_pix_y = (b8 >> 1) << 3;
597 
598         pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
599         /* when rdopt is off, we use the input as reference for constructing prediction buffer */
600         /* as opposed to using the recon pels. (open loop intra prediction) */
601         pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
602         pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
603         pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
604 
605         /* locating neighbors that are available for prediction */
606         /* TODO : update the neighbor availability information basing on constrained intra pred information */
607         /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
608         /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
609         s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */
610         s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */
611         s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */
612         s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */
613 
614         /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
615         i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  (s_ngbr_avbl.u1_mb_c << 3) +
616                         (s_ngbr_avbl.u1_mb_a << 4);
617         /* if top partition is available and top right is not available for intra prediction, then */
618         /* padd top right samples using top sample and make top right also available */
619         /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
620         ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl;
621 
622 
623         ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8,
624                                                      i4_src_strd, i4_ngbr_avbl);
625 
626         i4_partition_cost_least = INT_MAX;
627         /* set valid intra modes for evaluation */
628         u4_valid_intra_modes = 0x1ff;
629 
630         if (!s_ngbr_avbl.u1_mb_b)
631         {
632             u4_valid_intra_modes &= ~(1 << VERT_I4x4);
633             u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4);
634             u4_valid_intra_modes &= ~(1 << VERT_L_I4x4);
635         }
636         if (!s_ngbr_avbl.u1_mb_a)
637         {
638             u4_valid_intra_modes &= ~(1 << HORZ_I4x4);
639             u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4);
640         }
641         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d)
642         {
643             u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4);
644             u4_valid_intra_modes &= ~(1 << VERT_R_I4x4);
645             u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4);
646         }
647 
648         /* estimate the intra 8x8 mode for the current partition (for evaluating cost) */
649         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
650         {
651             u4_estimated_intra_8x8_mode = DC_I8x8;
652         }
653         else
654         {
655             UWORD32 u4_left_intra_8x8_mode = DC_I8x8;
656             UWORD32 u4_top_intra_8x8_mode = DC_I8x8;
657 
658             if (u4_pix_x == 0)
659             {
660                 if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
661                 {
662                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1];
663                 }
664                 else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
665                 {
666                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2];
667                 }
668             }
669             else
670             {
671                 u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1];
672             }
673 
674             if (u4_pix_y == 0)
675             {
676                 if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
677                 {
678                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2];
679                 }
680                 else if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
681                 {
682                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2];
683                 }
684             }
685             else
686             {
687                 u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2];
688             }
689 
690             u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode);
691         }
692 
693         /* perform intra mode 8x8 evaluation */
694         for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1)
695         {
696             if ( (u4_valid_intra_modes & 1) == 0)
697                 continue;
698 
699             /* intra prediction */
700             (ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl);
701 
702             /* evaluate distortion between the actual blk and the estimated blk for the given mode */
703             ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion);
704 
705             i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits);
706 
707             /* update the least cost information if necessary */
708             if (i4_partition_cost < i4_partition_cost_least)
709             {
710                 i4_partition_cost_least = i4_partition_cost;
711                 i4_partition_distortion_least = i4_partition_distortion;
712                 u4_best_intra_8x8_mode = u4_intra_mode;
713             }
714         }
715         /* macroblock distortion */
716         i4_total_cost += i4_partition_cost_least;
717         i4_total_distortion += i4_partition_distortion_least;
718         /* mb partition mode */
719         ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode;
720 
721     }
722 
723     /* update the type of the mb if necessary */
724     if (i4_total_cost < ps_proc->i4_mb_cost)
725     {
726         ps_proc->i4_mb_cost = i4_total_cost;
727         ps_proc->i4_mb_distortion = i4_total_distortion;
728         ps_proc->u4_mb_type = I8x8;
729     }
730 
731     return ;
732 }
733 
734 
735 /**
736 ******************************************************************************
737 *
738 * @brief
739 *  evaluate best intra 4x4 mode (rate distortion opt off)
740 *
741 * @par Description
742 *  This function evaluates all the possible intra 4x4 modes and finds the mode
743 *  that best represents the macro-block (least distortion) and occupies fewer
744 *  bits in the bit-stream.
745 *
746 * @param[in]    ps_proc_ctxt
747 *  pointer to proc ctxt
748 *
749 * @remarks
750 *  Ideally the cost of encoding a macroblock is calculated as
751 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
752 *  input block and the reconstructed block and rate is the number of bits taken
753 *  to place the macroblock in the bit-stream. In this routine the rate does not
754 *  exactly point to the total number of bits it takes, rather it points to header
755 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
756 *  and residual bits fall in to texture bits the number of bits taken to encoding
757 *  mbtype is considered as rate, we compute cost. Further we will approximate
758 *  the distortion as the deviation b/w input and the predicted block as opposed
759 *  to input and reconstructed block.
760 *
761 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
762 *  24*lambda is added to the SAD before comparison with the best SAD for
763 *  inter prediction. This is an empirical value to prevent using too many intra
764 *  blocks.
765 *
766 * @return      none
767 *
768 ******************************************************************************
769 */
ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)770 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
771 {
772     /* Codec Context */
773     codec_t *ps_codec = ps_proc->ps_codec;
774 
775     /* SAD(distortion metric) of an 4x4 block */
776     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
777 
778     /* lambda */
779     UWORD32 u4_lambda = ps_proc->u4_lambda;
780 
781     /* cost = distortion + lambda*rate */
782     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
783 
784     /* cost due to mbtype */
785     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
786 
787     /* intra mode */
788     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
789 
790     /* neighbor pels for intra prediction */
791     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
792 
793     /* pointer to curr partition */
794     UWORD8 *pu1_mb_curr;
795 
796     /* pointer to prediction macro block */
797     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
798 
799     /* strides */
800     WORD32 i4_src_strd = ps_proc->i4_src_strd;
801     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
802 
803     /* neighbors left, top, top right, top left */
804     UWORD8 *pu1_mb_a;
805     UWORD8 *pu1_mb_b;
806     UWORD8 *pu1_mb_c;
807     UWORD8 *pu1_mb_d;
808 
809     /* neighbor availability */
810     WORD32 i4_ngbr_avbl;
811     block_neighbors_t s_ngbr_avbl;
812 
813     /* temp vars */
814     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
815 
816     /* scan order inside 4x4 block */
817     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
818 
819     /* ngbr sub mb modes */
820     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
821     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
822     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
823 
824     /* valid intra modes map */
825     UWORD32 u4_valid_intra_modes;
826     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
827 
828     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
829     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
830     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
831     {
832         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x + 1;
833     }
834     /* left pels */
835     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
836                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
837 
838     /* top pels */
839     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
840                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
841 
842     /* topleft pels */
843     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
844                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
845 
846     /* top right */
847     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
848                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
849 
850     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
851     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
852 
853     for (b8 = 0; b8 < 4; b8++)
854     {
855         u4_blk_x = (b8 & 0x01) << 3;
856         u4_blk_y = (b8 >> 1) << 3;
857         for (b4 = 0; b4 < 4; b4++)
858         {
859             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
860             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
861 
862             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
863             /* when rdopt is off, we use the input as reference for constructing prediction buffer */
864             /* as opposed to using the recon pels. (open loop intra prediction) */
865             pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
866             pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
867             pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */
868             pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
869 
870             /* locating neighbors that are available for prediction */
871             /* TODO : update the neighbor availability information basing on constrained intra pred information */
872             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
873             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
874 
875             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
876             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
877             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
878             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
879             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
880             /* set valid intra modes for evaluation */
881             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
882 
883             /* if top partition is available and top right is not available for intra prediction, then */
884             /* padd top right samples using top sample and make top right also available */
885             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
886 
887             /* gather prediction pels from the neighbors */
888             if (s_ngbr_avbl.u1_mb_a)
889             {
890                 for(i = 0; i < 4; i++)
891                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd];
892             }
893             else
894             {
895                 memset(pu1_ngbr_pels_i4, 0, 4);
896             }
897 
898             if (s_ngbr_avbl.u1_mb_b)
899             {
900                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
901             }
902             else
903             {
904                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
905             }
906 
907             if (s_ngbr_avbl.u1_mb_d)
908                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
909             else
910                 pu1_ngbr_pels_i4[4] = 0;
911 
912             if (s_ngbr_avbl.u1_mb_c)
913             {
914                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
915             }
916             else if (s_ngbr_avbl.u1_mb_b)
917             {
918                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
919                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
920             }
921 
922             i4_partition_cost_least = INT_MAX;
923 
924             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
925             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
926             {
927                 u4_estimated_intra_4x4_mode = DC_I4x4;
928             }
929             else
930             {
931                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
932                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
933 
934                 if (u4_pix_x == 0)
935                 {
936                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
937                     {
938                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
939                     }
940                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
941                     {
942                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
943                     }
944                 }
945                 else
946                 {
947                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
948                 }
949 
950                 if (u4_pix_y == 0)
951                 {
952                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
953                     {
954                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
955                     }
956                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
957                     {
958                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
959                     }
960                 }
961                 else
962                 {
963                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
964                 }
965 
966                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
967             }
968 
969             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
970 
971             /* mode evaluation and prediction */
972             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
973                                                          pu1_ngbr_pels_i4,
974                                                          pu1_pred_mb, i4_src_strd,
975                                                          i4_pred_strd, i4_ngbr_avbl,
976                                                          &u4_best_intra_4x4_mode,
977                                                          &i4_partition_cost_least,
978                                                          u4_valid_intra_modes,
979                                                          u4_lambda,
980                                                          u4_estimated_intra_4x4_mode);
981 
982 
983             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits);
984 
985             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
986             /* macroblock distortion */
987             i4_total_distortion += i4_partition_distortion_least;
988             i4_total_cost += i4_partition_cost_least;
989             /* mb partition mode */
990             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
991         }
992     }
993 
994     /* update the type of the mb if necessary */
995     if (i4_total_cost < ps_proc->i4_mb_cost)
996     {
997         ps_proc->i4_mb_cost = i4_total_cost;
998         ps_proc->i4_mb_distortion = i4_total_distortion;
999         ps_proc->u4_mb_type = I4x4;
1000     }
1001 
1002     return ;
1003 }
1004 
1005 /**
1006 ******************************************************************************
1007 *
1008 * @brief evaluate best intra 4x4 mode (rate distortion opt on)
1009 *
1010 * @par Description
1011 *  This function evaluates all the possible intra 4x4 modes and finds the mode
1012 *  that best represents the macro-block (least distortion) and occupies fewer
1013 *  bits in the bit-stream.
1014 *
1015 * @param[in]    ps_proc_ctxt
1016 *  pointer to proc ctxt
1017 *
1018 * @remarks
1019 *  Ideally the cost of encoding a macroblock is calculated as
1020 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
1021 *  input block and the reconstructed block and rate is the number of bits taken
1022 *  to place the macroblock in the bit-stream. In this routine the rate does not
1023 *  exactly point to the total number of bits it takes, rather it points to header
1024 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
1025 *  and residual bits fall in to texture bits the number of bits taken to encoding
1026 *  mbtype is considered as rate, we compute cost. Further we will approximate
1027 *  the distortion as the deviation b/w input and the predicted block as opposed
1028 *  to input and reconstructed block.
1029 *
1030 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
1031 *  24*lambda is added to the SAD before comparison with the best SAD for
1032 *  inter prediction. This is an empirical value to prevent using too many intra
1033 *  blocks.
1034 *
1035 * @return      none
1036 *
1037 ******************************************************************************
1038 */
ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t * ps_proc)1039 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc)
1040 {
1041     /* Codec Context */
1042     codec_t *ps_codec = ps_proc->ps_codec;
1043 
1044     /* SAD(distortion metric) of an 4x4 block */
1045     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
1046 
1047     /* lambda */
1048     UWORD32 u4_lambda = ps_proc->u4_lambda;
1049 
1050     /* cost = distortion + lambda*rate */
1051     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
1052 
1053     /* cost due to mbtype */
1054     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
1055 
1056     /* intra mode */
1057     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
1058 
1059     /* neighbor pels for intra prediction */
1060     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
1061 
1062     /* pointer to curr partition */
1063     UWORD8 *pu1_mb_curr;
1064     UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top;
1065     UWORD8 *pu1_ref_mb_intra_4x4;
1066 
1067     /* pointer to residual macro block */
1068     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1069 
1070     /* pointer to prediction macro block */
1071     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
1072 
1073     /* strides */
1074     WORD32 i4_src_strd = ps_proc->i4_src_strd;
1075     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1076     WORD32 i4_ref_strd_left, i4_ref_strd_top;
1077 
1078     /* neighbors left, top, top right, top left */
1079     UWORD8 *pu1_mb_a;
1080     UWORD8 *pu1_mb_b;
1081     UWORD8 *pu1_mb_c;
1082     UWORD8 *pu1_mb_d;
1083 
1084     /* number of non zero coeffs*/
1085     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
1086 
1087     /* quantization parameters */
1088     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1089 
1090     /* neighbor availability */
1091     WORD32 i4_ngbr_avbl;
1092     block_neighbors_t s_ngbr_avbl;
1093 
1094     /* temp vars */
1095     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
1096 
1097     /* scan order inside 4x4 block */
1098     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
1099 
1100     /* ngbr sub mb modes */
1101     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
1102     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1103     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1104 
1105     /* valid intra modes map */
1106     UWORD32 u4_valid_intra_modes;
1107     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
1108 
1109     /* Dummy variable for 4x4 trans function */
1110     WORD16 i2_dc_dummy;
1111     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
1112     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
1113 
1114     /* compute ngbr availability for sub blks */
1115     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
1116     {
1117         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
1118     }
1119 
1120     /* left pels */
1121     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
1122                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
1123 
1124        /* top pels */
1125     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
1126                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
1127 
1128        /* topleft pels */
1129     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
1130                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
1131 
1132        /* top right pels */
1133     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
1134                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
1135 
1136     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
1137     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
1138 
1139     for(b8 = 0; b8 < 4; b8++)
1140     {
1141         u4_blk_x = (b8 & 0x01) << 3;
1142         u4_blk_y = (b8 >> 1) << 3;
1143         for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
1144         {
1145             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
1146             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
1147 
1148             pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd);
1149             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
1150             if (u4_pix_x == 0)
1151             {
1152                 i4_ref_strd_left = ps_proc->i4_rec_strd;
1153                 pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left);
1154             }
1155             else
1156             {
1157                 i4_ref_strd_left = i4_pred_strd;
1158                 pu1_mb_ref_left = pu1_ref_mb_intra_4x4;
1159             }
1160             if (u4_pix_y == 0)
1161             {
1162                 i4_ref_strd_top = ps_proc->i4_rec_strd;
1163                 pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top);
1164             }
1165             else
1166             {
1167                 i4_ref_strd_top = i4_pred_strd;
1168                 pu1_mb_ref_top = pu1_ref_mb_intra_4x4;
1169             }
1170 
1171             pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */
1172             pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */
1173             pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */
1174             if (u4_pix_y == 0)
1175                 pu1_mb_d = pu1_mb_b - 1;
1176             else
1177                 pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */
1178 
1179             /* locating neighbors that are available for prediction */
1180             /* TODO : update the neighbor availability information basing on constrained intra pred information */
1181             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
1182             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
1183 
1184             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
1185             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
1186             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
1187             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
1188             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
1189             /* set valid intra modes for evaluation */
1190             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
1191 
1192             /* if top partition is available and top right is not available for intra prediction, then */
1193             /* padd top right samples using top sample and make top right also available */
1194             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
1195 
1196             /* gather prediction pels from the neighbors */
1197             if (s_ngbr_avbl.u1_mb_a)
1198             {
1199                 for(i = 0; i < 4; i++)
1200                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left];
1201             }
1202             else
1203             {
1204                 memset(pu1_ngbr_pels_i4,0,4);
1205             }
1206             if(s_ngbr_avbl.u1_mb_b)
1207             {
1208                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
1209             }
1210             else
1211             {
1212                 memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4);
1213             }
1214             if (s_ngbr_avbl.u1_mb_d)
1215                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
1216             else
1217                 pu1_ngbr_pels_i4[4] = 0;
1218             if (s_ngbr_avbl.u1_mb_c)
1219             {
1220                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
1221             }
1222             else if (s_ngbr_avbl.u1_mb_b)
1223             {
1224                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
1225                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
1226             }
1227 
1228             i4_partition_cost_least = INT_MAX;
1229 
1230             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
1231             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
1232             {
1233                 u4_estimated_intra_4x4_mode = DC_I4x4;
1234             }
1235             else
1236             {
1237                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
1238                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
1239 
1240                 if (u4_pix_x == 0)
1241                 {
1242                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
1243                     {
1244                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
1245                     }
1246                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
1247                     {
1248                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
1249                     }
1250                 }
1251                 else
1252                 {
1253                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
1254                 }
1255 
1256                 if (u4_pix_y == 0)
1257                 {
1258                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
1259                     {
1260                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
1261                     }
1262                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
1263                     {
1264                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
1265                     }
1266                 }
1267                 else
1268                 {
1269                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
1270                 }
1271 
1272                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
1273             }
1274 
1275             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
1276 
1277             /*mode evaluation and prediction*/
1278             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
1279                                                          pu1_ngbr_pels_i4,
1280                                                          pu1_pred_mb, i4_src_strd,
1281                                                          i4_pred_strd, i4_ngbr_avbl,
1282                                                          &u4_best_intra_4x4_mode,
1283                                                          &i4_partition_cost_least,
1284                                                          u4_valid_intra_modes,
1285                                                          u4_lambda,
1286                                                          u4_estimated_intra_4x4_mode);
1287 
1288 
1289             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits);
1290 
1291             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
1292 
1293             /* macroblock distortion */
1294             i4_total_distortion += i4_partition_distortion_least;
1295             i4_total_cost += i4_partition_cost_least;
1296 
1297             /* mb partition mode */
1298             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
1299 
1300 
1301             /********************************************************/
1302             /*  error estimation,                                   */
1303             /*  transform                                           */
1304             /*  quantization                                        */
1305             /********************************************************/
1306             ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb,
1307                                               pi2_res_mb, i4_src_strd,
1308                                               i4_pred_strd,
1309                                               /* No op stride, this implies a buff of lenght 1x16 */
1310                                               ps_qp_params->pu2_scale_mat,
1311                                               ps_qp_params->pu2_thres_mat,
1312                                               ps_qp_params->u1_qbits,
1313                                               ps_qp_params->u4_dead_zone,
1314                                               pu1_nnz, &i2_dc_dummy);
1315 
1316             /********************************************************/
1317             /*  ierror estimation,                                  */
1318             /*  itransform                                          */
1319             /*  iquantization                                       */
1320             /********************************************************/
1321             ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb,
1322                                                  pu1_ref_mb_intra_4x4,
1323                                                  i4_pred_strd, i4_pred_strd,
1324                                                  ps_qp_params->pu2_iscale_mat,
1325                                                  ps_qp_params->pu2_weigh_mat,
1326                                                  ps_qp_params->u1_qp_div,
1327                                                  ps_proc->pv_scratch_buff, 0,
1328                                                  NULL);
1329         }
1330     }
1331 
1332     /* update the type of the mb if necessary */
1333     if (i4_total_cost < ps_proc->i4_mb_cost)
1334     {
1335         ps_proc->i4_mb_cost = i4_total_cost;
1336         ps_proc->i4_mb_distortion = i4_total_distortion;
1337         ps_proc->u4_mb_type = I4x4;
1338     }
1339 
1340     return ;
1341 }
1342 
1343 /**
1344 ******************************************************************************
1345 *
1346 * @brief
1347 *  evaluate best chroma intra 8x8 mode (rate distortion opt off)
1348 *
1349 * @par Description
1350 *  This function evaluates all the possible chroma intra 8x8 modes and finds
1351 *  the mode that best represents the macroblock (least distortion) and occupies
1352 *  fewer bits in the bitstream.
1353 *
1354 * @param[in] ps_proc_ctxt
1355 *  pointer to macroblock context (handle)
1356 *
1357 * @remarks
1358 *  For chroma best intra pred mode is calculated based only on SAD
1359 *
1360 * @returns none
1361 *
1362 ******************************************************************************
1363 */
1364 
ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)1365 void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
1366 {
1367     /* Codec Context */
1368     codec_t *ps_codec = ps_proc->ps_codec;
1369 
1370     /* SAD(distortion metric) of an 8x8 block */
1371     WORD32 i4_mb_distortion, i4_chroma_mb_distortion;
1372 
1373     /* intra mode */
1374     UWORD32  u4_best_chroma_intra_8x8_mode = DC_CH_I8x8;
1375 
1376     /* neighbor pels for intra prediction */
1377     UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels;
1378 
1379     /* pointer to curr macro block */
1380     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
1381     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
1382 
1383     /* pointer to prediction macro block */
1384     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
1385     UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
1386 
1387     /* strides */
1388     WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
1389     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1390     WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
1391 
1392     /* neighbors left, top, top left */
1393     UWORD8 *pu1_mb_a = pu1_ref_mb - 2;
1394     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c;
1395     UWORD8 *pu1_mb_d = pu1_mb_b - 2;
1396 
1397     /* neighbor availability */
1398     const UWORD8  u1_valid_intra_modes[8] = {1, 3, 1, 3, 5, 7, 5, 15};
1399     WORD32 i4_ngbr_avbl;
1400 
1401     /* valid intra modes map */
1402     UWORD32 u4_valid_intra_modes;
1403     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1404 
1405     /* temp var */
1406     UWORD8 i;
1407     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
1408     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
1409     /* locating neighbors that are available for prediction */
1410 
1411     /* gather prediction pels from the neighbors */
1412     /* left pels */
1413     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
1414                     && (u4_constrained_intra_pred ?  ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
1415     if (u1_mb_a)
1416     {
1417         for (i = 0; i < 16; i += 2)
1418         {
1419             pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c];
1420             pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1];
1421         }
1422     }
1423     else
1424     {
1425         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE);
1426     }
1427 
1428     /* top pels */
1429     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
1430                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
1431     if (u1_mb_b)
1432     {
1433         ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16);
1434     }
1435     else
1436     {
1437         ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE);
1438     }
1439 
1440     /* top left pels */
1441     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
1442                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
1443     if (u1_mb_d)
1444     {
1445         pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d;
1446         pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1);
1447     }
1448     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
1449     ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl;
1450 
1451     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
1452 
1453     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST ||
1454                     ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST)
1455         u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8);
1456 
1457     i4_chroma_mb_distortion = INT_MAX;
1458 
1459     /* perform intra mode chroma  8x8 evaluation */
1460     /* intra prediction */
1461     ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb,
1462                                                     pu1_ngbr_pels_c_i8x8,
1463                                                     pu1_pred_mb,
1464                                                     i4_src_strd_c,
1465                                                     i4_pred_strd,
1466                                                     i4_ngbr_avbl,
1467                                                     &u4_best_chroma_intra_8x8_mode,
1468                                                     &i4_chroma_mb_distortion,
1469                                                     u4_valid_intra_modes);
1470 
1471     if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/
1472     {
1473         (ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl);
1474 
1475         /* evaluate distortion(sad) */
1476         ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion);
1477 
1478         /* update the least distortion information if necessary */
1479         if(i4_mb_distortion < i4_chroma_mb_distortion)
1480         {
1481             i4_chroma_mb_distortion = i4_mb_distortion;
1482             u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8;
1483         }
1484     }
1485 
1486     DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode);
1487 
1488     ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode;
1489 
1490     return ;
1491 }
1492 
1493 
1494 /**
1495 ******************************************************************************
1496 *
1497 * @brief
1498 *  Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
1499 *  prediction.
1500 *
1501 * @par Description
1502 *  This function evaluates first three 16x16 modes and compute corresponding sad
1503 *  and return the buffer predicted with best mode.
1504 *
1505 * @param[in] pu1_src
1506 *  UWORD8 pointer to the source
1507 *
1508 * @param[in] pu1_ngbr_pels_i16
1509 *  UWORD8 pointer to neighbouring pels
1510 *
1511 * @param[out] pu1_dst
1512 *  UWORD8 pointer to the destination
1513 *
1514 * @param[in] src_strd
1515 *  integer source stride
1516 *
1517 * @param[in] dst_strd
1518 *  integer destination stride
1519 *
1520 * @param[in] u4_n_avblty
1521 *  availability of neighbouring pixels
1522 *
1523 * @param[in] u4_intra_mode
1524 *  Pointer to the variable in which best mode is returned
1525 *
1526 * @param[in] pu4_sadmin
1527 *  Pointer to the variable in which minimum sad is returned
1528 *
1529 * @param[in] u4_valid_intra_modes
1530 *  Says what all modes are valid
1531 *
1532 * @returns      none
1533 *
1534 ******************************************************************************
1535 */
ih264e_evaluate_intra16x16_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels_i16,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)1536 void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
1537                                       UWORD8 *pu1_ngbr_pels_i16,
1538                                       UWORD8 *pu1_dst,
1539                                       UWORD32 src_strd,
1540                                       UWORD32 dst_strd,
1541                                       WORD32 u4_n_avblty,
1542                                       UWORD32 *u4_intra_mode,
1543                                       WORD32 *pu4_sadmin,
1544                                       UWORD32 u4_valid_intra_modes)
1545 {
1546     UWORD8 *pu1_neighbour;
1547     UWORD8 *pu1_src_temp = pu1_src;
1548     UWORD8 left = 0, top = 0;
1549     WORD32 u4_dcval = 0;
1550     WORD32 i, j;
1551     WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX,
1552                     i4_min_sad = INT_MAX;
1553     UWORD8 val;
1554 
1555     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
1556     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
1557 
1558     /* left available */
1559     if (left)
1560     {
1561         i4_sad_horz = 0;
1562 
1563         for (i = 0; i < 16; i++)
1564         {
1565             val = pu1_ngbr_pels_i16[15 - i];
1566 
1567             u4_dcval += val;
1568 
1569             for (j = 0; j < 16; j++)
1570             {
1571                 i4_sad_horz += ABS(val - pu1_src_temp[j]);
1572             }
1573 
1574             pu1_src_temp += src_strd;
1575         }
1576         u4_dcval += 8;
1577     }
1578 
1579     pu1_src_temp = pu1_src;
1580     /* top available */
1581     if (top)
1582     {
1583         i4_sad_vert = 0;
1584 
1585         for (i = 0; i < 16; i++)
1586         {
1587             u4_dcval += pu1_ngbr_pels_i16[17 + i];
1588 
1589             for (j = 0; j < 16; j++)
1590             {
1591                 i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]);
1592             }
1593             pu1_src_temp += src_strd;
1594 
1595         }
1596         u4_dcval += 8;
1597     }
1598 
1599     u4_dcval = (u4_dcval) >> (3 + left + top);
1600 
1601     pu1_src_temp = pu1_src;
1602 
1603     /* none available */
1604     u4_dcval += (left == 0) * (top == 0) * 128;
1605 
1606     i4_sad_dc = 0;
1607 
1608     for (i = 0; i < 16; i++)
1609     {
1610         for (j = 0; j < 16; j++)
1611         {
1612             i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]);
1613         }
1614         pu1_src_temp += src_strd;
1615     }
1616 
1617     if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */
1618         i4_sad_dc = INT_MAX;
1619 
1620     if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */
1621         i4_sad_vert = INT_MAX;
1622 
1623     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */
1624         i4_sad_horz = INT_MAX;
1625 
1626     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
1627 
1628     /* Finding Minimum sad and doing corresponding prediction */
1629     if (i4_min_sad < *pu4_sadmin)
1630     {
1631         *pu4_sadmin = i4_min_sad;
1632         if (i4_min_sad == i4_sad_vert)
1633         {
1634             *u4_intra_mode = VERT_I16x16;
1635             pu1_neighbour = pu1_ngbr_pels_i16 + 17;
1636             for (j = 0; j < 16; j++)
1637             {
1638                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
1639                 pu1_dst += dst_strd;
1640             }
1641         }
1642         else if (i4_min_sad == i4_sad_horz)
1643         {
1644             *u4_intra_mode = HORZ_I16x16;
1645             for (j = 0; j < 16; j++)
1646             {
1647                 val = pu1_ngbr_pels_i16[15 - j];
1648                 memset(pu1_dst, val, MB_SIZE);
1649                 pu1_dst += dst_strd;
1650             }
1651         }
1652         else
1653         {
1654             *u4_intra_mode = DC_I16x16;
1655             for (j = 0; j < 16; j++)
1656             {
1657                 memset(pu1_dst, u4_dcval, MB_SIZE);
1658                 pu1_dst += dst_strd;
1659             }
1660         }
1661     }
1662     return;
1663 }
1664 
1665 /**
1666 ******************************************************************************
1667 *
1668 * @brief
1669 *  Evaluate best intra 4x4 mode and perform prediction.
1670 *
1671 * @par Description
1672 *  This function evaluates  4x4 modes and compute corresponding sad
1673 *  and return the buffer predicted with best mode.
1674 *
1675 * @param[in] pu1_src
1676 *  UWORD8 pointer to the source
1677 *
1678 * @param[in] pu1_ngbr_pels
1679 *  UWORD8 pointer to neighbouring pels
1680 *
1681 * @param[out] pu1_dst
1682 *  UWORD8 pointer to the destination
1683 *
1684 * @param[in] src_strd
1685 *  integer source stride
1686 *
1687 * @param[in] dst_strd
1688 *  integer destination stride
1689 *
1690 * @param[in] u4_n_avblty
1691 *  availability of neighbouring pixels
1692 *
1693 * @param[in] u4_intra_mode
1694 *  Pointer to the variable in which best mode is returned
1695 *
1696 * @param[in] pu4_sadmin
1697 *  Pointer to the variable in which minimum cost is returned
1698 *
1699 * @param[in] u4_valid_intra_modes
1700 *  Says what all modes are valid
1701 *
1702 * @param[in] u4_lambda
1703 *  Lamda value for computing cost from SAD
1704 *
1705 * @param[in] u4_predictd_mode
1706 *  Predicted mode for cost computation
1707 *
1708 * @returns      none
1709 *
1710 ******************************************************************************
1711 */
ih264e_evaluate_intra_4x4_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes,UWORD32 u4_lambda,UWORD32 u4_predictd_mode)1712 void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
1713                                      UWORD8 *pu1_ngbr_pels,
1714                                      UWORD8 *pu1_dst,
1715                                      UWORD32 src_strd,
1716                                      UWORD32 dst_strd,
1717                                      WORD32 u4_n_avblty,
1718                                      UWORD32 *u4_intra_mode,
1719                                      WORD32 *pu4_sadmin,
1720                                      UWORD32 u4_valid_intra_modes,
1721                                      UWORD32  u4_lambda,
1722                                      UWORD32 u4_predictd_mode)
1723 {
1724     UWORD8 *pu1_src_temp = pu1_src;
1725     UWORD8 *pu1_pred = pu1_ngbr_pels;
1726     UWORD8 left = 0, top = 0;
1727     UWORD8 u1_pred_val = 0;
1728     UWORD8 u1_pred_vals[4] = {0};
1729     UWORD8 *pu1_pred_val = NULL;
1730     /* To store FILT121 operated values*/
1731     UWORD8 u1_pred_vals_diag_121[15] = {0};
1732     /* To store FILT11 operated values*/
1733     UWORD8 u1_pred_vals_diag_11[15] = {0};
1734     UWORD8 u1_pred_vals_vert_r[8] = {0};
1735     UWORD8 u1_pred_vals_horz_d[10] = {0};
1736     UWORD8 u1_pred_vals_horz_u[10] = {0};
1737     WORD32 u4_dcval = 0;
1738     WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
1739                                INT_MAX, INT_MAX, INT_MAX, INT_MAX};
1740 
1741     WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
1742                                 INT_MAX, INT_MAX, INT_MAX, INT_MAX};
1743     WORD32 i, i4_min_cost = INT_MAX;
1744 
1745     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
1746     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
1747 
1748     /* Computing SAD */
1749 
1750     /* VERT mode valid */
1751     if (u4_valid_intra_modes & 1)
1752     {
1753         pu1_pred = pu1_ngbr_pels + 5;
1754         i4_sad[VERT_I4x4] = 0;
1755         i4_cost[VERT_I4x4] = 0;
1756 
1757         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1758         pu1_src_temp += src_strd;
1759         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1760         pu1_src_temp += src_strd;
1761         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1762         pu1_src_temp += src_strd;
1763         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1764 
1765         i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ?
1766                                         u4_lambda : 4 * u4_lambda);
1767     }
1768 
1769     /* HORZ mode valid */
1770     if (u4_valid_intra_modes & 2)
1771     {
1772         i4_sad[HORZ_I4x4] = 0;
1773         i4_cost[HORZ_I4x4] =0;
1774         pu1_src_temp = pu1_src;
1775 
1776         u1_pred_val = pu1_ngbr_pels[3];
1777 
1778         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1779                         + ABS(pu1_src_temp[1] - u1_pred_val)
1780                         + ABS(pu1_src_temp[2] - u1_pred_val)
1781                         + ABS(pu1_src_temp[3] - u1_pred_val);
1782         pu1_src_temp += src_strd;
1783 
1784         u1_pred_val = pu1_ngbr_pels[2];
1785 
1786         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1787                         + ABS(pu1_src_temp[1] - u1_pred_val)
1788                         + ABS(pu1_src_temp[2] - u1_pred_val)
1789                         + ABS(pu1_src_temp[3] - u1_pred_val);
1790         pu1_src_temp += src_strd;
1791 
1792         u1_pred_val = pu1_ngbr_pels[1];
1793 
1794         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1795                         + ABS(pu1_src_temp[1] - u1_pred_val)
1796                         + ABS(pu1_src_temp[2] - u1_pred_val)
1797                         + ABS(pu1_src_temp[3] - u1_pred_val);
1798         pu1_src_temp += src_strd;
1799 
1800         u1_pred_val = pu1_ngbr_pels[0];
1801 
1802         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1803                         + ABS(pu1_src_temp[1] - u1_pred_val)
1804                         + ABS(pu1_src_temp[2] - u1_pred_val)
1805                         + ABS(pu1_src_temp[3] - u1_pred_val);
1806 
1807         i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ?
1808                                         u4_lambda : 4 * u4_lambda);
1809     }
1810 
1811     /* DC mode valid */
1812     if (u4_valid_intra_modes & 4)
1813     {
1814         i4_sad[DC_I4x4] = 0;
1815         i4_cost[DC_I4x4] = 0;
1816         pu1_src_temp = pu1_src;
1817 
1818         if (left)
1819             u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2]
1820                             + pu1_ngbr_pels[3] + 2;
1821         if (top)
1822             u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7]
1823                             + pu1_ngbr_pels[8] + 2;
1824 
1825         u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128;
1826 
1827         /* none available */
1828         memset(u1_pred_vals, u4_dcval, 4);
1829         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1830         pu1_src_temp += src_strd;
1831         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1832         pu1_src_temp += src_strd;
1833         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1834         pu1_src_temp += src_strd;
1835         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1836         pu1_src_temp += src_strd;
1837 
1838         i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ?
1839                                         u4_lambda : 4 * u4_lambda);
1840     }
1841 
1842     /* if modes other than VERT, HORZ and DC are  valid */
1843     if (u4_valid_intra_modes > 7)
1844     {
1845         pu1_pred = pu1_ngbr_pels;
1846         pu1_pred[13] = pu1_pred[14] = pu1_pred[12];
1847 
1848         /* Performing FILT121 and FILT11 operation for all neighbour values*/
1849         for (i = 0; i < 13; i++)
1850         {
1851             u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]);
1852             u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]);
1853 
1854             pu1_pred++;
1855         }
1856 
1857         if (u4_valid_intra_modes & 8)/* DIAG_DL */
1858         {
1859             i4_sad[DIAG_DL_I4x4] = 0;
1860             i4_cost[DIAG_DL_I4x4] = 0;
1861             pu1_src_temp = pu1_src;
1862             pu1_pred_val = u1_pred_vals_diag_121 + 5;
1863 
1864             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]);
1865             pu1_src_temp += src_strd;
1866             USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]);
1867             pu1_src_temp += src_strd;
1868             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]);
1869             pu1_src_temp += src_strd;
1870             USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]);
1871             pu1_src_temp += src_strd;
1872             i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ?
1873                                             u4_lambda : 4 * u4_lambda);
1874         }
1875 
1876         if (u4_valid_intra_modes & 16)/* DIAG_DR */
1877         {
1878             i4_sad[DIAG_DR_I4x4] = 0;
1879             i4_cost[DIAG_DR_I4x4] = 0;
1880             pu1_src_temp = pu1_src;
1881             pu1_pred_val = u1_pred_vals_diag_121 + 3;
1882 
1883             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]);
1884             pu1_src_temp += src_strd;
1885             USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]);
1886             pu1_src_temp += src_strd;
1887             USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]);
1888             pu1_src_temp += src_strd;
1889             USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]);
1890             pu1_src_temp += src_strd;
1891             i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ?
1892                                             u4_lambda : 4 * u4_lambda);
1893 
1894         }
1895 
1896         if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
1897         {
1898             i4_sad[VERT_R_I4x4] = 0;
1899 
1900             pu1_src_temp = pu1_src;
1901             u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2];
1902             memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3);
1903             u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1];
1904             memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3);
1905 
1906             pu1_pred_val = u1_pred_vals_diag_11 + 4;
1907             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
1908             pu1_pred_val = u1_pred_vals_diag_121 + 3;
1909             pu1_src_temp += src_strd;
1910             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
1911             pu1_src_temp += src_strd;
1912             USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]);
1913             pu1_src_temp += src_strd;
1914             USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4),
1915                    i4_sad[VERT_R_I4x4]);
1916 
1917             i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ?
1918                                             u4_lambda : 4 * u4_lambda);
1919         }
1920 
1921         if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
1922         {
1923             i4_sad[HORZ_D_I4x4] = 0;
1924 
1925             pu1_src_temp = pu1_src;
1926             u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3];
1927             memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3);
1928             u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0];
1929             u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0];
1930             u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1];
1931             u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1];
1932             u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2];
1933             u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2];
1934 
1935             pu1_pred_val = u1_pred_vals_horz_d;
1936             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]);
1937             pu1_src_temp += src_strd;
1938             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]);
1939             pu1_src_temp += src_strd;
1940             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]);
1941             pu1_src_temp += src_strd;
1942             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]);
1943 
1944             i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ?
1945                                             u4_lambda : 4 * u4_lambda);
1946         }
1947 
1948         if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
1949         {
1950             i4_sad[VERT_L_I4x4] = 0;
1951             pu1_src_temp = pu1_src;
1952             pu1_pred_val = u1_pred_vals_diag_11 + 5;
1953             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1954             pu1_src_temp += src_strd;
1955             pu1_pred_val = u1_pred_vals_diag_121 + 5;
1956             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1957             pu1_src_temp += src_strd;
1958             pu1_pred_val = u1_pred_vals_diag_11 + 6;
1959             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1960             pu1_src_temp += src_strd;
1961             pu1_pred_val = u1_pred_vals_diag_121 + 6;
1962             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1963 
1964             i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ?
1965                                             u4_lambda : 4 * u4_lambda);
1966         }
1967 
1968         if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
1969         {
1970             i4_sad[HORZ_U_I4x4] = 0;
1971             pu1_src_temp = pu1_src;
1972             u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2];
1973             u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1];
1974             u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1];
1975             u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0];
1976             u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0];
1977             u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]);
1978 
1979             memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4);
1980 
1981             pu1_pred_val = u1_pred_vals_horz_u;
1982             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]);
1983             pu1_src_temp += src_strd;
1984             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]);
1985             pu1_src_temp += src_strd;
1986             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]);
1987             pu1_src_temp += src_strd;
1988             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]);
1989 
1990             i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ?
1991                                             u4_lambda : 4 * u4_lambda);
1992         }
1993 
1994         i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]),
1995                         MIN3(i4_cost[3], i4_cost[4], i4_cost[5]),
1996                         MIN3(i4_cost[6], i4_cost[7], i4_cost[8]));
1997 
1998     }
1999     else
2000     {
2001         /* Only first three modes valid */
2002         i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]);
2003     }
2004 
2005     *pu4_sadmin = i4_min_cost;
2006 
2007     if (i4_min_cost == i4_cost[0])
2008     {
2009         *u4_intra_mode = VERT_I4x4;
2010         pu1_pred_val = pu1_ngbr_pels + 5;
2011         memcpy(pu1_dst, (pu1_pred_val), 4);
2012         pu1_dst += dst_strd;
2013         memcpy(pu1_dst, (pu1_pred_val), 4);
2014         pu1_dst += dst_strd;
2015         memcpy(pu1_dst, (pu1_pred_val), 4);
2016         pu1_dst += dst_strd;
2017         memcpy(pu1_dst, (pu1_pred_val), 4);
2018     }
2019     else if (i4_min_cost == i4_cost[1])
2020     {
2021         *u4_intra_mode = HORZ_I4x4;
2022         memset(pu1_dst, pu1_ngbr_pels[3], 4);
2023         pu1_dst += dst_strd;
2024         memset(pu1_dst, pu1_ngbr_pels[2], 4);
2025         pu1_dst += dst_strd;
2026         memset(pu1_dst, pu1_ngbr_pels[1], 4);
2027         pu1_dst += dst_strd;
2028         memset(pu1_dst, pu1_ngbr_pels[0], 4);
2029     }
2030     else if (i4_min_cost == i4_cost[2])
2031     {
2032         *u4_intra_mode = DC_I4x4;
2033         memset(pu1_dst, u4_dcval, 4);
2034         pu1_dst += dst_strd;
2035         memset(pu1_dst, u4_dcval, 4);
2036         pu1_dst += dst_strd;
2037         memset(pu1_dst, u4_dcval, 4);
2038         pu1_dst += dst_strd;
2039         memset(pu1_dst, u4_dcval, 4);
2040     }
2041 
2042     else if (i4_min_cost == i4_cost[3])
2043     {
2044         *u4_intra_mode = DIAG_DL_I4x4;
2045         pu1_pred_val = u1_pred_vals_diag_121 + 5;
2046         memcpy(pu1_dst, (pu1_pred_val), 4);
2047         pu1_dst += dst_strd;
2048         memcpy(pu1_dst, (pu1_pred_val + 1), 4);
2049         pu1_dst += dst_strd;
2050         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2051         pu1_dst += dst_strd;
2052         memcpy(pu1_dst, (pu1_pred_val + 3), 4);
2053     }
2054     else if (i4_min_cost == i4_cost[4])
2055     {
2056         *u4_intra_mode = DIAG_DR_I4x4;
2057         pu1_pred_val = u1_pred_vals_diag_121 + 3;
2058 
2059         memcpy(pu1_dst, (pu1_pred_val), 4);
2060         pu1_dst += dst_strd;
2061         memcpy(pu1_dst, (pu1_pred_val - 1), 4);
2062         pu1_dst += dst_strd;
2063         memcpy(pu1_dst, (pu1_pred_val - 2), 4);
2064         pu1_dst += dst_strd;
2065         memcpy(pu1_dst, (pu1_pred_val - 3), 4);
2066     }
2067 
2068     else if (i4_min_cost == i4_cost[5])
2069     {
2070         *u4_intra_mode = VERT_R_I4x4;
2071         pu1_pred_val = u1_pred_vals_diag_11 + 4;
2072         memcpy(pu1_dst, (pu1_pred_val), 4);
2073         pu1_dst += dst_strd;
2074         pu1_pred_val = u1_pred_vals_diag_121 + 3;
2075         memcpy(pu1_dst, (pu1_pred_val), 4);
2076         pu1_dst += dst_strd;
2077         memcpy(pu1_dst, (u1_pred_vals_vert_r), 4);
2078         pu1_dst += dst_strd;
2079         memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4);
2080     }
2081     else if (i4_min_cost == i4_cost[6])
2082     {
2083         *u4_intra_mode = HORZ_D_I4x4;
2084         pu1_pred_val = u1_pred_vals_horz_d;
2085         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
2086         pu1_dst += dst_strd;
2087         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
2088         pu1_dst += dst_strd;
2089         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2090         pu1_dst += dst_strd;
2091         memcpy(pu1_dst, (pu1_pred_val), 4);
2092         pu1_dst += dst_strd;
2093     }
2094     else if (i4_min_cost == i4_cost[7])
2095     {
2096         *u4_intra_mode = VERT_L_I4x4;
2097         pu1_pred_val = u1_pred_vals_diag_11 + 5;
2098         memcpy(pu1_dst, (pu1_pred_val), 4);
2099         pu1_dst += dst_strd;
2100         pu1_pred_val = u1_pred_vals_diag_121 + 5;
2101         memcpy(pu1_dst, (pu1_pred_val), 4);
2102         pu1_dst += dst_strd;
2103         pu1_pred_val = u1_pred_vals_diag_11 + 6;
2104         memcpy(pu1_dst, (pu1_pred_val), 4);
2105         pu1_dst += dst_strd;
2106         pu1_pred_val = u1_pred_vals_diag_121 + 6;
2107         memcpy(pu1_dst, (pu1_pred_val), 4);
2108     }
2109     else if (i4_min_cost == i4_cost[8])
2110     {
2111         *u4_intra_mode = HORZ_U_I4x4;
2112         pu1_pred_val = u1_pred_vals_horz_u;
2113         memcpy(pu1_dst, (pu1_pred_val), 4);
2114         pu1_dst += dst_strd;
2115         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2116         pu1_dst += dst_strd;
2117         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
2118         pu1_dst += dst_strd;
2119         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
2120         pu1_dst += dst_strd;
2121     }
2122 
2123     return;
2124 }
2125 
2126 /**
2127 ******************************************************************************
2128 *
2129 * @brief:
2130 *  Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction.
2131 *
2132 * @par Description
2133 *  This function evaluates  first three intra chroma modes and compute corresponding sad
2134 *  and return the buffer predicted with best mode.
2135 *
2136 * @param[in] pu1_src
2137 *  UWORD8 pointer to the source
2138 *
2139 * @param[in] pu1_ngbr_pels
2140 *  UWORD8 pointer to neighbouring pels
2141 *
2142 * @param[out] pu1_dst
2143 *  UWORD8 pointer to the destination
2144 *
2145 * @param[in] src_strd
2146 *  integer source stride
2147 *
2148 * @param[in] dst_strd
2149 *  integer destination stride
2150 *
2151 * @param[in] u4_n_avblty
2152 *  availability of neighbouring pixels
2153 *
2154 * @param[in] u4_intra_mode
2155 *  Pointer to the variable in which best mode is returned
2156 *
2157 * @param[in] pu4_sadmin
2158 *  Pointer to the variable in which minimum sad is returned
2159 *
2160 * @param[in] u4_valid_intra_modes
2161 *  Says what all modes are valid
2162 *
2163 * @return      none
2164 *
2165 ******************************************************************************
2166 */
ih264e_evaluate_intra_chroma_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)2167 void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
2168                                         UWORD8 *pu1_ngbr_pels,
2169                                         UWORD8 *pu1_dst,
2170                                         UWORD32 src_strd,
2171                                         UWORD32 dst_strd,
2172                                         WORD32 u4_n_avblty,
2173                                         UWORD32 *u4_intra_mode,
2174                                         WORD32 *pu4_sadmin,
2175                                         UWORD32 u4_valid_intra_modes)
2176 {
2177     UWORD8 *pu1_neighbour;
2178     UWORD8 *pu1_src_temp = pu1_src;
2179     UWORD8 left = 0, top = 0;
2180     WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */
2181            u4_dcval_u_t[2] = { 0, 0 };  /*sum top neighbours for 'U'*/
2182 
2183     WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/
2184            u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/
2185 
2186     WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX,
2187                     i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX;
2188     UWORD8 val_u, val_v;
2189 
2190     WORD32 u4_dc_val[2][2][2];/*  -----------
2191                                   |    |    |  Chroma can have four
2192                                   | 00 | 01 |  separate dc value...
2193                                   -----------  u4_dc_val corresponds to this dc values
2194                                   |    |    |  with u4_dc_val[2][2][U] and u4_dc_val[2][2][V]
2195                                   | 10 | 11 |
2196                                   -----------                */
2197     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
2198     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
2199 
2200     /*Evaluating HORZ*/
2201     if (left)/* Ifleft available*/
2202     {
2203         i4_sad_horz = 0;
2204 
2205         for (i = 0; i < 8; i++)
2206         {
2207             val_v = pu1_ngbr_pels[15 - 2 * i];
2208             val_u = pu1_ngbr_pels[15 - 2 * i - 1];
2209             row = i / 4;
2210             u4_dcval_u_l[row] += val_u;
2211             u4_dcval_v_l[row] += val_v;
2212             for (j = 0; j < 8; j++)
2213             {
2214                 i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/
2215                 i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]);
2216             }
2217 
2218             pu1_src_temp += src_strd;
2219         }
2220         u4_dcval_u_l[0] += 2;
2221         u4_dcval_u_l[1] += 2;
2222         u4_dcval_v_l[0] += 2;
2223         u4_dcval_v_l[1] += 2;
2224     }
2225 
2226     /*Evaluating VERT**/
2227     pu1_src_temp = pu1_src;
2228     if (top) /* top available*/
2229     {
2230         i4_sad_vert = 0;
2231 
2232         for (i = 0; i < 8; i++)
2233         {
2234             col = i / 4;
2235 
2236             val_u = pu1_ngbr_pels[18 + i * 2];
2237             val_v = pu1_ngbr_pels[18 + i * 2 + 1];
2238             u4_dcval_u_t[col] += val_u;
2239             u4_dcval_v_t[col] += val_v;
2240 
2241             for (j = 0; j < 16; j++)
2242             {
2243                 i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/
2244             }
2245             pu1_src_temp += src_strd;
2246 
2247         }
2248         u4_dcval_u_t[0] += 2;
2249         u4_dcval_u_t[1] += 2;
2250         u4_dcval_v_t[0] += 2;
2251         u4_dcval_v_t[1] += 2;
2252     }
2253 
2254     /* computing DC value*/
2255     /* Equation  8-128 in spec*/
2256     u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top);
2257     u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top);
2258     u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top);
2259     u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top);
2260 
2261     if (top)
2262     {
2263         /* Equation  8-132 in spec*/
2264         u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top);
2265         u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top);
2266     }
2267     else
2268     {
2269         u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left);
2270         u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left);
2271     }
2272 
2273     if (left)
2274     {
2275         u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left);
2276         u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left);
2277     }
2278     else
2279     {
2280         u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top);
2281         u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top);
2282     }
2283 
2284     if (!(left || top))
2285     {
2286         /*none available*/
2287         u4_dc_val[0][0][0] = u4_dc_val[0][0][1] =
2288         u4_dc_val[0][1][0] = u4_dc_val[0][1][1] =
2289         u4_dc_val[1][0][0] = u4_dc_val[1][0][1] =
2290         u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128;
2291     }
2292 
2293     /* Evaluating DC */
2294     pu1_src_temp = pu1_src;
2295     i4_sad_dc = 0;
2296     for (i = 0; i < 8; i++)
2297     {
2298         for (j = 0; j < 8; j++)
2299         {
2300             col = j / 4;
2301             row = i / 4;
2302             val_u = u4_dc_val[row][col][0];
2303             val_v = u4_dc_val[row][col][1];
2304 
2305             i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/
2306             i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]);
2307         }
2308         pu1_src_temp += src_strd;
2309     }
2310 
2311     if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/
2312         i4_sad_dc = INT_MAX;
2313     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/
2314         i4_sad_horz = INT_MAX;
2315     if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/
2316         i4_sad_vert = INT_MAX;
2317 
2318     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
2319 
2320     /* Finding Minimum sad and doing corresponding prediction*/
2321     if (i4_min_sad < *pu4_sadmin)
2322     {
2323         *pu4_sadmin = i4_min_sad;
2324 
2325         if (i4_min_sad == i4_sad_dc)
2326         {
2327             *u4_intra_mode = DC_CH_I8x8;
2328             for (i = 0; i < 8; i++)
2329             {
2330                 for (j = 0; j < 8; j++)
2331                 {
2332                     col = j / 4;
2333                     row = i / 4;
2334 
2335                     pu1_dst[2 * j] = u4_dc_val[row][col][0];
2336                     pu1_dst[2 * j + 1] = u4_dc_val[row][col][1];
2337                 }
2338                 pu1_dst += dst_strd;
2339             }
2340         }
2341         else if (i4_min_sad == i4_sad_horz)
2342         {
2343             *u4_intra_mode = HORZ_CH_I8x8;
2344             for (j = 0; j < 8; j++)
2345             {
2346                 val_v = pu1_ngbr_pels[15 - 2 * j];
2347                 val_u = pu1_ngbr_pels[15 - 2 * j - 1];
2348 
2349                 for (i = 0; i < 8; i++)
2350                 {
2351                     pu1_dst[2 * i] = val_u;
2352                     pu1_dst[2 * i + 1] = val_v;
2353 
2354                 }
2355                 pu1_dst += dst_strd;
2356             }
2357         }
2358         else
2359         {
2360             *u4_intra_mode = VERT_CH_I8x8;
2361             pu1_neighbour = pu1_ngbr_pels + 18;
2362             for (j = 0; j < 8; j++)
2363             {
2364                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
2365                 pu1_dst += dst_strd;
2366             }
2367         }
2368     }
2369 
2370     return;
2371 }
2372