1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 *******************************************************************************
23 * @file
24 *  ih264e_intra_modes_eval.c
25 *
26 * @brief
27 *  This file contains definitions of routines that perform rate distortion
28 *  analysis on a macroblock if they are to be coded as intra.
29 *
30 * @author
31 *  ittiam
32 *
33 * @par List of Functions:
34 *  - ih264e_derive_neighbor_availability_of_mbs()
35 *  - ih264e_derive_ngbr_avbl_of_mb_partitions()
36 *  - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff()
37 *  - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff()
38 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff()
39 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton()
40 *  - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff()
41 *  - ih264e_evaluate_intra16x16_modes()
42 *  - ih264e_evaluate_intra4x4_modes()
43 *  - ih264e_evaluate_intra_chroma_modes()
44 *
45 * @remarks
46 *  None
47 *
48 *******************************************************************************
49 */
50 
51 /*****************************************************************************/
52 /* File Includes                                                             */
53 /*****************************************************************************/
54 
55 /* System include files */
56 #include <stdio.h>
57 #include <string.h>
58 #include <limits.h>
59 #include <assert.h>
60 
61 /* User include files */
62 #include "ih264e_config.h"
63 #include "ih264_typedefs.h"
64 #include "ih264e_defs.h"
65 #include "iv2.h"
66 #include "ive2.h"
67 #include "ih264_debug.h"
68 #include "ih264_defs.h"
69 #include "ih264_macros.h"
70 #include "ih264_intra_pred_filters.h"
71 #include "ih264_structs.h"
72 #include "ih264_common_tables.h"
73 #include "ih264_trans_quant_itrans_iquant.h"
74 #include "ih264_inter_pred_filters.h"
75 #include "ih264_mem_fns.h"
76 #include "ih264_padding.h"
77 #include "ih264_deblk_edge_filters.h"
78 #include "ih264_cabac_tables.h"
79 #include "ime_distortion_metrics.h"
80 #include "ih264e_error.h"
81 #include "ih264e_bitstream.h"
82 #include "ime_defs.h"
83 #include "ime_structs.h"
84 #include "irc_cntrl_param.h"
85 #include "irc_frame_info_collector.h"
86 #include "ih264e_rate_control.h"
87 #include "ih264e_cabac_structs.h"
88 #include "ih264e_structs.h"
89 #include "ih264e_intra_modes_eval.h"
90 #include "ih264e_globals.h"
91 #include "ime_platform_macros.h"
92 
93 
94 /*****************************************************************************/
95 /* Function Definitions                                                      */
96 /*****************************************************************************/
97 
98 /**
99 ******************************************************************************
100 *
101 * @brief
102 *  derivation process for macroblock availability
103 *
104 * @par   Description
105 *  Calculates the availability of the left, top, topright and topleft macroblocks.
106 *
107 * @param[in] ps_proc_ctxt
108 *  pointer to proc context (handle)
109 *
110 * @remarks Based on section 6.4.5 in H264 spec
111 *
112 * @return  none
113 *
114 ******************************************************************************
115 */
ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t * ps_proc)116 void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc)
117 {
118     UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx;
119     UWORD8 *pu1_slice_idx_b;
120     UWORD8 *pu1_slice_idx_a;
121     UWORD8 *pu1_slice_idx_c;
122     UWORD8 *pu1_slice_idx_d;
123     block_neighbors_t *ps_ngbr_avbl;
124     WORD32 i4_mb_x, i4_mb_y;
125     WORD32 i4_wd_mbs;
126 
127     i4_mb_x = ps_proc->i4_mb_x;
128     i4_mb_y = ps_proc->i4_mb_y;
129 
130     i4_wd_mbs = ps_proc->i4_wd_mbs;
131 
132     pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x;
133     pu1_slice_idx_a = pu1_slice_idx_curr - 1;
134     pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs;
135     pu1_slice_idx_c = pu1_slice_idx_b + 1;
136     pu1_slice_idx_d = pu1_slice_idx_b - 1;
137     ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
138 
139     /**********************************************************************/
140     /* The macroblock is marked as available, unless one of the following */
141     /* conditions is true in which case the macroblock shall be marked as */
142     /* not available.                                                     */
143     /* 1. mbAddr < 0                                                      */
144     /* 2  mbAddr > CurrMbAddr                                             */
145     /* 3. the macroblock with address mbAddr belongs to a different slice */
146     /* than the macroblock with address CurrMbAddr                        */
147     /**********************************************************************/
148 
149     /* left macroblock availability */
150     if (i4_mb_x == 0)
151     { /* macroblocks along first column */
152         ps_ngbr_avbl->u1_mb_a = 0;
153     }
154     else
155     { /* macroblocks belong to same slice? */
156         if (*pu1_slice_idx_a != *pu1_slice_idx_curr)
157             ps_ngbr_avbl->u1_mb_a = 0;
158         else
159             ps_ngbr_avbl->u1_mb_a = 1;
160     }
161 
162     /* top macroblock availability */
163     if (i4_mb_y == 0)
164     { /* macroblocks along first row */
165         ps_ngbr_avbl->u1_mb_b = 0;
166     }
167     else
168     { /* macroblocks belong to same slice? */
169         if (*pu1_slice_idx_b != *pu1_slice_idx_curr)
170             ps_ngbr_avbl->u1_mb_b = 0;
171         else
172             ps_ngbr_avbl->u1_mb_b = 1;
173     }
174 
175     /* top right macroblock availability */
176     if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0)
177     { /* macroblocks along last column */
178         ps_ngbr_avbl->u1_mb_c = 0;
179     }
180     else
181     { /* macroblocks belong to same slice? */
182         if (*pu1_slice_idx_c != *pu1_slice_idx_curr)
183             ps_ngbr_avbl->u1_mb_c = 0;
184         else
185             ps_ngbr_avbl->u1_mb_c = 1;
186     }
187 
188     /* top left macroblock availability */
189     if (i4_mb_x == 0 || i4_mb_y == 0)
190     { /* macroblocks along first column */
191         ps_ngbr_avbl->u1_mb_d = 0;
192     }
193     else
194     { /* macroblocks belong to same slice? */
195         if (*pu1_slice_idx_d != *pu1_slice_idx_curr)
196             ps_ngbr_avbl->u1_mb_d = 0;
197         else
198             ps_ngbr_avbl->u1_mb_d = 1;
199     }
200 }
201 
202 /**
203 ******************************************************************************
204 *
205 * @brief
206 *  derivation process for subblock/partition availability
207 *
208 * @par   Description
209 *  Calculates the availability of the left, top, topright and topleft subblock
210 *  or partitions.
211 *
212 * @param[in]    ps_proc_ctxt
213 *  pointer to macroblock context (handle)
214 *
215 * @param[in]    i1_pel_pos_x
216 *  column position of the pel wrt the current block
217 *
218 * @param[in]    i1_pel_pos_y
219 *  row position of the pel in wrt current block
220 *
221 * @remarks     Assumptions: before calling this function it is assumed that
222 *   the neighbor availability of the current macroblock is already derived.
223 *   Based on table 6-3 of H264 specification
224 *
225 * @return      availability status (yes or no)
226 *
227 ******************************************************************************
228 */
ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t * ps_ngbr_avbl,WORD8 i1_pel_pos_x,WORD8 i1_pel_pos_y)229 UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl,
230                                                 WORD8 i1_pel_pos_x,
231                                                 WORD8 i1_pel_pos_y)
232 {
233     UWORD8 u1_neighbor_avail=0;
234 
235     /**********************************************************************/
236     /* values of i1_pel_pos_x in the range 0-15 inclusive correspond to   */
237     /* various columns of a macroblock                                    */
238     /*                                                                    */
239     /* values of i1_pel_pos_y in the range 0-15 inclusive correspond to   */
240     /* various rows of a macroblock                                       */
241     /*                                                                    */
242     /* other values of i1_pel_pos_x & i1_pel_pos_y represents elements    */
243     /* outside the bound of an mb ie., represents its neighbors.          */
244     /**********************************************************************/
245     if (i1_pel_pos_x < 0)
246     { /* column(-1) */
247         if (i1_pel_pos_y < 0)
248         { /* row(-1) */
249             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */
250         }
251         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
252         { /* all rows of a macroblock */
253             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */
254         }
255         else /* if (i1_pel_pos_y >= 16) */
256         { /* rows(+16) */
257             u1_neighbor_avail = 0;  /* current mb bottom left availability */
258         }
259     }
260     else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16)
261     { /* all columns of a macroblock */
262         if (i1_pel_pos_y < 0)
263         { /* row(-1) */
264             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */
265         }
266         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
267         { /* all rows of a macroblock */
268             u1_neighbor_avail = 1; /* current mb availability */
269             /* availability of the partition is dependent on the position of the partition inside the mb */
270             /* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */
271         }
272         else /* if (i1_pel_pos_y >= 16) */
273         { /* rows(+16) */
274             u1_neighbor_avail = 0;  /* current mb bottom availability */
275         }
276     }
277     else if (i1_pel_pos_x >= 16)
278     { /* column(+16) */
279         if (i1_pel_pos_y < 0)
280         { /* row(-1) */
281             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */
282         }
283         else /* if (i1_pel_pos_y >= 0) */
284         { /* all other rows */
285             u1_neighbor_avail = 0;  /* current mb right & bottom right availability */
286         }
287     }
288 
289     return u1_neighbor_avail;
290 }
291 
292 /**
293 ******************************************************************************
294 *
295 * @brief
296 *  evaluate best intra 16x16 mode (rate distortion opt off)
297 *
298 * @par Description
299 *  This function evaluates all the possible intra 16x16 modes and finds the mode
300 *  that best represents the macro-block (least distortion) and occupies fewer
301 *  bits in the bit-stream.
302 *
303 * @param[in]   ps_proc_ctxt
304 *  pointer to process context (handle)
305 *
306 * @remarks
307 *  Ideally the cost of encoding a macroblock is calculated as
308 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
309 *  input block and the reconstructed block and rate is the number of bits taken
310 *  to place the macroblock in the bit-stream. In this routine the rate does not
311 *  exactly point to the total number of bits it takes, rather it points to header
312 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
313 *  and residual bits fall in to texture bits the number of bits taken to encoding
314 *  mbtype is considered as rate, we compute cost. Further we will approximate
315 *  the distortion as the deviation b/w input and the predicted block as opposed
316 *  to input and reconstructed block.
317 *
318 *  NOTE: As per the Document JVT-O079, for intra 16x16 macroblock,
319 *  the SAD and cost are one and the same.
320 *
321 * @return     none
322 *
323 ******************************************************************************
324 */
325 
ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)326 void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
327 {
328     /* Codec Context */
329     codec_t *ps_codec = ps_proc->ps_codec;
330 
331     /* SAD(distortion metric) of an 8x8 block */
332     WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX;
333 
334     /* lambda */
335     UWORD32 u4_lambda = ps_proc->u4_lambda;
336 
337     /* cost = distortion + lambda*rate */
338     WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX;
339 
340     /* intra mode */
341     UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16;
342 
343     /* neighbor pels for intra prediction */
344     UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels;
345 
346     /* neighbor availability */
347     WORD32 i4_ngbr_avbl;
348 
349     /* pointer to src macro block */
350     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
351     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
352 
353     /* pointer to prediction macro block */
354     UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16;
355     UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane;
356 
357     /* strides */
358     WORD32 i4_src_strd = ps_proc->i4_src_strd;
359     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
360     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
361 
362     /* pointer to neighbors left, top, topleft */
363     UWORD8 *pu1_mb_a = pu1_ref_mb - 1;
364     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd;
365     UWORD8 *pu1_mb_d = pu1_mb_b - 1;
366     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
367     /* valid intra modes map */
368     UWORD32 u4_valid_intra_modes;
369 
370     /* lut for valid intra modes */
371     const UWORD8 u1_valid_intra_modes[8] = {4, 6, 4, 6, 5, 7, 5, 15};
372 
373     /* temp var */
374     UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
375     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
376     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
377 
378     /* init temp var */
379     if (ps_proc->i4_slice_type != ISLICE)
380     {
381         /* Offset for MBtype */
382         offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
383         u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
384     }
385 
386     /* locating neighbors that are available for prediction */
387 
388     /* gather prediction pels from the neighbors, if particular set is not available
389      * it is set to zero*/
390     /* left pels */
391     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
392                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
393     if (u1_mb_a)
394     {
395         for(i = 0; i < 16; i++)
396             pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd];
397     }
398     else
399     {
400         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE);
401     }
402     /* top pels */
403     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
404                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
405     if (u1_mb_b)
406     {
407         ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16);
408     }
409     else
410     {
411         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE);
412     }
413     /* topleft pels */
414     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
415                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
416     if (u1_mb_d)
417     {
418         pu1_ngbr_pels_i16[16] = *pu1_mb_d;
419     }
420     else
421     {
422         pu1_ngbr_pels_i16[16] = 0;
423     }
424 
425     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
426     ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl;
427 
428     /* set valid intra modes for evaluation */
429     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
430 
431     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
432         u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
433 
434     /* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */
435     ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16,
436                                                   i4_src_strd, i4_pred_strd,
437                                                   i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least,
438                                                   u4_valid_intra_modes);
439 
440     /* cost = distortion + lambda*rate */
441     i4_mb_cost_least = i4_mb_distortion_least;
442 
443     if ((( (u4_valid_intra_modes >> 3) & 1) != 0) && (ps_codec->s_cfg.u4_enc_speed_preset != IVE_FASTEST ||
444                     ps_proc->i4_slice_type == ISLICE))
445     {
446         /* intra prediction for PLANE mode*/
447         (ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl);
448 
449         /* evaluate distortion between the actual blk and the estimated blk for the given mode */
450         ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion);
451 
452         /* cost = distortion + lambda*rate */
453         i4_mb_cost = i4_mb_distortion;
454 
455         /* update the least cost information if necessary */
456         if(i4_mb_cost < i4_mb_distortion_least)
457         {
458             u4_intra_mode = PLANE_I16x16;
459 
460             i4_mb_cost_least = i4_mb_cost;
461             i4_mb_distortion_least = i4_mb_distortion;
462         }
463     }
464 
465     u4_best_intra_16x16_mode = u4_intra_mode;
466 
467     DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode);
468 
469     ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode;
470 
471     /* cost = distortion + lambda*rate */
472     i4_mb_cost_least    = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode];
473 
474 
475     /* update the type of the mb if necessary */
476     if (i4_mb_cost_least < ps_proc->i4_mb_cost)
477     {
478         ps_proc->i4_mb_cost = i4_mb_cost_least;
479         ps_proc->i4_mb_distortion = i4_mb_distortion_least;
480         ps_proc->u4_mb_type = I16x16;
481     }
482 
483     return ;
484 }
485 
486 
487 /**
488 ******************************************************************************
489 *
490 * @brief
491 *  evaluate best intra 8x8 mode (rate distortion opt on)
492 *
493 * @par Description
494 *  This function evaluates all the possible intra 8x8 modes and finds the mode
495 *  that best represents the macro-block (least distortion) and occupies fewer
496 *  bits in the bit-stream.
497 *
498 * @param[in]    ps_proc_ctxt
499 *  pointer to proc ctxt
500 *
501 * @remarks Ideally the cost of encoding a macroblock is calculated as
502 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
503 *  input block and the reconstructed block and rate is the number of bits taken
504 *  to place the macroblock in the bit-stream. In this routine the rate does not
505 *  exactly point to the total number of bits it takes, rather it points to header
506 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
507 *  and residual bits fall in to texture bits the number of bits taken to encoding
508 *  mbtype is considered as rate, we compute cost. Further we will approximate
509 *  the distortion as the deviation b/w input and the predicted block as opposed
510 *  to input and reconstructed block.
511 *
512 *  NOTE: TODO: This function needs to be tested
513 *
514 *  @return      none
515 *
516 ******************************************************************************
517 */
ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)518 void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
519 {
520     /* Codec Context */
521     codec_t *ps_codec = ps_proc->ps_codec;
522 
523     /* SAD(distortion metric) of an 4x4 block */
524     WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
525 
526     /* lambda */
527     UWORD32 u4_lambda = ps_proc->u4_lambda;
528 
529     /* cost = distortion + lambda*rate */
530     WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda;
531 
532     /* cost due to mbtype */
533     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
534 
535     /* intra mode */
536     UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode;
537 
538     /* neighbor pels for intra prediction */
539     UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels;
540 
541     /* pointer to curr partition */
542     UWORD8 *pu1_mb_curr;
543 
544     /* pointer to prediction macro block */
545     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
546 
547     /* strides */
548     WORD32 i4_src_strd = ps_proc->i4_src_strd;
549     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
550 
551     /* neighbors left, top, top right, top left */
552     UWORD8 *pu1_mb_a;
553     UWORD8 *pu1_mb_b;
554     UWORD8 *pu1_mb_d;
555 
556     /* neighbor availability */
557     WORD32 i4_ngbr_avbl;
558     block_neighbors_t s_ngbr_avbl;
559 
560     /* temp vars */
561     UWORD32  b8, u4_pix_x, u4_pix_y;
562     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
563     block_neighbors_t s_ngbr_avbl_MB;
564 
565     /* ngbr mb syntax information */
566     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
567     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
568     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
569     /* valid intra modes map */
570     UWORD32 u4_valid_intra_modes;
571 
572     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
573     {
574         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
575     }
576     /* left pels */
577     s_ngbr_avbl_MB.u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
578                                   && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
579 
580     /* top pels */
581     s_ngbr_avbl_MB.u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
582                                   && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
583 
584     /* topleft pels */
585     s_ngbr_avbl_MB.u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
586                                   && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
587 
588     /* top right */
589     s_ngbr_avbl_MB.u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
590                                   && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
591 
592 
593     for(b8 = 0; b8 < 4; b8++)
594     {
595         u4_pix_x = (b8 & 0x01) << 3;
596         u4_pix_y = (b8 >> 1) << 3;
597 
598         pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
599         /* when rdopt is off, we use the input as reference for constructing prediction buffer */
600         /* as opposed to using the recon pels. (open loop intra prediction) */
601         pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
602         pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
603         pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
604 
605         /* locating neighbors that are available for prediction */
606         /* TODO : update the neighbor availability information basing on constrained intra pred information */
607         /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
608         /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
609         s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */
610         s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */
611         s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */
612         s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */
613 
614         /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
615         i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  (s_ngbr_avbl.u1_mb_c << 3) +
616                         (s_ngbr_avbl.u1_mb_a << 4);
617         /* if top partition is available and top right is not available for intra prediction, then */
618         /* padd top right samples using top sample and make top right also available */
619         /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
620         ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl;
621 
622 
623         ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8,
624                                                      i4_src_strd, i4_ngbr_avbl);
625 
626         i4_partition_cost_least = INT_MAX;
627         /* set valid intra modes for evaluation */
628         u4_valid_intra_modes = 0x1ff;
629 
630         if (!s_ngbr_avbl.u1_mb_b)
631         {
632             u4_valid_intra_modes &= ~(1 << VERT_I4x4);
633             u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4);
634             u4_valid_intra_modes &= ~(1 << VERT_L_I4x4);
635         }
636         if (!s_ngbr_avbl.u1_mb_a)
637         {
638             u4_valid_intra_modes &= ~(1 << HORZ_I4x4);
639             u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4);
640         }
641         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d)
642         {
643             u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4);
644             u4_valid_intra_modes &= ~(1 << VERT_R_I4x4);
645             u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4);
646         }
647 
648         /* estimate the intra 8x8 mode for the current partition (for evaluating cost) */
649         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
650         {
651             u4_estimated_intra_8x8_mode = DC_I8x8;
652         }
653         else
654         {
655             UWORD32 u4_left_intra_8x8_mode = DC_I8x8;
656             UWORD32 u4_top_intra_8x8_mode = DC_I8x8;
657 
658             if (u4_pix_x == 0)
659             {
660                 if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
661                 {
662                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1];
663                 }
664                 else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
665                 {
666                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2];
667                 }
668             }
669             else
670             {
671                 u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1];
672             }
673 
674             if (u4_pix_y == 0)
675             {
676                 if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
677                 {
678                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2];
679                 }
680                 else if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
681                 {
682                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2];
683                 }
684             }
685             else
686             {
687                 u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2];
688             }
689 
690             u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode);
691         }
692 
693         /* perform intra mode 8x8 evaluation */
694         for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1)
695         {
696             if ( (u4_valid_intra_modes & 1) == 0)
697                 continue;
698 
699             /* intra prediction */
700             (ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl);
701 
702             /* evaluate distortion between the actual blk and the estimated blk for the given mode */
703             ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion);
704 
705             i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits);
706 
707             /* update the least cost information if necessary */
708             if (i4_partition_cost < i4_partition_cost_least)
709             {
710                 i4_partition_cost_least = i4_partition_cost;
711                 i4_partition_distortion_least = i4_partition_distortion;
712                 u4_best_intra_8x8_mode = u4_intra_mode;
713             }
714         }
715         /* macroblock distortion */
716         i4_total_cost += i4_partition_cost_least;
717         i4_total_distortion += i4_partition_distortion_least;
718         /* mb partition mode */
719         ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode;
720 
721     }
722 
723     /* update the type of the mb if necessary */
724     if (i4_total_cost < ps_proc->i4_mb_cost)
725     {
726         ps_proc->i4_mb_cost = i4_total_cost;
727         ps_proc->i4_mb_distortion = i4_total_distortion;
728         ps_proc->u4_mb_type = I8x8;
729     }
730 
731     return ;
732 }
733 
734 
735 /**
736 ******************************************************************************
737 *
738 * @brief
739 *  evaluate best intra 4x4 mode (rate distortion opt off)
740 *
741 * @par Description
742 *  This function evaluates all the possible intra 4x4 modes and finds the mode
743 *  that best represents the macro-block (least distortion) and occupies fewer
744 *  bits in the bit-stream.
745 *
746 * @param[in]    ps_proc_ctxt
747 *  pointer to proc ctxt
748 *
749 * @remarks
750 *  Ideally the cost of encoding a macroblock is calculated as
751 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
752 *  input block and the reconstructed block and rate is the number of bits taken
753 *  to place the macroblock in the bit-stream. In this routine the rate does not
754 *  exactly point to the total number of bits it takes, rather it points to header
755 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
756 *  and residual bits fall in to texture bits the number of bits taken to encoding
757 *  mbtype is considered as rate, we compute cost. Further we will approximate
758 *  the distortion as the deviation b/w input and the predicted block as opposed
759 *  to input and reconstructed block.
760 *
761 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
762 *  24*lambda is added to the SAD before comparison with the best SAD for
763 *  inter prediction. This is an empirical value to prevent using too many intra
764 *  blocks.
765 *
766 * @return      none
767 *
768 ******************************************************************************
769 */
ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)770 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
771 {
772     /* Codec Context */
773     codec_t *ps_codec = ps_proc->ps_codec;
774 
775     /* SAD(distortion metric) of an 4x4 block */
776     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
777 
778     /* lambda */
779     UWORD32 u4_lambda = ps_proc->u4_lambda;
780 
781     /* cost = distortion + lambda*rate */
782     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
783 
784     /* cost due to mbtype */
785     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
786 
787     /* intra mode */
788     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
789 
790     /* neighbor pels for intra prediction */
791     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
792 
793     /* pointer to curr partition */
794     UWORD8 *pu1_mb_curr;
795 
796     /* pointer to prediction macro block */
797     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
798 
799     /* strides */
800     WORD32 i4_src_strd = ps_proc->i4_src_strd;
801     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
802 
803     /* neighbors left, top, top right, top left */
804     UWORD8 *pu1_mb_a;
805     UWORD8 *pu1_mb_b;
806     UWORD8 *pu1_mb_c;
807     UWORD8 *pu1_mb_d;
808 
809     /* neighbor availability */
810     WORD32 i4_ngbr_avbl;
811     block_neighbors_t s_ngbr_avbl;
812 
813     /* temp vars */
814     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
815 
816     /* scan order inside 4x4 block */
817     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
818 
819     /* ngbr sub mb modes */
820     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
821     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
822     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
823 
824     /* valid intra modes map */
825     UWORD32 u4_valid_intra_modes;
826     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
827 
828     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
829     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
830     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
831     {
832         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x + 1;
833     }
834     /* left pels */
835     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
836                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
837 
838     /* top pels */
839     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
840                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
841 
842     /* topleft pels */
843     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
844                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
845 
846     /* top right */
847     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
848                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
849 
850     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
851     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
852 
853     for (b8 = 0; b8 < 4; b8++)
854     {
855         u4_blk_x = (b8 & 0x01) << 3;
856         u4_blk_y = (b8 >> 1) << 3;
857         for (b4 = 0; b4 < 4; b4++)
858         {
859             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
860             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
861 
862             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
863             /* when rdopt is off, we use the input as reference for constructing prediction buffer */
864             /* as opposed to using the recon pels. (open loop intra prediction) */
865             pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
866             pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
867             pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */
868             pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
869 
870             /* locating neighbors that are available for prediction */
871             /* TODO : update the neighbor availability information basing on constrained intra pred information */
872             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
873             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
874 
875             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
876             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
877             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
878             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
879             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
880             /* set valid intra modes for evaluation */
881             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
882 
883             /* if top partition is available and top right is not available for intra prediction, then */
884             /* padd top right samples using top sample and make top right also available */
885             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
886 
887             /* gather prediction pels from the neighbors */
888             if (s_ngbr_avbl.u1_mb_a)
889             {
890                 for(i = 0; i < 4; i++)
891                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd];
892             }
893             else
894             {
895                 memset(pu1_ngbr_pels_i4, 0, 4);
896             }
897 
898             if (s_ngbr_avbl.u1_mb_b)
899             {
900                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
901             }
902             else
903             {
904                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
905             }
906 
907             if (s_ngbr_avbl.u1_mb_d)
908                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
909             else
910                 pu1_ngbr_pels_i4[4] = 0;
911 
912             if (s_ngbr_avbl.u1_mb_c)
913             {
914                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
915             }
916             else if (s_ngbr_avbl.u1_mb_b)
917             {
918                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
919                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
920             }
921 
922             i4_partition_cost_least = INT_MAX;
923 
924             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
925             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
926             {
927                 u4_estimated_intra_4x4_mode = DC_I4x4;
928             }
929             else
930             {
931                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
932                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
933 
934                 if (u4_pix_x == 0)
935                 {
936                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
937                     {
938                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
939                     }
940                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
941                     {
942                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
943                     }
944                 }
945                 else
946                 {
947                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
948                 }
949 
950                 if (u4_pix_y == 0)
951                 {
952                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
953                     {
954                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
955                     }
956                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
957                     {
958                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
959                     }
960                 }
961                 else
962                 {
963                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
964                 }
965 
966                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
967             }
968 
969             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
970 
971             /* mode evaluation and prediction */
972             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
973                                                          pu1_ngbr_pels_i4,
974                                                          pu1_pred_mb, i4_src_strd,
975                                                          i4_pred_strd, i4_ngbr_avbl,
976                                                          &u4_best_intra_4x4_mode,
977                                                          &i4_partition_cost_least,
978                                                          u4_valid_intra_modes,
979                                                          u4_lambda,
980                                                          u4_estimated_intra_4x4_mode);
981 
982 
983             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits);
984 
985             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
986             /* macroblock distortion */
987             i4_total_distortion += i4_partition_distortion_least;
988             i4_total_cost += i4_partition_cost_least;
989             /* mb partition mode */
990             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
991         }
992     }
993 
994     /* update the type of the mb if necessary */
995     if (i4_total_cost < ps_proc->i4_mb_cost)
996     {
997         ps_proc->i4_mb_cost = i4_total_cost;
998         ps_proc->i4_mb_distortion = i4_total_distortion;
999         ps_proc->u4_mb_type = I4x4;
1000     }
1001 
1002     return ;
1003 }
1004 
1005 /**
1006 ******************************************************************************
1007 *
1008 * @brief evaluate best intra 4x4 mode (rate distortion opt on)
1009 *
1010 * @par Description
1011 *  This function evaluates all the possible intra 4x4 modes and finds the mode
1012 *  that best represents the macro-block (least distortion) and occupies fewer
1013 *  bits in the bit-stream.
1014 *
1015 * @param[in]    ps_proc_ctxt
1016 *  pointer to proc ctxt
1017 *
1018 * @remarks
1019 *  Ideally the cost of encoding a macroblock is calculated as
1020 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
1021 *  input block and the reconstructed block and rate is the number of bits taken
1022 *  to place the macroblock in the bit-stream. In this routine the rate does not
1023 *  exactly point to the total number of bits it takes, rather it points to header
1024 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
1025 *  and residual bits fall in to texture bits the number of bits taken to encoding
1026 *  mbtype is considered as rate, we compute cost. Further we will approximate
1027 *  the distortion as the deviation b/w input and the predicted block as opposed
1028 *  to input and reconstructed block.
1029 *
1030 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
1031 *  24*lambda is added to the SAD before comparison with the best SAD for
1032 *  inter prediction. This is an empirical value to prevent using too many intra
1033 *  blocks.
1034 *
1035 * @return      none
1036 *
1037 ******************************************************************************
1038 */
ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t * ps_proc)1039 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc)
1040 {
1041     /* Codec Context */
1042     codec_t *ps_codec = ps_proc->ps_codec;
1043 
1044     /* SAD(distortion metric) of an 4x4 block */
1045     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
1046 
1047     /* lambda */
1048     UWORD32 u4_lambda = ps_proc->u4_lambda;
1049 
1050     /* cost = distortion + lambda*rate */
1051     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
1052 
1053     /* cost due to mbtype */
1054     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
1055 
1056     /* intra mode */
1057     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
1058 
1059     /* neighbor pels for intra prediction */
1060     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
1061 
1062     /* pointer to curr partition */
1063     UWORD8 *pu1_mb_curr;
1064     UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top;
1065     UWORD8 *pu1_ref_mb_intra_4x4;
1066 
1067     /* pointer to residual macro block */
1068     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1069 
1070     /* pointer to prediction macro block */
1071     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
1072 
1073     /* strides */
1074     WORD32 i4_src_strd = ps_proc->i4_src_strd;
1075     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1076     WORD32 i4_ref_strd_left, i4_ref_strd_top;
1077 
1078     /* neighbors left, top, top right, top left */
1079     UWORD8 *pu1_mb_a;
1080     UWORD8 *pu1_mb_b;
1081     UWORD8 *pu1_mb_c;
1082     UWORD8 *pu1_mb_d;
1083 
1084     /* number of non zero coeffs*/
1085     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
1086 
1087     /* quantization parameters */
1088     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1089 
1090     /* neighbor availability */
1091     WORD32 i4_ngbr_avbl;
1092     block_neighbors_t s_ngbr_avbl;
1093 
1094     /* temp vars */
1095     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
1096 
1097     /* scan order inside 4x4 block */
1098     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
1099 
1100     /* ngbr sub mb modes */
1101     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
1102     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1103     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1104 
1105     /* valid intra modes map */
1106     UWORD32 u4_valid_intra_modes;
1107     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
1108 
1109     /* Dummy variable for 4x4 trans function */
1110     WORD16 i2_dc_dummy;
1111     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
1112     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
1113 
1114     /* compute ngbr availability for sub blks */
1115     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
1116     {
1117         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
1118     }
1119 
1120     /* left pels */
1121     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
1122                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
1123 
1124        /* top pels */
1125     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
1126                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
1127 
1128        /* topleft pels */
1129     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
1130                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
1131 
1132        /* top right pels */
1133     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
1134                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
1135 
1136     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
1137     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
1138 
1139     for(b8 = 0; b8 < 4; b8++)
1140     {
1141         u4_blk_x = (b8 & 0x01) << 3;
1142         u4_blk_y = (b8 >> 1) << 3;
1143         for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
1144         {
1145             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
1146             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
1147 
1148             pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd);
1149             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
1150             if (u4_pix_x == 0)
1151             {
1152                 i4_ref_strd_left = ps_proc->i4_rec_strd;
1153                 pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left);
1154             }
1155             else
1156             {
1157                 i4_ref_strd_left = i4_pred_strd;
1158                 pu1_mb_ref_left = pu1_ref_mb_intra_4x4;
1159             }
1160             if (u4_pix_y == 0)
1161             {
1162                 i4_ref_strd_top = ps_proc->i4_rec_strd;
1163                 pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top);
1164             }
1165             else
1166             {
1167                 i4_ref_strd_top = i4_pred_strd;
1168                 pu1_mb_ref_top = pu1_ref_mb_intra_4x4;
1169             }
1170 
1171             pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */
1172             pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */
1173             pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */
1174             if (u4_pix_y == 0)
1175                 pu1_mb_d = pu1_mb_b - 1;
1176             else
1177                 pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */
1178 
1179             /* locating neighbors that are available for prediction */
1180             /* TODO : update the neighbor availability information basing on constrained intra pred information */
1181             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
1182             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
1183 
1184             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
1185             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
1186             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
1187             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
1188             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
1189             /* set valid intra modes for evaluation */
1190             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
1191 
1192             /* if top partition is available and top right is not available for intra prediction, then */
1193             /* padd top right samples using top sample and make top right also available */
1194             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
1195 
1196             /* gather prediction pels from the neighbors */
1197             if (s_ngbr_avbl.u1_mb_a)
1198             {
1199                 for(i = 0; i < 4; i++)
1200                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left];
1201             }
1202             else
1203             {
1204                 memset(pu1_ngbr_pels_i4,0,4);
1205             }
1206             if(s_ngbr_avbl.u1_mb_b)
1207             {
1208                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
1209             }
1210             else
1211             {
1212                 memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4);
1213             }
1214             if (s_ngbr_avbl.u1_mb_d)
1215                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
1216             else
1217                 pu1_ngbr_pels_i4[4] = 0;
1218             if (s_ngbr_avbl.u1_mb_c)
1219             {
1220                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
1221             }
1222             else if (s_ngbr_avbl.u1_mb_b)
1223             {
1224                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
1225                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
1226             }
1227 
1228             i4_partition_cost_least = INT_MAX;
1229 
1230             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
1231             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
1232             {
1233                 u4_estimated_intra_4x4_mode = DC_I4x4;
1234             }
1235             else
1236             {
1237                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
1238                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
1239 
1240                 if (u4_pix_x == 0)
1241                 {
1242                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
1243                     {
1244                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
1245                     }
1246                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
1247                     {
1248                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
1249                     }
1250                 }
1251                 else
1252                 {
1253                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
1254                 }
1255 
1256                 if (u4_pix_y == 0)
1257                 {
1258                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
1259                     {
1260                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
1261                     }
1262                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
1263                     {
1264                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
1265                     }
1266                 }
1267                 else
1268                 {
1269                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
1270                 }
1271 
1272                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
1273             }
1274 
1275             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
1276 
1277             /*mode evaluation and prediction*/
1278             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
1279                                                          pu1_ngbr_pels_i4,
1280                                                          pu1_pred_mb, i4_src_strd,
1281                                                          i4_pred_strd, i4_ngbr_avbl,
1282                                                          &u4_best_intra_4x4_mode,
1283                                                          &i4_partition_cost_least,
1284                                                          u4_valid_intra_modes,
1285                                                          u4_lambda,
1286                                                          u4_estimated_intra_4x4_mode);
1287 
1288 
1289             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits);
1290 
1291             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
1292 
1293             /* macroblock distortion */
1294             i4_total_distortion += i4_partition_distortion_least;
1295             i4_total_cost += i4_partition_cost_least;
1296 
1297             /* mb partition mode */
1298             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
1299 
1300 
1301             /********************************************************/
1302             /*  error estimation,                                   */
1303             /*  transform                                           */
1304             /*  quantization                                        */
1305             /********************************************************/
1306             ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb,
1307                                               pi2_res_mb, i4_src_strd,
1308                                               i4_pred_strd,
1309                                               /* No op stride, this implies a buff of lenght 1x16 */
1310                                               ps_qp_params->pu2_scale_mat,
1311                                               ps_qp_params->pu2_thres_mat,
1312                                               ps_qp_params->u1_qbits,
1313                                               ps_qp_params->u4_dead_zone,
1314                                               pu1_nnz, &i2_dc_dummy);
1315 
1316             /********************************************************/
1317             /*  ierror estimation,                                  */
1318             /*  itransform                                          */
1319             /*  iquantization                                       */
1320             /********************************************************/
1321             ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb,
1322                                                  pu1_ref_mb_intra_4x4,
1323                                                  i4_pred_strd, i4_pred_strd,
1324                                                  ps_qp_params->pu2_iscale_mat,
1325                                                  ps_qp_params->pu2_weigh_mat,
1326                                                  ps_qp_params->u1_qp_div,
1327                                                  ps_proc->pv_scratch_buff, 0,
1328                                                  NULL);
1329         }
1330     }
1331 
1332     /* update the type of the mb if necessary */
1333     if (i4_total_cost < ps_proc->i4_mb_cost)
1334     {
1335         ps_proc->i4_mb_cost = i4_total_cost;
1336         ps_proc->i4_mb_distortion = i4_total_distortion;
1337         ps_proc->u4_mb_type = I4x4;
1338     }
1339 
1340     return ;
1341 }
1342 
1343 /**
1344 ******************************************************************************
1345 *
1346 * @brief
1347 *  evaluate best chroma intra 8x8 mode (rate distortion opt off)
1348 *
1349 * @par Description
1350 *  This function evaluates all the possible chroma intra 8x8 modes and finds
1351 *  the mode that best represents the macroblock (least distortion) and occupies
1352 *  fewer bits in the bitstream.
1353 *
1354 * @param[in] ps_proc_ctxt
1355 *  pointer to macroblock context (handle)
1356 *
1357 * @remarks
1358 *  For chroma best intra pred mode is calculated based only on SAD
1359 *
1360 * @returns none
1361 *
1362 ******************************************************************************
1363 */
1364 
ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)1365 void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
1366 {
1367     /* Codec Context */
1368     codec_t *ps_codec = ps_proc->ps_codec;
1369 
1370     /* SAD(distortion metric) of an 8x8 block */
1371     WORD32 i4_mb_distortion, i4_chroma_mb_distortion;
1372 
1373     /* intra mode */
1374     UWORD32  u4_best_chroma_intra_8x8_mode = DC_CH_I8x8;
1375 
1376     /* neighbor pels for intra prediction */
1377     UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels;
1378 
1379     /* pointer to curr macro block */
1380     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
1381     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
1382 
1383     /* pointer to prediction macro block */
1384     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
1385     UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
1386 
1387     /* strides */
1388     WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
1389     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1390     WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
1391 
1392     /* neighbors left, top, top left */
1393     UWORD8 *pu1_mb_a = pu1_ref_mb - 2;
1394     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c;
1395     UWORD8 *pu1_mb_d = pu1_mb_b - 2;
1396 
1397     /* neighbor availability */
1398     const UWORD8  u1_valid_intra_modes[8] = {1, 3, 1, 3, 5, 7, 5, 15};
1399     WORD32 i4_ngbr_avbl;
1400 
1401     /* valid intra modes map */
1402     UWORD32 u4_valid_intra_modes;
1403     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1404 
1405     /* temp var */
1406     UWORD8 i;
1407     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
1408     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
1409     /* locating neighbors that are available for prediction */
1410 
1411     /* gather prediction pels from the neighbors */
1412     /* left pels */
1413     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
1414                     && (u4_constrained_intra_pred ?  ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
1415     if (u1_mb_a)
1416     {
1417         for (i = 0; i < 16; i += 2)
1418         {
1419             pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c];
1420             pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1];
1421         }
1422     }
1423     else
1424     {
1425         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE);
1426     }
1427 
1428     /* top pels */
1429     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
1430                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
1431     if (u1_mb_b)
1432     {
1433         ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16);
1434     }
1435     else
1436     {
1437         ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE);
1438     }
1439 
1440     /* top left pels */
1441     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
1442                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
1443     if (u1_mb_d)
1444     {
1445         pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d;
1446         pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1);
1447     }
1448     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
1449     ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl;
1450 
1451     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
1452 
1453     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
1454         u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8);
1455 
1456     i4_chroma_mb_distortion = INT_MAX;
1457 
1458     /* perform intra mode chroma  8x8 evaluation */
1459     /* intra prediction */
1460     ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb,
1461                                                     pu1_ngbr_pels_c_i8x8,
1462                                                     pu1_pred_mb,
1463                                                     i4_src_strd_c,
1464                                                     i4_pred_strd,
1465                                                     i4_ngbr_avbl,
1466                                                     &u4_best_chroma_intra_8x8_mode,
1467                                                     &i4_chroma_mb_distortion,
1468                                                     u4_valid_intra_modes);
1469 
1470     if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/
1471     {
1472         (ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl);
1473 
1474         /* evaluate distortion(sad) */
1475         ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion);
1476 
1477         /* update the least distortion information if necessary */
1478         if(i4_mb_distortion < i4_chroma_mb_distortion)
1479         {
1480             i4_chroma_mb_distortion = i4_mb_distortion;
1481             u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8;
1482         }
1483     }
1484 
1485     DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode);
1486 
1487     ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode;
1488 
1489     return ;
1490 }
1491 
1492 
1493 /**
1494 ******************************************************************************
1495 *
1496 * @brief
1497 *  Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
1498 *  prediction.
1499 *
1500 * @par Description
1501 *  This function evaluates first three 16x16 modes and compute corresponding sad
1502 *  and return the buffer predicted with best mode.
1503 *
1504 * @param[in] pu1_src
1505 *  UWORD8 pointer to the source
1506 *
1507 * @param[in] pu1_ngbr_pels_i16
1508 *  UWORD8 pointer to neighbouring pels
1509 *
1510 * @param[out] pu1_dst
1511 *  UWORD8 pointer to the destination
1512 *
1513 * @param[in] src_strd
1514 *  integer source stride
1515 *
1516 * @param[in] dst_strd
1517 *  integer destination stride
1518 *
1519 * @param[in] u4_n_avblty
1520 *  availability of neighbouring pixels
1521 *
1522 * @param[in] u4_intra_mode
1523 *  Pointer to the variable in which best mode is returned
1524 *
1525 * @param[in] pu4_sadmin
1526 *  Pointer to the variable in which minimum sad is returned
1527 *
1528 * @param[in] u4_valid_intra_modes
1529 *  Says what all modes are valid
1530 *
1531 * @returns      none
1532 *
1533 ******************************************************************************
1534 */
ih264e_evaluate_intra16x16_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels_i16,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)1535 void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
1536                                       UWORD8 *pu1_ngbr_pels_i16,
1537                                       UWORD8 *pu1_dst,
1538                                       UWORD32 src_strd,
1539                                       UWORD32 dst_strd,
1540                                       WORD32 u4_n_avblty,
1541                                       UWORD32 *u4_intra_mode,
1542                                       WORD32 *pu4_sadmin,
1543                                       UWORD32 u4_valid_intra_modes)
1544 {
1545     UWORD8 *pu1_neighbour;
1546     UWORD8 *pu1_src_temp = pu1_src;
1547     UWORD8 left = 0, top = 0;
1548     WORD32 u4_dcval = 0;
1549     WORD32 i, j;
1550     WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX,
1551                     i4_min_sad = INT_MAX;
1552     UWORD8 val;
1553 
1554     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
1555     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
1556 
1557     /* left available */
1558     if (left)
1559     {
1560         i4_sad_horz = 0;
1561 
1562         for (i = 0; i < 16; i++)
1563         {
1564             val = pu1_ngbr_pels_i16[15 - i];
1565 
1566             u4_dcval += val;
1567 
1568             for (j = 0; j < 16; j++)
1569             {
1570                 i4_sad_horz += ABS(val - pu1_src_temp[j]);
1571             }
1572 
1573             pu1_src_temp += src_strd;
1574         }
1575         u4_dcval += 8;
1576     }
1577 
1578     pu1_src_temp = pu1_src;
1579     /* top available */
1580     if (top)
1581     {
1582         i4_sad_vert = 0;
1583 
1584         for (i = 0; i < 16; i++)
1585         {
1586             u4_dcval += pu1_ngbr_pels_i16[17 + i];
1587 
1588             for (j = 0; j < 16; j++)
1589             {
1590                 i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]);
1591             }
1592             pu1_src_temp += src_strd;
1593 
1594         }
1595         u4_dcval += 8;
1596     }
1597 
1598     u4_dcval = (u4_dcval) >> (3 + left + top);
1599 
1600     pu1_src_temp = pu1_src;
1601 
1602     /* none available */
1603     u4_dcval += (left == 0) * (top == 0) * 128;
1604 
1605     i4_sad_dc = 0;
1606 
1607     for (i = 0; i < 16; i++)
1608     {
1609         for (j = 0; j < 16; j++)
1610         {
1611             i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]);
1612         }
1613         pu1_src_temp += src_strd;
1614     }
1615 
1616     if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */
1617         i4_sad_dc = INT_MAX;
1618 
1619     if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */
1620         i4_sad_vert = INT_MAX;
1621 
1622     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */
1623         i4_sad_horz = INT_MAX;
1624 
1625     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
1626 
1627     /* Finding Minimum sad and doing corresponding prediction */
1628     if (i4_min_sad < *pu4_sadmin)
1629     {
1630         *pu4_sadmin = i4_min_sad;
1631         if (i4_min_sad == i4_sad_vert)
1632         {
1633             *u4_intra_mode = VERT_I16x16;
1634             pu1_neighbour = pu1_ngbr_pels_i16 + 17;
1635             for (j = 0; j < 16; j++)
1636             {
1637                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
1638                 pu1_dst += dst_strd;
1639             }
1640         }
1641         else if (i4_min_sad == i4_sad_horz)
1642         {
1643             *u4_intra_mode = HORZ_I16x16;
1644             for (j = 0; j < 16; j++)
1645             {
1646                 val = pu1_ngbr_pels_i16[15 - j];
1647                 memset(pu1_dst, val, MB_SIZE);
1648                 pu1_dst += dst_strd;
1649             }
1650         }
1651         else
1652         {
1653             *u4_intra_mode = DC_I16x16;
1654             for (j = 0; j < 16; j++)
1655             {
1656                 memset(pu1_dst, u4_dcval, MB_SIZE);
1657                 pu1_dst += dst_strd;
1658             }
1659         }
1660     }
1661     return;
1662 }
1663 
1664 /**
1665 ******************************************************************************
1666 *
1667 * @brief
1668 *  Evaluate best intra 4x4 mode and perform prediction.
1669 *
1670 * @par Description
1671 *  This function evaluates  4x4 modes and compute corresponding sad
1672 *  and return the buffer predicted with best mode.
1673 *
1674 * @param[in] pu1_src
1675 *  UWORD8 pointer to the source
1676 *
1677 * @param[in] pu1_ngbr_pels
1678 *  UWORD8 pointer to neighbouring pels
1679 *
1680 * @param[out] pu1_dst
1681 *  UWORD8 pointer to the destination
1682 *
1683 * @param[in] src_strd
1684 *  integer source stride
1685 *
1686 * @param[in] dst_strd
1687 *  integer destination stride
1688 *
1689 * @param[in] u4_n_avblty
1690 *  availability of neighbouring pixels
1691 *
1692 * @param[in] u4_intra_mode
1693 *  Pointer to the variable in which best mode is returned
1694 *
1695 * @param[in] pu4_sadmin
1696 *  Pointer to the variable in which minimum cost is returned
1697 *
1698 * @param[in] u4_valid_intra_modes
1699 *  Says what all modes are valid
1700 *
1701 * @param[in] u4_lambda
1702 *  Lamda value for computing cost from SAD
1703 *
1704 * @param[in] u4_predictd_mode
1705 *  Predicted mode for cost computation
1706 *
1707 * @returns      none
1708 *
1709 ******************************************************************************
1710 */
ih264e_evaluate_intra_4x4_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes,UWORD32 u4_lambda,UWORD32 u4_predictd_mode)1711 void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
1712                                      UWORD8 *pu1_ngbr_pels,
1713                                      UWORD8 *pu1_dst,
1714                                      UWORD32 src_strd,
1715                                      UWORD32 dst_strd,
1716                                      WORD32 u4_n_avblty,
1717                                      UWORD32 *u4_intra_mode,
1718                                      WORD32 *pu4_sadmin,
1719                                      UWORD32 u4_valid_intra_modes,
1720                                      UWORD32  u4_lambda,
1721                                      UWORD32 u4_predictd_mode)
1722 {
1723     UWORD8 *pu1_src_temp = pu1_src;
1724     UWORD8 *pu1_pred = pu1_ngbr_pels;
1725     UWORD8 left = 0, top = 0;
1726     UWORD8 u1_pred_val = 0;
1727     UWORD8 u1_pred_vals[4] = {0};
1728     UWORD8 *pu1_pred_val = NULL;
1729     /* To store FILT121 operated values*/
1730     UWORD8 u1_pred_vals_diag_121[15] = {0};
1731     /* To store FILT11 operated values*/
1732     UWORD8 u1_pred_vals_diag_11[15] = {0};
1733     UWORD8 u1_pred_vals_vert_r[8] = {0};
1734     UWORD8 u1_pred_vals_horz_d[10] = {0};
1735     UWORD8 u1_pred_vals_horz_u[10] = {0};
1736     WORD32 u4_dcval = 0;
1737     WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
1738                                INT_MAX, INT_MAX, INT_MAX, INT_MAX};
1739 
1740     WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
1741                                 INT_MAX, INT_MAX, INT_MAX, INT_MAX};
1742     WORD32 i, i4_min_cost = INT_MAX;
1743 
1744     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
1745     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
1746 
1747     /* Computing SAD */
1748 
1749     /* VERT mode valid */
1750     if (u4_valid_intra_modes & 1)
1751     {
1752         pu1_pred = pu1_ngbr_pels + 5;
1753         i4_sad[VERT_I4x4] = 0;
1754         i4_cost[VERT_I4x4] = 0;
1755 
1756         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1757         pu1_src_temp += src_strd;
1758         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1759         pu1_src_temp += src_strd;
1760         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1761         pu1_src_temp += src_strd;
1762         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1763 
1764         i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ?
1765                                         u4_lambda : 4 * u4_lambda);
1766     }
1767 
1768     /* HORZ mode valid */
1769     if (u4_valid_intra_modes & 2)
1770     {
1771         i4_sad[HORZ_I4x4] = 0;
1772         i4_cost[HORZ_I4x4] =0;
1773         pu1_src_temp = pu1_src;
1774 
1775         u1_pred_val = pu1_ngbr_pels[3];
1776 
1777         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1778                         + ABS(pu1_src_temp[1] - u1_pred_val)
1779                         + ABS(pu1_src_temp[2] - u1_pred_val)
1780                         + ABS(pu1_src_temp[3] - u1_pred_val);
1781         pu1_src_temp += src_strd;
1782 
1783         u1_pred_val = pu1_ngbr_pels[2];
1784 
1785         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1786                         + ABS(pu1_src_temp[1] - u1_pred_val)
1787                         + ABS(pu1_src_temp[2] - u1_pred_val)
1788                         + ABS(pu1_src_temp[3] - u1_pred_val);
1789         pu1_src_temp += src_strd;
1790 
1791         u1_pred_val = pu1_ngbr_pels[1];
1792 
1793         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1794                         + ABS(pu1_src_temp[1] - u1_pred_val)
1795                         + ABS(pu1_src_temp[2] - u1_pred_val)
1796                         + ABS(pu1_src_temp[3] - u1_pred_val);
1797         pu1_src_temp += src_strd;
1798 
1799         u1_pred_val = pu1_ngbr_pels[0];
1800 
1801         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1802                         + ABS(pu1_src_temp[1] - u1_pred_val)
1803                         + ABS(pu1_src_temp[2] - u1_pred_val)
1804                         + ABS(pu1_src_temp[3] - u1_pred_val);
1805 
1806         i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ?
1807                                         u4_lambda : 4 * u4_lambda);
1808     }
1809 
1810     /* DC mode valid */
1811     if (u4_valid_intra_modes & 4)
1812     {
1813         i4_sad[DC_I4x4] = 0;
1814         i4_cost[DC_I4x4] = 0;
1815         pu1_src_temp = pu1_src;
1816 
1817         if (left)
1818             u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2]
1819                             + pu1_ngbr_pels[3] + 2;
1820         if (top)
1821             u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7]
1822                             + pu1_ngbr_pels[8] + 2;
1823 
1824         u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128;
1825 
1826         /* none available */
1827         memset(u1_pred_vals, u4_dcval, 4);
1828         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1829         pu1_src_temp += src_strd;
1830         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1831         pu1_src_temp += src_strd;
1832         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1833         pu1_src_temp += src_strd;
1834         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1835         pu1_src_temp += src_strd;
1836 
1837         i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ?
1838                                         u4_lambda : 4 * u4_lambda);
1839     }
1840 
1841     /* if modes other than VERT, HORZ and DC are  valid */
1842     if (u4_valid_intra_modes > 7)
1843     {
1844         pu1_pred = pu1_ngbr_pels;
1845         pu1_pred[13] = pu1_pred[14] = pu1_pred[12];
1846 
1847         /* Performing FILT121 and FILT11 operation for all neighbour values*/
1848         for (i = 0; i < 13; i++)
1849         {
1850             u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]);
1851             u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]);
1852 
1853             pu1_pred++;
1854         }
1855 
1856         if (u4_valid_intra_modes & 8)/* DIAG_DL */
1857         {
1858             i4_sad[DIAG_DL_I4x4] = 0;
1859             i4_cost[DIAG_DL_I4x4] = 0;
1860             pu1_src_temp = pu1_src;
1861             pu1_pred_val = u1_pred_vals_diag_121 + 5;
1862 
1863             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]);
1864             pu1_src_temp += src_strd;
1865             USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]);
1866             pu1_src_temp += src_strd;
1867             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]);
1868             pu1_src_temp += src_strd;
1869             USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]);
1870             pu1_src_temp += src_strd;
1871             i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ?
1872                                             u4_lambda : 4 * u4_lambda);
1873         }
1874 
1875         if (u4_valid_intra_modes & 16)/* DIAG_DR */
1876         {
1877             i4_sad[DIAG_DR_I4x4] = 0;
1878             i4_cost[DIAG_DR_I4x4] = 0;
1879             pu1_src_temp = pu1_src;
1880             pu1_pred_val = u1_pred_vals_diag_121 + 3;
1881 
1882             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]);
1883             pu1_src_temp += src_strd;
1884             USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]);
1885             pu1_src_temp += src_strd;
1886             USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]);
1887             pu1_src_temp += src_strd;
1888             USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]);
1889             pu1_src_temp += src_strd;
1890             i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ?
1891                                             u4_lambda : 4 * u4_lambda);
1892 
1893         }
1894 
1895         if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
1896         {
1897             i4_sad[VERT_R_I4x4] = 0;
1898 
1899             pu1_src_temp = pu1_src;
1900             u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2];
1901             memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3);
1902             u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1];
1903             memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3);
1904 
1905             pu1_pred_val = u1_pred_vals_diag_11 + 4;
1906             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
1907             pu1_pred_val = u1_pred_vals_diag_121 + 3;
1908             pu1_src_temp += src_strd;
1909             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
1910             pu1_src_temp += src_strd;
1911             USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]);
1912             pu1_src_temp += src_strd;
1913             USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4),
1914                    i4_sad[VERT_R_I4x4]);
1915 
1916             i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ?
1917                                             u4_lambda : 4 * u4_lambda);
1918         }
1919 
1920         if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
1921         {
1922             i4_sad[HORZ_D_I4x4] = 0;
1923 
1924             pu1_src_temp = pu1_src;
1925             u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3];
1926             memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3);
1927             u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0];
1928             u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0];
1929             u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1];
1930             u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1];
1931             u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2];
1932             u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2];
1933 
1934             pu1_pred_val = u1_pred_vals_horz_d;
1935             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]);
1936             pu1_src_temp += src_strd;
1937             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]);
1938             pu1_src_temp += src_strd;
1939             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]);
1940             pu1_src_temp += src_strd;
1941             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]);
1942 
1943             i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ?
1944                                             u4_lambda : 4 * u4_lambda);
1945         }
1946 
1947         if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
1948         {
1949             i4_sad[VERT_L_I4x4] = 0;
1950             pu1_src_temp = pu1_src;
1951             pu1_pred_val = u1_pred_vals_diag_11 + 5;
1952             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1953             pu1_src_temp += src_strd;
1954             pu1_pred_val = u1_pred_vals_diag_121 + 5;
1955             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1956             pu1_src_temp += src_strd;
1957             pu1_pred_val = u1_pred_vals_diag_11 + 6;
1958             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1959             pu1_src_temp += src_strd;
1960             pu1_pred_val = u1_pred_vals_diag_121 + 6;
1961             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1962 
1963             i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ?
1964                                             u4_lambda : 4 * u4_lambda);
1965         }
1966 
1967         if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
1968         {
1969             i4_sad[HORZ_U_I4x4] = 0;
1970             pu1_src_temp = pu1_src;
1971             u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2];
1972             u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1];
1973             u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1];
1974             u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0];
1975             u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0];
1976             u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]);
1977 
1978             memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4);
1979 
1980             pu1_pred_val = u1_pred_vals_horz_u;
1981             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]);
1982             pu1_src_temp += src_strd;
1983             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]);
1984             pu1_src_temp += src_strd;
1985             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]);
1986             pu1_src_temp += src_strd;
1987             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]);
1988 
1989             i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ?
1990                                             u4_lambda : 4 * u4_lambda);
1991         }
1992 
1993         i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]),
1994                         MIN3(i4_cost[3], i4_cost[4], i4_cost[5]),
1995                         MIN3(i4_cost[6], i4_cost[7], i4_cost[8]));
1996 
1997     }
1998     else
1999     {
2000         /* Only first three modes valid */
2001         i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]);
2002     }
2003 
2004     *pu4_sadmin = i4_min_cost;
2005 
2006     if (i4_min_cost == i4_cost[0])
2007     {
2008         *u4_intra_mode = VERT_I4x4;
2009         pu1_pred_val = pu1_ngbr_pels + 5;
2010         memcpy(pu1_dst, (pu1_pred_val), 4);
2011         pu1_dst += dst_strd;
2012         memcpy(pu1_dst, (pu1_pred_val), 4);
2013         pu1_dst += dst_strd;
2014         memcpy(pu1_dst, (pu1_pred_val), 4);
2015         pu1_dst += dst_strd;
2016         memcpy(pu1_dst, (pu1_pred_val), 4);
2017     }
2018     else if (i4_min_cost == i4_cost[1])
2019     {
2020         *u4_intra_mode = HORZ_I4x4;
2021         memset(pu1_dst, pu1_ngbr_pels[3], 4);
2022         pu1_dst += dst_strd;
2023         memset(pu1_dst, pu1_ngbr_pels[2], 4);
2024         pu1_dst += dst_strd;
2025         memset(pu1_dst, pu1_ngbr_pels[1], 4);
2026         pu1_dst += dst_strd;
2027         memset(pu1_dst, pu1_ngbr_pels[0], 4);
2028     }
2029     else if (i4_min_cost == i4_cost[2])
2030     {
2031         *u4_intra_mode = DC_I4x4;
2032         memset(pu1_dst, u4_dcval, 4);
2033         pu1_dst += dst_strd;
2034         memset(pu1_dst, u4_dcval, 4);
2035         pu1_dst += dst_strd;
2036         memset(pu1_dst, u4_dcval, 4);
2037         pu1_dst += dst_strd;
2038         memset(pu1_dst, u4_dcval, 4);
2039     }
2040 
2041     else if (i4_min_cost == i4_cost[3])
2042     {
2043         *u4_intra_mode = DIAG_DL_I4x4;
2044         pu1_pred_val = u1_pred_vals_diag_121 + 5;
2045         memcpy(pu1_dst, (pu1_pred_val), 4);
2046         pu1_dst += dst_strd;
2047         memcpy(pu1_dst, (pu1_pred_val + 1), 4);
2048         pu1_dst += dst_strd;
2049         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2050         pu1_dst += dst_strd;
2051         memcpy(pu1_dst, (pu1_pred_val + 3), 4);
2052     }
2053     else if (i4_min_cost == i4_cost[4])
2054     {
2055         *u4_intra_mode = DIAG_DR_I4x4;
2056         pu1_pred_val = u1_pred_vals_diag_121 + 3;
2057 
2058         memcpy(pu1_dst, (pu1_pred_val), 4);
2059         pu1_dst += dst_strd;
2060         memcpy(pu1_dst, (pu1_pred_val - 1), 4);
2061         pu1_dst += dst_strd;
2062         memcpy(pu1_dst, (pu1_pred_val - 2), 4);
2063         pu1_dst += dst_strd;
2064         memcpy(pu1_dst, (pu1_pred_val - 3), 4);
2065     }
2066 
2067     else if (i4_min_cost == i4_cost[5])
2068     {
2069         *u4_intra_mode = VERT_R_I4x4;
2070         pu1_pred_val = u1_pred_vals_diag_11 + 4;
2071         memcpy(pu1_dst, (pu1_pred_val), 4);
2072         pu1_dst += dst_strd;
2073         pu1_pred_val = u1_pred_vals_diag_121 + 3;
2074         memcpy(pu1_dst, (pu1_pred_val), 4);
2075         pu1_dst += dst_strd;
2076         memcpy(pu1_dst, (u1_pred_vals_vert_r), 4);
2077         pu1_dst += dst_strd;
2078         memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4);
2079     }
2080     else if (i4_min_cost == i4_cost[6])
2081     {
2082         *u4_intra_mode = HORZ_D_I4x4;
2083         pu1_pred_val = u1_pred_vals_horz_d;
2084         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
2085         pu1_dst += dst_strd;
2086         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
2087         pu1_dst += dst_strd;
2088         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2089         pu1_dst += dst_strd;
2090         memcpy(pu1_dst, (pu1_pred_val), 4);
2091         pu1_dst += dst_strd;
2092     }
2093     else if (i4_min_cost == i4_cost[7])
2094     {
2095         *u4_intra_mode = VERT_L_I4x4;
2096         pu1_pred_val = u1_pred_vals_diag_11 + 5;
2097         memcpy(pu1_dst, (pu1_pred_val), 4);
2098         pu1_dst += dst_strd;
2099         pu1_pred_val = u1_pred_vals_diag_121 + 5;
2100         memcpy(pu1_dst, (pu1_pred_val), 4);
2101         pu1_dst += dst_strd;
2102         pu1_pred_val = u1_pred_vals_diag_11 + 6;
2103         memcpy(pu1_dst, (pu1_pred_val), 4);
2104         pu1_dst += dst_strd;
2105         pu1_pred_val = u1_pred_vals_diag_121 + 6;
2106         memcpy(pu1_dst, (pu1_pred_val), 4);
2107     }
2108     else if (i4_min_cost == i4_cost[8])
2109     {
2110         *u4_intra_mode = HORZ_U_I4x4;
2111         pu1_pred_val = u1_pred_vals_horz_u;
2112         memcpy(pu1_dst, (pu1_pred_val), 4);
2113         pu1_dst += dst_strd;
2114         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2115         pu1_dst += dst_strd;
2116         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
2117         pu1_dst += dst_strd;
2118         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
2119         pu1_dst += dst_strd;
2120     }
2121 
2122     return;
2123 }
2124 
2125 /**
2126 ******************************************************************************
2127 *
2128 * @brief:
2129 *  Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction.
2130 *
2131 * @par Description
2132 *  This function evaluates  first three intra chroma modes and compute corresponding sad
2133 *  and return the buffer predicted with best mode.
2134 *
2135 * @param[in] pu1_src
2136 *  UWORD8 pointer to the source
2137 *
2138 * @param[in] pu1_ngbr_pels
2139 *  UWORD8 pointer to neighbouring pels
2140 *
2141 * @param[out] pu1_dst
2142 *  UWORD8 pointer to the destination
2143 *
2144 * @param[in] src_strd
2145 *  integer source stride
2146 *
2147 * @param[in] dst_strd
2148 *  integer destination stride
2149 *
2150 * @param[in] u4_n_avblty
2151 *  availability of neighbouring pixels
2152 *
2153 * @param[in] u4_intra_mode
2154 *  Pointer to the variable in which best mode is returned
2155 *
2156 * @param[in] pu4_sadmin
2157 *  Pointer to the variable in which minimum sad is returned
2158 *
2159 * @param[in] u4_valid_intra_modes
2160 *  Says what all modes are valid
2161 *
2162 * @return      none
2163 *
2164 ******************************************************************************
2165 */
ih264e_evaluate_intra_chroma_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)2166 void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
2167                                         UWORD8 *pu1_ngbr_pels,
2168                                         UWORD8 *pu1_dst,
2169                                         UWORD32 src_strd,
2170                                         UWORD32 dst_strd,
2171                                         WORD32 u4_n_avblty,
2172                                         UWORD32 *u4_intra_mode,
2173                                         WORD32 *pu4_sadmin,
2174                                         UWORD32 u4_valid_intra_modes)
2175 {
2176     UWORD8 *pu1_neighbour;
2177     UWORD8 *pu1_src_temp = pu1_src;
2178     UWORD8 left = 0, top = 0;
2179     WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */
2180            u4_dcval_u_t[2] = { 0, 0 };  /*sum top neighbours for 'U'*/
2181 
2182     WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/
2183            u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/
2184 
2185     WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX,
2186                     i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX;
2187     UWORD8 val_u, val_v;
2188 
2189     WORD32 u4_dc_val[2][2][2];/*  -----------
2190                                   |    |    |  Chroma can have four
2191                                   | 00 | 01 |  separate dc value...
2192                                   -----------  u4_dc_val corresponds to this dc values
2193                                   |    |    |  with u4_dc_val[2][2][U] and u4_dc_val[2][2][V]
2194                                   | 10 | 11 |
2195                                   -----------                */
2196     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
2197     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
2198 
2199     /*Evaluating HORZ*/
2200     if (left)/* Ifleft available*/
2201     {
2202         i4_sad_horz = 0;
2203 
2204         for (i = 0; i < 8; i++)
2205         {
2206             val_v = pu1_ngbr_pels[15 - 2 * i];
2207             val_u = pu1_ngbr_pels[15 - 2 * i - 1];
2208             row = i / 4;
2209             u4_dcval_u_l[row] += val_u;
2210             u4_dcval_v_l[row] += val_v;
2211             for (j = 0; j < 8; j++)
2212             {
2213                 i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/
2214                 i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]);
2215             }
2216 
2217             pu1_src_temp += src_strd;
2218         }
2219         u4_dcval_u_l[0] += 2;
2220         u4_dcval_u_l[1] += 2;
2221         u4_dcval_v_l[0] += 2;
2222         u4_dcval_v_l[1] += 2;
2223     }
2224 
2225     /*Evaluating VERT**/
2226     pu1_src_temp = pu1_src;
2227     if (top) /* top available*/
2228     {
2229         i4_sad_vert = 0;
2230 
2231         for (i = 0; i < 8; i++)
2232         {
2233             col = i / 4;
2234 
2235             val_u = pu1_ngbr_pels[18 + i * 2];
2236             val_v = pu1_ngbr_pels[18 + i * 2 + 1];
2237             u4_dcval_u_t[col] += val_u;
2238             u4_dcval_v_t[col] += val_v;
2239 
2240             for (j = 0; j < 16; j++)
2241             {
2242                 i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/
2243             }
2244             pu1_src_temp += src_strd;
2245 
2246         }
2247         u4_dcval_u_t[0] += 2;
2248         u4_dcval_u_t[1] += 2;
2249         u4_dcval_v_t[0] += 2;
2250         u4_dcval_v_t[1] += 2;
2251     }
2252 
2253     /* computing DC value*/
2254     /* Equation  8-128 in spec*/
2255     u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top);
2256     u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top);
2257     u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top);
2258     u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top);
2259 
2260     if (top)
2261     {
2262         /* Equation  8-132 in spec*/
2263         u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top);
2264         u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top);
2265     }
2266     else
2267     {
2268         u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left);
2269         u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left);
2270     }
2271 
2272     if (left)
2273     {
2274         u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left);
2275         u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left);
2276     }
2277     else
2278     {
2279         u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top);
2280         u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top);
2281     }
2282 
2283     if (!(left || top))
2284     {
2285         /*none available*/
2286         u4_dc_val[0][0][0] = u4_dc_val[0][0][1] =
2287         u4_dc_val[0][1][0] = u4_dc_val[0][1][1] =
2288         u4_dc_val[1][0][0] = u4_dc_val[1][0][1] =
2289         u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128;
2290     }
2291 
2292     /* Evaluating DC */
2293     pu1_src_temp = pu1_src;
2294     i4_sad_dc = 0;
2295     for (i = 0; i < 8; i++)
2296     {
2297         for (j = 0; j < 8; j++)
2298         {
2299             col = j / 4;
2300             row = i / 4;
2301             val_u = u4_dc_val[row][col][0];
2302             val_v = u4_dc_val[row][col][1];
2303 
2304             i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/
2305             i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]);
2306         }
2307         pu1_src_temp += src_strd;
2308     }
2309 
2310     if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/
2311         i4_sad_dc = INT_MAX;
2312     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/
2313         i4_sad_horz = INT_MAX;
2314     if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/
2315         i4_sad_vert = INT_MAX;
2316 
2317     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
2318 
2319     /* Finding Minimum sad and doing corresponding prediction*/
2320     if (i4_min_sad < *pu4_sadmin)
2321     {
2322         *pu4_sadmin = i4_min_sad;
2323 
2324         if (i4_min_sad == i4_sad_dc)
2325         {
2326             *u4_intra_mode = DC_CH_I8x8;
2327             for (i = 0; i < 8; i++)
2328             {
2329                 for (j = 0; j < 8; j++)
2330                 {
2331                     col = j / 4;
2332                     row = i / 4;
2333 
2334                     pu1_dst[2 * j] = u4_dc_val[row][col][0];
2335                     pu1_dst[2 * j + 1] = u4_dc_val[row][col][1];
2336                 }
2337                 pu1_dst += dst_strd;
2338             }
2339         }
2340         else if (i4_min_sad == i4_sad_horz)
2341         {
2342             *u4_intra_mode = HORZ_CH_I8x8;
2343             for (j = 0; j < 8; j++)
2344             {
2345                 val_v = pu1_ngbr_pels[15 - 2 * j];
2346                 val_u = pu1_ngbr_pels[15 - 2 * j - 1];
2347 
2348                 for (i = 0; i < 8; i++)
2349                 {
2350                     pu1_dst[2 * i] = val_u;
2351                     pu1_dst[2 * i + 1] = val_v;
2352 
2353                 }
2354                 pu1_dst += dst_strd;
2355             }
2356         }
2357         else
2358         {
2359             *u4_intra_mode = VERT_CH_I8x8;
2360             pu1_neighbour = pu1_ngbr_pels + 18;
2361             for (j = 0; j < 8; j++)
2362             {
2363                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
2364                 pu1_dst += dst_strd;
2365             }
2366         }
2367     }
2368 
2369     return;
2370 }
2371