1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /*!
22 ******************************************************************************
23 * \file ihevce_enc_loop_utils.c
24 *
25 * \brief
26 *    This file contains utility functions of Encode loop
27 *
28 * \date
29 *    18/09/2012
30 *
31 * \author
32 *    Ittiam
33 *
34 *
35 * List of Functions
36 *
37 *
38 ******************************************************************************
39 */
40 
41 /*****************************************************************************/
42 /* File Includes                                                             */
43 /*****************************************************************************/
44 /* System include files */
45 #include <stdio.h>
46 #include <string.h>
47 #include <stdlib.h>
48 #include <assert.h>
49 #include <stdarg.h>
50 #include <math.h>
51 #include <limits.h>
52 
53 /* User include files */
54 #include "ihevc_typedefs.h"
55 #include "itt_video_api.h"
56 #include "ihevce_api.h"
57 
58 #include "rc_cntrl_param.h"
59 #include "rc_frame_info_collector.h"
60 #include "rc_look_ahead_params.h"
61 
62 #include "ihevc_defs.h"
63 #include "ihevc_macros.h"
64 #include "ihevc_debug.h"
65 #include "ihevc_structs.h"
66 #include "ihevc_platform_macros.h"
67 #include "ihevc_deblk.h"
68 #include "ihevc_itrans_recon.h"
69 #include "ihevc_chroma_itrans_recon.h"
70 #include "ihevc_chroma_intra_pred.h"
71 #include "ihevc_intra_pred.h"
72 #include "ihevc_inter_pred.h"
73 #include "ihevc_mem_fns.h"
74 #include "ihevc_padding.h"
75 #include "ihevc_weighted_pred.h"
76 #include "ihevc_sao.h"
77 #include "ihevc_resi_trans.h"
78 #include "ihevc_quant_iquant_ssd.h"
79 #include "ihevc_cabac_tables.h"
80 #include "ihevc_common_tables.h"
81 
82 #include "ihevce_defs.h"
83 #include "ihevce_hle_interface.h"
84 #include "ihevce_lap_enc_structs.h"
85 #include "ihevce_multi_thrd_structs.h"
86 #include "ihevce_multi_thrd_funcs.h"
87 #include "ihevce_me_common_defs.h"
88 #include "ihevce_had_satd.h"
89 #include "ihevce_error_codes.h"
90 #include "ihevce_bitstream.h"
91 #include "ihevce_cabac.h"
92 #include "ihevce_rdoq_macros.h"
93 #include "ihevce_function_selector.h"
94 #include "ihevce_enc_structs.h"
95 #include "ihevce_entropy_structs.h"
96 #include "ihevce_cmn_utils_instr_set_router.h"
97 #include "ihevce_ipe_instr_set_router.h"
98 #include "ihevce_decomp_pre_intra_structs.h"
99 #include "ihevce_decomp_pre_intra_pass.h"
100 #include "ihevce_enc_loop_structs.h"
101 #include "ihevce_nbr_avail.h"
102 #include "ihevce_enc_loop_utils.h"
103 #include "ihevce_sub_pic_rc.h"
104 #include "ihevce_global_tables.h"
105 #include "ihevce_bs_compute_ctb.h"
106 #include "ihevce_cabac_rdo.h"
107 #include "ihevce_deblk.h"
108 #include "ihevce_frame_process.h"
109 #include "ihevce_rc_enc_structs.h"
110 #include "hme_datatype.h"
111 #include "hme_interface.h"
112 #include "hme_common_defs.h"
113 #include "hme_defs.h"
114 #include "hme_common_utils.h"
115 #include "ihevce_me_instr_set_router.h"
116 #include "ihevce_enc_subpel_gen.h"
117 #include "ihevce_inter_pred.h"
118 #include "ihevce_mv_pred.h"
119 #include "ihevce_mv_pred_merge.h"
120 #include "ihevce_enc_loop_inter_mode_sifter.h"
121 #include "ihevce_enc_cu_recursion.h"
122 #include "ihevce_enc_loop_pass.h"
123 #include "ihevce_common_utils.h"
124 #include "ihevce_dep_mngr_interface.h"
125 #include "ihevce_sao.h"
126 #include "ihevce_tile_interface.h"
127 #include "ihevce_profile.h"
128 #include "ihevce_stasino_helpers.h"
129 #include "ihevce_tu_tree_selector.h"
130 
131 /*****************************************************************************/
132 /* Globals                                                                   */
133 /*****************************************************************************/
134 
135 extern UWORD16 gau2_ihevce_cabac_bin_to_bits[64 * 2];
136 extern const UWORD8 gu1_hevce_scan4x4[3][16];
137 extern const UWORD8 gu1_hevce_sigcoeff_ctxtinc[4][16];
138 extern const UWORD8 gu1_hevce_sigcoeff_ctxtinc_tr4[16];
139 extern const UWORD8 gu1_hevce_sigcoeff_ctxtinc_00[16];
140 
141 /*****************************************************************************/
142 /* Constant Macros                                                           */
143 /*****************************************************************************/
144 #define ENABLE_ZERO_CBF 1
145 #define DISABLE_RDOQ_INTRA 0
146 
147 /*****************************************************************************/
148 /* Function Definitions                                                      */
149 /*****************************************************************************/
ihevce_tu_tree_update(tu_prms_t * ps_tu_prms,WORD32 * pnum_tu_in_cu,WORD32 depth,WORD32 tu_split_flag,WORD32 tu_early_cbf,WORD32 i4_x_off,WORD32 i4_y_off)150 void *ihevce_tu_tree_update(
151     tu_prms_t *ps_tu_prms,
152     WORD32 *pnum_tu_in_cu,
153     WORD32 depth,
154     WORD32 tu_split_flag,
155     WORD32 tu_early_cbf,
156     WORD32 i4_x_off,
157     WORD32 i4_y_off)
158 {
159     //WORD32 tu_split_flag = p_tu_split_flag[0];
160     WORD32 p_tu_split_flag[4];
161     WORD32 p_tu_early_cbf[4];
162 
163     WORD32 tu_size = ps_tu_prms->u1_tu_size;
164 
165     if(((tu_size >> depth) >= 16) && (tu_split_flag & 0x1))
166     {
167         if((tu_size >> depth) == 32)
168         {
169             /* Get the individual TU split flags */
170             p_tu_split_flag[0] = (tu_split_flag >> 16) & 0x1F;
171             p_tu_split_flag[1] = (tu_split_flag >> 11) & 0x1F;
172             p_tu_split_flag[2] = (tu_split_flag >> 6) & 0x1F;
173             p_tu_split_flag[3] = (tu_split_flag >> 1) & 0x1F;
174 
175             /* Get the early CBF flags */
176             p_tu_early_cbf[0] = (tu_early_cbf >> 16) & 0x1F;
177             p_tu_early_cbf[1] = (tu_early_cbf >> 11) & 0x1F;
178             p_tu_early_cbf[2] = (tu_early_cbf >> 6) & 0x1F;
179             p_tu_early_cbf[3] = (tu_early_cbf >> 1) & 0x1F;
180         }
181         else
182         {
183             /* Get the individual TU split flags */
184             p_tu_split_flag[0] = ((tu_split_flag >> 4) & 0x1);
185             p_tu_split_flag[1] = ((tu_split_flag >> 3) & 0x1);
186             p_tu_split_flag[2] = ((tu_split_flag >> 2) & 0x1);
187             p_tu_split_flag[3] = ((tu_split_flag >> 1) & 0x1);
188 
189             /* Get the early CBF flags */
190             p_tu_early_cbf[0] = ((tu_early_cbf >> 4) & 0x1);
191             p_tu_early_cbf[1] = ((tu_early_cbf >> 3) & 0x1);
192             p_tu_early_cbf[2] = ((tu_early_cbf >> 2) & 0x1);
193             p_tu_early_cbf[3] = ((tu_early_cbf >> 1) & 0x1);
194         }
195 
196         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
197             ps_tu_prms,
198             pnum_tu_in_cu,
199             depth + 1,
200             p_tu_split_flag[0],
201             p_tu_early_cbf[0],
202             i4_x_off,
203             i4_y_off);
204 
205         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
206             ps_tu_prms,
207             pnum_tu_in_cu,
208             depth + 1,
209             p_tu_split_flag[1],
210             p_tu_early_cbf[1],
211             (i4_x_off + (tu_size >> (depth + 1))),
212             i4_y_off);
213 
214         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
215             ps_tu_prms,
216             pnum_tu_in_cu,
217             depth + 1,
218             p_tu_split_flag[2],
219             p_tu_early_cbf[2],
220             i4_x_off,
221             (i4_y_off + (tu_size >> (depth + 1))));
222 
223         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
224             ps_tu_prms,
225             pnum_tu_in_cu,
226             depth + 1,
227             p_tu_split_flag[3],
228             p_tu_early_cbf[3],
229             (i4_x_off + (tu_size >> (depth + 1))),
230             (i4_y_off + (tu_size >> (depth + 1))));
231     }
232     else
233     {
234         if(tu_split_flag & 0x1)
235         {
236             /* This piece of code will be entered for the 8x8, if it is split
237             Update the 4 child TU's accordingly. */
238 
239             (*pnum_tu_in_cu) += 4;
240 
241             /* TL TU update */
242             ps_tu_prms->u1_tu_size = tu_size >> (depth + 1);
243 
244             ps_tu_prms->u1_x_off = i4_x_off;
245 
246             ps_tu_prms->u1_y_off = i4_y_off;
247 
248             /* Early CBF is not done for 4x4 transforms */
249             ps_tu_prms->i4_early_cbf = 1;
250 
251             ps_tu_prms++;
252 
253             /* TR TU update */
254             ps_tu_prms->u1_tu_size = tu_size >> (depth + 1);
255 
256             ps_tu_prms->u1_x_off = i4_x_off + (tu_size >> (depth + 1));
257 
258             ps_tu_prms->u1_y_off = i4_y_off;
259 
260             /* Early CBF is not done for 4x4 transforms */
261             ps_tu_prms->i4_early_cbf = 1;
262 
263             ps_tu_prms++;
264 
265             /* BL TU update */
266             ps_tu_prms->u1_tu_size = tu_size >> (depth + 1);
267 
268             ps_tu_prms->u1_x_off = i4_x_off;
269 
270             ps_tu_prms->u1_y_off = i4_y_off + (tu_size >> (depth + 1));
271 
272             /* Early CBF is not done for 4x4 transforms */
273             ps_tu_prms->i4_early_cbf = 1;
274 
275             ps_tu_prms++;
276 
277             /* BR TU update */
278             ps_tu_prms->u1_tu_size = tu_size >> (depth + 1);
279 
280             ps_tu_prms->u1_x_off = i4_x_off + (tu_size >> (depth + 1));
281 
282             ps_tu_prms->u1_y_off = i4_y_off + (tu_size >> (depth + 1));
283 
284             /* Early CBF is not done for 4x4 transforms */
285             ps_tu_prms->i4_early_cbf = 1;
286         }
287         else
288         {
289             /* Update the TU params */
290             ps_tu_prms->u1_tu_size = tu_size >> depth;
291 
292             ps_tu_prms->u1_x_off = i4_x_off;
293 
294             ps_tu_prms->u1_y_off = i4_y_off;
295 
296             (*pnum_tu_in_cu)++;
297 
298             /* Early CBF update for current TU */
299             ps_tu_prms->i4_early_cbf = tu_early_cbf & 0x1;
300         }
301         if((*pnum_tu_in_cu) < MAX_TU_IN_CTB)
302         {
303             ps_tu_prms++;
304 
305             ps_tu_prms->u1_tu_size = tu_size;
306         }
307     }
308 
309     return ps_tu_prms;
310 }
311 
312 /*!
313 ******************************************************************************
314 * \if Function name : ihevce_compute_quant_rel_param \endif
315 *
316 * \brief
317 *    This function updates quantization related parameters like qp_mod_6 etc in
318 *       context according to new qp
319 *
320 * \date
321 *    08/01/2013
322 *
323 * \author
324 *    Ittiam
325 *
326 * \return
327 *
328 * List of Functions
329 *
330 *
331 ******************************************************************************
332 */
ihevce_compute_quant_rel_param(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD8 i1_cu_qp)333 void ihevce_compute_quant_rel_param(ihevce_enc_loop_ctxt_t *ps_ctxt, WORD8 i1_cu_qp)
334 {
335     WORD32 i4_div_factor;
336 
337     ps_ctxt->i4_chrm_cu_qp =
338         (ps_ctxt->u1_chroma_array_type == 2)
339             ? MIN(i1_cu_qp + ps_ctxt->i4_chroma_qp_offset, 51)
340             : gai1_ihevc_chroma_qp_scale[i1_cu_qp + ps_ctxt->i4_chroma_qp_offset + MAX_QP_BD_OFFSET];
341     ps_ctxt->i4_cu_qp_div6 = (i1_cu_qp + (6 * (ps_ctxt->u1_bit_depth - 8))) / 6;
342     i4_div_factor = (i1_cu_qp + 3) / 6;
343     i4_div_factor = CLIP3(i4_div_factor, 3, 6);
344     ps_ctxt->i4_cu_qp_mod6 = (i1_cu_qp + (6 * (ps_ctxt->u1_bit_depth - 8))) % 6;
345     ps_ctxt->i4_chrm_cu_qp_div6 = (ps_ctxt->i4_chrm_cu_qp + (6 * (ps_ctxt->u1_bit_depth - 8))) / 6;
346     ps_ctxt->i4_chrm_cu_qp_mod6 = (ps_ctxt->i4_chrm_cu_qp + (6 * (ps_ctxt->u1_bit_depth - 8))) % 6;
347 
348 #define INTER_RND_QP_BY_6
349 #ifdef INTER_RND_QP_BY_6
350     /* quant factor without RDOQ is 1/6th of shift for inter : like in H264 */
351     {
352         ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTER] =
353             (WORD32)(((1 << QUANT_ROUND_FACTOR_Q) / (float)6) + 0.5f);
354     }
355 #else
356     /* quant factor without RDOQ is 1/6th of shift for inter : like in H264 */
357     ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTER] = (1 << QUANT_ROUND_FACTOR_Q) / 3;
358 #endif
359 
360     if(ISLICE == ps_ctxt->i1_slice_type)
361     {
362         /* quant factor without RDOQ is 1/3rd of shift for intra : like in H264 */
363         ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTRA] =
364             (WORD32)(((1 << QUANT_ROUND_FACTOR_Q) / (float)3) + 0.5f);
365     }
366     else
367     {
368         if(0) /*TRAQO_EXT_ENABLE_ONE_THIRD_RND*/
369         {
370             /* quant factor without RDOQ is 1/3rd of shift for intra : like in H264 */
371             ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTRA] =
372                 (WORD32)(((1 << QUANT_ROUND_FACTOR_Q) / (float)3) + 0.5f);
373         }
374         else
375         {
376             /* quant factor without RDOQ is 1/6th of shift for intra in inter pic */
377             ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTRA] =
378                 ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTER];
379             /* (1 << QUANT_ROUND_FACTOR_Q) / 6; */
380         }
381     }
382 }
383 
384 /*!
385 ******************************************************************************
386 * \if Function name : ihevce_populate_cl_cu_lambda_prms \endif
387 *
388 * \brief
389 *    Function whihc calculates the Lambda params for current picture
390 *
391 * \param[in] ps_enc_ctxt : encoder ctxt pointer
392 * \param[in] ps_cur_pic_ctxt : current pic ctxt
393 * \param[in] i4_cur_frame_qp : current pic QP
394 * \param[in] first_field : is first field flag
395 * \param[in] i4_temporal_lyr_id : Current picture layer id
396 *
397 * \return
398 *    None
399 *
400 * \author
401 *  Ittiam
402 *
403 *****************************************************************************
404 */
ihevce_populate_cl_cu_lambda_prms(ihevce_enc_loop_ctxt_t * ps_ctxt,frm_lambda_ctxt_t * ps_frm_lamda,WORD32 i4_slice_type,WORD32 i4_temporal_lyr_id,WORD32 i4_lambda_type)405 void ihevce_populate_cl_cu_lambda_prms(
406     ihevce_enc_loop_ctxt_t *ps_ctxt,
407     frm_lambda_ctxt_t *ps_frm_lamda,
408     WORD32 i4_slice_type,
409     WORD32 i4_temporal_lyr_id,
410     WORD32 i4_lambda_type)
411 {
412     WORD32 i4_curr_cu_qp, i4_curr_cu_qp_offset;
413     double lambda_modifier;
414     double lambda_uv_modifier;
415     double lambda;
416     double lambda_uv;
417 
418     WORD32 i4_qp_bdoffset = 6 * (ps_ctxt->u1_bit_depth - 8);
419 
420     /*Populate lamda modifier */
421     ps_ctxt->i4_lamda_modifier = ps_frm_lamda->lambda_modifier;
422     ps_ctxt->i4_uv_lamda_modifier = ps_frm_lamda->lambda_uv_modifier;
423     ps_ctxt->i4_temporal_layer_id = i4_temporal_lyr_id;
424 
425     for(i4_curr_cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qp;
426         i4_curr_cu_qp <= ps_ctxt->ps_rc_quant_ctxt->i2_max_qp;
427         i4_curr_cu_qp++)
428     {
429         WORD32 chroma_qp = (ps_ctxt->i4_chroma_format == IV_YUV_422SP_UV)
430                                ? MIN(i4_curr_cu_qp, 51)
431                                : gai1_ihevc_chroma_qp_scale[i4_curr_cu_qp + MAX_QP_BD_OFFSET];
432 
433         i4_curr_cu_qp_offset = i4_curr_cu_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset;
434 
435         lambda = pow(2.0, (((double)(i4_curr_cu_qp + i4_qp_bdoffset - 12)) / 3.0));
436         lambda_uv = pow(2.0, (((double)(chroma_qp + i4_qp_bdoffset - 12)) / 3.0));
437 
438         if((BSLICE == i4_slice_type) && (i4_temporal_lyr_id))
439         {
440             lambda_modifier = ps_frm_lamda->lambda_modifier *
441                               CLIP3((((double)(i4_curr_cu_qp - 12)) / 6.0), 2.00, 4.00);
442             lambda_uv_modifier = ps_frm_lamda->lambda_uv_modifier *
443                                  CLIP3((((double)(chroma_qp - 12)) / 6.0), 2.00, 4.00);
444         }
445         else
446         {
447             lambda_modifier = ps_frm_lamda->lambda_modifier;
448             lambda_uv_modifier = ps_frm_lamda->lambda_uv_modifier;
449         }
450         if(ps_ctxt->i4_use_const_lamda_modifier)
451         {
452             if(ISLICE == ps_ctxt->i1_slice_type)
453             {
454                 lambda_modifier = ps_ctxt->f_i_pic_lamda_modifier;
455                 lambda_uv_modifier = ps_ctxt->f_i_pic_lamda_modifier;
456             }
457             else
458             {
459                 lambda_modifier = CONST_LAMDA_MOD_VAL;
460                 lambda_uv_modifier = CONST_LAMDA_MOD_VAL;
461             }
462         }
463         switch(i4_lambda_type)
464         {
465         case 0:
466         {
467             i4_qp_bdoffset = 0;
468 
469             lambda = pow(2.0, (((double)(i4_curr_cu_qp + i4_qp_bdoffset - 12)) / 3.0));
470             lambda_uv = pow(2.0, (((double)(chroma_qp + i4_qp_bdoffset - 12)) / 3.0));
471 
472             lambda *= lambda_modifier;
473             lambda_uv *= lambda_uv_modifier;
474 
475             ps_ctxt->au4_chroma_cost_weighing_factor_array[i4_curr_cu_qp_offset] =
476                 (UWORD32)((lambda / lambda_uv) * (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT));
477 
478             ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset] =
479                 (LWORD64)(lambda * (1 << LAMBDA_Q_SHIFT));
480 
481             ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
482                 (LWORD64)(lambda_uv * (1 << LAMBDA_Q_SHIFT));
483             if(ps_ctxt->i4_use_const_lamda_modifier)
484             {
485                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
486                     (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
487             }
488             else
489             {
490                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
491                     (WORD32)(sqrt(lambda * 1.9) * (1 << LAMBDA_Q_SHIFT));
492             }
493 
494             ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset] =
495                 (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
496 
497             ps_ctxt->i8_cl_ssd_type2_lambda_qf_array[i4_curr_cu_qp_offset] =
498                 ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset];
499 
500             ps_ctxt->i8_cl_ssd_type2_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
501                 ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset];
502 
503             ps_ctxt->i4_satd_type2_lamda_array[i4_curr_cu_qp_offset] =
504                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset];
505 
506             ps_ctxt->i4_sad_type2_lamda_array[i4_curr_cu_qp_offset] =
507                 ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset];
508 
509             break;
510         }
511         case 1:
512         {
513             lambda = pow(2.0, (((double)(i4_curr_cu_qp + i4_qp_bdoffset - 12)) / 3.0));
514             lambda_uv = pow(2.0, (((double)(chroma_qp + i4_qp_bdoffset - 12)) / 3.0));
515 
516             lambda *= lambda_modifier;
517             lambda_uv *= lambda_uv_modifier;
518 
519             ps_ctxt->au4_chroma_cost_weighing_factor_array[i4_curr_cu_qp_offset] =
520                 (UWORD32)((lambda / lambda_uv) * (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT));
521 
522             ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset] =
523                 (LWORD64)(lambda * (1 << LAMBDA_Q_SHIFT));
524 
525             ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
526                 (LWORD64)(lambda_uv * (1 << LAMBDA_Q_SHIFT));
527             if(ps_ctxt->i4_use_const_lamda_modifier)
528             {
529                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
530                     (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
531             }
532             else
533             {
534                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
535                     (WORD32)(sqrt(lambda * 1.9) * (1 << LAMBDA_Q_SHIFT));
536             }
537             ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset] =
538                 (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
539 
540             ps_ctxt->i8_cl_ssd_type2_lambda_qf_array[i4_curr_cu_qp_offset] =
541                 ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset];
542 
543             ps_ctxt->i8_cl_ssd_type2_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
544                 ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset];
545 
546             ps_ctxt->i4_satd_type2_lamda_array[i4_curr_cu_qp_offset] =
547                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset];
548 
549             ps_ctxt->i4_sad_type2_lamda_array[i4_curr_cu_qp_offset] =
550                 ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset];
551 
552             break;
553         }
554         case 2:
555         {
556             lambda = pow(2.0, (((double)(i4_curr_cu_qp + i4_qp_bdoffset - 12)) / 3.0));
557             lambda_uv = pow(2.0, (((double)(chroma_qp + i4_qp_bdoffset - 12)) / 3.0));
558 
559             lambda *= lambda_modifier;
560             lambda_uv *= lambda_uv_modifier;
561 
562             ps_ctxt->au4_chroma_cost_weighing_factor_array[i4_curr_cu_qp_offset] =
563                 (UWORD32)((lambda / lambda_uv) * (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT));
564 
565             ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset] =
566                 (LWORD64)(lambda * (1 << LAMBDA_Q_SHIFT));
567 
568             ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
569                 (LWORD64)(lambda_uv * (1 << LAMBDA_Q_SHIFT));
570 
571             if(ps_ctxt->i4_use_const_lamda_modifier)
572             {
573                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
574                     (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
575             }
576             else
577             {
578                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
579                     (WORD32)(sqrt(lambda * 1.9) * (1 << LAMBDA_Q_SHIFT));
580             }
581             ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset] =
582                 (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
583 
584             /* lambda corresponding to 8- bit, for metrics based on 8- bit ( Example 8bit SAD in encloop)*/
585             lambda = pow(2.0, (((double)(i4_curr_cu_qp - 12)) / 3.0));
586             lambda_uv = pow(2.0, (((double)(chroma_qp - 12)) / 3.0));
587 
588             lambda *= lambda_modifier;
589             lambda_uv *= lambda_uv_modifier;
590 
591             ps_ctxt->au4_chroma_cost_weighing_factor_array[i4_curr_cu_qp_offset] =
592                 (UWORD32)((lambda / lambda_uv) * (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT));
593 
594             ps_ctxt->i8_cl_ssd_type2_lambda_qf_array[i4_curr_cu_qp_offset] =
595                 (LWORD64)(lambda * (1 << LAMBDA_Q_SHIFT));
596 
597             ps_ctxt->i8_cl_ssd_type2_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
598                 (LWORD64)(lambda_uv * (1 << LAMBDA_Q_SHIFT));
599             if(ps_ctxt->i4_use_const_lamda_modifier)
600             {
601                 ps_ctxt->i4_satd_type2_lamda_array[i4_curr_cu_qp_offset] =
602                     (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
603             }
604             else
605             {
606                 ps_ctxt->i4_satd_type2_lamda_array[i4_curr_cu_qp_offset] =
607                     (WORD32)(sqrt(lambda * 1.9) * (1 << LAMBDA_Q_SHIFT));
608             }
609 
610             ps_ctxt->i4_sad_type2_lamda_array[i4_curr_cu_qp_offset] =
611                 (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
612 
613             break;
614         }
615         default:
616         {
617             /* Intended to be a barren wasteland! */
618             ASSERT(0);
619         }
620         }
621     }
622 }
623 
624 /*!
625 ******************************************************************************
626 * \if Function name : ihevce_get_cl_cu_lambda_prms \endif
627 *
628 * \brief
629 *    Function whihc calculates the Lambda params for current picture
630 *
631 * \param[in] ps_enc_ctxt : encoder ctxt pointer
632 * \param[in] ps_cur_pic_ctxt : current pic ctxt
633 * \param[in] i4_cur_frame_qp : current pic QP
634 * \param[in] first_field : is first field flag
635 * \param[in] i4_temporal_lyr_id : Current picture layer id
636 *
637 * \return
638 *    None
639 *
640 * \author
641 *  Ittiam
642 *
643 *****************************************************************************
644 */
ihevce_get_cl_cu_lambda_prms(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD32 i4_cur_cu_qp)645 void ihevce_get_cl_cu_lambda_prms(ihevce_enc_loop_ctxt_t *ps_ctxt, WORD32 i4_cur_cu_qp)
646 {
647     WORD32 chroma_qp = (ps_ctxt->u1_chroma_array_type == 2)
648                            ? MIN(i4_cur_cu_qp + ps_ctxt->i4_chroma_qp_offset, 51)
649                            : gai1_ihevc_chroma_qp_scale
650                                  [i4_cur_cu_qp + ps_ctxt->i4_chroma_qp_offset + MAX_QP_BD_OFFSET];
651 
652     /* closed loop ssd lambda is same as final lambda */
653     ps_ctxt->i8_cl_ssd_lambda_qf =
654         ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_cur_cu_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
655     ps_ctxt->i8_cl_ssd_lambda_chroma_qf =
656         ps_ctxt
657             ->i8_cl_ssd_lambda_chroma_qf_array[chroma_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
658     ps_ctxt->u4_chroma_cost_weighing_factor =
659         ps_ctxt->au4_chroma_cost_weighing_factor_array
660             [chroma_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
661     /* --- Initialized the lambda for SATD computations --- */
662     /* --- 0.95 is the multiplication factor as per HM --- */
663     /* --- 1.9 is the multiplication factor for Hadamard Transform --- */
664     ps_ctxt->i4_satd_lamda =
665         ps_ctxt->i4_satd_lamda_array[i4_cur_cu_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
666     ps_ctxt->i4_sad_lamda =
667         ps_ctxt->i4_sad_type2_lamda_array[i4_cur_cu_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
668 }
669 
670 /*!
671 ******************************************************************************
672 * \if Function name : ihevce_update_pred_qp \endif
673 *
674 * \brief
675 *    Computes pred qp for the given CU
676 *
677 * \param[in]
678 *
679 * \return
680 *
681 *
682 * \author
683 *  Ittiam
684 *
685 *****************************************************************************
686 */
ihevce_update_pred_qp(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD32 cu_pos_x,WORD32 cu_pos_y)687 void ihevce_update_pred_qp(ihevce_enc_loop_ctxt_t *ps_ctxt, WORD32 cu_pos_x, WORD32 cu_pos_y)
688 {
689     WORD32 i4_pred_qp = 0x7FFFFFFF;
690     WORD32 i4_top, i4_left;
691     if(cu_pos_x == 0 && cu_pos_y == 0) /*CTB start*/
692     {
693         i4_pred_qp = ps_ctxt->i4_prev_QP;
694     }
695     else
696     {
697         if(cu_pos_y == 0) /*CTB boundary*/
698         {
699             i4_top = ps_ctxt->i4_prev_QP;
700         }
701         else /*within CTB*/
702         {
703             i4_top = ps_ctxt->ai4_qp_qg[(cu_pos_y - 1) * 8 + (cu_pos_x)];
704         }
705         if(cu_pos_x == 0) /*CTB boundary*/
706         {
707             i4_left = ps_ctxt->i4_prev_QP;
708         }
709         else /*within CTB*/
710         {
711             i4_left = ps_ctxt->ai4_qp_qg[(cu_pos_y)*8 + (cu_pos_x - 1)];
712         }
713         i4_pred_qp = (i4_left + i4_top + 1) >> 1;
714     }
715     ps_ctxt->i4_pred_qp = i4_pred_qp;
716     return;
717 }
718 /*!
719 ******************************************************************************
720 * \if Function name : ihevce_compute_cu_level_QP \endif
721 *
722 * \brief
723 *    Computes cu level QP with Traqo,Spatial Mod and In-frame RC
724 *
725 * \param[in]
726 *
727 * \return
728 *
729 *
730 * \author
731 *  Ittiam
732 *
733 *****************************************************************************
734 */
ihevce_compute_cu_level_QP(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD32 i4_activity_for_qp,WORD32 i4_activity_for_lamda,WORD32 i4_reduce_qp)735 void ihevce_compute_cu_level_QP(
736     ihevce_enc_loop_ctxt_t *ps_ctxt,
737     WORD32 i4_activity_for_qp,
738     WORD32 i4_activity_for_lamda,
739     WORD32 i4_reduce_qp)
740 {
741     /*modify quant related param in ctxt based on current cu qp*/
742     WORD32 i4_input_QP = ps_ctxt->i4_frame_mod_qp;
743     WORD32 cu_qp = i4_input_QP + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset;
744 
745     WORD32 i4_max_qp_allowed;
746     WORD32 i4_min_qp_allowed;
747     WORD32 i4_pred_qp;
748 
749     i4_pred_qp = ps_ctxt->i4_pred_qp;
750 
751     if(ps_ctxt->i4_sub_pic_level_rc)
752     {
753         i4_max_qp_allowed = (i4_pred_qp + (25 + (ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset / 2)));
754         i4_min_qp_allowed = (i4_pred_qp - (26 + (ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset / 2)));
755     }
756     else
757     {
758         i4_max_qp_allowed = (i4_input_QP + (7 + (ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset / 4)));
759         i4_min_qp_allowed = (i4_input_QP - (18 + (ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset / 4)));
760     }
761     if((ps_ctxt->i1_slice_type == BSLICE) && (ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6))
762         return;
763 
764 #if LAMDA_BASED_ON_QUANT
765     i4_activity_for_lamda = i4_activity_for_qp;
766 #endif
767 
768     if(i4_activity_for_qp != -1)
769     {
770         cu_qp = (ps_ctxt->ps_rc_quant_ctxt
771                      ->pi4_qp_to_qscale[i4_input_QP + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset]);
772         if(ps_ctxt->i4_qp_mod)
773         {
774             /*Recompute the Qp as per enc thread's frame level Qp*/
775             ASSERT(i4_activity_for_qp > 0);
776             cu_qp = ((cu_qp * i4_activity_for_qp) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >>
777                     QP_LEVEL_MOD_ACT_FACTOR;
778         }
779 
780         // To avoid access of uninitialised Qscale to qp conversion table
781         if(cu_qp > ps_ctxt->ps_rc_quant_ctxt->i2_max_qscale)
782             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_max_qscale;
783         else if(cu_qp < ps_ctxt->ps_rc_quant_ctxt->i2_min_qscale)
784             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qscale;
785 
786         cu_qp = ps_ctxt->ps_rc_quant_ctxt->pi4_qscale_to_qp[cu_qp];
787 
788         if((1 == i4_reduce_qp) && (cu_qp > 1))
789             cu_qp--;
790 
791         /*CLIP the delta to obey standard allowed QP variation of (-26 + offset/2) to (25 + offset/2)*/
792         if(cu_qp > i4_max_qp_allowed)
793             cu_qp = i4_max_qp_allowed;
794         else if(cu_qp < i4_min_qp_allowed)
795             cu_qp = i4_min_qp_allowed;
796 
797         /* CLIP to maintain Qp between user configured and min and max Qp values*/
798         if(cu_qp > ps_ctxt->ps_rc_quant_ctxt->i2_max_qp)
799             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_max_qp;
800         else if(cu_qp < ps_ctxt->ps_rc_quant_ctxt->i2_min_qp)
801             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qp;
802 
803         /*cu qp must be populated in cu_analyse_t struct*/
804         ps_ctxt->i4_cu_qp = cu_qp;
805         /*recompute quant related param at every cu level*/
806         ihevce_compute_quant_rel_param(ps_ctxt, cu_qp);
807     }
808 
809     /*Decoupling qp and lamda calculation */
810     if(i4_activity_for_lamda != -1)
811     {
812         cu_qp = (ps_ctxt->ps_rc_quant_ctxt
813                      ->pi4_qp_to_qscale[i4_input_QP + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset]);
814 
815         if(ps_ctxt->i4_qp_mod)
816         {
817 #if MODULATE_LAMDA_WHEN_SPATIAL_MOD_ON
818             /*Recompute the Qp as per enc thread's frame level Qp*/
819             ASSERT(i4_activity_for_lamda > 0);
820             cu_qp = ((cu_qp * i4_activity_for_lamda) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >>
821                     QP_LEVEL_MOD_ACT_FACTOR;
822 #endif
823         }
824         if(cu_qp > ps_ctxt->ps_rc_quant_ctxt->i2_max_qscale)
825             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_max_qscale;
826         else if(cu_qp < ps_ctxt->ps_rc_quant_ctxt->i2_min_qscale)
827             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qscale;
828 
829         cu_qp = ps_ctxt->ps_rc_quant_ctxt->pi4_qscale_to_qp[cu_qp];
830 
831         /*CLIP the delta to obey standard allowed QP variation of (-26 + offset/2) to (25 + offset/2)*/
832         if(cu_qp > i4_max_qp_allowed)
833             cu_qp = i4_max_qp_allowed;
834         else if(cu_qp < i4_min_qp_allowed)
835             cu_qp = i4_min_qp_allowed;
836 
837         /* CLIP to maintain Qp between user configured and min and max Qp values*/
838         if(cu_qp > ps_ctxt->ps_rc_quant_ctxt->i2_max_qp)
839             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_max_qp;
840         else if(cu_qp < ps_ctxt->ps_rc_quant_ctxt->i2_min_qp)
841             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qp;
842         /* get frame level lambda params */
843         ihevce_get_cl_cu_lambda_prms(
844             ps_ctxt, MODULATE_LAMDA_WHEN_SPATIAL_MOD_ON ? cu_qp : ps_ctxt->i4_frame_qp);
845     }
846 }
847 
ihevce_update_cu_level_qp_lamda(ihevce_enc_loop_ctxt_t * ps_ctxt,cu_analyse_t * ps_cu_analyse,WORD32 trans_size,WORD32 is_intra)848 void ihevce_update_cu_level_qp_lamda(
849     ihevce_enc_loop_ctxt_t *ps_ctxt, cu_analyse_t *ps_cu_analyse, WORD32 trans_size, WORD32 is_intra)
850 {
851     WORD32 i4_act_counter = 0, i4_act_counter_lamda = 0;
852 
853     if(ps_cu_analyse->u1_cu_size == 64)
854     {
855         ASSERT((trans_size == 32) || (trans_size == 16) || (trans_size == 8) || (trans_size == 4));
856         i4_act_counter = (trans_size == 16) + 2 * ((trans_size == 8) || (trans_size == 4));
857         i4_act_counter_lamda = 3;
858     }
859     else if(ps_cu_analyse->u1_cu_size == 32)
860     {
861         ASSERT((trans_size == 32) || (trans_size == 16) || (trans_size == 8) || (trans_size == 4));
862         i4_act_counter = (trans_size == 16) + 2 * ((trans_size == 8) || (trans_size == 4));
863         i4_act_counter_lamda = 0;
864     }
865     else if(ps_cu_analyse->u1_cu_size == 16)
866     {
867         ASSERT((trans_size == 16) || (trans_size == 8) || (trans_size == 4));
868         i4_act_counter = (trans_size == 8) || (trans_size == 4);
869         i4_act_counter_lamda = 0;
870     }
871     else if(ps_cu_analyse->u1_cu_size == 8)
872     {
873         ASSERT((trans_size == 8) || (trans_size == 4));
874         i4_act_counter = 1;
875         i4_act_counter_lamda = 0;
876     }
877     else
878     {
879         ASSERT(0);
880     }
881 
882     if(ps_ctxt->i4_use_ctb_level_lamda)
883     {
884         ihevce_compute_cu_level_QP(
885             ps_ctxt, ps_cu_analyse->i4_act_factor[i4_act_counter][is_intra], -1, 0);
886     }
887     else
888     {
889         ihevce_compute_cu_level_QP(
890             ps_ctxt,
891             ps_cu_analyse->i4_act_factor[i4_act_counter][is_intra],
892             ps_cu_analyse->i4_act_factor[i4_act_counter_lamda][is_intra],
893             0);
894     }
895 
896     ps_cu_analyse->i1_cu_qp = ps_ctxt->i4_cu_qp;
897 }
898 
899 /**
900 *******************************************************************************
901 * \if Function name : ihevce_scan_coeffs \endif
902 *
903 * @brief * Computes the coeff buffer for a coded TU for entropy coding
904 *
905 * @par   Description
906 * Computes the coeff buffer for a coded TU for entropy coding
907 *
908 * \param[in] pi2_quan_coeffs Quantized coefficient context
909 *
910 * \param[in] scan_idx Scan index specifying the scan order
911 *
912 * \param[in] trans_size Transform unit size
913 *
914 * \param[inout] pu1_out_data output coeff buffer for a coded TU for entropy coding
915 *
916 * \param[in] pu1_csbf_buf csb flag buffer
917 *
918 * @returns num_bytes
919 * Number of bytes written to pu1_out_data
920 *
921 * @remarks
922 *
923 * \author
924 *  Ittiam
925 *
926 *******************************************************************************
927 */
928 
ihevce_scan_coeffs(WORD16 * pi2_quant_coeffs,WORD32 * pi4_subBlock2csbfId_map,WORD32 scan_idx,WORD32 trans_size,UWORD8 * pu1_out_data,UWORD8 * pu1_csbf_buf,WORD32 i4_csbf_stride)929 WORD32 ihevce_scan_coeffs(
930     WORD16 *pi2_quant_coeffs,
931     WORD32 *pi4_subBlock2csbfId_map,
932     WORD32 scan_idx,
933     WORD32 trans_size,
934     UWORD8 *pu1_out_data,
935     UWORD8 *pu1_csbf_buf,
936     WORD32 i4_csbf_stride)
937 {
938     WORD32 i, trans_unit_idx, num_gt1_flag;
939     UWORD16 u2_csbf0flags;
940     WORD32 num_bytes = 0;
941     UWORD8 *pu1_trans_table;
942     UWORD8 *pu1_csb_table;
943     WORD32 shift_value, mask_value;
944     UWORD16 u2_sig_coeff_abs_gt0_flags = 0, u2_sig_coeff_abs_gt1_flags = 0;
945     UWORD16 u2_sign_flags;
946     UWORD16 u2_abs_coeff_remaining[16];
947     WORD32 blk_row, blk_col;
948 
949     UWORD8 *pu1_out_data_header;
950     UWORD16 *pu2_out_data_coeff;
951 
952     WORD32 x_pos, y_pos;
953     WORD32 quant_coeff;
954 
955     WORD32 num_gt0_flag;
956     (void)i4_csbf_stride;
957     pu1_out_data_header = pu1_out_data;
958     /* Need only last 3 bits, rest are reserved for debugging and making */
959     /* WORD alignment */
960     u2_csbf0flags = 0xBAD0;
961 
962     /* Select proper order for your transform unit and csb based on scan_idx*/
963     /* and the trans_size */
964 
965     /* scan order inside a csb */
966     pu1_csb_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
967     /* GETRANGE will give the log_2 of trans_size to shift_value */
968     GETRANGE(shift_value, trans_size);
969     shift_value = shift_value - 3; /* for finding. row no. from scan index */
970     mask_value = (trans_size / 4) - 1; /*for finding the col. no. from scan index*/
971     switch(trans_size)
972     {
973     case 32:
974         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_8x8[scan_idx][0]);
975         break;
976     case 16:
977         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
978         break;
979     case 8:
980         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_2x2[scan_idx][0]);
981         break;
982     case 4:
983         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_1x1[0]);
984         break;
985     default:
986         DBG_PRINTF("Invalid Trans Size\n");
987         return -1;
988         break;
989     }
990 
991     /*go through each csb in the scan order for first non-zero coded sub-block*/
992     for(trans_unit_idx = (trans_size * trans_size / 16) - 1; trans_unit_idx >= 0; trans_unit_idx--)
993     {
994         /* check for the first csb flag in our scan order */
995         if(pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]])
996         {
997             UWORD8 u1_last_x, u1_last_y;
998             /* row of csb */
999             blk_row = pu1_trans_table[trans_unit_idx] >> shift_value;
1000             /* col of csb */
1001             blk_col = pu1_trans_table[trans_unit_idx] & mask_value;
1002 
1003             /*check for the 1st non-0 values inside the csb in our scan order*/
1004             for(i = 15; i >= 0; i--)
1005             {
1006                 x_pos = (pu1_csb_table[i] & 0x3) + blk_col * 4;
1007                 y_pos = (pu1_csb_table[i] >> 2) + blk_row * 4;
1008 
1009                 quant_coeff = pi2_quant_coeffs[x_pos + (y_pos * trans_size)];
1010 
1011                 if(quant_coeff != 0)
1012                     break;
1013             }
1014 
1015             ASSERT(i >= 0);
1016 
1017             u1_last_x = x_pos;
1018             u1_last_y = y_pos;
1019 
1020             /* storing last_x and last_y */
1021             *pu1_out_data_header = u1_last_x;
1022             pu1_out_data_header++;
1023             num_bytes++;
1024             *pu1_out_data_header = u1_last_y;
1025             pu1_out_data_header++;
1026             num_bytes++;
1027 
1028             /* storing the scan order */
1029             *pu1_out_data_header = scan_idx;
1030             pu1_out_data_header++;
1031             num_bytes++;
1032             /* storing last_sub_block pos. in scan order count */
1033             *pu1_out_data_header = trans_unit_idx;
1034             pu1_out_data_header++;
1035             num_bytes++;
1036 
1037             /*stored the first 4 bytes, now all are word16. So word16 pointer*/
1038             pu2_out_data_coeff = (UWORD16 *)pu1_out_data_header;
1039 
1040             /* u2_csbf0flags word */
1041             u2_csbf0flags = 0xBAD0 | 1; /*since right&bottom csbf is 0*/
1042             /* storing u2_csbf0flags word */
1043             *pu2_out_data_coeff = u2_csbf0flags;
1044             pu2_out_data_coeff++;
1045             num_bytes += 2;
1046 
1047             num_gt0_flag = 1;
1048             num_gt1_flag = 0;
1049             u2_sign_flags = 0;
1050 
1051             /* set the i th bit of u2_sig_coeff_abs_gt0_flags */
1052             u2_sig_coeff_abs_gt0_flags = u2_sig_coeff_abs_gt0_flags | (1 << i);
1053             if(abs(quant_coeff) > 1)
1054             {
1055                 /* set the i th bit of u2_sig_coeff_abs_gt1_flags */
1056                 u2_sig_coeff_abs_gt1_flags = u2_sig_coeff_abs_gt1_flags | (1 << i);
1057                 /* update u2_abs_coeff_remaining */
1058                 u2_abs_coeff_remaining[num_gt1_flag] = (UWORD16)abs(quant_coeff) - 1;
1059 
1060                 num_gt1_flag++;
1061             }
1062 
1063             if(quant_coeff < 0)
1064             {
1065                 /* set the i th bit of u2_sign_flags */
1066                 u2_sign_flags = u2_sign_flags | (1 << i);
1067             }
1068 
1069             /* Test remaining elements in our scan order */
1070             /* Can optimize further by CLZ macro */
1071             for(i = i - 1; i >= 0; i--)
1072             {
1073                 x_pos = (pu1_csb_table[i] & 0x3) + blk_col * 4;
1074                 y_pos = (pu1_csb_table[i] >> 2) + blk_row * 4;
1075 
1076                 quant_coeff = pi2_quant_coeffs[x_pos + (y_pos * trans_size)];
1077 
1078                 if(quant_coeff != 0)
1079                 {
1080                     /* set the i th bit of u2_sig_coeff_abs_gt0_flags */
1081                     u2_sig_coeff_abs_gt0_flags |= (1 << i);
1082 
1083                     if((abs(quant_coeff) > 1) || (num_gt0_flag >= MAX_GT_ONE))
1084                     {
1085                         /* set the i th bit of u2_sig_coeff_abs_gt1_flags */
1086                         u2_sig_coeff_abs_gt1_flags |= (1 << i);
1087 
1088                         /* update u2_abs_coeff_remaining */
1089                         u2_abs_coeff_remaining[num_gt1_flag] = (UWORD16)abs(quant_coeff) - 1;
1090 
1091                         num_gt1_flag++; /*n0. of Ones in sig_coeff_abs_gt1_flag*/
1092                     }
1093 
1094                     if(quant_coeff < 0)
1095                     {
1096                         /* set the i th bit of u2_sign_flags */
1097                         u2_sign_flags |= (1 << i);
1098                     }
1099 
1100                     num_gt0_flag++;
1101                 }
1102             }
1103 
1104             /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
1105             *pu2_out_data_coeff = u2_sig_coeff_abs_gt0_flags;
1106             pu2_out_data_coeff++;
1107             num_bytes += 2;
1108             /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
1109             *pu2_out_data_coeff = u2_sig_coeff_abs_gt1_flags;
1110             pu2_out_data_coeff++;
1111             num_bytes += 2;
1112             /* storing u2_sign_flags 2 bytes */
1113             *pu2_out_data_coeff = u2_sign_flags;
1114             pu2_out_data_coeff++;
1115             num_bytes += 2;
1116 
1117             /* Store the u2_abs_coeff_remaining[] */
1118             for(i = 0; i < num_gt1_flag; i++)
1119             {
1120                 /* storing u2_abs_coeff_remaining[i] 2 bytes */
1121                 *pu2_out_data_coeff = u2_abs_coeff_remaining[i];
1122                 pu2_out_data_coeff++;
1123                 num_bytes += 2;
1124             }
1125 
1126             break; /*We just need this loop for finding 1st non-zero csb only*/
1127         }
1128     }
1129 
1130     /* go through remaining csb in the scan order */
1131     for(trans_unit_idx = trans_unit_idx - 1; trans_unit_idx >= 0; trans_unit_idx--)
1132     {
1133         blk_row = pu1_trans_table[trans_unit_idx] >> shift_value; /*row of csb*/
1134         blk_col = pu1_trans_table[trans_unit_idx] & mask_value; /*col of csb*/
1135 
1136         /* u2_csbf0flags word */
1137         u2_csbf0flags = 0xBAD0 | /* assuming csbf_buf has only 0 or 1 values */
1138                         (pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]]);
1139 
1140         /********************************************************************/
1141         /* Minor hack: As per HEVC spec csbf in not signalled in stream for */
1142         /* block0, instead sig coeff map is directly signalled. This is     */
1143         /* taken care by forcing csbf for block0 to be 1 even if it is 0    */
1144         /********************************************************************/
1145         if(0 == trans_unit_idx)
1146         {
1147             u2_csbf0flags |= 1;
1148         }
1149 
1150         if((blk_col + 1 < trans_size / 4)) /* checking right boundary */
1151         {
1152             if(pu1_csbf_buf[pi4_subBlock2csbfId_map[blk_row * trans_size / 4 + blk_col + 1]])
1153             {
1154                 /* set the 2nd bit of u2_csbf0flags for right csbf */
1155                 u2_csbf0flags = u2_csbf0flags | (1 << 1);
1156             }
1157         }
1158         if((blk_row + 1 < trans_size / 4)) /* checking bottom oundary */
1159         {
1160             if(pu1_csbf_buf[pi4_subBlock2csbfId_map[(blk_row + 1) * trans_size / 4 + blk_col]])
1161             {
1162                 /* set the 3rd bit of u2_csbf0flags  for bottom csbf */
1163                 u2_csbf0flags = u2_csbf0flags | (1 << 2);
1164             }
1165         }
1166 
1167         /* storing u2_csbf0flags word */
1168         *pu2_out_data_coeff = u2_csbf0flags;
1169         pu2_out_data_coeff++;
1170         num_bytes += 2;
1171 
1172         /* check for the csb flag in our scan order */
1173         if(u2_csbf0flags & 0x1)
1174         {
1175             u2_sig_coeff_abs_gt0_flags = 0;
1176             u2_sig_coeff_abs_gt1_flags = 0;
1177             u2_sign_flags = 0;
1178 
1179             num_gt0_flag = 0;
1180             num_gt1_flag = 0;
1181             /* check for the non-0 values inside the csb in our scan order */
1182             /* Can optimize further by CLZ macro */
1183             for(i = 15; i >= 0; i--)
1184             {
1185                 x_pos = (pu1_csb_table[i] & 0x3) + blk_col * 4;
1186                 y_pos = (pu1_csb_table[i] >> 2) + blk_row * 4;
1187 
1188                 quant_coeff = pi2_quant_coeffs[x_pos + (y_pos * trans_size)];
1189 
1190                 if(quant_coeff != 0)
1191                 {
1192                     /* set the i th bit of u2_sig_coeff_abs_gt0_flags */
1193                     u2_sig_coeff_abs_gt0_flags |= (1 << i);
1194 
1195                     if((abs(quant_coeff) > 1) || (num_gt0_flag >= MAX_GT_ONE))
1196                     {
1197                         /* set the i th bit of u2_sig_coeff_abs_gt1_flags */
1198                         u2_sig_coeff_abs_gt1_flags |= (1 << i);
1199 
1200                         /* update u2_abs_coeff_remaining */
1201                         u2_abs_coeff_remaining[num_gt1_flag] = (UWORD16)abs(quant_coeff) - 1;
1202 
1203                         num_gt1_flag++;
1204                     }
1205 
1206                     if(quant_coeff < 0)
1207                     {
1208                         /* set the i th bit of u2_sign_flags */
1209                         u2_sign_flags = u2_sign_flags | (1 << i);
1210                     }
1211 
1212                     num_gt0_flag++;
1213                 }
1214             }
1215 
1216             /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
1217             *pu2_out_data_coeff = u2_sig_coeff_abs_gt0_flags;
1218             pu2_out_data_coeff++;
1219             num_bytes += 2;
1220 
1221             /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
1222             *pu2_out_data_coeff = u2_sig_coeff_abs_gt1_flags;
1223             pu2_out_data_coeff++;
1224             num_bytes += 2;
1225 
1226             /* storing u2_sign_flags 2 bytes */
1227             *pu2_out_data_coeff = u2_sign_flags;
1228             pu2_out_data_coeff++;
1229             num_bytes += 2;
1230 
1231             /* Store the u2_abs_coeff_remaining[] */
1232             for(i = 0; i < num_gt1_flag; i++)
1233             {
1234                 /* storing u2_abs_coeff_remaining[i] 2 bytes */
1235                 *pu2_out_data_coeff = u2_abs_coeff_remaining[i];
1236                 pu2_out_data_coeff++;
1237                 num_bytes += 2;
1238             }
1239         }
1240     }
1241 
1242     return num_bytes; /* Return the number of bytes written to out_data */
1243 }
1244 
1245 /**
1246 *******************************************************************************
1247 * \if Function name : ihevce_populate_intra_pred_mode \endif
1248 *
1249 * \brief * populates intra pred modes,b2_mpm_idx,b1_prev_intra_luma_pred_flag &
1250 * b5_rem_intra_pred_mode for a CU based on nieghbouring CUs,
1251 *
1252 * \par   Description
1253 * Computes the b1_prev_intra_luma_pred_flag, b2_mpm_idx & b5_rem_intra_pred_mode
1254 * for a CU
1255 *
1256 * \param[in] top_intra_mode Top intra mode
1257 * \param[in] left_intra_mode Left intra mode
1258 * \param[in] available_top Top availability flag
1259 * \param[in] available_left Left availability flag
1260 * \param[in] cu_pos_y CU 'y' position
1261 * \param[in] ps_cand_mode_list pointer to populate candidate list
1262 *
1263 * \returns none
1264 *
1265 * \author
1266 *  Ittiam
1267 *
1268 *******************************************************************************
1269 */
1270 
ihevce_populate_intra_pred_mode(WORD32 top_intra_mode,WORD32 left_intra_mode,WORD32 available_top,WORD32 available_left,WORD32 cu_pos_y,WORD32 * ps_cand_mode_list)1271 void ihevce_populate_intra_pred_mode(
1272     WORD32 top_intra_mode,
1273     WORD32 left_intra_mode,
1274     WORD32 available_top,
1275     WORD32 available_left,
1276     WORD32 cu_pos_y,
1277     WORD32 *ps_cand_mode_list)
1278 {
1279     /* local variables */
1280     WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
1281 
1282     /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
1283     /* N = top */
1284     if(0 == available_top)
1285     {
1286         cand_intra_pred_mode_top = INTRA_DC;
1287     }
1288     /* for neighbour != INTRA, setting DC is done outside */
1289     else if(0 == cu_pos_y) /* It's on the CTB boundary */
1290     {
1291         cand_intra_pred_mode_top = INTRA_DC;
1292     }
1293     else
1294     {
1295         cand_intra_pred_mode_top = top_intra_mode;
1296     }
1297 
1298     /* N = left */
1299     if(0 == available_left)
1300     {
1301         cand_intra_pred_mode_left = INTRA_DC;
1302     }
1303     /* for neighbour != INTRA, setting DC is done outside */
1304     else
1305     {
1306         cand_intra_pred_mode_left = left_intra_mode;
1307     }
1308 
1309     /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
1310     if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
1311     {
1312         if(cand_intra_pred_mode_left < 2)
1313         {
1314             ps_cand_mode_list[0] = INTRA_PLANAR;
1315             ps_cand_mode_list[1] = INTRA_DC;
1316             ps_cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
1317         }
1318         else
1319         {
1320             ps_cand_mode_list[0] = cand_intra_pred_mode_left;
1321             ps_cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
1322             ps_cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
1323         }
1324     }
1325     else
1326     {
1327         ps_cand_mode_list[0] = cand_intra_pred_mode_left;
1328         ps_cand_mode_list[1] = cand_intra_pred_mode_top;
1329 
1330         if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
1331            (cand_intra_pred_mode_top != INTRA_PLANAR))
1332         {
1333             ps_cand_mode_list[2] = INTRA_PLANAR;
1334         }
1335         else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
1336         {
1337             ps_cand_mode_list[2] = INTRA_DC;
1338         }
1339         else
1340         {
1341             ps_cand_mode_list[2] = INTRA_ANGULAR(26);
1342         }
1343     }
1344 }
1345 /**
1346 *******************************************************************************
1347 * \if Function name : ihevce_intra_pred_mode_signaling \endif
1348 *
1349 * \brief * Computes the b1_prev_intra_luma_pred_flag, b2_mpm_idx &
1350 * b5_rem_intra_pred_mode for a CU
1351 *
1352 * \par   Description
1353 * Computes the b1_prev_intra_luma_pred_flag, b2_mpm_idx & b5_rem_intra_pred_mode
1354 * for a CU
1355 *
1356 * \param[in] ps_nbr_top Top neighbour context
1357 * \param[in] ps_nbr_left Left neighbour context
1358 * \param[in] available_top Top availability flag
1359 * \param[in] available_left Left availability flag
1360 * \param[in] cu_pos_y CU 'y' position
1361 * \param[in] luma_intra_pred_mode_current the intra_pred_mode of current block
1362 * \param[inout] ps_intra_pred_mode_current
1363 * Pointer to structure having b1_prev_intra_luma_pred_flag, b2_mpm_idx and
1364 * b5_rem_intra_pred_mode
1365 *
1366 * \returns none
1367 *
1368 * \author
1369 *  Ittiam
1370 *
1371 *******************************************************************************
1372 */
1373 
ihevce_intra_pred_mode_signaling(WORD32 top_intra_mode,WORD32 left_intra_mode,WORD32 available_top,WORD32 available_left,WORD32 cu_pos_y,WORD32 luma_intra_pred_mode_current,intra_prev_rem_flags_t * ps_intra_pred_mode_current)1374 void ihevce_intra_pred_mode_signaling(
1375     WORD32 top_intra_mode,
1376     WORD32 left_intra_mode,
1377     WORD32 available_top,
1378     WORD32 available_left,
1379     WORD32 cu_pos_y,
1380     WORD32 luma_intra_pred_mode_current,
1381     intra_prev_rem_flags_t *ps_intra_pred_mode_current)
1382 {
1383     /* local variables */
1384     WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
1385     WORD32 cand_mode_list[3];
1386 
1387     ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 0;
1388     ps_intra_pred_mode_current->b2_mpm_idx = 0;  // for safety purpose
1389     ps_intra_pred_mode_current->b5_rem_intra_pred_mode = 0;
1390 
1391     /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
1392     /* N = top */
1393     if(0 == available_top)
1394     {
1395         cand_intra_pred_mode_top = INTRA_DC;
1396     }
1397     /* for neighbour != INTRA, setting DC is done outside */
1398     else if(0 == cu_pos_y) /* It's on the CTB boundary */
1399     {
1400         cand_intra_pred_mode_top = INTRA_DC;
1401     }
1402     else
1403     {
1404         cand_intra_pred_mode_top = top_intra_mode;
1405     }
1406 
1407     /* N = left */
1408     if(0 == available_left)
1409     {
1410         cand_intra_pred_mode_left = INTRA_DC;
1411     }
1412     /* for neighbour != INTRA, setting DC is done outside */
1413     else
1414     {
1415         cand_intra_pred_mode_left = left_intra_mode;
1416     }
1417 
1418     /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
1419     if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
1420     {
1421         if(cand_intra_pred_mode_left < 2)
1422         {
1423             cand_mode_list[0] = INTRA_PLANAR;
1424             cand_mode_list[1] = INTRA_DC;
1425             cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
1426         }
1427         else
1428         {
1429             cand_mode_list[0] = cand_intra_pred_mode_left;
1430             cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
1431             cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
1432         }
1433     }
1434     else
1435     {
1436         cand_mode_list[0] = cand_intra_pred_mode_left;
1437         cand_mode_list[1] = cand_intra_pred_mode_top;
1438 
1439         if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
1440            (cand_intra_pred_mode_top != INTRA_PLANAR))
1441         {
1442             cand_mode_list[2] = INTRA_PLANAR;
1443         }
1444         else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
1445         {
1446             cand_mode_list[2] = INTRA_DC;
1447         }
1448         else
1449         {
1450             cand_mode_list[2] = INTRA_ANGULAR(26);
1451         }
1452     }
1453 
1454     /* Signal Generation */
1455 
1456     /* Flag & mpm_index generation */
1457     if(cand_mode_list[0] == luma_intra_pred_mode_current)
1458     {
1459         ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 1;
1460         ps_intra_pred_mode_current->b2_mpm_idx = 0;
1461     }
1462     else if(cand_mode_list[1] == luma_intra_pred_mode_current)
1463     {
1464         ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 1;
1465         ps_intra_pred_mode_current->b2_mpm_idx = 1;
1466     }
1467     else if(cand_mode_list[2] == luma_intra_pred_mode_current)
1468     {
1469         ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 1;
1470         ps_intra_pred_mode_current->b2_mpm_idx = 2;
1471     }
1472     /* Flag & b5_rem_intra_pred_mode generation */
1473     else
1474     {
1475         WORD32 rem_mode;
1476 
1477         ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 0;
1478 
1479         /* sorting cand_mode_list */
1480         if(cand_mode_list[0] > cand_mode_list[1])
1481         {
1482             SWAP(cand_mode_list[0], cand_mode_list[1]);
1483         }
1484         if(cand_mode_list[0] > cand_mode_list[2])
1485         {
1486             SWAP(cand_mode_list[0], cand_mode_list[2]);
1487         }
1488         if(cand_mode_list[1] > cand_mode_list[2])
1489         {
1490             SWAP(cand_mode_list[1], cand_mode_list[2]);
1491         }
1492 
1493         rem_mode = luma_intra_pred_mode_current;
1494 
1495         if((rem_mode) >= cand_mode_list[2])
1496         {
1497             (rem_mode)--;
1498         }
1499         if((rem_mode) >= cand_mode_list[1])
1500         {
1501             (rem_mode)--;
1502         }
1503         if((rem_mode) >= cand_mode_list[0])
1504         {
1505             (rem_mode)--;
1506         }
1507         ps_intra_pred_mode_current->b5_rem_intra_pred_mode = rem_mode;
1508     }
1509 }
1510 
ihevce_quant_rounding_factor_gen(WORD32 i4_trans_size,WORD32 is_luma,rdopt_entropy_ctxt_t * ps_rdopt_entropy_ctxt,WORD32 * pi4_quant_round_0_1,WORD32 * pi4_quant_round_1_2,double i4_lamda_modifier,UWORD8 i4_is_tu_level_quant_rounding)1511 void ihevce_quant_rounding_factor_gen(
1512     WORD32 i4_trans_size,
1513     WORD32 is_luma,
1514     rdopt_entropy_ctxt_t *ps_rdopt_entropy_ctxt,
1515     WORD32 *pi4_quant_round_0_1,
1516     WORD32 *pi4_quant_round_1_2,
1517     double i4_lamda_modifier,
1518     UWORD8 i4_is_tu_level_quant_rounding)
1519 {
1520     //WORD32 i4_scan_idx = ps_ctxt->i4_scan_idx;
1521     UWORD8 *pu1_ctxt_model;
1522     WORD32 scan_pos;
1523     WORD32 sig_coeff_base_ctxt; /* cabac context for sig coeff flag    */
1524     WORD32 abs_gt1_base_ctxt;
1525     WORD32 log2_tr_size, i;
1526     UWORD16 u4_bits_estimated_r0, u4_bits_estimated_r1, u4_bits_estimated_r2;
1527     UWORD16 u4_bits_estimated_r1_temp;
1528     WORD32 j = 0;
1529     WORD32 k = 0;
1530     WORD32 temp2;
1531 
1532     double i4_lamda_mod = i4_lamda_modifier * pow(2.0, (-8.0 / 3.0));
1533     LWORD64 lamda_mod = (LWORD64)(i4_lamda_mod * (1 << LAMDA_Q_SHIFT_FACT));
1534     /* transform size to log2transform size */
1535     GETRANGE(log2_tr_size, i4_trans_size);
1536     log2_tr_size -= 1;
1537 
1538     if(1 == i4_is_tu_level_quant_rounding)
1539     {
1540         entropy_context_t *ps_cur_tu_entropy;
1541         cab_ctxt_t *ps_cabac;
1542         WORD32 curr_buf_idx = ps_rdopt_entropy_ctxt->i4_curr_buf_idx;
1543         ps_cur_tu_entropy = &ps_rdopt_entropy_ctxt->as_cu_entropy_ctxt[curr_buf_idx];
1544 
1545         ps_cabac = &ps_cur_tu_entropy->s_cabac_ctxt;
1546 
1547         pu1_ctxt_model = &ps_cabac->au1_ctxt_models[0];
1548     }
1549     else
1550     {
1551         pu1_ctxt_model = &ps_rdopt_entropy_ctxt->au1_init_cabac_ctxt_states[0];
1552     }
1553     /*If transform size is 4x4, then only one sub-block*/
1554     if(is_luma)
1555     {
1556         sig_coeff_base_ctxt = IHEVC_CAB_COEFF_FLAG;
1557         abs_gt1_base_ctxt = IHEVC_CAB_COEFABS_GRTR1_FLAG;
1558 
1559         if(3 == log2_tr_size)
1560         {
1561             /* 8x8 transform size */
1562             /* Assuming diagnol scan idx for now */
1563             sig_coeff_base_ctxt += 9;
1564         }
1565         else if(3 < log2_tr_size)
1566         {
1567             /* larger transform sizes */
1568             sig_coeff_base_ctxt += 21;
1569         }
1570     }
1571     else
1572     {
1573         /* chroma context initializations */
1574         sig_coeff_base_ctxt = IHEVC_CAB_COEFF_FLAG + 27;
1575         abs_gt1_base_ctxt = IHEVC_CAB_COEFABS_GRTR1_FLAG + 16;
1576 
1577         if(3 == log2_tr_size)
1578         {
1579             /* 8x8 transform size */
1580             sig_coeff_base_ctxt += 9;
1581         }
1582         else if(3 < log2_tr_size)
1583         {
1584             /* larger transform sizes */
1585             sig_coeff_base_ctxt += 12;
1586         }
1587     }
1588 
1589     /*Transform size of 4x4 will have only a single CSB */
1590     /* derive the context inc as per section 9.3.3.1.4 */
1591 
1592     if(2 == log2_tr_size)
1593     {
1594         UWORD8 sig_ctxinc;
1595         WORD32 state_mps;
1596         WORD32 gt1_ctxt = 0;
1597         WORD32 ctxt_set = 0;
1598         WORD32 ctxt_idx = 0;
1599 
1600         /* context set based on luma subblock pos */
1601 
1602         /* Encodet the abs level gt1 bins */
1603         /* Currently calculating trade off between mps(2) and mps(1)*/
1604         /* The estimation has to be further done for mps(11) and mps(111)*/
1605         /*ctxt_set = 0 as transform 4x4 has only one csb with DC */
1606         /* gt1_ctxt = 0 for the co-ef value to be 2 */
1607 
1608         ctxt_set = gt1_ctxt = 0;
1609         ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1610 
1611         state_mps = pu1_ctxt_model[ctxt_idx];
1612 
1613         u4_bits_estimated_r2 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1];
1614 
1615         u4_bits_estimated_r1_temp = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1616 
1617         QUANT_ROUND_FACTOR(temp2, u4_bits_estimated_r2, u4_bits_estimated_r1_temp, lamda_mod);
1618         for(scan_pos = 0; scan_pos < 16; scan_pos++)
1619         {
1620             *(pi4_quant_round_1_2 + scan_pos) = temp2;
1621         }
1622 
1623         for(scan_pos = 0; scan_pos < 16; scan_pos++)
1624         {
1625             //UWORD8 nbr_csbf = 1;
1626             /* derive the x,y pos */
1627             UWORD8 y_pos_x_pos = scan_pos;  //gu1_hevce_scan4x4[i4_scan_idx][scan_pos];
1628 
1629             /* 4x4 transform size increment uses lookup */
1630             sig_ctxinc = gu1_hevce_sigcoeff_ctxtinc_tr4[y_pos_x_pos];
1631 
1632             /*Get the mps state based on ctxt modes */
1633             state_mps = pu1_ctxt_model[sig_ctxinc + sig_coeff_base_ctxt];
1634 
1635             /* Bits taken to encode sig co-ef flag as 0 */
1636             u4_bits_estimated_r0 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1637 
1638             /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1639             //
1640             u4_bits_estimated_r1 =
1641                 (gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1] + ROUND_Q12(1.000000000));
1642 
1643             /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1644             u4_bits_estimated_r1 += u4_bits_estimated_r1_temp;
1645 
1646             QUANT_ROUND_FACTOR(temp2, u4_bits_estimated_r1, u4_bits_estimated_r0, lamda_mod);
1647             *(pi4_quant_round_0_1 + scan_pos) = temp2;
1648         }
1649     }
1650     else
1651     {
1652         UWORD8 *pu1_hevce_sigcoeff_ctxtinc;
1653         WORD32 is_nbr_csb_state_mps;
1654 
1655         WORD32 state_mps;
1656         WORD32 gt1_ctxt = 0;
1657         WORD32 ctxt_set = 0;
1658         WORD32 ctxt_idx;
1659         /*1to2 rounding factor is same for all sub blocks except for sub-block = 0*/
1660         /*Hence will write all the sub-block with i >=1 coeff, and then overwrite for i = 0*/
1661 
1662         /*ctxt_set = 0 DC subblock, the previous state did not have 2
1663         ctxt_set = 1 DC subblock, the previous state did have >= 2
1664         ctxt_set = 2 AC subblock, the previous state did not have 2
1665         ctxt_set = 3 AC subblock, the previous state did have >= 2*/
1666         i = 1;
1667         ctxt_set = (i && is_luma) ? 2 : 0;
1668 
1669         ctxt_set++;
1670 
1671         /*0th position indicates the probability of 2 */
1672         /*1th position indicates the probability of 1 */
1673         /*2th position indicates the probability of 11 */
1674         /*3th position indicates the probability of 111 */
1675 
1676         gt1_ctxt = 0;
1677         ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1678 
1679         state_mps = pu1_ctxt_model[ctxt_idx];
1680 
1681         u4_bits_estimated_r2 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1];
1682 
1683         u4_bits_estimated_r1 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1684         QUANT_ROUND_FACTOR(temp2, u4_bits_estimated_r2, u4_bits_estimated_r1, lamda_mod);
1685 
1686         for(scan_pos = 0; scan_pos < (16 * (i4_trans_size * i4_trans_size >> 4)); scan_pos++)
1687         {
1688             *(pi4_quant_round_1_2 + scan_pos) = temp2;
1689         }
1690 
1691         i = 0;
1692         ctxt_set = (i && is_luma) ? 2 : 0;
1693         ctxt_set++;
1694 
1695         /*0th position indicates the probability of 2 */
1696         /*1th position indicates the probability of 1 */
1697         /*2th position indicates the probability of 11 */
1698         /*3th position indicates the probability of 111 */
1699 
1700         gt1_ctxt = 0;
1701         ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1702 
1703         state_mps = pu1_ctxt_model[ctxt_idx];
1704 
1705         u4_bits_estimated_r2 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1];
1706 
1707         u4_bits_estimated_r1 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1708         QUANT_ROUND_FACTOR(temp2, u4_bits_estimated_r2, u4_bits_estimated_r1, lamda_mod);
1709 
1710         for(scan_pos = 0; scan_pos < 16; scan_pos++)
1711         {
1712             *(pi4_quant_round_1_2 + ((scan_pos % 4) + ((scan_pos >> 2) * i4_trans_size))) = temp2;
1713         }
1714 
1715         {
1716             WORD32 ctxt_idx;
1717 
1718             WORD32 nbr_csbf_0, nbr_csbf_1;
1719             WORD32 state_mps_0, state_mps_1;
1720             ctxt_idx = IHEVC_CAB_CODED_SUBLK_IDX;
1721             ctxt_idx += is_luma ? 0 : 2;
1722 
1723             /* ctxt based on right / bottom avail csbf, section 9.3.3.1.3 */
1724             /* if neibhor not available, ctxt idx = 0*/
1725             nbr_csbf_0 = 0;
1726             ctxt_idx += nbr_csbf_0 ? 1 : 0;
1727             state_mps_0 = pu1_ctxt_model[ctxt_idx];
1728 
1729             nbr_csbf_1 = 1;
1730             ctxt_idx += nbr_csbf_1 ? 1 : 0;
1731             state_mps_1 = pu1_ctxt_model[ctxt_idx];
1732 
1733             is_nbr_csb_state_mps = ((state_mps_0 % 2) == 1) && ((state_mps_1 % 2) == 1);
1734         }
1735 
1736         if(1 == is_nbr_csb_state_mps)
1737         {
1738             for(i = 0; i < (i4_trans_size * i4_trans_size >> 4); i++)
1739             {
1740                 UWORD8 sig_ctxinc;
1741                 WORD32 state_mps;
1742                 WORD32 gt1_ctxt = 0;
1743                 WORD32 ctxt_set = 0;
1744 
1745                 WORD32 ctxt_idx;
1746 
1747                 /*Check if the cabac states had previous nbr available */
1748 
1749                 if(i == 0)
1750                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[3][0];
1751                 else if(i < (i4_trans_size >> 2))
1752                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[1][0];
1753                 else if((i % (i4_trans_size >> 2)) == 0)
1754                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[2][0];
1755                 else
1756                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[0][0];
1757 
1758                 if(((i % (i4_trans_size >> 2)) == 0) && (i != 0))
1759                     k++;
1760 
1761                 j = ((i4_trans_size * 4) * k) + ((i % (i4_trans_size >> 2)) * 4);
1762                 /*ctxt_set = 0 DC subblock, the previous state did not have 2
1763                 ctxt_set = 1 DC subblock, the previous state did have >= 2
1764                 ctxt_set = 2 AC subblock, the previous state did not have 2
1765                 ctxt_set = 3 AC subblock, the previous state did have >= 2*/
1766 
1767                 ctxt_set = (i && is_luma) ? 2 : 0;
1768 
1769                 /* gt1_ctxt = 1 for the co-ef value to be 1 */
1770                 gt1_ctxt = 0;
1771                 ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1772 
1773                 state_mps = pu1_ctxt_model[ctxt_idx];
1774 
1775                 /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1776                 u4_bits_estimated_r1_temp = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1777 
1778                 for(scan_pos = 0; scan_pos < 16; scan_pos++)
1779                 {
1780                     UWORD8 y_pos_x_pos;
1781 
1782                     if(scan_pos || i)
1783                     {
1784                         y_pos_x_pos = scan_pos;  // gu1_hevce_scan4x4[i4_scan_idx][scan_pos];
1785                         /* ctxt for AC coeff depends on curpos and neigbour csbf */
1786                         sig_ctxinc = pu1_hevce_sigcoeff_ctxtinc[y_pos_x_pos];
1787 
1788                         /* based on luma subblock pos */
1789                         sig_ctxinc += (i && is_luma) ? 3 : 0;
1790 
1791                         sig_ctxinc += sig_coeff_base_ctxt;
1792                     }
1793                     else
1794                     {
1795                         /*MAM : both scan pos and i 0 impies the DC coef of 1st block only */
1796                         /* DC coeff has fixed context for luma and chroma */
1797                         sig_ctxinc = is_luma ? IHEVC_CAB_COEFF_FLAG : IHEVC_CAB_COEFF_FLAG + 27;
1798                     }
1799 
1800                     /*Get the mps state based on ctxt modes */
1801                     state_mps = pu1_ctxt_model[sig_ctxinc];
1802 
1803                     /* Bits taken to encode sig co-ef flag as 0 */
1804                     u4_bits_estimated_r0 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1805 
1806                     u4_bits_estimated_r1 =
1807                         (gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1] + ROUND_Q12(1.000000000));
1808 
1809                     /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1810                     u4_bits_estimated_r1 += u4_bits_estimated_r1_temp;
1811                     {
1812                         QUANT_ROUND_FACTOR(
1813                             temp2, u4_bits_estimated_r1, u4_bits_estimated_r0, lamda_mod);
1814                         *(pi4_quant_round_0_1 +
1815                           ((scan_pos % 4) + ((scan_pos >> 2) * i4_trans_size)) + j) = temp2;
1816                     }
1817                 }
1818             }
1819         }
1820         else
1821         {
1822             /*If Both nbr csbfs are 0, then all the coef in sub-blocks will have same value except for 1st subblock,
1823             Hence will write the same value to all sub block, and overwrite for the 1st one */
1824             i = 1;
1825             {
1826                 UWORD8 sig_ctxinc;
1827                 UWORD8 y_pos_x_pos;
1828                 WORD32 quant_rounding_0_1;
1829 
1830                 pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc_00[0];
1831 
1832                 scan_pos = 0;
1833                 y_pos_x_pos = scan_pos;  // gu1_hevce_scan4x4[i4_scan_idx][scan_pos];
1834                 /* ctxt for AC coeff depends on curpos and neigbour csbf */
1835                 sig_ctxinc = pu1_hevce_sigcoeff_ctxtinc[y_pos_x_pos];
1836 
1837                 /* based on luma subblock pos */
1838                 sig_ctxinc += (is_luma) ? 3 : 0;
1839 
1840                 sig_ctxinc += sig_coeff_base_ctxt;
1841 
1842                 /*Get the mps state based on ctxt modes */
1843                 state_mps = pu1_ctxt_model[sig_ctxinc];
1844 
1845                 /* Bits taken to encode sig co-ef flag as 0 */
1846                 u4_bits_estimated_r0 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1847 
1848                 u4_bits_estimated_r1 =
1849                     (gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1] + ROUND_Q12(1.000000000));
1850 
1851                 /*ctxt_set = 0 DC subblock, the previous state did not have 2
1852                 ctxt_set = 1 DC subblock, the previous state did have >= 2
1853                 ctxt_set = 2 AC subblock, the previous state did not have 2
1854                 ctxt_set = 3 AC subblock, the previous state did have >= 2*/
1855 
1856                 ctxt_set = (i && is_luma) ? 2 : 0;
1857 
1858                 /* gt1_ctxt = 1 for the co-ef value to be 1 */
1859                 gt1_ctxt = 0;
1860                 ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1861 
1862                 state_mps = pu1_ctxt_model[ctxt_idx];
1863 
1864                 /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1865                 u4_bits_estimated_r1 += gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1866 
1867                 QUANT_ROUND_FACTOR(
1868                     quant_rounding_0_1, u4_bits_estimated_r1, u4_bits_estimated_r0, lamda_mod);
1869 
1870                 for(scan_pos = 0; scan_pos < (16 * (i4_trans_size * i4_trans_size >> 4));
1871                     scan_pos++)
1872                 {
1873                     *(pi4_quant_round_0_1 + scan_pos) = quant_rounding_0_1;
1874                 }
1875             }
1876 
1877             /*First Subblock*/
1878             i = 0;
1879 
1880             {
1881                 UWORD8 sig_ctxinc;
1882                 WORD32 state_mps;
1883                 WORD32 gt1_ctxt = 0;
1884                 WORD32 ctxt_set = 0;
1885 
1886                 WORD32 ctxt_idx;
1887 
1888                 /*Check if the cabac states had previous nbr available */
1889 
1890                 {
1891                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[0][0];
1892 
1893                     /*ctxt_set = 0 DC subblock, the previous state did not have 2
1894                     ctxt_set = 1 DC subblock, the previous state did have >= 2
1895                     ctxt_set = 2 AC subblock, the previous state did not have 2
1896                     ctxt_set = 3 AC subblock, the previous state did have >= 2*/
1897                     ctxt_set = (i && is_luma) ? 2 : 0;
1898 
1899                     /* gt1_ctxt = 1 for the co-ef value to be 1 */
1900                     gt1_ctxt = 0;
1901                     ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1902 
1903                     state_mps = pu1_ctxt_model[ctxt_idx];
1904 
1905                     /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1906                     u4_bits_estimated_r1_temp = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1907 
1908                     for(scan_pos = 0; scan_pos < 16; scan_pos++)
1909                     {
1910                         UWORD8 y_pos_x_pos;
1911 
1912                         if(scan_pos)
1913                         {
1914                             y_pos_x_pos = scan_pos;  // gu1_hevce_scan4x4[i4_scan_idx][scan_pos];
1915                             /* ctxt for AC coeff depends on curpos and neigbour csbf */
1916                             sig_ctxinc = pu1_hevce_sigcoeff_ctxtinc[y_pos_x_pos];
1917 
1918                             /* based on luma subblock pos */
1919                             sig_ctxinc += (i && is_luma) ? 3 : 0;
1920 
1921                             sig_ctxinc += sig_coeff_base_ctxt;
1922                         }
1923                         else
1924                         {
1925                             /*MAM : both scan pos and i 0 impies the DC coef of 1st block only */
1926                             /* DC coeff has fixed context for luma and chroma */
1927                             sig_ctxinc = is_luma ? IHEVC_CAB_COEFF_FLAG : IHEVC_CAB_COEFF_FLAG + 27;
1928                         }
1929 
1930                         /*Get the mps state based on ctxt modes */
1931                         state_mps = pu1_ctxt_model[sig_ctxinc];
1932 
1933                         /* Bits taken to encode sig co-ef flag as 0 */
1934                         u4_bits_estimated_r0 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1935 
1936                         u4_bits_estimated_r1 =
1937                             (gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1] + ROUND_Q12(1.000000000));
1938 
1939                         /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1940                         u4_bits_estimated_r1 += u4_bits_estimated_r1_temp;
1941                         {
1942                             QUANT_ROUND_FACTOR(
1943                                 temp2, u4_bits_estimated_r1, u4_bits_estimated_r0, lamda_mod);
1944                             *(pi4_quant_round_0_1 +
1945                               ((scan_pos % 4) + ((scan_pos >> 2) * i4_trans_size))) = temp2;
1946                         }
1947                     }
1948                 }
1949             }
1950         }
1951     }
1952     return;
1953 }
1954 
1955 /*!
1956 ******************************************************************************
1957 * \if Function name : ihevce_t_q_iq_ssd_scan_fxn \endif
1958 *
1959 * \brief
1960 *    Transform unit level (Luma) enc_loop function
1961 *
1962 * \param[in] ps_ctxt    enc_loop module ctxt pointer
1963 * \param[in] pu1_pred   pointer to predicted data buffer
1964 * \param[in] pred_strd  predicted buffer stride
1965 * \param[in] pu1_src    pointer to source data buffer
1966 * \param[in] src_strd   source buffer stride
1967 * \param[in] pi2_deq_data   pointer to store iq data
1968 * \param[in] deq_data_strd  iq data buffer stride
1969 * \param[out] pu1_ecd_data pointer coeff output buffer (input to ent cod)
1970 * \param[out] pu1_csbf_buf  pointer to store the csbf for all 4x4 in a current
1971 *                           block
1972 * \param[out] csbf_strd  csbf buffer stride
1973 * \param[in] trans_size transform size (4, 8, 16,32)
1974 * \param[in] packed_pred_mode   0:Inter 1:Intra 2:Skip
1975 * \param[out] pi4_cost      pointer to store the cost
1976 * \param[out] pi4_coeff_off pointer to store the number of bytes produced in
1977 *                           coeff buffer
1978 * \param[out] pu4_tu_bits   pointer to store the best TU bits required encode
1979 the current TU in RDopt Mode
1980 * \param[out] pu4_blk_sad   pointer to store the block sad for RC
1981 * \param[out] pi4_zero_col  pointer to store the zero_col info for the TU
1982 * \param[out] pi4_zero_row  pointer to store the zero_row info for the TU
1983 * \param[in]  i4_perform_rdoq Indicates if RDOQ should be performed or not
1984 * \param[in]  i4_perform_sbh Indicates if SBH should be performed or not
1985 *
1986 * \return
1987 *    CBF of the current block
1988 *
1989 * \author
1990 *  Ittiam
1991 *
1992 *****************************************************************************
1993 */
1994 
ihevce_t_q_iq_ssd_scan_fxn(ihevce_enc_loop_ctxt_t * ps_ctxt,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD16 * pi2_deq_data,WORD32 deq_data_strd,UWORD8 * pu1_recon,WORD32 i4_recon_stride,UWORD8 * pu1_ecd_data,UWORD8 * pu1_csbf_buf,WORD32 csbf_strd,WORD32 trans_size,WORD32 packed_pred_mode,LWORD64 * pi8_cost,WORD32 * pi4_coeff_off,WORD32 * pi4_tu_bits,UWORD32 * pu4_blk_sad,WORD32 * pi4_zero_col,WORD32 * pi4_zero_row,UWORD8 * pu1_is_recon_available,WORD32 i4_perform_rdoq,WORD32 i4_perform_sbh,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy,SSD_TYPE_T e_ssd_type,WORD32 early_cbf)1995 WORD32 ihevce_t_q_iq_ssd_scan_fxn(
1996     ihevce_enc_loop_ctxt_t *ps_ctxt,
1997     UWORD8 *pu1_pred,
1998     WORD32 pred_strd,
1999     UWORD8 *pu1_src,
2000     WORD32 src_strd,
2001     WORD16 *pi2_deq_data,
2002     WORD32 deq_data_strd,
2003     UWORD8 *pu1_recon,
2004     WORD32 i4_recon_stride,
2005     UWORD8 *pu1_ecd_data,
2006     UWORD8 *pu1_csbf_buf,
2007     WORD32 csbf_strd,
2008     WORD32 trans_size,
2009     WORD32 packed_pred_mode,
2010     LWORD64 *pi8_cost,
2011     WORD32 *pi4_coeff_off,
2012     WORD32 *pi4_tu_bits,
2013     UWORD32 *pu4_blk_sad,
2014     WORD32 *pi4_zero_col,
2015     WORD32 *pi4_zero_row,
2016     UWORD8 *pu1_is_recon_available,
2017     WORD32 i4_perform_rdoq,
2018     WORD32 i4_perform_sbh,
2019 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2020     WORD32 i4_alpha_stim_multiplier,
2021     UWORD8 u1_is_cu_noisy,
2022 #endif
2023     SSD_TYPE_T e_ssd_type,
2024     WORD32 early_cbf)
2025 {
2026     WORD32 cbf = 0;
2027     WORD32 trans_idx;
2028     WORD32 quant_scale_mat_offset;
2029     WORD32 *pi4_trans_scratch;
2030     WORD16 *pi2_trans_values;
2031     WORD16 *pi2_quant_coeffs;
2032     WORD32 *pi4_subBlock2csbfId_map = NULL;
2033 
2034 #if PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
2035     WORD32 ai4_quant_rounding_factors[3][MAX_TU_SIZE * MAX_TU_SIZE], i;
2036 #endif
2037 
2038     rdoq_sbh_ctxt_t *ps_rdoq_sbh_ctxt = &ps_ctxt->s_rdoq_sbh_ctxt;
2039 
2040     WORD32 i4_perform_zcbf = (ENABLE_INTER_ZCU_COST && (PRED_MODE_INTRA != packed_pred_mode)) ||
2041                              (ps_ctxt->i4_zcbf_rdo_level == ZCBF_ENABLE);
2042     WORD32 i4_perform_coeff_level_rdoq = (ps_ctxt->i4_quant_rounding_level != FIXED_QUANT_ROUNDING);
2043     WORD8 intra_flag = 0;
2044     ASSERT(csbf_strd == MAX_TU_IN_CTB_ROW);
2045 
2046     *pi4_tu_bits = 0;
2047     *pi4_coeff_off = 0;
2048     pu1_is_recon_available[0] = 0;
2049 
2050     if((PRED_MODE_SKIP == packed_pred_mode) || (0 == early_cbf))
2051     {
2052         if(e_ssd_type != NULL_TYPE)
2053         {
2054             /* SSD cost is stored to the pointer */
2055             pi8_cost[0] =
2056 
2057                 ps_ctxt->s_cmn_opt_func.pf_ssd_and_sad_calculator(
2058                     pu1_pred, pred_strd, pu1_src, src_strd, trans_size, pu4_blk_sad);
2059 
2060 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2061             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
2062             {
2063                 pi8_cost[0] = ihevce_inject_stim_into_distortion(
2064                     pu1_src,
2065                     src_strd,
2066                     pu1_pred,
2067                     pred_strd,
2068                     pi8_cost[0],
2069                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
2070                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
2071                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
2072                                                  100.0,
2073                     trans_size,
2074                     0,
2075                     ps_ctxt->u1_enable_psyRDOPT,
2076                     NULL_PLANE);
2077             }
2078 #endif
2079 
2080             /* copy pred to recon for skip mode */
2081             if(SPATIAL_DOMAIN_SSD == e_ssd_type)
2082             {
2083                 ps_ctxt->s_cmn_opt_func.pf_copy_2d(
2084                     pu1_recon, i4_recon_stride, pu1_pred, pred_strd, trans_size, trans_size);
2085                 pu1_is_recon_available[0] = 1;
2086             }
2087             else
2088             {
2089                 pu1_is_recon_available[0] = 0;
2090             }
2091 
2092 #if ENABLE_INTER_ZCU_COST
2093             ps_ctxt->i8_cu_not_coded_cost += pi8_cost[0];
2094 #endif
2095         }
2096         else
2097         {
2098             pi8_cost[0] = UINT_MAX;
2099         }
2100 
2101         /* cbf is returned as 0 */
2102         return (0);
2103     }
2104 
2105     /* derive context variables */
2106     pi4_trans_scratch = (WORD32 *)&ps_ctxt->ai2_scratch[0];
2107     pi2_quant_coeffs = &ps_ctxt->ai2_scratch[0];
2108     pi2_trans_values = &ps_ctxt->ai2_scratch[0] + (MAX_TRANS_SIZE * 2);
2109 
2110     /* translate the transform size to index for 4x4 and 8x8 */
2111     trans_idx = trans_size >> 2;
2112 
2113     if(PRED_MODE_INTRA == packed_pred_mode)
2114     {
2115         quant_scale_mat_offset = 0;
2116         intra_flag = 1;
2117 #if PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
2118         ai4_quant_rounding_factors[0][0] =
2119             MAX(ps_ctxt->i4_quant_rnd_factor[intra_flag], (1 << QUANT_ROUND_FACTOR_Q) / 3);
2120 
2121         for(i = 0; i < trans_size * trans_size; i++)
2122         {
2123             ai4_quant_rounding_factors[1][i] =
2124                 MAX(ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3][i],
2125                     (1 << QUANT_ROUND_FACTOR_Q) / 3);
2126             ai4_quant_rounding_factors[2][i] =
2127                 MAX(ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3][i],
2128                     (1 << QUANT_ROUND_FACTOR_Q) / 3);
2129         }
2130 #endif
2131     }
2132     else
2133     {
2134         quant_scale_mat_offset = NUM_TRANS_TYPES;
2135     }
2136     /* for intra 4x4 DST transform should be used */
2137     if((1 == trans_idx) && (1 == intra_flag))
2138     {
2139         trans_idx = 0;
2140     }
2141     /* for 16x16 cases */
2142     else if(16 == trans_size)
2143     {
2144         trans_idx = 3;
2145     }
2146     /* for 32x32 cases */
2147     else if(32 == trans_size)
2148     {
2149         trans_idx = 4;
2150     }
2151 
2152     switch(trans_size)
2153     {
2154     case 4:
2155     {
2156         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map4x4TU;
2157 
2158         break;
2159     }
2160     case 8:
2161     {
2162         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map8x8TU;
2163 
2164         break;
2165     }
2166     case 16:
2167     {
2168         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map16x16TU;
2169 
2170         break;
2171     }
2172     case 32:
2173     {
2174         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map32x32TU;
2175 
2176         break;
2177     }
2178     }
2179 
2180     /* Do not call the FT and Quant functions if early_cbf is 0 */
2181     if(1 == early_cbf)
2182     {
2183         /* ---------- call residue and transform block ------- */
2184         *pu4_blk_sad = ps_ctxt->apf_resd_trns[trans_idx](
2185             pu1_src,
2186             pu1_pred,
2187             pi4_trans_scratch,
2188             pi2_trans_values,
2189             src_strd,
2190             pred_strd,
2191             trans_size,
2192             NULL_PLANE);
2193 
2194         cbf = ps_ctxt->apf_quant_iquant_ssd
2195                   [i4_perform_coeff_level_rdoq + (e_ssd_type != FREQUENCY_DOMAIN_SSD) * 2](
2196                       pi2_trans_values,
2197                       ps_ctxt->api2_rescal_mat[trans_idx + quant_scale_mat_offset],
2198                       pi2_quant_coeffs,
2199                       pi2_deq_data,
2200                       trans_size,
2201                       ps_ctxt->i4_cu_qp_div6,
2202                       ps_ctxt->i4_cu_qp_mod6,
2203 #if !PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
2204                       ps_ctxt->i4_quant_rnd_factor[intra_flag],
2205                       ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
2206                       ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
2207 #else
2208                       intra_flag ? ai4_quant_rounding_factors[0][0]
2209                                  : ps_ctxt->i4_quant_rnd_factor[intra_flag],
2210                       intra_flag ? ai4_quant_rounding_factors[1]
2211                                  : ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
2212                       intra_flag ? ai4_quant_rounding_factors[2]
2213                                  : ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
2214 #endif
2215                       trans_size,
2216                       trans_size,
2217                       deq_data_strd,
2218                       pu1_csbf_buf,
2219                       csbf_strd,
2220                       pi4_zero_col,
2221                       pi4_zero_row,
2222                       ps_ctxt->api2_scal_mat[trans_idx + quant_scale_mat_offset],
2223                       pi8_cost);
2224 
2225         if(e_ssd_type != FREQUENCY_DOMAIN_SSD)
2226         {
2227             pi8_cost[0] = UINT_MAX;
2228         }
2229     }
2230 
2231     if(0 != cbf)
2232     {
2233         if(i4_perform_sbh || i4_perform_rdoq)
2234         {
2235             ps_rdoq_sbh_ctxt->i4_iq_data_strd = deq_data_strd;
2236             ps_rdoq_sbh_ctxt->i4_q_data_strd = trans_size;
2237             ps_rdoq_sbh_ctxt->pi4_subBlock2csbfId_map = pi4_subBlock2csbfId_map;
2238 
2239             ps_rdoq_sbh_ctxt->i4_qp_div = ps_ctxt->i4_cu_qp_div6;
2240             ps_rdoq_sbh_ctxt->i2_qp_rem = ps_ctxt->i4_cu_qp_mod6;
2241             ps_rdoq_sbh_ctxt->i4_scan_idx = ps_ctxt->i4_scan_idx;
2242             ps_rdoq_sbh_ctxt->i8_ssd_cost = *pi8_cost;
2243             ps_rdoq_sbh_ctxt->i4_trans_size = trans_size;
2244 
2245             ps_rdoq_sbh_ctxt->pi2_dequant_coeff =
2246                 ps_ctxt->api2_scal_mat[trans_idx + quant_scale_mat_offset];
2247             ps_rdoq_sbh_ctxt->pi2_iquant_coeffs = pi2_deq_data;
2248             ps_rdoq_sbh_ctxt->pi2_quant_coeffs = pi2_quant_coeffs;
2249             ps_rdoq_sbh_ctxt->pi2_trans_values = pi2_trans_values;
2250             ps_rdoq_sbh_ctxt->pu1_csbf_buf = pu1_csbf_buf;
2251 
2252             /* ------- call coeffs scan function ------- */
2253             if((!i4_perform_rdoq))
2254             {
2255                 ihevce_sign_data_hiding(ps_rdoq_sbh_ctxt);
2256 
2257                 pi8_cost[0] = ps_rdoq_sbh_ctxt->i8_ssd_cost;
2258             }
2259         }
2260 
2261         *pi4_coeff_off = ps_ctxt->s_cmn_opt_func.pf_scan_coeffs(
2262             pi2_quant_coeffs,
2263             pi4_subBlock2csbfId_map,
2264             ps_ctxt->i4_scan_idx,
2265             trans_size,
2266             pu1_ecd_data,
2267             pu1_csbf_buf,
2268             csbf_strd);
2269     }
2270     *pi8_cost >>= ga_trans_shift[trans_idx];
2271 
2272 #if RDOPT_ZERO_CBF_ENABLE
2273     /* compare null cbf cost with encode tu rd-cost */
2274     if(cbf != 0)
2275     {
2276         WORD32 tu_bits;
2277         LWORD64 tu_rd_cost;
2278 
2279         LWORD64 zero_cbf_cost = 0;
2280 
2281         /*Populating the feilds of rdoq_ctxt structure*/
2282         if(i4_perform_rdoq)
2283         {
2284             /* transform size to log2transform size */
2285             GETRANGE(ps_rdoq_sbh_ctxt->i4_log2_trans_size, trans_size);
2286             ps_rdoq_sbh_ctxt->i4_log2_trans_size -= 1;
2287             ps_rdoq_sbh_ctxt->i8_cl_ssd_lambda_qf = ps_ctxt->i8_cl_ssd_lambda_qf;
2288             ps_rdoq_sbh_ctxt->i4_is_luma = 1;
2289             ps_rdoq_sbh_ctxt->i4_shift_val_ssd_in_td = ga_trans_shift[trans_idx];
2290             ps_rdoq_sbh_ctxt->i4_round_val_ssd_in_td =
2291                 (1 << ps_rdoq_sbh_ctxt->i4_shift_val_ssd_in_td) / 2;
2292             ps_rdoq_sbh_ctxt->i1_tu_is_coded = 0;
2293             ps_rdoq_sbh_ctxt->pi4_zero_col = pi4_zero_col;
2294             ps_rdoq_sbh_ctxt->pi4_zero_row = pi4_zero_row;
2295         }
2296         else if(i4_perform_zcbf)
2297         {
2298             zero_cbf_cost =
2299 
2300                 ps_ctxt->s_cmn_opt_func.pf_ssd_calculator(
2301                     pu1_src, pu1_pred, src_strd, pred_strd, trans_size, trans_size, NULL_PLANE);
2302         }
2303 
2304         /************************************************************************/
2305         /* call the entropy rdo encode to get the bit estimate for current tu   */
2306         /* note that tu includes only residual coding bits and does not include */
2307         /* tu split, cbf and qp delta encoding bits for a TU                    */
2308         /************************************************************************/
2309         if(i4_perform_rdoq)
2310         {
2311             tu_bits = ihevce_entropy_rdo_encode_tu_rdoq(
2312                 &ps_ctxt->s_rdopt_entropy_ctxt,
2313                 (pu1_ecd_data),
2314                 trans_size,
2315                 1,
2316                 ps_rdoq_sbh_ctxt,
2317                 pi8_cost,
2318                 &zero_cbf_cost,
2319                 0);
2320 
2321             if(ps_rdoq_sbh_ctxt->i1_tu_is_coded == 0)
2322             {
2323                 cbf = 0;
2324                 *pi4_coeff_off = 0;
2325             }
2326 
2327             if((i4_perform_sbh) && (0 != cbf))
2328             {
2329                 ps_rdoq_sbh_ctxt->i8_ssd_cost = *pi8_cost;
2330                 ihevce_sign_data_hiding(ps_rdoq_sbh_ctxt);
2331                 *pi8_cost = ps_rdoq_sbh_ctxt->i8_ssd_cost;
2332             }
2333 
2334             /*Add round value before normalizing*/
2335             *pi8_cost += ps_rdoq_sbh_ctxt->i4_round_val_ssd_in_td;
2336             *pi8_cost >>= ga_trans_shift[trans_idx];
2337 
2338             if(ps_rdoq_sbh_ctxt->i1_tu_is_coded == 1)
2339             {
2340                 pi2_quant_coeffs = &ps_ctxt->ai2_scratch[0];
2341                 *pi4_coeff_off = ps_ctxt->s_cmn_opt_func.pf_scan_coeffs(
2342                     pi2_quant_coeffs,
2343                     pi4_subBlock2csbfId_map,
2344                     ps_ctxt->i4_scan_idx,
2345                     trans_size,
2346                     pu1_ecd_data,
2347                     pu1_csbf_buf,
2348                     csbf_strd);
2349             }
2350         }
2351         else
2352         {
2353             tu_bits = ihevce_entropy_rdo_encode_tu(
2354                 &ps_ctxt->s_rdopt_entropy_ctxt, pu1_ecd_data, trans_size, 1, i4_perform_sbh);
2355         }
2356 
2357         *pi4_tu_bits = tu_bits;
2358 
2359         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
2360         {
2361             *pi8_cost = ihevce_it_recon_ssd(
2362                 ps_ctxt,
2363                 pu1_src,
2364                 src_strd,
2365                 pu1_pred,
2366                 pred_strd,
2367                 pi2_deq_data,
2368                 deq_data_strd,
2369                 pu1_recon,
2370                 i4_recon_stride,
2371                 pu1_ecd_data,
2372                 trans_size,
2373                 packed_pred_mode,
2374                 cbf,
2375                 *pi4_zero_col,
2376                 *pi4_zero_row,
2377                 NULL_PLANE);
2378 
2379             pu1_is_recon_available[0] = 1;
2380         }
2381 
2382 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2383         if(u1_is_cu_noisy && (e_ssd_type == SPATIAL_DOMAIN_SSD) && i4_alpha_stim_multiplier)
2384         {
2385             pi8_cost[0] = ihevce_inject_stim_into_distortion(
2386                 pu1_src,
2387                 src_strd,
2388                 pu1_recon,
2389                 i4_recon_stride,
2390                 pi8_cost[0],
2391                 i4_alpha_stim_multiplier,
2392                 trans_size,
2393                 0,
2394                 ps_ctxt->u1_enable_psyRDOPT,
2395                 NULL_PLANE);
2396         }
2397         else if(u1_is_cu_noisy && (e_ssd_type == FREQUENCY_DOMAIN_SSD) && i4_alpha_stim_multiplier)
2398         {
2399             pi8_cost[0] = ihevce_inject_stim_into_distortion(
2400                 pu1_src,
2401                 src_strd,
2402                 pu1_pred,
2403                 pred_strd,
2404                 pi8_cost[0],
2405                 i4_alpha_stim_multiplier,
2406                 trans_size,
2407                 0,
2408                 ps_ctxt->u1_enable_psyRDOPT,
2409                 NULL_PLANE);
2410         }
2411 #endif
2412 
2413         /* add the SSD cost to bits estimate given by ECD */
2414         tu_rd_cost = *pi8_cost + COMPUTE_RATE_COST_CLIP30(
2415                                      tu_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
2416 
2417         if(i4_perform_zcbf)
2418         {
2419 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2420             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
2421             {
2422                 zero_cbf_cost = ihevce_inject_stim_into_distortion(
2423                     pu1_src,
2424                     src_strd,
2425                     pu1_pred,
2426                     pred_strd,
2427                     zero_cbf_cost,
2428                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
2429                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
2430                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
2431                                                  100.0,
2432                     trans_size,
2433                     0,
2434                     ps_ctxt->u1_enable_psyRDOPT,
2435                     NULL_PLANE);
2436             }
2437 #endif
2438 
2439             /* force the tu as zero cbf if zero_cbf_cost is lower */
2440             if(zero_cbf_cost < tu_rd_cost)
2441             {
2442                 /* num bytes is set to 0 */
2443                 *pi4_coeff_off = 0;
2444 
2445                 /* cbf is returned as 0 */
2446                 cbf = 0;
2447 
2448                 /* cost is returned as 0 cbf cost */
2449                 *pi8_cost = zero_cbf_cost;
2450 
2451                 /* TU bits is set to 0 */
2452                 *pi4_tu_bits = 0;
2453                 pu1_is_recon_available[0] = 0;
2454 
2455                 if(SPATIAL_DOMAIN_SSD == e_ssd_type)
2456                 {
2457                     /* copy pred to recon for zcbf mode */
2458 
2459                     ps_ctxt->s_cmn_opt_func.pf_copy_2d(
2460                         pu1_recon, i4_recon_stride, pu1_pred, pred_strd, trans_size, trans_size);
2461 
2462                     pu1_is_recon_available[0] = 1;
2463                 }
2464             }
2465             /* accumulate cu not coded cost with zcbf cost */
2466 #if ENABLE_INTER_ZCU_COST
2467             ps_ctxt->i8_cu_not_coded_cost += zero_cbf_cost;
2468 #endif
2469         }
2470     }
2471     else
2472     {
2473         /* cbf = 0, accumulate cu not coded cost */
2474         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
2475         {
2476             *pi8_cost = ihevce_it_recon_ssd(
2477                 ps_ctxt,
2478                 pu1_src,
2479                 src_strd,
2480                 pu1_pred,
2481                 pred_strd,
2482                 pi2_deq_data,
2483                 deq_data_strd,
2484                 pu1_recon,
2485                 i4_recon_stride,
2486                 pu1_ecd_data,
2487                 trans_size,
2488                 packed_pred_mode,
2489                 cbf,
2490                 *pi4_zero_col,
2491                 *pi4_zero_row,
2492                 NULL_PLANE);
2493 
2494             pu1_is_recon_available[0] = 1;
2495         }
2496 
2497 #if ENABLE_INTER_ZCU_COST
2498         {
2499 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2500             if(u1_is_cu_noisy && (e_ssd_type == SPATIAL_DOMAIN_SSD) && i4_alpha_stim_multiplier)
2501             {
2502                 pi8_cost[0] = ihevce_inject_stim_into_distortion(
2503                     pu1_src,
2504                     src_strd,
2505                     pu1_recon,
2506                     i4_recon_stride,
2507                     pi8_cost[0],
2508                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
2509                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
2510                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
2511                                                  100.0,
2512                     trans_size,
2513                     0,
2514                     ps_ctxt->u1_enable_psyRDOPT,
2515                     NULL_PLANE);
2516             }
2517             else if(u1_is_cu_noisy && (e_ssd_type == FREQUENCY_DOMAIN_SSD) && i4_alpha_stim_multiplier)
2518             {
2519                 pi8_cost[0] = ihevce_inject_stim_into_distortion(
2520                     pu1_src,
2521                     src_strd,
2522                     pu1_pred,
2523                     pred_strd,
2524                     pi8_cost[0],
2525                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
2526                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
2527                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
2528                                                  100.0,
2529                     trans_size,
2530                     0,
2531                     ps_ctxt->u1_enable_psyRDOPT,
2532                     NULL_PLANE);
2533             }
2534 #endif
2535 
2536             ps_ctxt->i8_cu_not_coded_cost += *pi8_cost;
2537         }
2538 #endif /* ENABLE_INTER_ZCU_COST */
2539     }
2540 #endif
2541 
2542     return (cbf);
2543 }
2544 
2545 /*!
2546 ******************************************************************************
2547 * \if Function name : ihevce_it_recon_fxn \endif
2548 *
2549 * \brief
2550 *    Transform unit level (Luma) IT Recon function
2551 *
2552 * \param[in] ps_ctxt        enc_loop module ctxt pointer
2553 * \param[in] pi2_deq_data   pointer to iq data
2554 * \param[in] deq_data_strd  iq data buffer stride
2555 * \param[in] pu1_pred       pointer to predicted data buffer
2556 * \param[in] pred_strd      predicted buffer stride
2557 * \param[in] pu1_recon      pointer to recon buffer
2558 * \param[in] recon_strd     recon buffer stride
2559 * \param[out] pu1_ecd_data  pointer coeff output buffer (input to ent cod)
2560 * \param[in] trans_size     transform size (4, 8, 16,32)
2561 * \param[in] packed_pred_mode   0:Inter 1:Intra 2:Skip
2562 * \param[in] cbf            CBF of the current block
2563 * \param[in] zero_cols      zero_cols of the current block
2564 * \param[in] zero_rows      zero_rows of the current block
2565 *
2566 * \return
2567 *
2568 * \author
2569 *  Ittiam
2570 *
2571 *****************************************************************************
2572 */
2573 
ihevce_it_recon_fxn(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD16 * pi2_deq_data,WORD32 deq_dat_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_recon,WORD32 recon_strd,UWORD8 * pu1_ecd_data,WORD32 trans_size,WORD32 packed_pred_mode,WORD32 cbf,WORD32 zero_cols,WORD32 zero_rows)2574 void ihevce_it_recon_fxn(
2575     ihevce_enc_loop_ctxt_t *ps_ctxt,
2576     WORD16 *pi2_deq_data,
2577     WORD32 deq_dat_strd,
2578     UWORD8 *pu1_pred,
2579     WORD32 pred_strd,
2580     UWORD8 *pu1_recon,
2581     WORD32 recon_strd,
2582     UWORD8 *pu1_ecd_data,
2583     WORD32 trans_size,
2584     WORD32 packed_pred_mode,
2585     WORD32 cbf,
2586     WORD32 zero_cols,
2587     WORD32 zero_rows)
2588 {
2589     WORD32 dc_add_flag = 0;
2590     WORD32 trans_idx;
2591 
2592     /* translate the transform size to index for 4x4 and 8x8 */
2593     trans_idx = trans_size >> 2;
2594 
2595     /* if SKIP mode needs to be evaluated the pred is copied to recon */
2596     if(PRED_MODE_SKIP == packed_pred_mode)
2597     {
2598         UWORD8 *pu1_curr_recon, *pu1_curr_pred;
2599 
2600         pu1_curr_pred = pu1_pred;
2601         pu1_curr_recon = pu1_recon;
2602 
2603         /* 2D copy of data */
2604 
2605         ps_ctxt->s_cmn_opt_func.pf_2d_square_copy(
2606             pu1_curr_recon, recon_strd, pu1_curr_pred, pred_strd, trans_size, sizeof(UWORD8));
2607 
2608         return;
2609     }
2610 
2611     /* for intra 4x4 DST transform should be used */
2612     if((1 == trans_idx) && (PRED_MODE_INTRA == packed_pred_mode))
2613     {
2614         trans_idx = 0;
2615     }
2616     /* for 16x16 cases */
2617     else if(16 == trans_size)
2618     {
2619         trans_idx = 3;
2620     }
2621     /* for 32x32 cases */
2622     else if(32 == trans_size)
2623     {
2624         trans_idx = 4;
2625     }
2626 
2627     /*if (lastx == 0 && lasty == 0) , ie only 1 coefficient */
2628     if((0 == pu1_ecd_data[0]) && (0 == pu1_ecd_data[1]))
2629     {
2630         dc_add_flag = 1;
2631     }
2632 
2633     if(0 == cbf)
2634     {
2635         /* buffer copy */
2636         ps_ctxt->s_cmn_opt_func.pf_2d_square_copy(
2637             pu1_recon, recon_strd, pu1_pred, pred_strd, trans_size, 1);
2638     }
2639     else if((1 == dc_add_flag) && (0 != trans_idx))
2640     {
2641         /* dc add */
2642         ps_ctxt->s_cmn_opt_func.pf_itrans_recon_dc(
2643             pu1_pred,
2644             pred_strd,
2645             pu1_recon,
2646             recon_strd,
2647             trans_size,
2648             pi2_deq_data[0],
2649             NULL_PLANE /* luma */
2650         );
2651     }
2652     else
2653     {
2654         ps_ctxt->apf_it_recon[trans_idx](
2655             pi2_deq_data,
2656             &ps_ctxt->ai2_scratch[0],
2657             pu1_pred,
2658             pu1_recon,
2659             deq_dat_strd,
2660             pred_strd,
2661             recon_strd,
2662             zero_cols,
2663             zero_rows);
2664     }
2665 }
2666 
2667 /*!
2668 ******************************************************************************
2669 * \if Function name : ihevce_chroma_it_recon_fxn \endif
2670 *
2671 * \brief
2672 *    Transform unit level (Chroma) IT Recon function
2673 *
2674 * \param[in] ps_ctxt        enc_loop module ctxt pointer
2675 * \param[in] pi2_deq_data   pointer to iq data
2676 * \param[in] deq_data_strd  iq data buffer stride
2677 * \param[in] pu1_pred       pointer to predicted data buffer
2678 * \param[in] pred_strd      predicted buffer stride
2679 * \param[in] pu1_recon      pointer to recon buffer
2680 * \param[in] recon_strd     recon buffer stride
2681 * \param[out] pu1_ecd_data  pointer coeff output buffer (input to ent cod)
2682 * \param[in] trans_size     transform size (4, 8, 16)
2683 * \param[in] cbf            CBF of the current block
2684 * \param[in] zero_cols      zero_cols of the current block
2685 * \param[in] zero_rows      zero_rows of the current block
2686 *
2687 * \return
2688 *
2689 * \author
2690 *  Ittiam
2691 *
2692 *****************************************************************************
2693 */
2694 
ihevce_chroma_it_recon_fxn(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD16 * pi2_deq_data,WORD32 deq_dat_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_recon,WORD32 recon_strd,UWORD8 * pu1_ecd_data,WORD32 trans_size,WORD32 cbf,WORD32 zero_cols,WORD32 zero_rows,CHROMA_PLANE_ID_T e_chroma_plane)2695 void ihevce_chroma_it_recon_fxn(
2696     ihevce_enc_loop_ctxt_t *ps_ctxt,
2697     WORD16 *pi2_deq_data,
2698     WORD32 deq_dat_strd,
2699     UWORD8 *pu1_pred,
2700     WORD32 pred_strd,
2701     UWORD8 *pu1_recon,
2702     WORD32 recon_strd,
2703     UWORD8 *pu1_ecd_data,
2704     WORD32 trans_size,
2705     WORD32 cbf,
2706     WORD32 zero_cols,
2707     WORD32 zero_rows,
2708     CHROMA_PLANE_ID_T e_chroma_plane)
2709 {
2710     WORD32 trans_idx;
2711 
2712     ASSERT((e_chroma_plane == U_PLANE) || (e_chroma_plane == V_PLANE));
2713 
2714     /* since 2x2 transform is not allowed for chroma*/
2715     if(2 == trans_size)
2716     {
2717         trans_size = 4;
2718     }
2719 
2720     /* translate the transform size to index */
2721     trans_idx = trans_size >> 2;
2722 
2723     /* for 16x16 cases */
2724     if(16 == trans_size)
2725     {
2726         trans_idx = 3;
2727     }
2728 
2729     if(0 == cbf)
2730     {
2731         /* buffer copy */
2732         ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
2733             pu1_pred, pred_strd, pu1_recon, recon_strd, trans_size, trans_size, e_chroma_plane);
2734     }
2735     else if((0 == pu1_ecd_data[0]) && (0 == pu1_ecd_data[1]))
2736     {
2737         /* dc add */
2738         ps_ctxt->s_cmn_opt_func.pf_itrans_recon_dc(
2739             pu1_pred,
2740             pred_strd,
2741             pu1_recon,
2742             recon_strd,
2743             trans_size,
2744             pi2_deq_data[0],
2745             e_chroma_plane /* chroma plane */
2746         );
2747     }
2748     else
2749     {
2750         ps_ctxt->apf_chrm_it_recon[trans_idx - 1](
2751             pi2_deq_data,
2752             &ps_ctxt->ai2_scratch[0],
2753             pu1_pred + (WORD32)e_chroma_plane,
2754             pu1_recon + (WORD32)e_chroma_plane,
2755             deq_dat_strd,
2756             pred_strd,
2757             recon_strd,
2758             zero_cols,
2759             zero_rows);
2760     }
2761 }
2762 
2763 /**
2764 *******************************************************************************
2765 * \if Function name : ihevce_mpm_idx_based_filter_RDOPT_cand \endif
2766 *
2767 * \brief * Filters the RDOPT candidates based on mpm_idx
2768 *
2769 * \par   Description
2770 * Computes the b1_prev_intra_luma_pred_flag, b2_mpm_idx & b5_rem_intra_pred_mode
2771 * for a CU
2772 *
2773 * \param[in] ps_ctxt : ptr to enc loop context
2774 * \param[in] ps_cu_analyse : ptr to CU analyse structure
2775 * \param[in] ps_top_nbr_4x4 top 4x4 neighbour pointer
2776 * \param[in] ps_left_nbr_4x4 left 4x4 neighbour pointer
2777 * \param[in] pu1_luma_mode luma mode
2778 *
2779 * \returns none
2780 *
2781 * \author
2782 *  Ittiam
2783 *
2784 *******************************************************************************
2785 */
2786 
ihevce_mpm_idx_based_filter_RDOPT_cand(ihevce_enc_loop_ctxt_t * ps_ctxt,cu_analyse_t * ps_cu_analyse,nbr_4x4_t * ps_left_nbr_4x4,nbr_4x4_t * ps_top_nbr_4x4,UWORD8 * pu1_luma_mode,UWORD8 * pu1_eval_mark)2787 void ihevce_mpm_idx_based_filter_RDOPT_cand(
2788     ihevce_enc_loop_ctxt_t *ps_ctxt,
2789     cu_analyse_t *ps_cu_analyse,
2790     nbr_4x4_t *ps_left_nbr_4x4,
2791     nbr_4x4_t *ps_top_nbr_4x4,
2792     UWORD8 *pu1_luma_mode,
2793     UWORD8 *pu1_eval_mark)
2794 {
2795     WORD32 cu_pos_x;
2796     WORD32 cu_pos_y;
2797     nbr_avail_flags_t s_nbr;
2798     WORD32 trans_size;
2799     WORD32 au4_cand_mode_list[3];
2800     WORD32 nbr_flags;
2801     UWORD8 *pu1_intra_luma_modes;
2802     WORD32 rdopt_cand_ctr = 0;
2803     UWORD8 *pu1_luma_eval_mark;
2804 
2805     cu_pos_x = ps_cu_analyse->b3_cu_pos_x << 1;
2806     cu_pos_y = ps_cu_analyse->b3_cu_pos_y << 1;
2807     trans_size = ps_cu_analyse->u1_cu_size;
2808 
2809     /* get the neighbour availability flags */
2810     nbr_flags = ihevce_get_nbr_intra(
2811         &s_nbr,
2812         ps_ctxt->pu1_ctb_nbr_map,
2813         ps_ctxt->i4_nbr_map_strd,
2814         cu_pos_x,
2815         cu_pos_y,
2816         trans_size >> 2);
2817     (void)nbr_flags;
2818     /*Call the fun to populate luma intra pred mode fro TU=CU and use the same list fro
2819     *TU=CU/2 also since the modes are same in both the cases.
2820     */
2821     ihevce_populate_intra_pred_mode(
2822         ps_top_nbr_4x4->b6_luma_intra_mode,
2823         ps_left_nbr_4x4->b6_luma_intra_mode,
2824         s_nbr.u1_top_avail,
2825         s_nbr.u1_left_avail,
2826         cu_pos_y,
2827         &au4_cand_mode_list[0]);
2828 
2829     /*Loop through all the RDOPT candidates of TU=CU and TU=CU/2 and check if the current RDOPT
2830     *cand is present in a4_cand_mode_list, If yes set eval flag to 1 else set it to zero
2831     */
2832 
2833     pu1_intra_luma_modes = pu1_luma_mode;
2834     pu1_luma_eval_mark = pu1_eval_mark;
2835 
2836     while(pu1_intra_luma_modes[rdopt_cand_ctr] != 255)
2837     {
2838         WORD32 i;
2839         WORD32 found_flag = 0;
2840 
2841         /*1st candidate of TU=CU list and TU=CU/2 list must go through RDOPT stage
2842         *irrespective of whether the cand is present in the mpm idx list or not
2843         */
2844         if(rdopt_cand_ctr == 0)
2845         {
2846             rdopt_cand_ctr++;
2847             continue;
2848         }
2849 
2850         for(i = 0; i < 3; i++)
2851         {
2852             if(pu1_intra_luma_modes[rdopt_cand_ctr] == au4_cand_mode_list[i])
2853             {
2854                 found_flag = 1;
2855                 break;
2856             }
2857         }
2858 
2859         if(found_flag == 0)
2860         {
2861             pu1_luma_eval_mark[rdopt_cand_ctr] = 0;
2862         }
2863 
2864         rdopt_cand_ctr++;
2865     }
2866 }
2867 
2868 /*!
2869 ******************************************************************************
2870 * \if Function name : ihevce_intra_rdopt_cu_ntu \endif
2871 *
2872 * \brief
2873 *    Intra Coding unit funtion for RD opt mode
2874 *
2875 * \param[in] ps_ctxt    enc_loop module ctxt pointer
2876 * \param[in] ps_chrm_cu_buf_prms pointer to chroma buffer pointers structure
2877 * \param[in] pu1_luma_mode : pointer to luma mode
2878 * \param[in] ps_cu_analyse  pointer to cu analyse pointer
2879 * \param[in] pu1_src    pointer to source data buffer
2880 * \param[in] src_strd   source buffer stride
2881 * \param[in] pu1_cu_left pointer to left recon data buffer
2882 * \param[in] pu1_cu_top  pointer to top recon data buffer
2883 * \param[in] pu1_cu_top_left pointer to top left recon data buffer
2884 * \param[in] ps_left_nbr_4x4 : left 4x4 neighbour pointer
2885 * \param[in] ps_top_nbr_4x4 : top 4x4 neighbour pointer
2886 * \param[in] nbr_4x4_left_strd left nbr4x4 stride
2887 * \param[in] cu_left_stride left recon buffer stride
2888 * \param[in] curr_buf_idx RD opt buffer index for current usage
2889 * \param[in] func_proc_mode : function procesing mode @sa TU_SIZE_WRT_CU_T
2890 *
2891 * \return
2892 *    RDopt cost
2893 *
2894 * \author
2895 *  Ittiam
2896 *
2897 *****************************************************************************
2898 */
ihevce_intra_rdopt_cu_ntu(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_cu_prms_t * ps_cu_prms,void * pv_pred_org,WORD32 pred_strd_org,enc_loop_chrm_cu_buf_prms_t * ps_chrm_cu_buf_prms,UWORD8 * pu1_luma_mode,cu_analyse_t * ps_cu_analyse,void * pv_curr_src,void * pv_cu_left,void * pv_cu_top,void * pv_cu_top_left,nbr_4x4_t * ps_left_nbr_4x4,nbr_4x4_t * ps_top_nbr_4x4,WORD32 nbr_4x4_left_strd,WORD32 cu_left_stride,WORD32 curr_buf_idx,WORD32 func_proc_mode,WORD32 i4_alpha_stim_multiplier)2899 LWORD64 ihevce_intra_rdopt_cu_ntu(
2900     ihevce_enc_loop_ctxt_t *ps_ctxt,
2901     enc_loop_cu_prms_t *ps_cu_prms,
2902     void *pv_pred_org,
2903     WORD32 pred_strd_org,
2904     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
2905     UWORD8 *pu1_luma_mode,
2906     cu_analyse_t *ps_cu_analyse,
2907     void *pv_curr_src,
2908     void *pv_cu_left,
2909     void *pv_cu_top,
2910     void *pv_cu_top_left,
2911     nbr_4x4_t *ps_left_nbr_4x4,
2912     nbr_4x4_t *ps_top_nbr_4x4,
2913     WORD32 nbr_4x4_left_strd,
2914     WORD32 cu_left_stride,
2915     WORD32 curr_buf_idx,
2916     WORD32 func_proc_mode,
2917     WORD32 i4_alpha_stim_multiplier)
2918 {
2919     enc_loop_cu_final_prms_t *ps_final_prms;
2920     nbr_avail_flags_t s_nbr;
2921     nbr_4x4_t *ps_nbr_4x4;
2922     nbr_4x4_t *ps_tmp_lt_4x4;
2923     recon_datastore_t *ps_recon_datastore;
2924 
2925     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
2926 
2927     UWORD32 *pu4_nbr_flags;
2928     UWORD8 *pu1_intra_pred_mode;
2929     WORD32 cu_pos_x;
2930     WORD32 cu_pos_y;
2931     WORD32 trans_size = 0;
2932     UWORD8 *pu1_left;
2933     UWORD8 *pu1_top;
2934     UWORD8 *pu1_top_left;
2935     UWORD8 *pu1_recon;
2936     UWORD8 *pu1_csbf_buf;
2937     UWORD8 *pu1_ecd_data;
2938     WORD16 *pi2_deq_data;
2939     WORD32 deq_data_strd;
2940     LWORD64 total_rdopt_cost;
2941     WORD32 ctr;
2942     WORD32 left_strd;
2943     WORD32 i4_recon_stride;
2944     WORD32 csbf_strd;
2945     WORD32 ecd_data_bytes_cons;
2946     WORD32 num_4x4_in_tu;
2947     WORD32 num_4x4_in_cu;
2948     WORD32 chrm_present_flag;
2949     WORD32 tx_size;
2950     WORD32 cu_bits;
2951     WORD32 num_cu_parts = 0;
2952     WORD32 num_cands = 0;
2953     WORD32 cu_pos_x_8pelunits;
2954     WORD32 cu_pos_y_8pelunits;
2955     WORD32 i4_perform_rdoq;
2956     WORD32 i4_perform_sbh;
2957     UWORD8 u1_compute_spatial_ssd;
2958     UWORD8 u1_compute_recon;
2959     UWORD8 au1_intra_nxn_rdopt_ctxt_models[2][IHEVC_CAB_CTXT_END];
2960 
2961     UWORD16 u2_num_tus_in_cu = 0;
2962     WORD32 is_sub_pu_in_hq = 0;
2963     /* Get the RDOPT cost of the best CU mode for early_exit */
2964     LWORD64 prev_best_rdopt_cost = ps_ctxt->as_cu_prms[!curr_buf_idx].i8_best_rdopt_cost;
2965     /* cabac context of prev intra luma pred flag */
2966     UWORD8 u1_prev_flag_cabac_ctxt =
2967         ps_ctxt->au1_rdopt_init_ctxt_models[IHEVC_CAB_INTRA_LUMA_PRED_FLAG];
2968     WORD32 src_strd = ps_cu_prms->i4_luma_src_stride;
2969 
2970     UWORD8 u1_is_cu_noisy = ps_cu_prms->u1_is_cu_noisy && !DISABLE_INTRA_WHEN_NOISY;
2971 
2972     total_rdopt_cost = 0;
2973     ps_final_prms = &ps_ctxt->as_cu_prms[curr_buf_idx];
2974     ps_recon_datastore = &ps_final_prms->s_recon_datastore;
2975     i4_recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
2976     csbf_strd = ps_ctxt->i4_cu_csbf_strd;
2977     pu1_csbf_buf = &ps_ctxt->au1_cu_csbf[0];
2978     pu1_ecd_data = &ps_final_prms->pu1_cu_coeffs[0];
2979     pi2_deq_data = &ps_final_prms->pi2_cu_deq_coeffs[0];
2980     deq_data_strd = ps_cu_analyse->u1_cu_size; /* deq_data stride is cu size */
2981     ps_nbr_4x4 = &ps_ctxt->as_cu_nbr[curr_buf_idx][0];
2982     ps_tmp_lt_4x4 = ps_left_nbr_4x4;
2983     pu4_nbr_flags = &ps_final_prms->au4_nbr_flags[0];
2984     pu1_intra_pred_mode = &ps_final_prms->au1_intra_pred_mode[0];
2985     cu_pos_x = ps_cu_analyse->b3_cu_pos_x;
2986     cu_pos_y = ps_cu_analyse->b3_cu_pos_y;
2987     cu_pos_x_8pelunits = cu_pos_x;
2988     cu_pos_y_8pelunits = cu_pos_y;
2989 
2990     /* reset cu not coded cost */
2991     ps_ctxt->i8_cu_not_coded_cost = 0;
2992 
2993     /* based on the Processng mode */
2994     if(TU_EQ_CU == func_proc_mode)
2995     {
2996         ps_final_prms->u1_part_mode = SIZE_2Nx2N;
2997         trans_size = ps_cu_analyse->u1_cu_size;
2998         num_cu_parts = 1;
2999         num_cands = 1;
3000         u2_num_tus_in_cu = 1;
3001     }
3002     else if(TU_EQ_CU_DIV2 == func_proc_mode)
3003     {
3004         ps_final_prms->u1_part_mode = SIZE_2Nx2N;
3005         trans_size = ps_cu_analyse->u1_cu_size >> 1;
3006         num_cu_parts = 4;
3007         num_cands = 1;
3008         u2_num_tus_in_cu = 4;
3009     }
3010     else if(TU_EQ_SUBCU == func_proc_mode)
3011     {
3012         ps_final_prms->u1_part_mode = SIZE_NxN;
3013         trans_size = ps_cu_analyse->u1_cu_size >> 1;
3014         num_cu_parts = 4;
3015         /*In HQ for TU = SUBPU, all 35 modes used for RDOPT instead of 3 modes */
3016         if(IHEVCE_QUALITY_P3 > ps_ctxt->i4_quality_preset)
3017         {
3018             if(ps_ctxt->i1_slice_type != BSLICE)
3019             {
3020                 num_cands = (4 * MAX_INTRA_CU_CANDIDATES) + 2;
3021             }
3022             else
3023             {
3024                 num_cands = (2 * MAX_INTRA_CU_CANDIDATES);
3025             }
3026         }
3027         else
3028         {
3029             num_cands = MAX_INTRA_CU_CANDIDATES;
3030         }
3031         u2_num_tus_in_cu = 4;
3032     }
3033     else
3034     {
3035         /* should not enter here */
3036         ASSERT(0);
3037     }
3038 
3039     if(ps_ctxt->i1_cu_qp_delta_enable)
3040     {
3041         ihevce_update_cu_level_qp_lamda(ps_ctxt, ps_cu_analyse, trans_size, 1);
3042     }
3043 
3044     if(u1_is_cu_noisy && !ps_ctxt->u1_enable_psyRDOPT)
3045     {
3046         ps_ctxt->i8_cl_ssd_lambda_qf =
3047             ((float)ps_ctxt->i8_cl_ssd_lambda_qf * (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) /
3048              100.0f);
3049         ps_ctxt->i8_cl_ssd_lambda_chroma_qf =
3050             ((float)ps_ctxt->i8_cl_ssd_lambda_chroma_qf *
3051              (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) / 100.0f);
3052     }
3053 
3054     u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
3055                              (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
3056                              CONVERT_SSDS_TO_SPATIAL_DOMAIN;
3057 
3058     if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
3059     {
3060         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
3061                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
3062     }
3063 
3064     /* populate the neigbours */
3065     pu1_left = (UWORD8 *)pv_cu_left;
3066     pu1_top = (UWORD8 *)pv_cu_top;
3067     pu1_top_left = (UWORD8 *)pv_cu_top_left;
3068     left_strd = cu_left_stride;
3069     num_4x4_in_tu = (trans_size >> 2);
3070     num_4x4_in_cu = (ps_cu_analyse->u1_cu_size >> 2);
3071     chrm_present_flag = 1;
3072     ecd_data_bytes_cons = 0;
3073     cu_bits = 0;
3074 
3075     /* get the 4x4 level postion of current cu */
3076     cu_pos_x = cu_pos_x << 1;
3077     cu_pos_y = cu_pos_y << 1;
3078 
3079     /* pouplate cu level params knowing that current is intra */
3080     ps_final_prms->u1_skip_flag = 0;
3081     ps_final_prms->u1_intra_flag = PRED_MODE_INTRA;
3082     ps_final_prms->u2_num_pus_in_cu = 1;
3083     /*init the is_cu_coded flag*/
3084     ps_final_prms->u1_is_cu_coded = 0;
3085     ps_final_prms->u4_cu_sad = 0;
3086 
3087     ps_final_prms->as_pu_enc_loop[0].b1_intra_flag = PRED_MODE_INTRA;
3088     ps_final_prms->as_pu_enc_loop[0].b4_wd = (trans_size >> 1) - 1;
3089     ps_final_prms->as_pu_enc_loop[0].b4_ht = (trans_size >> 1) - 1;
3090     ps_final_prms->as_pu_enc_loop[0].b4_pos_x = cu_pos_x;
3091     ps_final_prms->as_pu_enc_loop[0].b4_pos_y = cu_pos_y;
3092     ps_final_prms->as_pu_enc_loop[0].b1_merge_flag = 0;
3093 
3094     ps_final_prms->as_col_pu_enc_loop[0].b1_intra_flag = 1;
3095 
3096     /*copy qp directly as intra cant be skip*/
3097     ps_nbr_4x4->b8_qp = ps_ctxt->i4_cu_qp;
3098     ps_nbr_4x4->mv.s_l0_mv.i2_mvx = 0;
3099     ps_nbr_4x4->mv.s_l0_mv.i2_mvy = 0;
3100     ps_nbr_4x4->mv.s_l1_mv.i2_mvx = 0;
3101     ps_nbr_4x4->mv.s_l1_mv.i2_mvy = 0;
3102     ps_nbr_4x4->mv.i1_l0_ref_pic_buf_id = -1;
3103     ps_nbr_4x4->mv.i1_l1_ref_pic_buf_id = -1;
3104     ps_nbr_4x4->mv.i1_l0_ref_idx = -1;
3105     ps_nbr_4x4->mv.i1_l1_ref_idx = -1;
3106 
3107     /* RDOPT copy States :  TU init (best until prev TU) to current */
3108     memcpy(
3109         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
3110              .s_cabac_ctxt.au1_ctxt_models[0],
3111         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
3112         IHEVC_CAB_COEFFX_PREFIX);
3113 
3114     /* RDOPT copy States :update to init state if 0 cbf */
3115     memcpy(
3116         &au1_intra_nxn_rdopt_ctxt_models[0][0],
3117         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
3118         IHEVC_CAB_COEFFX_PREFIX);
3119     memcpy(
3120         &au1_intra_nxn_rdopt_ctxt_models[1][0],
3121         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
3122         IHEVC_CAB_COEFFX_PREFIX);
3123 
3124     /* loop for all partitions in CU  blocks */
3125     for(ctr = 0; ctr < num_cu_parts; ctr++)
3126     {
3127         UWORD8 *pu1_curr_mode;
3128         WORD32 cand_ctr;
3129         WORD32 nbr_flags;
3130 
3131         /* for NxN case to track the best mode       */
3132         /* for other cases zeroth index will be used */
3133         intra_prev_rem_flags_t as_intra_prev_rem[2];
3134         LWORD64 ai8_cand_rdopt_cost[2];
3135         UWORD32 au4_tu_sad[2];
3136         WORD32 ai4_tu_bits[2];
3137         WORD32 ai4_cbf[2];
3138         WORD32 ai4_curr_bytes[2];
3139         WORD32 ai4_zero_col[2];
3140         WORD32 ai4_zero_row[2];
3141         /* To store the pred, coeff and dequant for TU_EQ_SUBCU case (since mul.
3142         cand. are there) ping-pong buffer to store the best and current */
3143         UWORD8 au1_cur_pred_data[2][MIN_TU_SIZE * MIN_TU_SIZE];
3144         UWORD8 au1_intra_coeffs[2][MAX_SCAN_COEFFS_BYTES_4x4];
3145         WORD16 ai2_intra_deq_coeffs[2][MIN_TU_SIZE * MIN_TU_SIZE];
3146         /* Context models stored for RDopt store and restore purpose */
3147 
3148         UWORD8 au1_recon_availability[2];
3149 
3150         WORD32 best_cand_idx = 0;
3151         LWORD64 best_cand_cost = MAX_COST_64;
3152         /* counters to toggle b/w best and current */
3153         WORD32 best_intra_buf_idx = 1;
3154         WORD32 curr_intra_buf_idx = 0;
3155 
3156         /* copy the mode pointer to be used in inner loop */
3157         pu1_curr_mode = pu1_luma_mode;
3158 
3159         /* get the neighbour availability flags */
3160         nbr_flags = ihevce_get_nbr_intra(
3161             &s_nbr,
3162             ps_ctxt->pu1_ctb_nbr_map,
3163             ps_ctxt->i4_nbr_map_strd,
3164             cu_pos_x,
3165             cu_pos_y,
3166             num_4x4_in_tu);
3167 
3168         /* copy the nbr flags for chroma reuse */
3169         if(4 != trans_size)
3170         {
3171             *pu4_nbr_flags = nbr_flags;
3172         }
3173         else if(1 == chrm_present_flag)
3174         {
3175             /* compute the avail flags assuming luma trans is 8x8 */
3176             /* get the neighbour availability flags */
3177             *pu4_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
3178                 ps_ctxt->pu1_ctb_nbr_map,
3179                 ps_ctxt->i4_nbr_map_strd,
3180                 cu_pos_x,
3181                 cu_pos_y,
3182                 (num_4x4_in_tu << 1),
3183                 (num_4x4_in_tu << 1));
3184         }
3185 
3186         u1_compute_recon = !u1_compute_spatial_ssd && ((num_cu_parts > 1) && (ctr < 3));
3187 
3188         if(!ctr && (u1_compute_spatial_ssd || u1_compute_recon))
3189         {
3190             ps_recon_datastore->u1_is_lumaRecon_available = 1;
3191         }
3192         else if(!ctr)
3193         {
3194             ps_recon_datastore->u1_is_lumaRecon_available = 0;
3195         }
3196 
3197         ihevc_intra_pred_luma_ref_substitution_fptr =
3198             ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
3199 
3200         /* call reference array substitution */
3201         ihevc_intra_pred_luma_ref_substitution_fptr(
3202             pu1_top_left,
3203             pu1_top,
3204             pu1_left,
3205             left_strd,
3206             trans_size,
3207             nbr_flags,
3208             (UWORD8 *)ps_ctxt->pv_ref_sub_out,
3209             1);
3210 
3211         /* Intra Mode gating based on MPM cand list and encoder quality preset */
3212         if((ps_ctxt->i1_slice_type != ISLICE) && (TU_EQ_SUBCU == func_proc_mode) &&
3213            (ps_ctxt->i4_quality_preset >= IHEVCE_QUALITY_P3))
3214         {
3215             ihevce_mpm_idx_based_filter_RDOPT_cand(
3216                 ps_ctxt,
3217                 ps_cu_analyse,
3218                 ps_left_nbr_4x4,
3219                 ps_top_nbr_4x4,
3220                 pu1_luma_mode,
3221                 &ps_cu_analyse->s_cu_intra_cand.au1_nxn_eval_mark[ctr][0]);
3222         }
3223 
3224         if((TU_EQ_SUBCU == func_proc_mode) && (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
3225            (ps_cu_analyse->s_cu_intra_cand.au1_num_modes_added[ctr] >= MAX_INTRA_CU_CANDIDATES))
3226         {
3227             WORD32 ai4_mpm_mode_list[3];
3228             WORD32 i;
3229 
3230             WORD32 i4_curr_index = ps_cu_analyse->s_cu_intra_cand.au1_num_modes_added[ctr];
3231 
3232             ihevce_populate_intra_pred_mode(
3233                 ps_top_nbr_4x4->b6_luma_intra_mode,
3234                 ps_tmp_lt_4x4->b6_luma_intra_mode,
3235                 s_nbr.u1_top_avail,
3236                 s_nbr.u1_left_avail,
3237                 cu_pos_y,
3238                 &ai4_mpm_mode_list[0]);
3239 
3240             for(i = 0; i < 3; i++)
3241             {
3242                 if(ps_cu_analyse->s_cu_intra_cand
3243                        .au1_intra_luma_mode_nxn_hash[ctr][ai4_mpm_mode_list[i]] == 0)
3244                 {
3245                     ASSERT(ai4_mpm_mode_list[i] < 35);
3246 
3247                     ps_cu_analyse->s_cu_intra_cand
3248                         .au1_intra_luma_mode_nxn_hash[ctr][ai4_mpm_mode_list[i]] = 1;
3249                     pu1_luma_mode[i4_curr_index] = ai4_mpm_mode_list[i];
3250                     ps_cu_analyse->s_cu_intra_cand.au1_num_modes_added[ctr]++;
3251                     i4_curr_index++;
3252                 }
3253             }
3254 
3255             pu1_luma_mode[i4_curr_index] = 255;
3256         }
3257 
3258         /* loop over candidates for each partition */
3259         for(cand_ctr = 0; cand_ctr < num_cands; cand_ctr++)
3260         {
3261             WORD32 curr_pred_mode;
3262             WORD32 bits = 0;
3263             LWORD64 curr_cost;
3264             WORD32 luma_pred_func_idx;
3265             UWORD8 *pu1_curr_ecd_data;
3266             WORD16 *pi2_curr_deq_data;
3267             WORD32 curr_deq_data_strd;
3268             WORD32 pred_strd;
3269             UWORD8 *pu1_pred;
3270 
3271             /* if NXN case the recon and ecd data is stored in temp buffers */
3272             if(TU_EQ_SUBCU == func_proc_mode)
3273             {
3274                 pu1_pred = &au1_cur_pred_data[curr_intra_buf_idx][0];
3275                 pred_strd = trans_size;
3276                 pu1_curr_ecd_data = &au1_intra_coeffs[curr_intra_buf_idx][0];
3277                 pi2_curr_deq_data = &ai2_intra_deq_coeffs[curr_intra_buf_idx][0];
3278                 curr_deq_data_strd = trans_size;
3279 
3280                 ASSERT(trans_size == MIN_TU_SIZE);
3281             }
3282             else
3283             {
3284                 pu1_pred = (UWORD8 *)pv_pred_org;
3285                 pred_strd = pred_strd_org;
3286                 pu1_curr_ecd_data = pu1_ecd_data;
3287                 pi2_curr_deq_data = pi2_deq_data;
3288                 curr_deq_data_strd = deq_data_strd;
3289             }
3290 
3291             pu1_recon = ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs[curr_intra_buf_idx]) +
3292                         (ctr & 1) * trans_size + (ctr > 1) * trans_size * i4_recon_stride;
3293 
3294             if(is_sub_pu_in_hq == 1)
3295             {
3296                 curr_pred_mode = cand_ctr;
3297             }
3298             else
3299             {
3300                 curr_pred_mode = pu1_curr_mode[cand_ctr];
3301             }
3302 
3303             /* If the candidate mode is 255, then break */
3304             if(255 == curr_pred_mode)
3305             {
3306                 break;
3307             }
3308             else if(250 == curr_pred_mode)
3309             {
3310                 continue;
3311             }
3312 
3313             /* check if this mode needs to be evaluated or not. For 2nx2n cases, this   */
3314             /* function will be called once per candidate, so this check has been done  */
3315             /* outside this function call. For NxN case, this function will be called   */
3316             /* only once, and all the candidates will be evaluated here.                */
3317             if(ps_ctxt->i4_quality_preset >= IHEVCE_QUALITY_P3)
3318             {
3319                 if((TU_EQ_SUBCU == func_proc_mode) &&
3320                    (0 == ps_cu_analyse->s_cu_intra_cand.au1_nxn_eval_mark[ctr][cand_ctr]))
3321                 {
3322                     continue;
3323                 }
3324             }
3325 
3326             /* call reference filtering */
3327             ps_ctxt->ps_func_selector->ihevc_intra_pred_ref_filtering_fptr(
3328                 (UWORD8 *)ps_ctxt->pv_ref_sub_out,
3329                 trans_size,
3330                 (UWORD8 *)ps_ctxt->pv_ref_filt_out,
3331                 curr_pred_mode,
3332                 ps_ctxt->i1_strong_intra_smoothing_enable_flag);
3333 
3334             /* use the look up to get the function idx */
3335             luma_pred_func_idx = g_i4_ip_funcs[curr_pred_mode];
3336 
3337             /* call the intra prediction function */
3338             ps_ctxt->apf_lum_ip[luma_pred_func_idx](
3339                 (UWORD8 *)ps_ctxt->pv_ref_filt_out,
3340                 1,
3341                 pu1_pred,
3342                 pred_strd,
3343                 trans_size,
3344                 curr_pred_mode);
3345 
3346             /* populate the coeffs scan idx */
3347             ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
3348 
3349             /* for luma 4x4 and 8x8 transforms based on intra pred mode scan is choosen*/
3350             if(trans_size < 16)
3351             {
3352                 /* for modes from 22 upto 30 horizontal scan is used */
3353                 if((curr_pred_mode > 21) && (curr_pred_mode < 31))
3354                 {
3355                     ps_ctxt->i4_scan_idx = SCAN_HORZ;
3356                 }
3357                 /* for modes from 6 upto 14 horizontal scan is used */
3358                 else if((curr_pred_mode > 5) && (curr_pred_mode < 15))
3359                 {
3360                     ps_ctxt->i4_scan_idx = SCAN_VERT;
3361                 }
3362             }
3363 
3364             /* RDOPT copy States :  TU init (best until prev TU) to current */
3365             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
3366                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
3367                         .s_cabac_ctxt.au1_ctxt_models[0] +
3368                     IHEVC_CAB_COEFFX_PREFIX,
3369                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
3370                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
3371 
3372             i4_perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_rdoq;
3373             i4_perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_sbh;
3374 
3375 #if DISABLE_RDOQ_INTRA
3376             i4_perform_rdoq = 0;
3377 #endif
3378 
3379             /*2 Multi- dimensinal array based on trans size  of rounding factor to be added here */
3380             /* arrays are for rounding factor corr. to 0-1 decision and 1-2 decision */
3381             /* Currently the complete array will contain only single value*/
3382             /*The rounding factor is calculated with the formula
3383             Deadzone val = (((R1 - R0) * (2^(-8/3)) * lamMod) + 1)/2
3384             rounding factor = (1 - DeadZone Val)
3385 
3386             Assumption: Cabac states of All the sub-blocks in the TU are considered independent
3387             */
3388             if((ps_ctxt->i4_quant_rounding_level != FIXED_QUANT_ROUNDING))
3389             {
3390                 if((ps_ctxt->i4_quant_rounding_level == TU_LEVEL_QUANT_ROUNDING) && (ctr != 0))
3391                 {
3392                     double i4_lamda_modifier;
3393 
3394                     if((BSLICE == ps_ctxt->i1_slice_type) && (ps_ctxt->i4_temporal_layer_id))
3395                     {
3396                         i4_lamda_modifier =
3397                             ps_ctxt->i4_lamda_modifier *
3398                             CLIP3((((double)(ps_ctxt->i4_cu_qp - 12)) / 6.0), 2.00, 4.00);
3399                     }
3400                     else
3401                     {
3402                         i4_lamda_modifier = ps_ctxt->i4_lamda_modifier;
3403                     }
3404                     if(ps_ctxt->i4_use_const_lamda_modifier)
3405                     {
3406                         if(ISLICE == ps_ctxt->i1_slice_type)
3407                         {
3408                             i4_lamda_modifier = ps_ctxt->f_i_pic_lamda_modifier;
3409                         }
3410                         else
3411                         {
3412                             i4_lamda_modifier = CONST_LAMDA_MOD_VAL;
3413                         }
3414                     }
3415 
3416                     ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3] =
3417                         &ps_ctxt->i4_quant_round_tu[0][0];
3418                     ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3] =
3419                         &ps_ctxt->i4_quant_round_tu[1][0];
3420 
3421                     memset(
3422                         ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
3423                         0,
3424                         trans_size * trans_size * sizeof(WORD32));
3425                     memset(
3426                         ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
3427                         0,
3428                         trans_size * trans_size * sizeof(WORD32));
3429 
3430                     ihevce_quant_rounding_factor_gen(
3431                         trans_size,
3432                         1,
3433                         &ps_ctxt->s_rdopt_entropy_ctxt,
3434                         ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
3435                         ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
3436                         i4_lamda_modifier,
3437                         1);
3438                 }
3439                 else
3440                 {
3441                     ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3] =
3442                         ps_ctxt->pi4_quant_round_factor_cu_ctb_0_1[trans_size >> 3];
3443                     ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3] =
3444                         ps_ctxt->pi4_quant_round_factor_cu_ctb_1_2[trans_size >> 3];
3445                 }
3446             }
3447 
3448             /* call T Q IT IQ and recon function */
3449             ai4_cbf[curr_intra_buf_idx] = ihevce_t_q_iq_ssd_scan_fxn(
3450                 ps_ctxt,
3451                 pu1_pred,
3452                 pred_strd,
3453                 (UWORD8 *)pv_curr_src,
3454                 src_strd,
3455                 pi2_curr_deq_data,
3456                 curr_deq_data_strd,
3457                 pu1_recon,
3458                 i4_recon_stride,
3459                 pu1_curr_ecd_data,
3460                 pu1_csbf_buf,
3461                 csbf_strd,
3462                 trans_size,
3463                 PRED_MODE_INTRA,
3464                 &ai8_cand_rdopt_cost[curr_intra_buf_idx],
3465                 &ai4_curr_bytes[curr_intra_buf_idx],
3466                 &ai4_tu_bits[curr_intra_buf_idx],
3467                 &au4_tu_sad[curr_intra_buf_idx],
3468                 &ai4_zero_col[curr_intra_buf_idx],
3469                 &ai4_zero_row[curr_intra_buf_idx],
3470                 &au1_recon_availability[curr_intra_buf_idx],
3471                 i4_perform_rdoq,
3472                 i4_perform_sbh,
3473 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
3474                 i4_alpha_stim_multiplier,
3475                 u1_is_cu_noisy,
3476 #endif
3477                 u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
3478                 1 /*early_cbf */
3479             );
3480 
3481 #if COMPUTE_NOISE_TERM_AT_THE_TU_LEVEL && !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
3482             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
3483             {
3484 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
3485                 ai8_cand_rdopt_cost[curr_intra_buf_idx] = ihevce_inject_stim_into_distortion(
3486                     pv_curr_src,
3487                     src_strd,
3488                     pu1_pred,
3489                     pred_strd,
3490                     ai8_cand_rdopt_cost[curr_intra_buf_idx],
3491                     i4_alpha_stim_multiplier,
3492                     trans_size,
3493                     0,
3494                     ps_ctxt->u1_enable_psyRDOPT,
3495                     NULL_PLANE);
3496 #else
3497                 if(u1_compute_spatial_ssd && au1_recon_availability[curr_intra_buf_idx])
3498                 {
3499                     ai8_cand_rdopt_cost[curr_intra_buf_idx] = ihevce_inject_stim_into_distortion(
3500                         pv_curr_src,
3501                         src_strd,
3502                         pu1_recon,
3503                         i4_recon_stride,
3504                         ai8_cand_rdopt_cost[curr_intra_buf_idx],
3505                         i4_alpha_stim_multiplier,
3506                         trans_size,
3507                         0,
3508                         ps_ctxt->u1_enable_psyRDOPT,
3509                         NULL_PLANE);
3510                 }
3511                 else
3512                 {
3513                     ai8_cand_rdopt_cost[curr_intra_buf_idx] = ihevce_inject_stim_into_distortion(
3514                         pv_curr_src,
3515                         src_strd,
3516                         pu1_pred,
3517                         pred_strd,
3518                         ai8_cand_rdopt_cost[curr_intra_buf_idx],
3519                         i4_alpha_stim_multiplier,
3520                         trans_size,
3521                         0,
3522                         ps_ctxt->u1_enable_psyRDOPT,
3523                         NULL_PLANE);
3524                 }
3525 #endif
3526             }
3527 #endif
3528 
3529             if(TU_EQ_SUBCU == func_proc_mode)
3530             {
3531                 ASSERT(ai4_curr_bytes[curr_intra_buf_idx] < MAX_SCAN_COEFFS_BYTES_4x4);
3532             }
3533 
3534             /* based on CBF/No CBF copy the corresponding state */
3535             if(0 == ai4_cbf[curr_intra_buf_idx])
3536             {
3537                 /* RDOPT copy States :update to init state if 0 cbf */
3538                 COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
3539                     &au1_intra_nxn_rdopt_ctxt_models[curr_intra_buf_idx][0] +
3540                         IHEVC_CAB_COEFFX_PREFIX,
3541                     &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
3542                     IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
3543             }
3544             else
3545             {
3546                 /* RDOPT copy States :update to new state only if CBF is non zero */
3547                 COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
3548                     &au1_intra_nxn_rdopt_ctxt_models[curr_intra_buf_idx][0] +
3549                         IHEVC_CAB_COEFFX_PREFIX,
3550                     &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
3551                             .s_cabac_ctxt.au1_ctxt_models[0] +
3552                         IHEVC_CAB_COEFFX_PREFIX,
3553                     IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
3554             }
3555 
3556             /* call the function which perform intra mode prediction */
3557             ihevce_intra_pred_mode_signaling(
3558                 ps_top_nbr_4x4->b6_luma_intra_mode,
3559                 ps_tmp_lt_4x4->b6_luma_intra_mode,
3560                 s_nbr.u1_top_avail,
3561                 s_nbr.u1_left_avail,
3562                 cu_pos_y,
3563                 curr_pred_mode,
3564                 &as_intra_prev_rem[curr_intra_buf_idx]);
3565             /******************************************************************/
3566             /* PREV INTRA LUMA FLAG, MPM MODE and REM INTRA MODE bits for I_NxN
3567             The bits for these are evaluated for every RDO mode of current subcu
3568             as they can significantly contribute to RDO cost.  Note that these
3569             bits are not accounted for here (ai8_cand_rdopt_cost) as they
3570             are accounted for in encode_cu call later */
3571 
3572             /******************************************************************/
3573             /* PREV INTRA LUMA FLAG, MPM MODE and REM INTRA MODE bits for I_NxN
3574             The bits for these are evaluated for every RDO mode of current subcu
3575             as they can significantly contribute to RDO cost.  Note that these
3576             bits are not accounted for here (ai8_cand_rdopt_cost) as they
3577             are accounted for in encode_cu call later */
3578 
3579             /* Estimate bits to encode prev rem flag  for NXN mode */
3580             {
3581                 WORD32 bits_frac = gau2_ihevce_cabac_bin_to_bits
3582                     [u1_prev_flag_cabac_ctxt ^
3583                      as_intra_prev_rem[curr_intra_buf_idx].b1_prev_intra_luma_pred_flag];
3584 
3585                 /* rounding the fractional bits to nearest integer */
3586                 bits = ((bits_frac + (1 << (CABAC_FRAC_BITS_Q - 1))) >> CABAC_FRAC_BITS_Q);
3587             }
3588 
3589             /* based on prev flag all the mpmidx bits and rem bits */
3590             if(1 == as_intra_prev_rem[curr_intra_buf_idx].b1_prev_intra_luma_pred_flag)
3591             {
3592                 /* mpm_idx */
3593                 bits += as_intra_prev_rem[curr_intra_buf_idx].b2_mpm_idx ? 2 : 1;
3594             }
3595             else
3596             {
3597                 /* rem intra mode */
3598                 bits += 5;
3599             }
3600 
3601             bits += ai4_tu_bits[curr_intra_buf_idx];
3602 
3603             /* compute the total cost for current candidate */
3604             curr_cost = ai8_cand_rdopt_cost[curr_intra_buf_idx];
3605 
3606             /* get the final ssd cost */
3607             curr_cost +=
3608                 COMPUTE_RATE_COST_CLIP30(bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
3609 
3610             /* check of the best candidate cost */
3611             if(curr_cost < best_cand_cost)
3612             {
3613                 best_cand_cost = curr_cost;
3614                 best_cand_idx = cand_ctr;
3615                 best_intra_buf_idx = curr_intra_buf_idx;
3616                 curr_intra_buf_idx = !curr_intra_buf_idx;
3617             }
3618         }
3619 
3620         /***************    For TU_EQ_SUBCU case    *****************/
3621         /* Copy the pred for best cand. to the final pred array     */
3622         /* Copy the iq-coeff for best cand. to the final array      */
3623         /* copy the best coeffs data to final buffer                */
3624         if(TU_EQ_SUBCU == func_proc_mode)
3625         {
3626             /* Copy the pred for best cand. to the final pred array */
3627 
3628             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
3629                 (UWORD8 *)pv_pred_org,
3630                 pred_strd_org,
3631                 &au1_cur_pred_data[best_intra_buf_idx][0],
3632                 trans_size,
3633                 trans_size,
3634                 trans_size);
3635 
3636             /* Copy the deq-coeff for best cand. to the final array */
3637 
3638             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
3639                 (UWORD8 *)pi2_deq_data,
3640                 deq_data_strd << 1,
3641                 (UWORD8 *)&ai2_intra_deq_coeffs[best_intra_buf_idx][0],
3642                 trans_size << 1,
3643                 trans_size << 1,
3644                 trans_size);
3645             /* copy the coeffs to final cu ecd bytes buffer */
3646             memcpy(
3647                 pu1_ecd_data,
3648                 &au1_intra_coeffs[best_intra_buf_idx][0],
3649                 ai4_curr_bytes[best_intra_buf_idx]);
3650 
3651             pu1_recon = ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs[best_intra_buf_idx]) +
3652                         (ctr & 1) * trans_size + (ctr > 1) * trans_size * i4_recon_stride;
3653         }
3654 
3655         /*----------   Calculate Recon for the best INTRA mode     ---------*/
3656         /* TU_EQ_CU case : No need for recon, otherwise recon is required   */
3657         /* Compute recon only for the best mode for TU_EQ_SUBCU case        */
3658         if(u1_compute_recon)
3659         {
3660             ihevce_it_recon_fxn(
3661                 ps_ctxt,
3662                 pi2_deq_data,
3663                 deq_data_strd,
3664                 (UWORD8 *)pv_pred_org,
3665                 pred_strd_org,
3666                 pu1_recon,
3667                 i4_recon_stride,
3668                 pu1_ecd_data,
3669                 trans_size,
3670                 PRED_MODE_INTRA,
3671                 ai4_cbf[best_intra_buf_idx],
3672                 ai4_zero_col[best_intra_buf_idx],
3673                 ai4_zero_row[best_intra_buf_idx]);
3674 
3675             ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr] = best_intra_buf_idx;
3676         }
3677         else if(u1_compute_spatial_ssd && au1_recon_availability[best_intra_buf_idx])
3678         {
3679             ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr] = best_intra_buf_idx;
3680         }
3681         else
3682         {
3683             ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr] = UCHAR_MAX;
3684         }
3685 
3686         /* RDOPT copy States :update to best modes state */
3687         COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
3688             &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
3689             &au1_intra_nxn_rdopt_ctxt_models[best_intra_buf_idx][0] + IHEVC_CAB_COEFFX_PREFIX,
3690             IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
3691 
3692         /* copy the prev,mpm_idx and rem modes from best cand */
3693         ps_final_prms->as_intra_prev_rem[ctr] = as_intra_prev_rem[best_intra_buf_idx];
3694 
3695         /* update the cabac context of prev intra pred mode flag */
3696         u1_prev_flag_cabac_ctxt = gau1_ihevc_next_state
3697             [(u1_prev_flag_cabac_ctxt << 1) |
3698              as_intra_prev_rem[best_intra_buf_idx].b1_prev_intra_luma_pred_flag];
3699 
3700         /* accumulate the TU bits into cu bits */
3701         cu_bits += ai4_tu_bits[best_intra_buf_idx];
3702 
3703         /* copy the intra pred mode for chroma reuse */
3704         if(is_sub_pu_in_hq == 0)
3705         {
3706             *pu1_intra_pred_mode = pu1_curr_mode[best_cand_idx];
3707         }
3708         else
3709         {
3710             *pu1_intra_pred_mode = best_cand_idx;
3711         }
3712 
3713         /* Store luma mode as chroma mode. If chroma prcs happens, and
3714         if a diff. mode wins, it should update this!! */
3715         if(1 == chrm_present_flag)
3716         {
3717             if(is_sub_pu_in_hq == 0)
3718             {
3719                 ps_final_prms->u1_chroma_intra_pred_actual_mode =
3720                     ((ps_ctxt->u1_chroma_array_type == 2)
3721                          ? gau1_chroma422_intra_angle_mapping[pu1_curr_mode[best_cand_idx]]
3722                          : pu1_curr_mode[best_cand_idx]);
3723             }
3724             else
3725             {
3726                 ps_final_prms->u1_chroma_intra_pred_actual_mode =
3727                     ((ps_ctxt->u1_chroma_array_type == 2)
3728                          ? gau1_chroma422_intra_angle_mapping[best_cand_idx]
3729                          : best_cand_idx);
3730             }
3731 
3732             ps_final_prms->u1_chroma_intra_pred_mode = 4;
3733         }
3734 
3735         /*remember the cbf flag to replicate qp for 4x4 neighbour*/
3736         ps_final_prms->u1_is_cu_coded |= ai4_cbf[best_intra_buf_idx];
3737 
3738         /*accumulate ssd over all TU of intra CU*/
3739         ps_final_prms->u4_cu_sad += au4_tu_sad[best_intra_buf_idx];
3740 
3741         /* update the bytes */
3742         ps_final_prms->as_tu_enc_loop[ctr].i4_luma_coeff_offset = ecd_data_bytes_cons;
3743         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].i2_luma_bytes_consumed =
3744             ai4_curr_bytes[best_intra_buf_idx];
3745         /* update the zero_row and col info for the final mode */
3746         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].u4_luma_zero_col =
3747             ai4_zero_col[best_intra_buf_idx];
3748         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].u4_luma_zero_row =
3749             ai4_zero_row[best_intra_buf_idx];
3750 
3751         ps_final_prms->as_tu_enc_loop[ctr].i4_luma_coeff_offset = ecd_data_bytes_cons;
3752 
3753         /* update the total bytes cons */
3754         ecd_data_bytes_cons += ai4_curr_bytes[best_intra_buf_idx];
3755         pu1_ecd_data += ai4_curr_bytes[best_intra_buf_idx];
3756 
3757         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_y_cbf = ai4_cbf[best_intra_buf_idx];
3758         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf = 0;
3759         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf = 0;
3760         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1 = 0;
3761         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1 = 0;
3762         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_chroma_intra_mode_idx = chrm_present_flag;
3763         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b7_qp = ps_ctxt->i4_cu_qp;
3764         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_first_tu_in_cu = 0;
3765         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_transquant_bypass = 0;
3766         GETRANGE(tx_size, trans_size);
3767         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_size = tx_size - 3;
3768         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_x = cu_pos_x;
3769         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_y = cu_pos_y;
3770 
3771         /* repiclate the nbr 4x4 structure for all 4x4 blocks current TU */
3772         ps_nbr_4x4->b1_skip_flag = 0;
3773         ps_nbr_4x4->b1_intra_flag = 1;
3774         ps_nbr_4x4->b1_pred_l0_flag = 0;
3775         ps_nbr_4x4->b1_pred_l1_flag = 0;
3776 
3777         if(is_sub_pu_in_hq == 0)
3778         {
3779             ps_nbr_4x4->b6_luma_intra_mode = pu1_curr_mode[best_cand_idx];
3780         }
3781         else
3782         {
3783             ps_nbr_4x4->b6_luma_intra_mode = best_cand_idx;
3784         }
3785 
3786         ps_nbr_4x4->b1_y_cbf = ai4_cbf[best_intra_buf_idx];
3787 
3788         /* since tu size can be less than cusize, replication is done with strd */
3789         {
3790             WORD32 i, j;
3791             nbr_4x4_t *ps_tmp_4x4;
3792 
3793             ps_tmp_4x4 = ps_nbr_4x4;
3794 
3795             for(i = 0; i < num_4x4_in_tu; i++)
3796             {
3797                 for(j = 0; j < num_4x4_in_tu; j++)
3798                 {
3799                     ps_tmp_4x4[j] = *ps_nbr_4x4;
3800                 }
3801                 /* row level update*/
3802                 ps_tmp_4x4 += num_4x4_in_cu;
3803             }
3804         }
3805 
3806         if(TU_EQ_SUBCU == func_proc_mode)
3807         {
3808             pu1_luma_mode += ((MAX_INTRA_CU_CANDIDATES * 4) + 2 + 1);
3809         }
3810 
3811         if((num_cu_parts > 1) && (ctr < 3))
3812         {
3813             /* set the neighbour map to 1 */
3814             ihevce_set_nbr_map(
3815                 ps_ctxt->pu1_ctb_nbr_map,
3816                 ps_ctxt->i4_nbr_map_strd,
3817                 cu_pos_x,
3818                 cu_pos_y,
3819                 trans_size >> 2,
3820                 1);
3821 
3822             /* block level updates block number (1 & 3 )*/
3823             pv_curr_src = (UWORD8 *)pv_curr_src + trans_size;
3824             pv_pred_org = (UWORD8 *)pv_pred_org + trans_size;
3825             pi2_deq_data += trans_size;
3826 
3827             switch(ctr)
3828             {
3829             case 0:
3830             {
3831                 pu1_left = pu1_recon + trans_size - 1;
3832                 pu1_top += trans_size;
3833                 pu1_top_left = pu1_top - 1;
3834                 left_strd = i4_recon_stride;
3835 
3836                 break;
3837             }
3838             case 1:
3839             {
3840                 ASSERT(
3841                     (ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0] == 0) ||
3842                     (ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0] == 1));
3843 
3844                 /* Since the 'lumaRefSubstitution' function expects both Top and */
3845                 /* TopRight recon pixels to be present in the same buffer */
3846                 if(ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0] !=
3847                    ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1])
3848                 {
3849                     UWORD8 *pu1_src =
3850                         ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
3851                              [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1]]) +
3852                         trans_size;
3853                     UWORD8 *pu1_dst =
3854                         ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
3855                              [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0]]) +
3856                         trans_size;
3857 
3858                     ps_ctxt->s_cmn_opt_func.pf_copy_2d(
3859                         pu1_dst, i4_recon_stride, pu1_src, i4_recon_stride, trans_size, trans_size);
3860 
3861                     ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1] =
3862                         ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0];
3863                 }
3864 
3865                 pu1_left = (UWORD8 *)pv_cu_left + trans_size * cu_left_stride;
3866                 pu1_top = ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
3867                                [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0]]) +
3868                           (trans_size - 1) * i4_recon_stride;
3869                 pu1_top_left = pu1_left - cu_left_stride;
3870                 left_strd = cu_left_stride;
3871 
3872                 break;
3873             }
3874             case 2:
3875             {
3876                 ASSERT(
3877                     (ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1] == 0) ||
3878                     (ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1] == 1));
3879 
3880                 pu1_left = pu1_recon + trans_size - 1;
3881                 pu1_top = ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
3882                                [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1]]) +
3883                           (trans_size - 1) * i4_recon_stride + trans_size;
3884                 pu1_top_left = pu1_top - 1;
3885                 left_strd = i4_recon_stride;
3886 
3887                 break;
3888             }
3889             }
3890 
3891             pu1_csbf_buf += num_4x4_in_tu;
3892             cu_pos_x += num_4x4_in_tu;
3893             ps_nbr_4x4 += num_4x4_in_tu;
3894             ps_top_nbr_4x4 += num_4x4_in_tu;
3895             ps_tmp_lt_4x4 = ps_nbr_4x4 - 1;
3896 
3897             pu1_intra_pred_mode++;
3898 
3899             /* after 2 blocks increment the pointers to bottom blocks */
3900             if(1 == ctr)
3901             {
3902                 pv_curr_src = (UWORD8 *)pv_curr_src - (trans_size << 1);
3903                 pv_curr_src = (UWORD8 *)pv_curr_src + (trans_size * src_strd);
3904 
3905                 pv_pred_org = (UWORD8 *)pv_pred_org - (trans_size << 1);
3906                 pv_pred_org = (UWORD8 *)pv_pred_org + (trans_size * pred_strd_org);
3907                 pi2_deq_data -= (trans_size << 1);
3908                 pi2_deq_data += (trans_size * deq_data_strd);
3909 
3910                 pu1_csbf_buf -= (num_4x4_in_tu << 1);
3911                 pu1_csbf_buf += (num_4x4_in_tu * csbf_strd);
3912 
3913                 ps_nbr_4x4 -= (num_4x4_in_tu << 1);
3914                 ps_nbr_4x4 += (num_4x4_in_tu * num_4x4_in_cu);
3915                 ps_top_nbr_4x4 = ps_nbr_4x4 - num_4x4_in_cu;
3916                 ps_tmp_lt_4x4 = ps_left_nbr_4x4 + (num_4x4_in_tu * nbr_4x4_left_strd);
3917 
3918                 /* decrement pos x to start */
3919                 cu_pos_x -= (num_4x4_in_tu << 1);
3920                 cu_pos_y += num_4x4_in_tu;
3921             }
3922         }
3923 
3924 #if RDOPT_ENABLE
3925         /* compute the RDOPT cost for the current TU */
3926         ai8_cand_rdopt_cost[best_intra_buf_idx] += COMPUTE_RATE_COST_CLIP30(
3927             ai4_tu_bits[best_intra_buf_idx], ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
3928 #endif
3929 
3930         /* accumulate the costs */
3931         total_rdopt_cost += ai8_cand_rdopt_cost[best_intra_buf_idx];
3932 
3933         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
3934         {
3935             /* Early exit : If the current running cost exceeds
3936             the prev. best mode cost, break */
3937             if(total_rdopt_cost > prev_best_rdopt_cost)
3938             {
3939                 return (total_rdopt_cost);
3940             }
3941         }
3942 
3943         /* if transfrom size is 4x4 then only first luma 4x4 will have chroma*/
3944         chrm_present_flag = (4 != trans_size) ? 1 : INTRA_PRED_CHROMA_IDX_NONE;
3945 
3946         pu4_nbr_flags++;
3947     }
3948     /* Modify the cost function for this CU. */
3949     /* loop in for 8x8 blocks */
3950     if(ps_ctxt->u1_enable_psyRDOPT)
3951     {
3952         UWORD8 *pu1_recon_cu;
3953         WORD32 recon_stride;
3954         WORD32 curr_pos_x;
3955         WORD32 curr_pos_y;
3956         WORD32 start_index;
3957         WORD32 num_horz_cu_in_ctb;
3958         WORD32 cu_size;
3959         WORD32 had_block_size;
3960 
3961         /* tODO: sreenivasa ctb size has to be used appropriately */
3962         had_block_size = 8;
3963         cu_size = ps_cu_analyse->u1_cu_size; /* todo */
3964         num_horz_cu_in_ctb = 64 / had_block_size;
3965 
3966         curr_pos_x = ps_cu_analyse->b3_cu_pos_x << 3; /* pel units */
3967         curr_pos_y = ps_cu_analyse->b3_cu_pos_y << 3; /* pel units */
3968         recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
3969         pu1_recon_cu =
3970             ((UWORD8 *)ps_final_prms->s_recon_datastore
3971                  .apv_luma_recon_bufs[ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0]]);
3972         /* + \  curr_pos_x + curr_pos_y * recon_stride; */
3973 
3974         /* start index to index the source satd of curr cu int he current ctb*/
3975         start_index =
3976             (curr_pos_x / had_block_size) + (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
3977 
3978         {
3979             total_rdopt_cost += ihevce_psy_rd_cost(
3980                 ps_ctxt->ai4_source_satd_8x8,
3981                 pu1_recon_cu,
3982                 recon_stride,
3983                 1,  //
3984                 cu_size,
3985                 0,  // pic type
3986                 0,  //layer id
3987                 ps_ctxt->i4_satd_lamda,  // lambda
3988                 start_index,
3989                 ps_ctxt->u1_is_input_data_hbd,
3990                 ps_ctxt->u4_psy_strength,
3991                 &ps_ctxt->s_cmn_opt_func
3992 
3993             );  // 8 bit
3994         }
3995     }
3996 
3997 #if !FORCE_INTRA_TU_DEPTH_TO_0  //RATIONALISE_NUM_RDO_MODES_IN_PQ_AND_HQ
3998     if(TU_EQ_SUBCU == func_proc_mode)
3999     {
4000         UWORD8 au1_tu_eq_cu_div2_modes[4];
4001         UWORD8 au1_freq_of_mode[4];
4002 
4003         WORD32 i4_num_clusters = ihevce_find_num_clusters_of_identical_points_1D(
4004             ps_final_prms->au1_intra_pred_mode, au1_tu_eq_cu_div2_modes, au1_freq_of_mode, 4);
4005 
4006         if(1 == i4_num_clusters)
4007         {
4008             ps_final_prms->u2_num_pus_in_cu = 1;
4009             ps_final_prms->u1_part_mode = SIZE_2Nx2N;
4010         }
4011     }
4012 #endif
4013 
4014     /* store the num TUs*/
4015     ps_final_prms->u2_num_tus_in_cu = u2_num_tus_in_cu;
4016 
4017     /* update the bytes consumed */
4018     ps_final_prms->i4_num_bytes_ecd_data = ecd_data_bytes_cons;
4019 
4020     /* store the current cu size to final prms */
4021     ps_final_prms->u1_cu_size = ps_cu_analyse->u1_cu_size;
4022 
4023     /* cu bits will be having luma residual bits till this point    */
4024     /* if zero_cbf eval is disabled then cu bits will be zero       */
4025     ps_final_prms->u4_cu_luma_res_bits = cu_bits;
4026 
4027     /* ------------- Chroma processing -------------- */
4028     /* Chroma rdopt eval for each luma candidate only for HIGH QUALITY/MEDIUM SPEDD preset*/
4029     if(1 == ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt)
4030     {
4031         LWORD64 chrm_rdopt_cost;
4032         WORD32 chrm_rdopt_tu_bits;
4033 
4034         /* Store the current RDOPT cost to enable early exit in chrom_prcs */
4035         ps_ctxt->as_cu_prms[curr_buf_idx].i8_curr_rdopt_cost = total_rdopt_cost;
4036 
4037         chrm_rdopt_cost = ihevce_chroma_cu_prcs_rdopt(
4038             ps_ctxt,
4039             curr_buf_idx,
4040             func_proc_mode,
4041             ps_chrm_cu_buf_prms->pu1_curr_src,
4042             ps_chrm_cu_buf_prms->i4_chrm_src_stride,
4043             ps_chrm_cu_buf_prms->pu1_cu_left,
4044             ps_chrm_cu_buf_prms->pu1_cu_top,
4045             ps_chrm_cu_buf_prms->pu1_cu_top_left,
4046             ps_chrm_cu_buf_prms->i4_cu_left_stride,
4047             cu_pos_x_8pelunits,
4048             cu_pos_y_8pelunits,
4049             &chrm_rdopt_tu_bits,
4050             i4_alpha_stim_multiplier,
4051             u1_is_cu_noisy);
4052 
4053 #if WEIGH_CHROMA_COST
4054         chrm_rdopt_cost = (LWORD64)(
4055             (chrm_rdopt_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
4056              (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
4057             CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
4058 #endif
4059 
4060 #if CHROMA_RDOPT_ENABLE
4061         total_rdopt_cost += chrm_rdopt_cost;
4062 #endif
4063         cu_bits += chrm_rdopt_tu_bits;
4064 
4065         /* cu bits for chroma residual if chroma rdopt is on       */
4066         /* if zero_cbf eval is disabled then cu bits will be zero  */
4067         ps_final_prms->u4_cu_chroma_res_bits = chrm_rdopt_tu_bits;
4068 
4069         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
4070         {
4071             /* Early exit : If the current running cost exceeds
4072             the prev. best mode cost, break */
4073             if(total_rdopt_cost > prev_best_rdopt_cost)
4074             {
4075                 return (total_rdopt_cost);
4076             }
4077         }
4078     }
4079     else
4080     {}
4081 
4082     /* RDOPT copy States :  Best after all luma TUs to current */
4083     COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
4084         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4085                 .s_cabac_ctxt.au1_ctxt_models[0] +
4086             IHEVC_CAB_COEFFX_PREFIX,
4087         &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
4088         IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
4089 
4090     /* get the neighbour availability flags for current cu  */
4091     ihevce_get_only_nbr_flag(
4092         &s_nbr,
4093         ps_ctxt->pu1_ctb_nbr_map,
4094         ps_ctxt->i4_nbr_map_strd,
4095         (cu_pos_x_8pelunits << 1),
4096         (cu_pos_y_8pelunits << 1),
4097         (trans_size << 1),
4098         (trans_size << 1));
4099 
4100     /* call the entropy rdo encode to get the bit estimate for current cu */
4101     /*if ZERO_CBF eval is enabled then this function will return only CU header bits */
4102     {
4103         /*cbf_bits will account for both texture and cbf bits when zero cbf eval flag is 0*/
4104         WORD32 cbf_bits, header_bits;
4105 
4106         header_bits = ihevce_entropy_rdo_encode_cu(
4107             &ps_ctxt->s_rdopt_entropy_ctxt,
4108             ps_final_prms,
4109             cu_pos_x_8pelunits,
4110             cu_pos_y_8pelunits,
4111             ps_cu_analyse->u1_cu_size,
4112             s_nbr.u1_top_avail,
4113             s_nbr.u1_left_avail,
4114             &ps_final_prms->pu1_cu_coeffs[0],
4115             &cbf_bits);
4116 
4117         cu_bits += header_bits;
4118 
4119         /* cbf bits are excluded from header bits, instead considered as texture bits */
4120         /* incase if zero cbf eval is disabled then texture bits gets added here */
4121         ps_final_prms->u4_cu_hdr_bits = (header_bits - cbf_bits);
4122         ps_final_prms->u4_cu_cbf_bits = cbf_bits;
4123 
4124 #if RDOPT_ENABLE
4125         /* add the cost of coding the cu bits */
4126         total_rdopt_cost +=
4127             COMPUTE_RATE_COST_CLIP30(header_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
4128 #endif
4129     }
4130     return (total_rdopt_cost);
4131 }
4132 /*!
4133 ******************************************************************************
4134 * \if Function name : ihevce_inter_rdopt_cu_ntu \endif
4135 *
4136 * \brief
4137 *    Inter Coding unit funtion whic perfomr the TQ IT IQ recon for luma
4138 *
4139 * \param[in] ps_ctxt       enc_loop module ctxt pointer
4140 * \param[in] ps_inter_cand pointer to inter candidate structure
4141 * \param[in] pu1_src       pointer to source data buffer
4142 * \param[in] cu_size       Current CU size
4143 * \param[in] cu_pos_x      cu position x w.r.t to ctb
4144 * \param[in] cu_pos_y      cu position y w.r.t to ctb
4145 * \param[in] src_strd      source buffer stride
4146 * \param[in] curr_buf_idx  buffer index for current output storage
4147 * \param[in] ps_chrm_cu_buf_prms pointer to chroma buffer pointers structure
4148 *
4149 * \return
4150 *    Rdopt cost
4151 *
4152 * \author
4153 *  Ittiam
4154 *
4155 *****************************************************************************
4156 */
ihevce_inter_rdopt_cu_ntu(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_cu_prms_t * ps_cu_prms,void * pv_src,WORD32 cu_size,WORD32 cu_pos_x,WORD32 cu_pos_y,WORD32 curr_buf_idx,enc_loop_chrm_cu_buf_prms_t * ps_chrm_cu_buf_prms,cu_inter_cand_t * ps_inter_cand,cu_analyse_t * ps_cu_analyse,WORD32 i4_alpha_stim_multiplier)4157 LWORD64 ihevce_inter_rdopt_cu_ntu(
4158     ihevce_enc_loop_ctxt_t *ps_ctxt,
4159     enc_loop_cu_prms_t *ps_cu_prms,
4160     void *pv_src,
4161     WORD32 cu_size,
4162     WORD32 cu_pos_x,
4163     WORD32 cu_pos_y,
4164     WORD32 curr_buf_idx,
4165     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
4166     cu_inter_cand_t *ps_inter_cand,
4167     cu_analyse_t *ps_cu_analyse,
4168     WORD32 i4_alpha_stim_multiplier)
4169 {
4170     enc_loop_cu_final_prms_t *ps_final_prms;
4171     nbr_4x4_t *ps_nbr_4x4;
4172     tu_prms_t s_tu_prms[64 * 4];
4173     tu_prms_t *ps_tu_prms;
4174 
4175     WORD32 i4_perform_rdoq;
4176     WORD32 i4_perform_sbh;
4177     WORD32 ai4_tu_split_flags[4];
4178     WORD32 ai4_tu_early_cbf[4];
4179     WORD32 num_split_flags = 1;
4180     WORD32 i;
4181     UWORD8 u1_tu_size;
4182     UWORD8 *pu1_pred;
4183     UWORD8 *pu1_ecd_data;
4184     WORD16 *pi2_deq_data;
4185     UWORD8 *pu1_csbf_buf;
4186     UWORD8 *pu1_tu_sz_sft;
4187     UWORD8 *pu1_tu_posx;
4188     UWORD8 *pu1_tu_posy;
4189     LWORD64 total_rdopt_cost;
4190     WORD32 ctr;
4191     WORD32 chrm_ctr;
4192     WORD32 num_tu_in_cu = 0;
4193     WORD32 pred_stride;
4194     WORD32 recon_stride;
4195     WORD32 trans_size = ps_cu_analyse->u1_cu_size;
4196     WORD32 csbf_strd;
4197     WORD32 chrm_present_flag;
4198     WORD32 ecd_data_bytes_cons;
4199     WORD32 num_4x4_in_cu;
4200     WORD32 num_4x4_in_tu;
4201     WORD32 recon_func_mode;
4202     WORD32 cu_bits;
4203     UWORD8 u1_compute_spatial_ssd;
4204 
4205     /* min_trans_size is initialized to some huge number than usual TU sizes */
4206     WORD32 i4_min_trans_size = 256;
4207     /* Get the RDOPT cost of the best CU mode for early_exit */
4208     LWORD64 prev_best_rdopt_cost = ps_ctxt->as_cu_prms[!curr_buf_idx].i8_best_rdopt_cost;
4209     WORD32 src_strd = ps_cu_prms->i4_luma_src_stride;
4210 
4211     /* model for no residue syntax qt root cbf flag */
4212     UWORD8 u1_qtroot_cbf_cabac_model = ps_ctxt->au1_rdopt_init_ctxt_models[IHEVC_CAB_NORES_IDX];
4213 
4214     /* backup copy of cabac states for restoration if zero cu reside rdo wins later */
4215     UWORD8 au1_rdopt_init_ctxt_models[IHEVC_CAB_CTXT_END];
4216 
4217     /* for skip cases tables are not reqquired */
4218     UWORD8 u1_skip_tu_sz_sft = 0;
4219     UWORD8 u1_skip_tu_posx = 0;
4220     UWORD8 u1_skip_tu_posy = 0;
4221     UWORD8 u1_is_cu_noisy = ps_cu_prms->u1_is_cu_noisy;
4222 
4223     /* get the pointers based on curbuf idx */
4224     ps_final_prms = &ps_ctxt->as_cu_prms[curr_buf_idx];
4225     ps_nbr_4x4 = &ps_ctxt->as_cu_nbr[curr_buf_idx][0];
4226     pu1_ecd_data = &ps_final_prms->pu1_cu_coeffs[0];
4227     pi2_deq_data = &ps_final_prms->pi2_cu_deq_coeffs[0];
4228     csbf_strd = ps_ctxt->i4_cu_csbf_strd;
4229     pu1_csbf_buf = &ps_ctxt->au1_cu_csbf[0];
4230 
4231     pred_stride = ps_inter_cand->i4_pred_data_stride;
4232     recon_stride = cu_size;
4233     pu1_pred = ps_inter_cand->pu1_pred_data;
4234     chrm_ctr = 0;
4235     ecd_data_bytes_cons = 0;
4236     total_rdopt_cost = 0;
4237     num_4x4_in_cu = cu_size >> 2;
4238     recon_func_mode = PRED_MODE_INTER;
4239     cu_bits = 0;
4240 
4241     /* get the 4x4 level postion of current cu */
4242     cu_pos_x = cu_pos_x << 1;
4243     cu_pos_y = cu_pos_y << 1;
4244 
4245     /* default value for cu coded flag */
4246     ps_final_prms->u1_is_cu_coded = 0;
4247 
4248     /*init of ssd of CU accuumulated over all TU*/
4249     ps_final_prms->u4_cu_sad = 0;
4250 
4251     /* populate the coeffs scan idx */
4252     ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
4253 
4254 #if ENABLE_INTER_ZCU_COST
4255     /* reset cu not coded cost */
4256     ps_ctxt->i8_cu_not_coded_cost = 0;
4257 
4258     /* backup copy of cabac states for restoration if zero cu reside rdo wins later */
4259     memcpy(au1_rdopt_init_ctxt_models, &ps_ctxt->au1_rdopt_init_ctxt_models[0], IHEVC_CAB_CTXT_END);
4260 #endif
4261 
4262     if(ps_cu_analyse->u1_cu_size == 64)
4263     {
4264         num_split_flags = 4;
4265         u1_tu_size = 32;
4266     }
4267     else
4268     {
4269         num_split_flags = 1;
4270         u1_tu_size = ps_cu_analyse->u1_cu_size;
4271     }
4272 
4273     /* ckeck for skip mode */
4274     if(1 == ps_final_prms->u1_skip_flag)
4275     {
4276         if(64 == cu_size)
4277         {
4278             /* TU = CU/2 is set but no trnaform is evaluated  */
4279             num_tu_in_cu = 4;
4280             pu1_tu_sz_sft = &gau1_inter_tu_shft_amt[0];
4281             pu1_tu_posx = &gau1_inter_tu_posx_scl_amt[0];
4282             pu1_tu_posy = &gau1_inter_tu_posy_scl_amt[0];
4283         }
4284         else
4285         {
4286             /* TU = CU is set but no trnaform is evaluated  */
4287             num_tu_in_cu = 1;
4288             pu1_tu_sz_sft = &u1_skip_tu_sz_sft;
4289             pu1_tu_posx = &u1_skip_tu_posx;
4290             pu1_tu_posy = &u1_skip_tu_posy;
4291         }
4292 
4293         recon_func_mode = PRED_MODE_SKIP;
4294     }
4295     /* check for PU part mode being AMP or No AMP */
4296     else if(ps_final_prms->u1_part_mode < SIZE_2NxnU)
4297     {
4298         if((SIZE_2Nx2N == ps_final_prms->u1_part_mode) && (cu_size < 64))
4299         {
4300             /* TU= CU is evaluated 2Nx2N inter case */
4301             num_tu_in_cu = 1;
4302             pu1_tu_sz_sft = &u1_skip_tu_sz_sft;
4303             pu1_tu_posx = &u1_skip_tu_posx;
4304             pu1_tu_posy = &u1_skip_tu_posy;
4305         }
4306         else
4307         {
4308             /* currently TU= CU/2 is evaluated for all inter case */
4309             num_tu_in_cu = 4;
4310             pu1_tu_sz_sft = &gau1_inter_tu_shft_amt[0];
4311             pu1_tu_posx = &gau1_inter_tu_posx_scl_amt[0];
4312             pu1_tu_posy = &gau1_inter_tu_posy_scl_amt[0];
4313         }
4314     }
4315     else
4316     {
4317         /* for AMP cases one level of TU recurssion is done */
4318         /* based on oreintation of the partitions           */
4319         num_tu_in_cu = 10;
4320         pu1_tu_sz_sft = &gau1_inter_tu_shft_amt_amp[ps_final_prms->u1_part_mode - 4][0];
4321         pu1_tu_posx = &gau1_inter_tu_posx_scl_amt_amp[ps_final_prms->u1_part_mode - 4][0];
4322         pu1_tu_posy = &gau1_inter_tu_posy_scl_amt_amp[ps_final_prms->u1_part_mode - 4][0];
4323     }
4324 
4325     ps_tu_prms = &s_tu_prms[0];
4326     num_tu_in_cu = 0;
4327 
4328     for(i = 0; i < num_split_flags; i++)
4329     {
4330         WORD32 i4_x_off = 0, i4_y_off = 0;
4331 
4332         if(i == 1 || i == 3)
4333         {
4334             i4_x_off = 32;
4335         }
4336 
4337         if(i == 2 || i == 3)
4338         {
4339             i4_y_off = 32;
4340         }
4341 
4342         if(1 == ps_final_prms->u1_skip_flag)
4343         {
4344             ai4_tu_split_flags[0] = 0;
4345             ps_inter_cand->ai4_tu_split_flag[i] = 0;
4346 
4347             ai4_tu_early_cbf[0] = 0;
4348         }
4349         else
4350         {
4351             ai4_tu_split_flags[0] = ps_inter_cand->ai4_tu_split_flag[i];
4352             ai4_tu_early_cbf[0] = ps_inter_cand->ai4_tu_early_cbf[i];
4353         }
4354 
4355         ps_tu_prms->u1_tu_size = u1_tu_size;
4356 
4357         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
4358             ps_tu_prms,
4359             &num_tu_in_cu,
4360             0,
4361             ai4_tu_split_flags[0],
4362             ai4_tu_early_cbf[0],
4363             i4_x_off,
4364             i4_y_off);
4365     }
4366 
4367     /* loop for all tu blocks in current cu */
4368     ps_tu_prms = &s_tu_prms[0];
4369     for(ctr = 0; ctr < num_tu_in_cu; ctr++)
4370     {
4371         trans_size = ps_tu_prms->u1_tu_size;
4372 
4373         if(i4_min_trans_size > trans_size)
4374         {
4375             i4_min_trans_size = trans_size;
4376         }
4377         ps_tu_prms++;
4378     }
4379 
4380     if(ps_ctxt->i1_cu_qp_delta_enable)
4381     {
4382         ihevce_update_cu_level_qp_lamda(ps_ctxt, ps_cu_analyse, i4_min_trans_size, 0);
4383     }
4384 
4385     if(u1_is_cu_noisy && !ps_ctxt->u1_enable_psyRDOPT)
4386     {
4387         ps_ctxt->i8_cl_ssd_lambda_qf =
4388             ((float)ps_ctxt->i8_cl_ssd_lambda_qf * (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) /
4389              100.0f);
4390         ps_ctxt->i8_cl_ssd_lambda_chroma_qf =
4391             ((float)ps_ctxt->i8_cl_ssd_lambda_chroma_qf *
4392              (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) / 100.0f);
4393     }
4394 
4395     u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
4396                              (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
4397                              CONVERT_SSDS_TO_SPATIAL_DOMAIN;
4398 
4399     if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
4400     {
4401         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
4402                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
4403     }
4404 
4405     if(!u1_compute_spatial_ssd)
4406     {
4407         ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 0;
4408         ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 0;
4409     }
4410     else
4411     {
4412         ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 1;
4413     }
4414 
4415     ps_tu_prms = &s_tu_prms[0];
4416 
4417     ASSERT(num_tu_in_cu <= 256);
4418 
4419     /* RDOPT copy States :  TU init (best until prev TU) to current */
4420     memcpy(
4421         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4422              .s_cabac_ctxt.au1_ctxt_models[0],
4423         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
4424         IHEVC_CAB_COEFFX_PREFIX);
4425 
4426     for(ctr = 0; ctr < num_tu_in_cu; ctr++)
4427     {
4428         WORD32 curr_bytes;
4429         WORD32 tx_size;
4430         WORD32 cbf, zero_col, zero_row;
4431         LWORD64 rdopt_cost;
4432         UWORD8 u1_is_recon_available;
4433 
4434         WORD32 curr_pos_x;
4435         WORD32 curr_pos_y;
4436         nbr_4x4_t *ps_cur_nbr_4x4;
4437         UWORD8 *pu1_cur_pred;
4438         UWORD8 *pu1_cur_src;
4439         UWORD8 *pu1_cur_recon;
4440         WORD16 *pi2_cur_deq_data;
4441         UWORD32 u4_tu_sad;
4442         WORD32 tu_bits;
4443 
4444         WORD32 i4_recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
4445 
4446         trans_size = ps_tu_prms->u1_tu_size;
4447         /* get the current pos x and pos y in pixels */
4448         curr_pos_x = ps_tu_prms->u1_x_off;  //((cu_size >> 2) * pu1_tu_posx[ctr]);
4449         curr_pos_y = ps_tu_prms->u1_y_off;  //((cu_size >> 2) * pu1_tu_posy[ctr]);
4450 
4451         num_4x4_in_tu = trans_size >> 2;
4452 
4453 #if FORCE_8x8_TFR
4454         if(cu_size == 64)
4455         {
4456             curr_pos_x = ((cu_size >> 3) * pu1_tu_posx[ctr]);
4457             curr_pos_y = ((cu_size >> 3) * pu1_tu_posy[ctr]);
4458         }
4459 #endif
4460 
4461         /* increment the pointers to start of current TU  */
4462         pu1_cur_src = ((UWORD8 *)pv_src + curr_pos_x);
4463         pu1_cur_src += (curr_pos_y * src_strd);
4464         pu1_cur_pred = (pu1_pred + curr_pos_x);
4465         pu1_cur_pred += (curr_pos_y * pred_stride);
4466         pi2_cur_deq_data = pi2_deq_data + curr_pos_x;
4467         pi2_cur_deq_data += (curr_pos_y * cu_size);
4468         pu1_cur_recon = ((UWORD8 *)ps_final_prms->s_recon_datastore.apv_luma_recon_bufs[0]) +
4469                         curr_pos_x + curr_pos_y * i4_recon_stride;
4470 
4471         ps_cur_nbr_4x4 = (ps_nbr_4x4 + (curr_pos_x >> 2));
4472         ps_cur_nbr_4x4 += ((curr_pos_y >> 2) * num_4x4_in_cu);
4473 
4474         /* RDOPT copy States :  TU init (best until prev TU) to current */
4475         COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
4476             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4477                     .s_cabac_ctxt.au1_ctxt_models[0] +
4478                 IHEVC_CAB_COEFFX_PREFIX,
4479             &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
4480             IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
4481 
4482         i4_perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_rdoq;
4483         i4_perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_sbh;
4484 
4485         /*2 Multi- dimensinal array based on trans size  of rounding factor to be added here */
4486         /* arrays are for rounding factor corr. to 0-1 decision and 1-2 decision */
4487         /* Currently the complete array will contain only single value*/
4488         /*The rounding factor is calculated with the formula
4489         Deadzone val = (((R1 - R0) * (2^(-8/3)) * lamMod) + 1)/2
4490         rounding factor = (1 - DeadZone Val)
4491 
4492         Assumption: Cabac states of All the sub-blocks in the TU are considered independent
4493         */
4494         if((ps_ctxt->i4_quant_rounding_level == TU_LEVEL_QUANT_ROUNDING) && (ctr != 0))
4495         {
4496             double i4_lamda_modifier;
4497 
4498             if((BSLICE == ps_ctxt->i1_slice_type) && (ps_ctxt->i4_temporal_layer_id))
4499             {
4500                 i4_lamda_modifier = ps_ctxt->i4_lamda_modifier *
4501                                     CLIP3((((double)(ps_ctxt->i4_cu_qp - 12)) / 6.0), 2.00, 4.00);
4502             }
4503             else
4504             {
4505                 i4_lamda_modifier = ps_ctxt->i4_lamda_modifier;
4506             }
4507             if(ps_ctxt->i4_use_const_lamda_modifier)
4508             {
4509                 if(ISLICE == ps_ctxt->i1_slice_type)
4510                 {
4511                     i4_lamda_modifier = ps_ctxt->f_i_pic_lamda_modifier;
4512                 }
4513                 else
4514                 {
4515                     i4_lamda_modifier = CONST_LAMDA_MOD_VAL;
4516                 }
4517             }
4518             ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3] =
4519                 &ps_ctxt->i4_quant_round_tu[0][0];
4520             ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3] =
4521                 &ps_ctxt->i4_quant_round_tu[1][0];
4522 
4523             memset(
4524                 ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
4525                 0,
4526                 trans_size * trans_size * sizeof(WORD32));
4527             memset(
4528                 ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
4529                 0,
4530                 trans_size * trans_size * sizeof(WORD32));
4531 
4532             ihevce_quant_rounding_factor_gen(
4533                 trans_size,
4534                 1,
4535                 &ps_ctxt->s_rdopt_entropy_ctxt,
4536                 ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
4537                 ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
4538                 i4_lamda_modifier,
4539                 1);
4540         }
4541         else
4542         {
4543             ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3] =
4544                 ps_ctxt->pi4_quant_round_factor_cu_ctb_0_1[trans_size >> 3];
4545             ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3] =
4546                 ps_ctxt->pi4_quant_round_factor_cu_ctb_1_2[trans_size >> 3];
4547         }
4548 
4549         /* call T Q IT IQ and recon function */
4550         cbf = ihevce_t_q_iq_ssd_scan_fxn(
4551             ps_ctxt,
4552             pu1_cur_pred,
4553             pred_stride,
4554             pu1_cur_src,
4555             src_strd,
4556             pi2_cur_deq_data,
4557             cu_size,
4558             pu1_cur_recon,
4559             i4_recon_stride,
4560             pu1_ecd_data,
4561             pu1_csbf_buf,
4562             csbf_strd,
4563             trans_size,
4564             recon_func_mode,
4565             &rdopt_cost,
4566             &curr_bytes,
4567             &tu_bits,
4568             &u4_tu_sad,
4569             &zero_col,
4570             &zero_row,
4571             &u1_is_recon_available,
4572             i4_perform_rdoq,
4573             i4_perform_sbh,
4574 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
4575             i4_alpha_stim_multiplier,
4576             u1_is_cu_noisy,
4577 #endif
4578             u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
4579             ps_ctxt->u1_use_early_cbf_data ? ps_tu_prms->i4_early_cbf : 1);
4580 
4581 #if COMPUTE_NOISE_TERM_AT_THE_TU_LEVEL && !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
4582         if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
4583         {
4584 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
4585             rdopt_cost = ihevce_inject_stim_into_distortion(
4586                 pu1_cur_src,
4587                 src_strd,
4588                 pu1_cur_pred,
4589                 pred_stride,
4590                 rdopt_cost,
4591                 i4_alpha_stim_multiplier,
4592                 trans_size,
4593                 0,
4594                 ps_ctxt->u1_enable_psyRDOPT,
4595                 NULL_PLANE);
4596 #else
4597             if(u1_compute_spatial_ssd && u1_is_recon_available)
4598             {
4599                 rdopt_cost = ihevce_inject_stim_into_distortion(
4600                     pu1_cur_src,
4601                     src_strd,
4602                     pu1_cur_recon,
4603                     i4_recon_stride,
4604                     rdopt_cost,
4605                     i4_alpha_stim_multiplier,
4606                     trans_size,
4607                     0,
4608                     NULL_PLANE);
4609             }
4610             else
4611             {
4612                 rdopt_cost = ihevce_inject_stim_into_distortion(
4613                     pu1_cur_src,
4614                     src_strd,
4615                     pu1_cur_pred,
4616                     pred_stride,
4617                     rdopt_cost,
4618                     i4_alpha_stim_multiplier,
4619                     trans_size,
4620                     0,
4621                     ps_ctxt->u1_enable_psyRDOPT,
4622                     NULL_PLANE);
4623             }
4624 #endif
4625         }
4626 #endif
4627 
4628         if(u1_compute_spatial_ssd && u1_is_recon_available)
4629         {
4630             ps_final_prms->s_recon_datastore.au1_bufId_with_winning_LumaRecon[ctr] = 0;
4631         }
4632         else
4633         {
4634             ps_final_prms->s_recon_datastore.au1_bufId_with_winning_LumaRecon[ctr] = UCHAR_MAX;
4635         }
4636 
4637         /* accumulate the TU sad into cu sad */
4638         ps_final_prms->u4_cu_sad += u4_tu_sad;
4639 
4640         /* accumulate the TU bits into cu bits */
4641         cu_bits += tu_bits;
4642 
4643         /* inter cu is coded if any of the tu is coded in it */
4644         ps_final_prms->u1_is_cu_coded |= cbf;
4645 
4646         /* call the entropy function to get the bits */
4647         /* add that to rd opt cost(SSD)              */
4648 
4649         /* update the bytes */
4650         ps_final_prms->as_tu_enc_loop[ctr].i4_luma_coeff_offset = ecd_data_bytes_cons;
4651         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].i2_luma_bytes_consumed = curr_bytes;
4652         /* update the zero_row and col info for the final mode */
4653         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].u4_luma_zero_col = zero_col;
4654         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].u4_luma_zero_row = zero_row;
4655 
4656         /* update the bytes */
4657         ps_final_prms->as_tu_enc_loop[ctr].i4_luma_coeff_offset = ecd_data_bytes_cons;
4658 
4659         /* update the total bytes cons */
4660         ecd_data_bytes_cons += curr_bytes;
4661         pu1_ecd_data += curr_bytes;
4662 
4663         /* RDOPT copy States :  New updated after curr TU to TU init */
4664         if(0 != cbf)
4665         {
4666             /* update to new state only if CBF is non zero */
4667             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
4668                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
4669                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4670                         .s_cabac_ctxt.au1_ctxt_models[0] +
4671                     IHEVC_CAB_COEFFX_PREFIX,
4672                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
4673         }
4674 
4675         /* by default chroma present is set to 1*/
4676         chrm_present_flag = 1;
4677         if(4 == trans_size)
4678         {
4679             /* if tusize is 4x4 then only first luma 4x4 will have chroma*/
4680             if(0 != chrm_ctr)
4681             {
4682                 chrm_present_flag = INTRA_PRED_CHROMA_IDX_NONE;
4683             }
4684 
4685             /* increment the chrm ctr unconditionally */
4686             chrm_ctr++;
4687 
4688             /* after ctr reached 4 reset it */
4689             if(4 == chrm_ctr)
4690             {
4691                 chrm_ctr = 0;
4692             }
4693         }
4694 
4695         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_y_cbf = cbf;
4696         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf = 0;
4697         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf = 0;
4698         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1 = 0;
4699         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1 = 0;
4700         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_chroma_intra_mode_idx = chrm_present_flag;
4701         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b7_qp = ps_ctxt->i4_cu_qp;
4702         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_first_tu_in_cu = 0;
4703         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_transquant_bypass = 0;
4704         GETRANGE(tx_size, trans_size);
4705         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_size = tx_size - 3;
4706         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_x = cu_pos_x + (curr_pos_x >> 2);
4707         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_y = cu_pos_y + (curr_pos_y >> 2);
4708 
4709         /* repiclate the nbr 4x4 structure for all 4x4 blocks current TU */
4710         ps_cur_nbr_4x4->b1_y_cbf = cbf;
4711         /*copy the cu qp. This will be overwritten by qp calculated based on skip flag at final stage of cu mode decide*/
4712         ps_cur_nbr_4x4->b8_qp = ps_ctxt->i4_cu_qp;
4713 
4714         /* Qp and cbf are stored for the all 4x4 in TU */
4715         {
4716             WORD32 i, j;
4717             nbr_4x4_t *ps_tmp_4x4;
4718             ps_tmp_4x4 = ps_cur_nbr_4x4;
4719 
4720             for(i = 0; i < num_4x4_in_tu; i++)
4721             {
4722                 for(j = 0; j < num_4x4_in_tu; j++)
4723                 {
4724                     ps_tmp_4x4[j].b8_qp = ps_ctxt->i4_cu_qp;
4725                     ps_tmp_4x4[j].b1_y_cbf = cbf;
4726                 }
4727                 /* row level update*/
4728                 ps_tmp_4x4 += num_4x4_in_cu;
4729             }
4730         }
4731 
4732 #if RDOPT_ENABLE
4733         /* compute the rdopt cost */
4734         rdopt_cost +=
4735             COMPUTE_RATE_COST_CLIP30(tu_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
4736 #endif
4737         /* accumulate the costs */
4738         total_rdopt_cost += rdopt_cost;
4739 
4740         ps_tu_prms++;
4741 
4742         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
4743         {
4744             /* Early exit : If the current running cost exceeds
4745             the prev. best mode cost, break */
4746             if(total_rdopt_cost > prev_best_rdopt_cost)
4747             {
4748                 return (total_rdopt_cost);
4749             }
4750         }
4751     }
4752 
4753     /* Modify the cost function for this CU. */
4754     /* loop in for 8x8 blocks */
4755     if(ps_ctxt->u1_enable_psyRDOPT)
4756     {
4757         UWORD8 *pu1_recon_cu;
4758         WORD32 recon_stride;
4759         WORD32 curr_pos_x;
4760         WORD32 curr_pos_y;
4761         WORD32 start_index;
4762         WORD32 num_horz_cu_in_ctb;
4763         WORD32 had_block_size;
4764 
4765         /* tODO: sreenivasa ctb size has to be used appropriately */
4766         had_block_size = 8;
4767         num_horz_cu_in_ctb = 64 / had_block_size;
4768 
4769         curr_pos_x = cu_pos_x << 2; /* pel units */
4770         curr_pos_y = cu_pos_y << 2; /* pel units */
4771         recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
4772         pu1_recon_cu = ((UWORD8 *)ps_final_prms->s_recon_datastore
4773                             .apv_luma_recon_bufs[0]);  // already pointing to the current CU recon
4774         //+ \curr_pos_x + curr_pos_y * recon_stride;
4775 
4776         /* start index to index the source satd of curr cu int he current ctb*/
4777         start_index =
4778             (curr_pos_x / had_block_size) + (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
4779 
4780         {
4781             total_rdopt_cost += ihevce_psy_rd_cost(
4782                 ps_ctxt->ai4_source_satd_8x8,
4783                 pu1_recon_cu,
4784                 recon_stride,
4785                 1,  //howz stride
4786                 cu_size,
4787                 0,  // pic type
4788                 0,  //layer id
4789                 ps_ctxt->i4_satd_lamda,  // lambda
4790                 start_index,
4791                 ps_ctxt->u1_is_input_data_hbd,
4792                 ps_ctxt->u4_psy_strength,
4793                 &ps_ctxt->s_cmn_opt_func);  // 8 bit
4794         }
4795     }
4796 
4797     /* store the num TUs*/
4798     ps_final_prms->u2_num_tus_in_cu = num_tu_in_cu;
4799 
4800     /* update the bytes consumed */
4801     ps_final_prms->i4_num_bytes_ecd_data = ecd_data_bytes_cons;
4802 
4803     /* store the current cu size to final prms */
4804     ps_final_prms->u1_cu_size = cu_size;
4805 
4806     /* cu bits will be having luma residual bits till this point    */
4807     /* if zero_cbf eval is disabled then cu bits will be zero       */
4808     ps_final_prms->u4_cu_luma_res_bits = cu_bits;
4809 
4810     /* ------------- Chroma processing -------------- */
4811     /* Chroma rdopt eval for each luma candidate only for HIGH QUALITY/MEDIUM SPEDD preset*/
4812     if(1 == ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt)
4813     {
4814         LWORD64 chrm_rdopt_cost;
4815         WORD32 chrm_rdopt_tu_bits;
4816 
4817         /* Store the current RDOPT cost to enable early exit in chrom_prcs */
4818         ps_ctxt->as_cu_prms[curr_buf_idx].i8_curr_rdopt_cost = total_rdopt_cost;
4819 
4820         chrm_rdopt_cost = ihevce_chroma_cu_prcs_rdopt(
4821             ps_ctxt,
4822             curr_buf_idx,
4823             0, /* TU mode : Don't care in Inter patrh */
4824             ps_chrm_cu_buf_prms->pu1_curr_src,
4825             ps_chrm_cu_buf_prms->i4_chrm_src_stride,
4826             ps_chrm_cu_buf_prms->pu1_cu_left,
4827             ps_chrm_cu_buf_prms->pu1_cu_top,
4828             ps_chrm_cu_buf_prms->pu1_cu_top_left,
4829             ps_chrm_cu_buf_prms->i4_cu_left_stride,
4830             (cu_pos_x >> 1),
4831             (cu_pos_y >> 1),
4832             &chrm_rdopt_tu_bits,
4833             i4_alpha_stim_multiplier,
4834             u1_is_cu_noisy);
4835 
4836 #if WEIGH_CHROMA_COST
4837         chrm_rdopt_cost = (LWORD64)(
4838             (chrm_rdopt_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
4839              (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
4840             CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
4841 #endif
4842 
4843 #if CHROMA_RDOPT_ENABLE
4844         total_rdopt_cost += chrm_rdopt_cost;
4845 #endif
4846         cu_bits += chrm_rdopt_tu_bits;
4847 
4848         /* during chroma evaluation if skip decision was over written     */
4849         /* then the current skip candidate is set to a non skip candidate */
4850         ps_inter_cand->b1_skip_flag = ps_final_prms->u1_skip_flag;
4851 
4852         /* cu bits for chroma residual if chroma rdopt is on       */
4853         /* if zero_cbf eval is disabled then cu bits will be zero  */
4854         ps_final_prms->u4_cu_chroma_res_bits = chrm_rdopt_tu_bits;
4855 
4856         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
4857         {
4858             /* Early exit : If the current running cost exceeds
4859             the prev. best mode cost, break */
4860             if(total_rdopt_cost > prev_best_rdopt_cost)
4861             {
4862                 return (total_rdopt_cost);
4863             }
4864         }
4865     }
4866     else
4867     {}
4868 
4869 #if SHRINK_INTER_TUTREE
4870     /* ------------- Quadtree TU split  optimization ------------  */
4871     if(ps_final_prms->u1_is_cu_coded)
4872     {
4873         ps_final_prms->u2_num_tus_in_cu = ihevce_shrink_inter_tu_tree(
4874             &ps_final_prms->as_tu_enc_loop[0],
4875             &ps_final_prms->as_tu_enc_loop_temp_prms[0],
4876             &ps_final_prms->s_recon_datastore,
4877             num_tu_in_cu,
4878             (ps_ctxt->u1_chroma_array_type == 2));
4879     }
4880 #endif
4881 
4882     /* RDOPT copy States :  Best after all luma TUs (and chroma,if enabled)to current */
4883     COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
4884         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4885                 .s_cabac_ctxt.au1_ctxt_models[0] +
4886             IHEVC_CAB_COEFFX_PREFIX,
4887         &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
4888         IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
4889 
4890     /* -------- Bit estimate for RD opt -------------- */
4891     {
4892         nbr_avail_flags_t s_nbr;
4893         /*cbf_bits will account for both texture and cbf bits when zero cbf eval flag is 0*/
4894         WORD32 cbf_bits, header_bits;
4895 
4896         /* get the neighbour availability flags for current cu  */
4897         ihevce_get_only_nbr_flag(
4898             &s_nbr,
4899             ps_ctxt->pu1_ctb_nbr_map,
4900             ps_ctxt->i4_nbr_map_strd,
4901             cu_pos_x,
4902             cu_pos_y,
4903             (cu_size >> 2),
4904             (cu_size >> 2));
4905 
4906         /* call the entropy rdo encode to get the bit estimate for current cu */
4907         header_bits = ihevce_entropy_rdo_encode_cu(
4908             &ps_ctxt->s_rdopt_entropy_ctxt,
4909             ps_final_prms,
4910             (cu_pos_x >> 1), /*  back to 8x8 pel units   */
4911             (cu_pos_y >> 1), /*  back to 8x8 pel units   */
4912             cu_size,
4913             ps_ctxt->u1_disable_intra_eval ? !DISABLE_TOP_SYNC && s_nbr.u1_top_avail
4914                                            : s_nbr.u1_top_avail,
4915             s_nbr.u1_left_avail,
4916             &ps_final_prms->pu1_cu_coeffs[0],
4917             &cbf_bits);
4918 
4919         cu_bits += header_bits;
4920 
4921         /* cbf bits are excluded from header bits, instead considered as texture bits */
4922         /* incase if zero cbf eval is disabled then texture bits gets added here */
4923         ps_final_prms->u4_cu_hdr_bits = (header_bits - cbf_bits);
4924         ps_final_prms->u4_cu_cbf_bits = cbf_bits;
4925 
4926 #if RDOPT_ENABLE
4927         /* add the cost of coding the header bits */
4928         total_rdopt_cost +=
4929             COMPUTE_RATE_COST_CLIP30(header_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
4930 
4931 #if ENABLE_INTER_ZCU_COST
4932         /* If cu is coded, Evaluate not coded cost and check if it improves over coded cost */
4933         if(ps_final_prms->u1_is_cu_coded && (ZCBF_ENABLE == ps_ctxt->i4_zcbf_rdo_level))
4934         {
4935             LWORD64 i8_cu_not_coded_cost = ps_ctxt->i8_cu_not_coded_cost;
4936 
4937             WORD32 is_2nx2n_mergecu = (SIZE_2Nx2N == ps_final_prms->u1_part_mode) &&
4938                                       (1 == ps_final_prms->as_pu_enc_loop[0].b1_merge_flag);
4939 
4940             cab_ctxt_t *ps_cab_ctxt =
4941                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx].s_cabac_ctxt;
4942 
4943             /* Read header bits generatated after ihevce_entropy_rdo_encode_cu() call  */
4944             UWORD32 u4_cu_hdr_bits_q12 = ps_cab_ctxt->u4_header_bits_estimated_q12;
4945 
4946             /* account for coding qt_root_cbf = 0 */
4947             /* First subtract cost for coding as 1 (part of header bits) and then add cost for coding as 0 */
4948             u4_cu_hdr_bits_q12 += gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 0];
4949             if(u4_cu_hdr_bits_q12 < gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 1])
4950                 u4_cu_hdr_bits_q12 = 0;
4951             else
4952                 u4_cu_hdr_bits_q12 -= gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 1];
4953 
4954             /* add the cost of coding the header bits */
4955             i8_cu_not_coded_cost += COMPUTE_RATE_COST_CLIP30(
4956                 u4_cu_hdr_bits_q12 /* ps_final_prms->u4_cu_hdr_bits */,
4957                 ps_ctxt->i8_cl_ssd_lambda_qf,
4958                 (LAMBDA_Q_SHIFT + CABAC_FRAC_BITS_Q));
4959 
4960             if(ps_ctxt->u1_enable_psyRDOPT)
4961             {
4962                 i8_cu_not_coded_cost = total_rdopt_cost + 1;
4963             }
4964 
4965             /* Evaluate qtroot cbf rdo; exclude 2Nx2N Merge as skip cu is explicitly evaluated */
4966             if((i8_cu_not_coded_cost <= total_rdopt_cost) && (!is_2nx2n_mergecu))
4967             {
4968                 WORD32 tx_size;
4969 
4970                 /* force cu as not coded and update the cost */
4971                 ps_final_prms->u1_is_cu_coded = 0;
4972                 ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 0;
4973                 ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 0;
4974 
4975                 total_rdopt_cost = i8_cu_not_coded_cost;
4976 
4977                 /* reset num TUs to 1 unless cu size id 64 */
4978                 ps_final_prms->u2_num_tus_in_cu = (64 == cu_size) ? 4 : 1;
4979                 trans_size = (64 == cu_size) ? 32 : cu_size;
4980                 GETRANGE(tx_size, trans_size);
4981 
4982                 /* reset the bytes consumed */
4983                 ps_final_prms->i4_num_bytes_ecd_data = 0;
4984 
4985                 /* reset texture related bits and roll back header bits*/
4986                 ps_final_prms->u4_cu_cbf_bits = 0;
4987                 ps_final_prms->u4_cu_luma_res_bits = 0;
4988                 ps_final_prms->u4_cu_chroma_res_bits = 0;
4989                 ps_final_prms->u4_cu_hdr_bits =
4990                     (u4_cu_hdr_bits_q12 + (1 << (CABAC_FRAC_BITS_Q - 1))) >> CABAC_FRAC_BITS_Q;
4991 
4992                 /* update cabac model with qtroot cbf = 0 decision */
4993                 ps_cab_ctxt->au1_ctxt_models[IHEVC_CAB_NORES_IDX] =
4994                     gau1_ihevc_next_state[u1_qtroot_cbf_cabac_model << 1];
4995 
4996                 /* restore untouched cabac models for, tusplit, cbfs, texture etc */
4997                 memcpy(
4998                     &ps_cab_ctxt->au1_ctxt_models[IHEVC_CAB_SPLIT_TFM],
4999                     &au1_rdopt_init_ctxt_models[IHEVC_CAB_SPLIT_TFM],
5000                     (IHEVC_CAB_CTXT_END - IHEVC_CAB_SPLIT_TFM));
5001 
5002                 /* mark all tus as not coded for final eval */
5003                 for(ctr = 0; ctr < ps_final_prms->u2_num_tus_in_cu; ctr++)
5004                 {
5005                     WORD32 curr_pos_x = (ctr & 0x1) ? (trans_size >> 2) : 0;
5006                     WORD32 curr_pos_y = (ctr & 0x2) ? (trans_size >> 2) : 0;
5007 
5008                     nbr_4x4_t *ps_cur_nbr_4x4 =
5009                         ps_nbr_4x4 + curr_pos_x + (curr_pos_y * num_4x4_in_cu);
5010 
5011                     num_4x4_in_tu = trans_size >> 2;
5012 
5013                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].i2_luma_bytes_consumed = 0;
5014                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].ai2_cb_bytes_consumed[0] = 0;
5015                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].ai2_cr_bytes_consumed[0] = 0;
5016 
5017                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_y_cbf = 0;
5018                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf = 0;
5019                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf = 0;
5020 
5021                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1 = 0;
5022                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1 = 0;
5023 
5024                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_size = tx_size - 3;
5025                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_x = cu_pos_x + curr_pos_x;
5026                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_y = cu_pos_y + curr_pos_y;
5027 
5028                     /* reset cbf for the all 4x4 in TU */
5029                     {
5030                         WORD32 i, j;
5031                         nbr_4x4_t *ps_tmp_4x4;
5032                         ps_tmp_4x4 = ps_cur_nbr_4x4;
5033 
5034                         for(i = 0; i < num_4x4_in_tu; i++)
5035                         {
5036                             for(j = 0; j < num_4x4_in_tu; j++)
5037                             {
5038                                 ps_tmp_4x4[j].b1_y_cbf = 0;
5039                             }
5040                             /* row level update*/
5041                             ps_tmp_4x4 += num_4x4_in_cu;
5042                         }
5043                     }
5044                 }
5045             }
5046         }
5047 #endif /* ENABLE_INTER_ZCU_COST */
5048 
5049 #endif /* RDOPT_ENABLE */
5050     }
5051 
5052     return (total_rdopt_cost);
5053 }
5054 
5055 #if ENABLE_RDO_BASED_TU_RECURSION
ihevce_inter_tu_tree_selector_and_rdopt_cost_computer(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_cu_prms_t * ps_cu_prms,void * pv_src,WORD32 cu_size,WORD32 cu_pos_x,WORD32 cu_pos_y,WORD32 curr_buf_idx,enc_loop_chrm_cu_buf_prms_t * ps_chrm_cu_buf_prms,cu_inter_cand_t * ps_inter_cand,cu_analyse_t * ps_cu_analyse,WORD32 i4_alpha_stim_multiplier)5056 LWORD64 ihevce_inter_tu_tree_selector_and_rdopt_cost_computer(
5057     ihevce_enc_loop_ctxt_t *ps_ctxt,
5058     enc_loop_cu_prms_t *ps_cu_prms,
5059     void *pv_src,
5060     WORD32 cu_size,
5061     WORD32 cu_pos_x,
5062     WORD32 cu_pos_y,
5063     WORD32 curr_buf_idx,
5064     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
5065     cu_inter_cand_t *ps_inter_cand,
5066     cu_analyse_t *ps_cu_analyse,
5067     WORD32 i4_alpha_stim_multiplier)
5068 {
5069     tu_tree_node_t as_tu_nodes[256 + 64 + 16 + 4 + 1];
5070     buffer_data_for_tu_t s_buffer_data_for_tu;
5071     enc_loop_cu_final_prms_t *ps_final_prms;
5072     nbr_4x4_t *ps_nbr_4x4;
5073 
5074     WORD32 num_split_flags = 1;
5075     UWORD8 u1_tu_size;
5076     UWORD8 *pu1_pred;
5077     UWORD8 *pu1_ecd_data;
5078     WORD16 *pi2_deq_data;
5079     UWORD8 *pu1_csbf_buf;
5080     UWORD8 *pu1_tu_sz_sft;
5081     UWORD8 *pu1_tu_posx;
5082     UWORD8 *pu1_tu_posy;
5083     LWORD64 total_rdopt_cost;
5084     WORD32 ctr;
5085     WORD32 chrm_ctr;
5086     WORD32 pred_stride;
5087     WORD32 recon_stride;
5088     WORD32 trans_size = ps_cu_analyse->u1_cu_size;
5089     WORD32 csbf_strd;
5090     WORD32 ecd_data_bytes_cons;
5091     WORD32 num_4x4_in_cu;
5092     WORD32 num_4x4_in_tu;
5093     WORD32 recon_func_mode;
5094     WORD32 cu_bits;
5095     UWORD8 u1_compute_spatial_ssd;
5096     /* backup copy of cabac states for restoration if zero cu reside rdo wins later */
5097     UWORD8 au1_rdopt_init_ctxt_models[IHEVC_CAB_CTXT_END];
5098 
5099     WORD32 i4_min_trans_size = 256;
5100     LWORD64 prev_best_rdopt_cost = ps_ctxt->as_cu_prms[!curr_buf_idx].i8_best_rdopt_cost;
5101     WORD32 src_strd = ps_cu_prms->i4_luma_src_stride;
5102     /* model for no residue syntax qt root cbf flag */
5103     UWORD8 u1_qtroot_cbf_cabac_model = ps_ctxt->au1_rdopt_init_ctxt_models[IHEVC_CAB_NORES_IDX];
5104     UWORD8 u1_skip_tu_sz_sft = 0;
5105     UWORD8 u1_skip_tu_posx = 0;
5106     UWORD8 u1_skip_tu_posy = 0;
5107     UWORD8 u1_is_cu_noisy = ps_cu_prms->u1_is_cu_noisy;
5108 
5109     ps_final_prms = &ps_ctxt->as_cu_prms[curr_buf_idx];
5110     ps_nbr_4x4 = &ps_ctxt->as_cu_nbr[curr_buf_idx][0];
5111     pu1_ecd_data = &ps_final_prms->pu1_cu_coeffs[0];
5112     pi2_deq_data = &ps_final_prms->pi2_cu_deq_coeffs[0];
5113     csbf_strd = ps_ctxt->i4_cu_csbf_strd;
5114     pu1_csbf_buf = &ps_ctxt->au1_cu_csbf[0];
5115     pred_stride = ps_inter_cand->i4_pred_data_stride;
5116     recon_stride = cu_size;
5117     pu1_pred = ps_inter_cand->pu1_pred_data;
5118     chrm_ctr = 0;
5119     ecd_data_bytes_cons = 0;
5120     total_rdopt_cost = 0;
5121     num_4x4_in_cu = cu_size >> 2;
5122     recon_func_mode = PRED_MODE_INTER;
5123     cu_bits = 0;
5124 
5125     /* get the 4x4 level postion of current cu */
5126     cu_pos_x = cu_pos_x << 1;
5127     cu_pos_y = cu_pos_y << 1;
5128 
5129     ps_final_prms->u1_is_cu_coded = 0;
5130     ps_final_prms->u4_cu_sad = 0;
5131 
5132     /* populate the coeffs scan idx */
5133     ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
5134 
5135 #if ENABLE_INTER_ZCU_COST
5136     /* reset cu not coded cost */
5137     ps_ctxt->i8_cu_not_coded_cost = 0;
5138 
5139     /* backup copy of cabac states for restoration if zero cu reside rdo wins later */
5140     memcpy(au1_rdopt_init_ctxt_models, &ps_ctxt->au1_rdopt_init_ctxt_models[0], IHEVC_CAB_CTXT_END);
5141 #endif
5142 
5143     if(ps_cu_analyse->u1_cu_size == 64)
5144     {
5145         num_split_flags = 4;
5146         u1_tu_size = 32;
5147     }
5148     else
5149     {
5150         num_split_flags = 1;
5151         u1_tu_size = ps_cu_analyse->u1_cu_size;
5152     }
5153 
5154     if(1 == ps_final_prms->u1_skip_flag)
5155     {
5156         if(64 == cu_size)
5157         {
5158             /* TU = CU/2 is set but no trnaform is evaluated  */
5159             pu1_tu_sz_sft = &gau1_inter_tu_shft_amt[0];
5160             pu1_tu_posx = &gau1_inter_tu_posx_scl_amt[0];
5161             pu1_tu_posy = &gau1_inter_tu_posy_scl_amt[0];
5162         }
5163         else
5164         {
5165             /* TU = CU is set but no trnaform is evaluated  */
5166             pu1_tu_sz_sft = &u1_skip_tu_sz_sft;
5167             pu1_tu_posx = &u1_skip_tu_posx;
5168             pu1_tu_posy = &u1_skip_tu_posy;
5169         }
5170 
5171         recon_func_mode = PRED_MODE_SKIP;
5172     }
5173     /* check for PU part mode being AMP or No AMP */
5174     else if(ps_final_prms->u1_part_mode < SIZE_2NxnU)
5175     {
5176         if((SIZE_2Nx2N == ps_final_prms->u1_part_mode) && (cu_size < 64))
5177         {
5178             /* TU= CU is evaluated 2Nx2N inter case */
5179             pu1_tu_sz_sft = &u1_skip_tu_sz_sft;
5180             pu1_tu_posx = &u1_skip_tu_posx;
5181             pu1_tu_posy = &u1_skip_tu_posy;
5182         }
5183         else
5184         {
5185             /* currently TU= CU/2 is evaluated for all inter case */
5186             pu1_tu_sz_sft = &gau1_inter_tu_shft_amt[0];
5187             pu1_tu_posx = &gau1_inter_tu_posx_scl_amt[0];
5188             pu1_tu_posy = &gau1_inter_tu_posy_scl_amt[0];
5189         }
5190     }
5191     else
5192     {
5193         /* for AMP cases one level of TU recurssion is done */
5194         /* based on oreintation of the partitions           */
5195         pu1_tu_sz_sft = &gau1_inter_tu_shft_amt_amp[ps_final_prms->u1_part_mode - 4][0];
5196         pu1_tu_posx = &gau1_inter_tu_posx_scl_amt_amp[ps_final_prms->u1_part_mode - 4][0];
5197         pu1_tu_posy = &gau1_inter_tu_posy_scl_amt_amp[ps_final_prms->u1_part_mode - 4][0];
5198     }
5199 
5200     i4_min_trans_size = 4;
5201 
5202     if(ps_ctxt->i1_cu_qp_delta_enable)
5203     {
5204         ihevce_update_cu_level_qp_lamda(ps_ctxt, ps_cu_analyse, i4_min_trans_size, 0);
5205     }
5206 
5207     if(u1_is_cu_noisy && !ps_ctxt->u1_enable_psyRDOPT)
5208     {
5209         ps_ctxt->i8_cl_ssd_lambda_qf =
5210             ((float)ps_ctxt->i8_cl_ssd_lambda_qf * (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) /
5211              100.0f);
5212         ps_ctxt->i8_cl_ssd_lambda_chroma_qf =
5213             ((float)ps_ctxt->i8_cl_ssd_lambda_chroma_qf *
5214              (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) / 100.0f);
5215     }
5216 
5217     u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
5218                              (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
5219                              CONVERT_SSDS_TO_SPATIAL_DOMAIN;
5220 
5221     if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
5222     {
5223         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
5224                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
5225     }
5226 
5227     if(!u1_compute_spatial_ssd)
5228     {
5229         ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 0;
5230         ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 0;
5231     }
5232     else
5233     {
5234         ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 1;
5235 
5236         if(INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0))
5237         {
5238             ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 1;
5239         }
5240     }
5241 
5242     /* RDOPT copy States :  TU init (best until prev TU) to current */
5243     memcpy(
5244         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
5245              .s_cabac_ctxt.au1_ctxt_models[0],
5246         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
5247         IHEVC_CAB_COEFFX_PREFIX);
5248 
5249     ihevce_tu_tree_init(
5250         as_tu_nodes,
5251         cu_size,
5252         (cu_size == 64) ? !ps_inter_cand->b1_skip_flag : 0,
5253         ps_inter_cand->b1_skip_flag ? 0 : ps_ctxt->u1_max_inter_tr_depth,
5254         INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0),
5255         ps_ctxt->u1_chroma_array_type == 2);
5256 
5257     if(!ps_inter_cand->b1_skip_flag && (ps_ctxt->i4_quality_preset >= IHEVCE_QUALITY_P3))
5258     {
5259         ihevce_tuSplitArray_to_tuTree_mapper(
5260             as_tu_nodes,
5261             ps_inter_cand->ai4_tu_split_flag,
5262             cu_size,
5263             cu_size,
5264             MAX(MIN_TU_SIZE, (cu_size >> ps_ctxt->u1_max_inter_tr_depth)),
5265             MIN(MAX_TU_SIZE, cu_size),
5266             ps_inter_cand->b1_skip_flag);
5267     }
5268 
5269     ASSERT(ihevce_tu_tree_coverage_in_cu(as_tu_nodes) == cu_size * cu_size);
5270 
5271 #if ENABLE_INTER_ZCU_COST
5272     ps_ctxt->i8_cu_not_coded_cost = 0;
5273 #endif
5274 
5275     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.pv_src = pv_src;
5276     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.pv_pred = pu1_pred;
5277     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.pv_recon =
5278         ps_final_prms->s_recon_datastore.apv_luma_recon_bufs[0];
5279     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.i4_src_stride = src_strd;
5280     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.i4_pred_stride = pred_stride;
5281     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.i4_recon_stride =
5282         ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
5283     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.pv_src = ps_chrm_cu_buf_prms->pu1_curr_src;
5284     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.pv_pred =
5285         ps_ctxt->s_cu_me_intra_pred_prms.pu1_pred_data[CU_ME_INTRA_PRED_CHROMA_IDX] +
5286         curr_buf_idx * ((MAX_CTB_SIZE * MAX_CTB_SIZE >> 1) + ((ps_ctxt->u1_chroma_array_type == 2) *
5287                                                               (MAX_CTB_SIZE * MAX_CTB_SIZE >> 1)));
5288     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.pv_recon =
5289         ps_final_prms->s_recon_datastore.apv_chroma_recon_bufs[0];
5290     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_src_stride =
5291         ps_chrm_cu_buf_prms->i4_chrm_src_stride;
5292     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_pred_stride =
5293         ps_ctxt->s_cu_me_intra_pred_prms.ai4_pred_data_stride[CU_ME_INTRA_PRED_CHROMA_IDX];
5294     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_recon_stride =
5295         ps_final_prms->s_recon_datastore.i4_chromaRecon_stride;
5296     s_buffer_data_for_tu.ps_nbr_data_buf = ps_nbr_4x4;
5297     s_buffer_data_for_tu.pi2_deq_data = pi2_deq_data;
5298     s_buffer_data_for_tu.pi2_deq_data_chroma =
5299         pi2_deq_data + ps_final_prms->i4_chrm_deq_coeff_strt_idx;
5300     s_buffer_data_for_tu.i4_nbr_data_buf_stride = num_4x4_in_cu;
5301     s_buffer_data_for_tu.i4_deq_data_stride = cu_size;
5302     s_buffer_data_for_tu.i4_deq_data_stride_chroma = cu_size;
5303     s_buffer_data_for_tu.ppu1_ecd = &pu1_ecd_data;
5304 
5305     if(INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0))
5306     {
5307         UWORD8 i;
5308 
5309         UWORD8 *pu1_pred = (UWORD8 *)s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.pv_pred;
5310 
5311         for(i = 0; i < (!!ps_inter_cand->b3_part_size) + 1; i++)
5312         {
5313             pu_t *ps_pu;
5314 
5315             WORD32 inter_pu_wd;
5316             WORD32 inter_pu_ht;
5317 
5318             ps_pu = ps_inter_cand->as_inter_pu + i;
5319 
5320             inter_pu_wd = (ps_pu->b4_wd + 1) << 2; /* cb and cr pixel interleaved */
5321             inter_pu_ht = ((ps_pu->b4_ht + 1) << 2) >> 1;
5322             inter_pu_ht <<= (ps_ctxt->u1_chroma_array_type == 2);
5323             ihevce_chroma_inter_pred_pu(
5324                 &ps_ctxt->s_mc_ctxt,
5325                 ps_pu,
5326                 pu1_pred,
5327                 s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_pred_stride);
5328             if(!!ps_inter_cand->b3_part_size)
5329             {
5330                 /* 2Nx__ partion case */
5331                 if(inter_pu_wd == cu_size)
5332                 {
5333                     pu1_pred +=
5334                         (inter_pu_ht *
5335                          s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_pred_stride);
5336                 }
5337 
5338                 /* __x2N partion case */
5339                 if(inter_pu_ht == (cu_size >> !(ps_ctxt->u1_chroma_array_type == 2)))
5340                 {
5341                     pu1_pred += inter_pu_wd;
5342                 }
5343             }
5344         }
5345     }
5346 
5347 #if !ENABLE_TOP_DOWN_TU_RECURSION
5348     total_rdopt_cost = ihevce_tu_tree_selector(
5349         ps_ctxt,
5350         as_tu_nodes,
5351         &s_buffer_data_for_tu,
5352         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
5353              .s_cabac_ctxt.au1_ctxt_models[0],
5354         recon_func_mode,
5355 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
5356         i4_alpha_stim_multiplier,
5357         u1_is_cu_noisy,
5358 #endif
5359         0,
5360         ps_ctxt->u1_max_inter_tr_depth,
5361         ps_inter_cand->b3_part_size,
5362         u1_compute_spatial_ssd);
5363 #else
5364     total_rdopt_cost = ihevce_topDown_tu_tree_selector(
5365         ps_ctxt,
5366         as_tu_nodes,
5367         &s_buffer_data_for_tu,
5368         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
5369              .s_cabac_ctxt.au1_ctxt_models[0],
5370         recon_func_mode,
5371 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
5372         i4_alpha_stim_multiplier,
5373         u1_is_cu_noisy,
5374 #endif
5375         0,
5376         ps_ctxt->u1_max_inter_tr_depth,
5377         ps_inter_cand->b3_part_size,
5378         INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0),
5379         u1_compute_spatial_ssd);
5380 #endif
5381 
5382     ps_final_prms->u2_num_tus_in_cu = 0;
5383     ps_final_prms->u4_cu_luma_res_bits = 0;
5384     ps_final_prms->u4_cu_sad = 0;
5385     total_rdopt_cost = 0;
5386     ecd_data_bytes_cons = 0;
5387     cu_bits = 0;
5388 #if ENABLE_INTER_ZCU_COST
5389     ps_ctxt->i8_cu_not_coded_cost = 0;
5390 #endif
5391     ps_final_prms->u1_is_cu_coded = 0;
5392     ps_final_prms->u1_cu_size = cu_size;
5393 
5394     ihevce_tu_selector_debriefer(
5395         as_tu_nodes,
5396         ps_final_prms,
5397         &total_rdopt_cost,
5398 #if ENABLE_INTER_ZCU_COST
5399         &ps_ctxt->i8_cu_not_coded_cost,
5400 #endif
5401         &ecd_data_bytes_cons,
5402         &cu_bits,
5403         &ps_final_prms->u2_num_tus_in_cu,
5404         ps_ctxt->i4_cu_qp,
5405         cu_pos_x * 4,
5406         cu_pos_y * 4,
5407         INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0),
5408         (ps_ctxt->u1_chroma_array_type == 2),
5409         POS_TL);
5410 
5411     if(!(INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0)))
5412     {
5413         ps_final_prms->i4_chrm_cu_coeff_strt_idx = ecd_data_bytes_cons;
5414     }
5415 
5416     /* Modify the cost function for this CU. */
5417     /* loop in for 8x8 blocks */
5418     if(ps_ctxt->u1_enable_psyRDOPT)
5419     {
5420         UWORD8 *pu1_recon_cu;
5421         WORD32 recon_stride;
5422         WORD32 curr_pos_x;
5423         WORD32 curr_pos_y;
5424         WORD32 start_index;
5425         WORD32 num_horz_cu_in_ctb;
5426         WORD32 had_block_size;
5427 
5428         /* tODO: sreenivasa ctb size has to be used appropriately */
5429         had_block_size = 8;
5430         num_horz_cu_in_ctb = 64 / had_block_size;
5431 
5432         curr_pos_x = cu_pos_x << 2; /* pel units */
5433         curr_pos_y = cu_pos_y << 2; /* pel units */
5434         recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
5435         pu1_recon_cu = ((UWORD8 *)ps_final_prms->s_recon_datastore
5436                             .apv_luma_recon_bufs[0]);  // already pointing to the current CU recon
5437         //+ \curr_pos_x + curr_pos_y * recon_stride;
5438 
5439         /* start index to index the source satd of curr cu int he current ctb*/
5440         start_index =
5441             (curr_pos_x / had_block_size) + (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
5442 
5443         {
5444             total_rdopt_cost += ihevce_psy_rd_cost(
5445                 ps_ctxt->ai4_source_satd_8x8,
5446                 pu1_recon_cu,
5447                 recon_stride,
5448                 1,  //howz stride
5449                 cu_size,
5450                 0,  // pic type
5451                 0,  //layer id
5452                 ps_ctxt->i4_satd_lamda,  // lambda
5453                 start_index,
5454                 ps_ctxt->u1_is_input_data_hbd,
5455                 ps_ctxt->u4_psy_strength,
5456                 &ps_ctxt->s_cmn_opt_func);  // 8 bit
5457         }
5458     }
5459 
5460     ps_final_prms->u1_chroma_intra_pred_mode = 4;
5461 
5462     /* update the bytes consumed */
5463     ps_final_prms->i4_num_bytes_ecd_data = ecd_data_bytes_cons;
5464 
5465     /* store the current cu size to final prms */
5466     ps_final_prms->u1_cu_size = cu_size;
5467     /* ------------- Chroma processing -------------- */
5468     /* Chroma rdopt eval for each luma candidate only for HIGH QUALITY/MEDIUM SPEDD preset*/
5469     if(ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt &&
5470        !(INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0)))
5471     {
5472         LWORD64 chrm_rdopt_cost;
5473         WORD32 chrm_rdopt_tu_bits;
5474 
5475         /* Store the current RDOPT cost to enable early exit in chrom_prcs */
5476         ps_ctxt->as_cu_prms[curr_buf_idx].i8_curr_rdopt_cost = total_rdopt_cost;
5477 
5478         chrm_rdopt_cost = ihevce_chroma_cu_prcs_rdopt(
5479             ps_ctxt,
5480             curr_buf_idx,
5481             0, /* TU mode : Don't care in Inter patrh */
5482             ps_chrm_cu_buf_prms->pu1_curr_src,
5483             ps_chrm_cu_buf_prms->i4_chrm_src_stride,
5484             ps_chrm_cu_buf_prms->pu1_cu_left,
5485             ps_chrm_cu_buf_prms->pu1_cu_top,
5486             ps_chrm_cu_buf_prms->pu1_cu_top_left,
5487             ps_chrm_cu_buf_prms->i4_cu_left_stride,
5488             (cu_pos_x >> 1),
5489             (cu_pos_y >> 1),
5490             &chrm_rdopt_tu_bits,
5491             i4_alpha_stim_multiplier,
5492             u1_is_cu_noisy);
5493 
5494 #if WEIGH_CHROMA_COST
5495         chrm_rdopt_cost = (LWORD64)(
5496             (chrm_rdopt_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
5497              (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
5498             CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
5499 #endif
5500 
5501 #if CHROMA_RDOPT_ENABLE
5502         total_rdopt_cost += chrm_rdopt_cost;
5503 #endif
5504         cu_bits += chrm_rdopt_tu_bits;
5505 
5506         /* during chroma evaluation if skip decision was over written     */
5507         /* then the current skip candidate is set to a non skip candidate */
5508         ps_inter_cand->b1_skip_flag = ps_final_prms->u1_skip_flag;
5509 
5510         /* cu bits for chroma residual if chroma rdopt is on       */
5511         /* if zero_cbf eval is disabled then cu bits will be zero  */
5512         ps_final_prms->u4_cu_chroma_res_bits = chrm_rdopt_tu_bits;
5513 
5514         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
5515         {
5516             /* Early exit : If the current running cost exceeds
5517             the prev. best mode cost, break */
5518             if(total_rdopt_cost > prev_best_rdopt_cost)
5519             {
5520                 return (total_rdopt_cost);
5521             }
5522         }
5523     }
5524     else
5525     {}
5526 
5527 #if SHRINK_INTER_TUTREE
5528     /* ------------- Quadtree TU split  optimization ------------  */
5529     if(ps_final_prms->u1_is_cu_coded)
5530     {
5531         ps_final_prms->u2_num_tus_in_cu = ihevce_shrink_inter_tu_tree(
5532             &ps_final_prms->as_tu_enc_loop[0],
5533             &ps_final_prms->as_tu_enc_loop_temp_prms[0],
5534             &ps_final_prms->s_recon_datastore,
5535             ps_final_prms->u2_num_tus_in_cu,
5536             (ps_ctxt->u1_chroma_array_type == 2));
5537     }
5538 #endif
5539 
5540     /* RDOPT copy States :  Best after all luma TUs (and chroma,if enabled)to current */
5541     COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
5542         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
5543                 .s_cabac_ctxt.au1_ctxt_models[0] +
5544             IHEVC_CAB_COEFFX_PREFIX,
5545         &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
5546         IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
5547 
5548     /* -------- Bit estimate for RD opt -------------- */
5549     {
5550         nbr_avail_flags_t s_nbr;
5551         /*cbf_bits will account for both texture and cbf bits when zero cbf eval flag is 0*/
5552         WORD32 cbf_bits, header_bits;
5553 
5554         /* get the neighbour availability flags for current cu  */
5555         ihevce_get_only_nbr_flag(
5556             &s_nbr,
5557             ps_ctxt->pu1_ctb_nbr_map,
5558             ps_ctxt->i4_nbr_map_strd,
5559             cu_pos_x,
5560             cu_pos_y,
5561             (cu_size >> 2),
5562             (cu_size >> 2));
5563 
5564         /* call the entropy rdo encode to get the bit estimate for current cu */
5565         header_bits = ihevce_entropy_rdo_encode_cu(
5566             &ps_ctxt->s_rdopt_entropy_ctxt,
5567             ps_final_prms,
5568             (cu_pos_x >> 1), /*  back to 8x8 pel units   */
5569             (cu_pos_y >> 1), /*  back to 8x8 pel units   */
5570             cu_size,
5571             ps_ctxt->u1_disable_intra_eval ? !DISABLE_TOP_SYNC && s_nbr.u1_top_avail
5572                                            : s_nbr.u1_top_avail,
5573             s_nbr.u1_left_avail,
5574             &ps_final_prms->pu1_cu_coeffs[0],
5575             &cbf_bits);
5576 
5577         cu_bits += header_bits;
5578 
5579         /* cbf bits are excluded from header bits, instead considered as texture bits */
5580         /* incase if zero cbf eval is disabled then texture bits gets added here */
5581         ps_final_prms->u4_cu_hdr_bits = (header_bits - cbf_bits);
5582         ps_final_prms->u4_cu_cbf_bits = cbf_bits;
5583 
5584 #if RDOPT_ENABLE
5585         /* add the cost of coding the header bits */
5586         total_rdopt_cost +=
5587             COMPUTE_RATE_COST_CLIP30(header_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
5588 
5589 #if ENABLE_INTER_ZCU_COST
5590         /* If cu is coded, Evaluate not coded cost and check if it improves over coded cost */
5591         if(ps_final_prms->u1_is_cu_coded && (ZCBF_ENABLE == ps_ctxt->i4_zcbf_rdo_level))
5592         {
5593             LWORD64 i8_cu_not_coded_cost = ps_ctxt->i8_cu_not_coded_cost;
5594 
5595             WORD32 is_2nx2n_mergecu = (SIZE_2Nx2N == ps_final_prms->u1_part_mode) &&
5596                                       (1 == ps_final_prms->as_pu_enc_loop[0].b1_merge_flag);
5597 
5598             cab_ctxt_t *ps_cab_ctxt =
5599                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx].s_cabac_ctxt;
5600 
5601             /* Read header bits generatated after ihevce_entropy_rdo_encode_cu() call  */
5602             UWORD32 u4_cu_hdr_bits_q12 = ps_cab_ctxt->u4_header_bits_estimated_q12;
5603 
5604             /* account for coding qt_root_cbf = 0 */
5605             /* First subtract cost for coding as 1 (part of header bits) and then add cost for coding as 0 */
5606             u4_cu_hdr_bits_q12 += gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 0];
5607             if(u4_cu_hdr_bits_q12 < gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 1])
5608                 u4_cu_hdr_bits_q12 = 0;
5609             else
5610                 u4_cu_hdr_bits_q12 -= gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 1];
5611 
5612             /* add the cost of coding the header bits */
5613             i8_cu_not_coded_cost += COMPUTE_RATE_COST_CLIP30(
5614                 u4_cu_hdr_bits_q12 /* ps_final_prms->u4_cu_hdr_bits */,
5615                 ps_ctxt->i8_cl_ssd_lambda_qf,
5616                 (LAMBDA_Q_SHIFT + CABAC_FRAC_BITS_Q));
5617 
5618             if(ps_ctxt->u1_enable_psyRDOPT)
5619             {
5620                 i8_cu_not_coded_cost = total_rdopt_cost + 1;
5621             }
5622 
5623             /* Evaluate qtroot cbf rdo; exclude 2Nx2N Merge as skip cu is explicitly evaluated */
5624             if((i8_cu_not_coded_cost <= total_rdopt_cost) && (!is_2nx2n_mergecu))
5625             {
5626                 WORD32 tx_size;
5627 
5628                 /* force cu as not coded and update the cost */
5629                 ps_final_prms->u1_is_cu_coded = 0;
5630                 ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 0;
5631                 ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 0;
5632 
5633                 total_rdopt_cost = i8_cu_not_coded_cost;
5634 
5635                 /* reset num TUs to 1 unless cu size id 64 */
5636                 ps_final_prms->u2_num_tus_in_cu = (64 == cu_size) ? 4 : 1;
5637                 trans_size = (64 == cu_size) ? 32 : cu_size;
5638                 GETRANGE(tx_size, trans_size);
5639 
5640                 /* reset the bytes consumed */
5641                 ps_final_prms->i4_num_bytes_ecd_data = 0;
5642 
5643                 /* reset texture related bits and roll back header bits*/
5644                 ps_final_prms->u4_cu_cbf_bits = 0;
5645                 ps_final_prms->u4_cu_luma_res_bits = 0;
5646                 ps_final_prms->u4_cu_chroma_res_bits = 0;
5647                 ps_final_prms->u4_cu_hdr_bits =
5648                     (u4_cu_hdr_bits_q12 + (1 << (CABAC_FRAC_BITS_Q - 1))) >> CABAC_FRAC_BITS_Q;
5649 
5650                 /* update cabac model with qtroot cbf = 0 decision */
5651                 ps_cab_ctxt->au1_ctxt_models[IHEVC_CAB_NORES_IDX] =
5652                     gau1_ihevc_next_state[u1_qtroot_cbf_cabac_model << 1];
5653 
5654                 /* restore untouched cabac models for, tusplit, cbfs, texture etc */
5655                 memcpy(
5656                     &ps_cab_ctxt->au1_ctxt_models[IHEVC_CAB_SPLIT_TFM],
5657                     &au1_rdopt_init_ctxt_models[IHEVC_CAB_SPLIT_TFM],
5658                     (IHEVC_CAB_CTXT_END - IHEVC_CAB_SPLIT_TFM));
5659 
5660                 /* mark all tus as not coded for final eval */
5661                 for(ctr = 0; ctr < ps_final_prms->u2_num_tus_in_cu; ctr++)
5662                 {
5663                     WORD32 curr_pos_x = (ctr & 0x1) ? (trans_size >> 2) : 0;
5664                     WORD32 curr_pos_y = (ctr & 0x2) ? (trans_size >> 2) : 0;
5665 
5666                     nbr_4x4_t *ps_cur_nbr_4x4 =
5667                         ps_nbr_4x4 + curr_pos_x + (curr_pos_y * num_4x4_in_cu);
5668 
5669                     num_4x4_in_tu = trans_size >> 2;
5670 
5671                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].i2_luma_bytes_consumed = 0;
5672                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].ai2_cb_bytes_consumed[0] = 0;
5673                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].ai2_cr_bytes_consumed[0] = 0;
5674 
5675                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_y_cbf = 0;
5676                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf = 0;
5677                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf = 0;
5678 
5679                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1 = 0;
5680                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1 = 0;
5681 
5682                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_size = tx_size - 3;
5683                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_x = cu_pos_x + curr_pos_x;
5684                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_y = cu_pos_y + curr_pos_y;
5685 
5686                     /* reset cbf for the all 4x4 in TU */
5687                     {
5688                         WORD32 i, j;
5689                         nbr_4x4_t *ps_tmp_4x4;
5690                         ps_tmp_4x4 = ps_cur_nbr_4x4;
5691 
5692                         for(i = 0; i < num_4x4_in_tu; i++)
5693                         {
5694                             for(j = 0; j < num_4x4_in_tu; j++)
5695                             {
5696                                 ps_tmp_4x4[j].b1_y_cbf = 0;
5697                             }
5698                             /* row level update*/
5699                             ps_tmp_4x4 += num_4x4_in_cu;
5700                         }
5701                     }
5702                 }
5703             }
5704         }
5705 #endif /* ENABLE_INTER_ZCU_COST */
5706 
5707 #endif /* RDOPT_ENABLE */
5708     }
5709 
5710     return (total_rdopt_cost);
5711 }
5712 #endif
5713 
5714 /*!
5715 ******************************************************************************
5716 * \if Function name : ihevce_inter_rdopt_cu_mc_mvp \endif
5717 *
5718 * \brief
5719 *    Inter Coding unit funtion which performs MC and MVP calc for RD opt mode
5720 *
5721 * \param[in] ps_ctxt       enc_loop module ctxt pointer
5722 * \param[in] ps_inter_cand pointer to inter candidate structure
5723 * \param[in] cu_size         Current CU size
5724 * \param[in] cu_pos_x        cu position x w.r.t to ctb
5725 * \param[in] cu_pos_y        cu position y w.r.t to ctb
5726 * \param[in] ps_left_nbr_4x4 Left neighbour 4x4 structure pointer
5727 * \param[in] ps_top_nbr_4x4  top neighbour 4x4 structure pointer
5728 * \param[in] ps_topleft_nbr_4x4  top left neighbour 4x4 structure pointer
5729 * \param[in] nbr_4x4_left_strd  left neighbour 4x4 buffer stride
5730 * \param[in] curr_buf_idx Current Buffer index
5731 *
5732 * \return
5733 *    Rdopt cost
5734 *
5735 * \author
5736 *  Ittiam
5737 *
5738 *****************************************************************************
5739 */
ihevce_inter_rdopt_cu_mc_mvp(ihevce_enc_loop_ctxt_t * ps_ctxt,cu_inter_cand_t * ps_inter_cand,WORD32 cu_size,WORD32 cu_pos_x,WORD32 cu_pos_y,nbr_4x4_t * ps_left_nbr_4x4,nbr_4x4_t * ps_top_nbr_4x4,nbr_4x4_t * ps_topleft_nbr_4x4,WORD32 nbr_4x4_left_strd,WORD32 curr_buf_idx)5740 LWORD64 ihevce_inter_rdopt_cu_mc_mvp(
5741     ihevce_enc_loop_ctxt_t *ps_ctxt,
5742     cu_inter_cand_t *ps_inter_cand,
5743     WORD32 cu_size,
5744     WORD32 cu_pos_x,
5745     WORD32 cu_pos_y,
5746     nbr_4x4_t *ps_left_nbr_4x4,
5747     nbr_4x4_t *ps_top_nbr_4x4,
5748     nbr_4x4_t *ps_topleft_nbr_4x4,
5749     WORD32 nbr_4x4_left_strd,
5750     WORD32 curr_buf_idx)
5751 {
5752     /* local variables */
5753     enc_loop_cu_final_prms_t *ps_final_prms;
5754     nbr_avail_flags_t s_nbr;
5755     nbr_4x4_t *ps_nbr_4x4;
5756 
5757     UWORD8 au1_is_top_used[2][MAX_MVP_LIST_CAND];
5758     UWORD8 *pu1_pred;
5759     WORD32 rdopt_cost;
5760     WORD32 ctr;
5761     WORD32 num_cu_part;
5762     WORD32 inter_pu_wd;
5763     WORD32 inter_pu_ht;
5764     WORD32 pred_stride;
5765 
5766     /* get the pointers based on curbuf idx */
5767     ps_nbr_4x4 = &ps_ctxt->as_cu_nbr[curr_buf_idx][0];
5768     ps_final_prms = &ps_ctxt->as_cu_prms[curr_buf_idx];
5769     pu1_pred = ps_inter_cand->pu1_pred_data;
5770 
5771     pred_stride = ps_inter_cand->i4_pred_data_stride;
5772 
5773     /* store the partition mode in final prms */
5774     ps_final_prms->u1_part_mode = ps_inter_cand->b3_part_size;
5775 
5776     /* since encoder does not support NXN part type */
5777     /* num parts can be either 1 or 2 only          */
5778     ASSERT(SIZE_NxN != ps_inter_cand->b3_part_size);
5779 
5780     num_cu_part = (SIZE_2Nx2N != ps_inter_cand->b3_part_size) + 1;
5781 
5782     /* get the 4x4 level position of current cu */
5783     cu_pos_x = cu_pos_x << 1;
5784     cu_pos_y = cu_pos_y << 1;
5785 
5786     /* populate cu level params */
5787     ps_final_prms->u1_intra_flag = PRED_MODE_INTER;
5788     ps_final_prms->u2_num_pus_in_cu = num_cu_part;
5789 
5790     /* run a loop over all the partitons in cu */
5791     for(ctr = 0; ctr < num_cu_part; ctr++)
5792     {
5793         pu_mv_t as_pred_mv[MAX_MVP_LIST_CAND];
5794         pu_t *ps_pu;
5795         WORD32 skip_or_merge_flag;
5796         UWORD8 u1_use_mvp_from_top_row;
5797 
5798         ps_pu = &ps_inter_cand->as_inter_pu[ctr];
5799 
5800         /* IF AMP then each partitions can have diff wd ht */
5801         inter_pu_wd = (ps_pu->b4_wd + 1) << 2;
5802         inter_pu_ht = (ps_pu->b4_ht + 1) << 2;
5803 
5804         /* populate reference pic buf id for bs compute */
5805 
5806         /* L0 */
5807         if(-1 != ps_pu->mv.i1_l0_ref_idx)
5808         {
5809             ps_pu->mv.i1_l0_ref_pic_buf_id =
5810                 ps_ctxt->s_mv_pred_ctxt.ps_ref_list[0][ps_pu->mv.i1_l0_ref_idx]->i4_buf_id;
5811         }
5812 
5813         /* L1 */
5814         if(-1 != ps_pu->mv.i1_l1_ref_idx)
5815         {
5816             ps_pu->mv.i1_l1_ref_pic_buf_id =
5817                 ps_ctxt->s_mv_pred_ctxt.ps_ref_list[1][ps_pu->mv.i1_l1_ref_idx]->i4_buf_id;
5818         }
5819 
5820         /* SKIP or merge check for every part */
5821         skip_or_merge_flag = ps_inter_cand->b1_skip_flag | ps_pu->b1_merge_flag;
5822 
5823         /* ----------- MV Prediction ----------------- */
5824         if(0 == skip_or_merge_flag)
5825         {
5826             /* get the neighbour availability flags */
5827             ihevce_get_only_nbr_flag(
5828                 &s_nbr,
5829                 ps_ctxt->pu1_ctb_nbr_map,
5830                 ps_ctxt->i4_nbr_map_strd,
5831                 cu_pos_x,
5832                 cu_pos_y,
5833                 inter_pu_wd >> 2,
5834                 inter_pu_ht >> 2);
5835 
5836             if(ps_ctxt->u1_disable_intra_eval && DISABLE_TOP_SYNC && (ps_pu->b4_pos_y == 0))
5837             {
5838                 u1_use_mvp_from_top_row = 0;
5839             }
5840             else
5841             {
5842                 u1_use_mvp_from_top_row = 1;
5843             }
5844 
5845             if(!u1_use_mvp_from_top_row)
5846             {
5847                 if(s_nbr.u1_top_avail || s_nbr.u1_top_lt_avail || s_nbr.u1_top_rt_avail)
5848                 {
5849                     if(!s_nbr.u1_left_avail && !s_nbr.u1_bot_lt_avail)
5850                     {
5851                         WORD32 curr_cu_pos_in_row, cu_top_right_offset, cu_top_right_dep_pos;
5852 
5853                         /* Ensure Top Right Sync */
5854                         if(!ps_ctxt->u1_use_top_at_ctb_boundary)
5855                         {
5856                             curr_cu_pos_in_row =
5857                                 ps_ctxt->s_mc_ctxt.i4_ctb_frm_pos_x + (cu_pos_x << 2);
5858 
5859                             if(ps_ctxt->s_mc_ctxt.i4_ctb_frm_pos_y == 0)
5860                             {
5861                                 /* No wait for 1st row */
5862                                 cu_top_right_offset = -(MAX_CTB_SIZE);
5863                                 {
5864                                     ihevce_tile_params_t *ps_col_tile_params =
5865                                         ((ihevce_tile_params_t *)ps_ctxt->pv_tile_params_base +
5866                                          ps_ctxt->i4_tile_col_idx);
5867 
5868                                     /* No wait for 1st row */
5869                                     cu_top_right_offset =
5870                                         -(ps_col_tile_params->i4_first_sample_x + (MAX_CTB_SIZE));
5871                                 }
5872                                 cu_top_right_dep_pos = 0;
5873                             }
5874                             else
5875                             {
5876                                 cu_top_right_offset = (cu_size) + 4;
5877                                 cu_top_right_dep_pos =
5878                                     (ps_ctxt->s_mc_ctxt.i4_ctb_frm_pos_y >> 6) - 1;
5879                             }
5880 
5881                             ihevce_dmgr_chk_row_row_sync(
5882                                 ps_ctxt->pv_dep_mngr_enc_loop_cu_top_right,
5883                                 curr_cu_pos_in_row,
5884                                 cu_top_right_offset,
5885                                 cu_top_right_dep_pos,
5886                                 ps_ctxt->i4_tile_col_idx, /* Col Tile No. */
5887                                 ps_ctxt->thrd_id);
5888                         }
5889 
5890                         u1_use_mvp_from_top_row = 1;
5891                     }
5892                     else
5893                     {
5894                         s_nbr.u1_top_avail = 0;
5895                         s_nbr.u1_top_lt_avail = 0;
5896                         s_nbr.u1_top_rt_avail = 0;
5897                     }
5898                 }
5899                 else
5900                 {
5901                     u1_use_mvp_from_top_row = 1;
5902                 }
5903             }
5904             /* Call the MV prediction module to get MVP */
5905             ihevce_mv_pred(
5906                 &ps_ctxt->s_mv_pred_ctxt,
5907                 ps_top_nbr_4x4,
5908                 ps_left_nbr_4x4,
5909                 ps_topleft_nbr_4x4,
5910                 nbr_4x4_left_strd,
5911                 &s_nbr,
5912                 NULL, /* colocated MV */
5913                 ps_pu,
5914                 &as_pred_mv[0],
5915                 au1_is_top_used);
5916         }
5917 
5918         /* store the nbr 4x4 structure */
5919         ps_nbr_4x4->b1_skip_flag = ps_inter_cand->b1_skip_flag;
5920         ps_nbr_4x4->b1_intra_flag = 0;
5921         ps_nbr_4x4->b1_pred_l0_flag = 0;
5922         ps_nbr_4x4->b1_pred_l1_flag = 0;
5923 
5924         /* DC is default mode for inter cu, required for intra mode signalling */
5925         ps_nbr_4x4->b6_luma_intra_mode = 1;
5926 
5927         /* copy the motion vectors to neighbour structure */
5928         ps_nbr_4x4->mv = ps_pu->mv;
5929 
5930         /* copy the PU to final out pu */
5931         ps_final_prms->as_pu_enc_loop[ctr] = *ps_pu;
5932 
5933         /* copy the PU to chroma */
5934         ps_final_prms->as_pu_chrm_proc[ctr] = *ps_pu;
5935 
5936         /* store the skip flag to final prms */
5937         ps_final_prms->u1_skip_flag = ps_inter_cand->b1_skip_flag;
5938 
5939         /* MVP index & MVD calc is gated on skip/merge flag */
5940         if(0 == skip_or_merge_flag)
5941         {
5942             /* calculate the MVDs and popluate the MVP idx for L0 */
5943             if((PRED_BI == ps_pu->b2_pred_mode) || (PRED_L0 == ps_pu->b2_pred_mode))
5944             {
5945                 WORD32 idx0_cost, idx1_cost;
5946 
5947                 /* calculate the ABS mvd for cand 0 */
5948                 idx0_cost = abs(ps_pu->mv.s_l0_mv.i2_mvx - as_pred_mv[0].s_l0_mv.i2_mvx);
5949                 idx0_cost += abs(ps_pu->mv.s_l0_mv.i2_mvy - as_pred_mv[0].s_l0_mv.i2_mvy);
5950 
5951                 /* calculate the ABS mvd for cand 1 */
5952                 if(u1_use_mvp_from_top_row)
5953                 {
5954                     idx1_cost = abs(ps_pu->mv.s_l0_mv.i2_mvx - as_pred_mv[1].s_l0_mv.i2_mvx);
5955                     idx1_cost += abs(ps_pu->mv.s_l0_mv.i2_mvy - as_pred_mv[1].s_l0_mv.i2_mvy);
5956                 }
5957                 else
5958                 {
5959                     idx1_cost = INT_MAX;
5960                 }
5961 
5962                 /* based on the least cost choose the mvp idx */
5963                 if(idx0_cost <= idx1_cost)
5964                 {
5965                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l0_mv.i2_mvx -=
5966                         as_pred_mv[0].s_l0_mv.i2_mvx;
5967                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l0_mv.i2_mvy -=
5968                         as_pred_mv[0].s_l0_mv.i2_mvy;
5969 
5970                     ps_final_prms->as_pu_enc_loop[ctr].b1_l0_mvp_idx = 0;
5971                 }
5972                 else
5973                 {
5974                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l0_mv.i2_mvx -=
5975                         as_pred_mv[1].s_l0_mv.i2_mvx;
5976                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l0_mv.i2_mvy -=
5977                         as_pred_mv[1].s_l0_mv.i2_mvy;
5978 
5979                     ps_final_prms->as_pu_enc_loop[ctr].b1_l0_mvp_idx = 1;
5980                 }
5981 
5982                 /* set the pred l0 flag for neighbour storage */
5983                 ps_nbr_4x4->b1_pred_l0_flag = 1;
5984             }
5985             /* calculate the MVDs and popluate the MVP idx for L1 */
5986             if((PRED_BI == ps_pu->b2_pred_mode) || (PRED_L1 == ps_pu->b2_pred_mode))
5987             {
5988                 WORD32 idx0_cost, idx1_cost;
5989 
5990                 /* calculate the ABS mvd for cand 0 */
5991                 idx0_cost = abs(ps_pu->mv.s_l1_mv.i2_mvx - as_pred_mv[0].s_l1_mv.i2_mvx);
5992                 idx0_cost += abs(ps_pu->mv.s_l1_mv.i2_mvy - as_pred_mv[0].s_l1_mv.i2_mvy);
5993 
5994                 /* calculate the ABS mvd for cand 1 */
5995                 if(u1_use_mvp_from_top_row)
5996                 {
5997                     idx1_cost = abs(ps_pu->mv.s_l1_mv.i2_mvx - as_pred_mv[1].s_l1_mv.i2_mvx);
5998                     idx1_cost += abs(ps_pu->mv.s_l1_mv.i2_mvy - as_pred_mv[1].s_l1_mv.i2_mvy);
5999                 }
6000                 else
6001                 {
6002                     idx1_cost = INT_MAX;
6003                 }
6004 
6005                 /* based on the least cost choose the mvp idx */
6006                 if(idx0_cost <= idx1_cost)
6007                 {
6008                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l1_mv.i2_mvx -=
6009                         as_pred_mv[0].s_l1_mv.i2_mvx;
6010                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l1_mv.i2_mvy -=
6011                         as_pred_mv[0].s_l1_mv.i2_mvy;
6012 
6013                     ps_final_prms->as_pu_enc_loop[ctr].b1_l1_mvp_idx = 0;
6014                 }
6015                 else
6016                 {
6017                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l1_mv.i2_mvx -=
6018                         as_pred_mv[1].s_l1_mv.i2_mvx;
6019                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l1_mv.i2_mvy -=
6020                         as_pred_mv[1].s_l1_mv.i2_mvy;
6021 
6022                     ps_final_prms->as_pu_enc_loop[ctr].b1_l1_mvp_idx = 1;
6023                 }
6024 
6025                 /* set the pred l1 flag for neighbour storage */
6026                 ps_nbr_4x4->b1_pred_l1_flag = 1;
6027             }
6028 
6029             /* set the merge flag to 0 */
6030             ps_final_prms->as_pu_enc_loop[ctr].b1_merge_flag = 0;
6031             ps_final_prms->as_pu_enc_loop[ctr].b3_merge_idx = 0;
6032         }
6033         else
6034         {
6035             /* copy the merge index from candidate */
6036             ps_final_prms->as_pu_enc_loop[ctr].b1_merge_flag = ps_pu->b1_merge_flag;
6037 
6038             ps_final_prms->as_pu_enc_loop[ctr].b3_merge_idx = ps_pu->b3_merge_idx;
6039 
6040             if((PRED_BI == ps_pu->b2_pred_mode) || (PRED_L0 == ps_pu->b2_pred_mode))
6041             {
6042                 /* set the pred l0 flag for neighbour storage */
6043                 ps_nbr_4x4->b1_pred_l0_flag = 1;
6044             }
6045 
6046             /* calculate the MVDs and popluate the MVP idx for L1 */
6047             if((PRED_BI == ps_pu->b2_pred_mode) || (PRED_L1 == ps_pu->b2_pred_mode))
6048             {
6049                 /* set the pred l1 flag for neighbour storage */
6050                 ps_nbr_4x4->b1_pred_l1_flag = 1;
6051             }
6052         }
6053 
6054         /* RD opt cost computation is part of cu_ntu func hence here it is set to 0 */
6055         rdopt_cost = 0;
6056 
6057         /* copy the MV to colocated Mv structure */
6058         ps_final_prms->as_col_pu_enc_loop[ctr].s_l0_mv = ps_pu->mv.s_l0_mv;
6059         ps_final_prms->as_col_pu_enc_loop[ctr].s_l1_mv = ps_pu->mv.s_l1_mv;
6060         ps_final_prms->as_col_pu_enc_loop[ctr].i1_l0_ref_idx = ps_pu->mv.i1_l0_ref_idx;
6061         ps_final_prms->as_col_pu_enc_loop[ctr].i1_l1_ref_idx = ps_pu->mv.i1_l1_ref_idx;
6062         ps_final_prms->as_col_pu_enc_loop[ctr].b2_pred_mode = ps_pu->b2_pred_mode;
6063         ps_final_prms->as_col_pu_enc_loop[ctr].b1_intra_flag = 0;
6064 
6065         /* replicate neighbour 4x4 strcuture for entire partition */
6066         {
6067             WORD32 i, j;
6068             nbr_4x4_t *ps_tmp_4x4;
6069 
6070             ps_tmp_4x4 = ps_nbr_4x4;
6071 
6072             for(i = 0; i < (inter_pu_ht >> 2); i++)
6073             {
6074                 for(j = 0; j < (inter_pu_wd >> 2); j++)
6075                 {
6076                     ps_tmp_4x4[j] = *ps_nbr_4x4;
6077                 }
6078                 /* row level update*/
6079                 ps_tmp_4x4 += (cu_size >> 2);
6080             }
6081         }
6082         /* set the neighbour map to 1 */
6083         ihevce_set_inter_nbr_map(
6084             ps_ctxt->pu1_ctb_nbr_map,
6085             ps_ctxt->i4_nbr_map_strd,
6086             cu_pos_x,
6087             cu_pos_y,
6088             (inter_pu_wd >> 2),
6089             (inter_pu_ht >> 2),
6090             1);
6091         /* ----------- Motion Compensation for Luma ----------- */
6092 #if !ENABLE_MIXED_INTER_MODE_EVAL
6093         {
6094             IV_API_CALL_STATUS_T valid_mv_cand;
6095 
6096             /*If the inter candidate is neither merge cand nor skip cand
6097             then calculate the mc.*/
6098             if(0 == skip_or_merge_flag || (ps_ctxt->u1_high_speed_cu_dec_on))
6099             {
6100                 valid_mv_cand =
6101                     ihevce_luma_inter_pred_pu(&ps_ctxt->s_mc_ctxt, ps_pu, pu1_pred, pred_stride, 0);
6102 
6103                 /* assert if the MC is given a valid mv candidate */
6104                 ASSERT(valid_mv_cand == IV_SUCCESS);
6105             }
6106         }
6107 #endif
6108         if((2 == num_cu_part) && (0 == ctr))
6109         {
6110             /* 2Nx__ partion case */
6111             if(inter_pu_wd == cu_size)
6112             {
6113                 cu_pos_y += (inter_pu_ht >> 2);
6114                 pu1_pred += (inter_pu_ht * pred_stride);
6115                 ps_nbr_4x4 += (inter_pu_ht >> 2) * (cu_size >> 2);
6116                 ps_left_nbr_4x4 += (inter_pu_ht >> 2) * nbr_4x4_left_strd;
6117                 ps_top_nbr_4x4 = ps_nbr_4x4 - (cu_size >> 2);
6118                 ps_topleft_nbr_4x4 = ps_left_nbr_4x4 - nbr_4x4_left_strd;
6119             }
6120 
6121             /* __x2N partion case */
6122             if(inter_pu_ht == cu_size)
6123             {
6124                 cu_pos_x += (inter_pu_wd >> 2);
6125                 pu1_pred += inter_pu_wd;
6126                 ps_nbr_4x4 += (inter_pu_wd >> 2);
6127                 ps_left_nbr_4x4 = ps_nbr_4x4 - 1;
6128                 ps_top_nbr_4x4 += (inter_pu_wd >> 2);
6129                 ps_topleft_nbr_4x4 = ps_top_nbr_4x4 - 1;
6130                 nbr_4x4_left_strd = (cu_size >> 2);
6131             }
6132         }
6133     }
6134 
6135     return (rdopt_cost);
6136 }
6137 
6138 /*!
6139 ******************************************************************************
6140 * \if Function name : ihevce_intra_chroma_pred_mode_selector \endif
6141 *
6142 * \brief
6143 *    Coding unit processing function for chroma special modes (Non-Luma modes)
6144 *
6145 * \param[in] ps_ctxt       enc_loop module ctxt pointer
6146 * \param[in] ps_chrm_cu_buf_prms    ctxt having chroma related prms
6147 * \param[in] ps_cu_analyse      pointer to cu analyse
6148 * \param[in] rd_opt_curr_idx    index in the array of RDopt params
6149 * \param[in] tu_mode            TU_EQ_CU or other case
6150 *
6151 * \return
6152 *    Stores the best SATD mode, it's RDOPT cost, CABAC state, TU bits
6153 *
6154 * \author
6155 *  Ittiam
6156 *
6157 *****************************************************************************
6158 */
ihevce_distortion_based_intra_chroma_mode_selector(cu_analyse_t * ps_cu_analyse,ihevc_intra_pred_chroma_ref_substitution_ft * pf_ref_substitution,pf_intra_pred * ppf_chroma_ip,pf_res_trans_luma_had_chroma * ppf_resd_trns_had,UWORD8 * pu1_src,WORD32 i4_src_stride,UWORD8 * pu1_pred,WORD32 i4_pred_stride,UWORD8 * pu1_ctb_nbr_map,WORD32 i4_nbr_map_strd,UWORD8 * pu1_ref_sub_out,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy,UWORD8 u1_trans_size,UWORD8 u1_trans_idx,UWORD8 u1_num_tus_in_cu,UWORD8 u1_num_4x4_luma_blks_in_tu,UWORD8 u1_enable_psyRDOPT,UWORD8 u1_is_422)6159 UWORD8 ihevce_distortion_based_intra_chroma_mode_selector(
6160     cu_analyse_t *ps_cu_analyse,
6161     ihevc_intra_pred_chroma_ref_substitution_ft *pf_ref_substitution,
6162     pf_intra_pred *ppf_chroma_ip,
6163     pf_res_trans_luma_had_chroma *ppf_resd_trns_had,
6164     UWORD8 *pu1_src,
6165     WORD32 i4_src_stride,
6166     UWORD8 *pu1_pred,
6167     WORD32 i4_pred_stride,
6168     UWORD8 *pu1_ctb_nbr_map,
6169     WORD32 i4_nbr_map_strd,
6170     UWORD8 *pu1_ref_sub_out,
6171     WORD32 i4_alpha_stim_multiplier,
6172     UWORD8 u1_is_cu_noisy,
6173     UWORD8 u1_trans_size,
6174     UWORD8 u1_trans_idx,
6175     UWORD8 u1_num_tus_in_cu,
6176     UWORD8 u1_num_4x4_luma_blks_in_tu,
6177     UWORD8 u1_enable_psyRDOPT,
6178     UWORD8 u1_is_422)
6179 {
6180     UWORD8 u1_chrm_mode;
6181     UWORD8 ctr;
6182     WORD32 i4_subtu_idx;
6183 
6184     WORD32 i = 0;
6185     UWORD8 u1_chrm_modes[4] = { 0, 1, 10, 26 };
6186     WORD32 i4_satd_had[4] = { 0 };
6187     WORD32 i4_best_satd_had = INT_MAX;
6188     UWORD8 u1_cu_pos_x = (ps_cu_analyse->b3_cu_pos_x << 1);
6189     UWORD8 u1_cu_pos_y = (ps_cu_analyse->b3_cu_pos_y << 1);
6190     WORD32 i4_num_sub_tus = u1_is_422 + 1;
6191     UWORD8 u1_best_chrm_mode = 0;
6192 
6193     /* Get the best satd among all possible modes */
6194     for(i = 0; i < 4; i++)
6195     {
6196         WORD32 left_strd = i4_src_stride;
6197 
6198         u1_chrm_mode = (u1_is_422 == 1) ? gau1_chroma422_intra_angle_mapping[u1_chrm_modes[i]]
6199                                         : u1_chrm_modes[i];
6200 
6201         /* loop based on num tus in a cu */
6202         for(ctr = 0; ctr < u1_num_tus_in_cu; ctr++)
6203         {
6204             WORD32 luma_nbr_flags;
6205             WORD32 chrm_pred_func_idx;
6206 
6207             WORD32 i4_trans_size_m2 = u1_trans_size << 1;
6208             UWORD8 *pu1_tu_src = pu1_src + ((ctr & 1) * i4_trans_size_m2) +
6209                                  (((ctr > 1) * u1_trans_size * i4_src_stride) << u1_is_422);
6210             UWORD8 *pu1_tu_pred = pu1_pred + ((ctr & 1) * i4_trans_size_m2) +
6211                                   (((ctr > 1) * u1_trans_size * i4_pred_stride) << u1_is_422);
6212             WORD32 i4_curr_tu_pos_x = u1_cu_pos_x + ((ctr & 1) * u1_num_4x4_luma_blks_in_tu);
6213             WORD32 i4_curr_tu_pos_y = u1_cu_pos_y + ((ctr > 1) * u1_num_4x4_luma_blks_in_tu);
6214 
6215             luma_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
6216                 pu1_ctb_nbr_map,
6217                 i4_nbr_map_strd,
6218                 i4_curr_tu_pos_x,
6219                 i4_curr_tu_pos_y,
6220                 u1_num_4x4_luma_blks_in_tu,
6221                 u1_num_4x4_luma_blks_in_tu);
6222 
6223             for(i4_subtu_idx = 0; i4_subtu_idx < i4_num_sub_tus; i4_subtu_idx++)
6224             {
6225                 WORD32 nbr_flags;
6226 
6227                 UWORD8 *pu1_cur_src =
6228                     pu1_tu_src + ((i4_subtu_idx == 1) * u1_trans_size * i4_src_stride);
6229                 UWORD8 *pu1_cur_pred =
6230                     pu1_tu_pred + ((i4_subtu_idx == 1) * u1_trans_size * i4_pred_stride);
6231                 UWORD8 *pu1_left = pu1_cur_src - 2;
6232                 UWORD8 *pu1_top = pu1_cur_src - i4_src_stride;
6233                 UWORD8 *pu1_top_left = pu1_top - 2;
6234 
6235                 nbr_flags = ihevce_get_intra_chroma_tu_nbr(
6236                     luma_nbr_flags, i4_subtu_idx, u1_trans_size, u1_is_422);
6237 
6238                 /* call the chroma reference array substitution */
6239                 pf_ref_substitution(
6240                     pu1_top_left,
6241                     pu1_top,
6242                     pu1_left,
6243                     left_strd,
6244                     u1_trans_size,
6245                     nbr_flags,
6246                     pu1_ref_sub_out,
6247                     1);
6248 
6249                 /* use the look up to get the function idx */
6250                 chrm_pred_func_idx = g_i4_ip_funcs[u1_chrm_mode];
6251 
6252                 /* call the intra prediction function */
6253                 ppf_chroma_ip[chrm_pred_func_idx](
6254                     pu1_ref_sub_out, 1, pu1_cur_pred, i4_pred_stride, u1_trans_size, u1_chrm_mode);
6255 
6256                 if(!u1_is_cu_noisy || !i4_alpha_stim_multiplier)
6257                 {
6258                     /* compute Hadamard-transform satd : Cb */
6259                     i4_satd_had[i] += ppf_resd_trns_had[u1_trans_idx - 1](
6260                         pu1_cur_src, i4_src_stride, pu1_cur_pred, i4_pred_stride, NULL, 0);
6261 
6262                     /* compute Hadamard-transform satd : Cr */
6263                     i4_satd_had[i] += ppf_resd_trns_had[u1_trans_idx - 1](
6264                         pu1_cur_src + 1, i4_src_stride, pu1_cur_pred + 1, i4_pred_stride, NULL, 0);
6265                 }
6266                 else
6267                 {
6268                     WORD32 i4_satd;
6269 
6270                     /* compute Hadamard-transform satd : Cb */
6271                     i4_satd = ppf_resd_trns_had[u1_trans_idx - 1](
6272                         pu1_cur_src, i4_src_stride, pu1_cur_pred, i4_pred_stride, NULL, 0);
6273 
6274                     i4_satd = ihevce_inject_stim_into_distortion(
6275                         pu1_cur_src,
6276                         i4_src_stride,
6277                         pu1_cur_pred,
6278                         i4_pred_stride,
6279                         i4_satd,
6280                         i4_alpha_stim_multiplier,
6281                         u1_trans_size,
6282                         0,
6283                         u1_enable_psyRDOPT,
6284                         U_PLANE);
6285 
6286                     i4_satd_had[i] += i4_satd;
6287 
6288                     /* compute Hadamard-transform satd : Cr */
6289                     i4_satd = ppf_resd_trns_had[u1_trans_idx - 1](
6290                         pu1_cur_src + 1, i4_src_stride, pu1_cur_pred + 1, i4_pred_stride, NULL, 0);
6291 
6292                     i4_satd = ihevce_inject_stim_into_distortion(
6293                         pu1_cur_src,
6294                         i4_src_stride,
6295                         pu1_cur_pred,
6296                         i4_pred_stride,
6297                         i4_satd,
6298                         i4_alpha_stim_multiplier,
6299                         u1_trans_size,
6300                         0,
6301                         u1_enable_psyRDOPT,
6302                         V_PLANE);
6303 
6304                     i4_satd_had[i] += i4_satd;
6305                 }
6306             }
6307 
6308             /* set the neighbour map to 1 */
6309             ihevce_set_nbr_map(
6310                 pu1_ctb_nbr_map,
6311                 i4_nbr_map_strd,
6312                 i4_curr_tu_pos_x,
6313                 i4_curr_tu_pos_y,
6314                 u1_num_4x4_luma_blks_in_tu,
6315                 1);
6316         }
6317 
6318         /* set the neighbour map to 0 */
6319         ihevce_set_nbr_map(
6320             pu1_ctb_nbr_map,
6321             i4_nbr_map_strd,
6322             (ps_cu_analyse->b3_cu_pos_x << 1),
6323             (ps_cu_analyse->b3_cu_pos_y << 1),
6324             (ps_cu_analyse->u1_cu_size >> 2),
6325             0);
6326 
6327         /* Get the least SATD and corresponding mode */
6328         if(i4_best_satd_had > i4_satd_had[i])
6329         {
6330             i4_best_satd_had = i4_satd_had[i];
6331             u1_best_chrm_mode = u1_chrm_mode;
6332         }
6333     }
6334 
6335     return u1_best_chrm_mode;
6336 }
6337 
ihevce_intra_chroma_pred_mode_selector(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_chrm_cu_buf_prms_t * ps_chrm_cu_buf_prms,cu_analyse_t * ps_cu_analyse,WORD32 rd_opt_curr_idx,WORD32 tu_mode,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy)6338 void ihevce_intra_chroma_pred_mode_selector(
6339     ihevce_enc_loop_ctxt_t *ps_ctxt,
6340     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
6341     cu_analyse_t *ps_cu_analyse,
6342     WORD32 rd_opt_curr_idx,
6343     WORD32 tu_mode,
6344     WORD32 i4_alpha_stim_multiplier,
6345     UWORD8 u1_is_cu_noisy)
6346 {
6347     chroma_intra_satd_ctxt_t *ps_chr_intra_satd_ctxt;
6348 
6349     ihevc_intra_pred_chroma_ref_substitution_ft *ihevc_intra_pred_chroma_ref_substitution_fptr;
6350 
6351     UWORD8 *pu1_pred;
6352     WORD32 trans_size;
6353     WORD32 num_tus_in_cu;
6354     WORD32 pred_strd;
6355     WORD32 ctr;
6356     WORD32 i4_subtu_idx;
6357     WORD32 i4_num_sub_tus;
6358     WORD32 trans_idx;
6359     WORD32 scan_idx;
6360     WORD32 num_4x4_luma_in_tu;
6361     WORD32 cu_pos_x;
6362     WORD32 cu_pos_y;
6363 
6364     recon_datastore_t *aps_recon_datastore[2] = { &ps_ctxt->as_cu_prms[0].s_recon_datastore,
6365                                                   &ps_ctxt->as_cu_prms[1].s_recon_datastore };
6366 
6367     LWORD64 chrm_cod_cost = 0;
6368     WORD32 chrm_tu_bits = 0;
6369     WORD32 best_chrm_mode = DM_CHROMA_IDX;
6370     UWORD8 *pu1_chrm_src = ps_chrm_cu_buf_prms->pu1_curr_src;
6371     WORD32 chrm_src_stride = ps_chrm_cu_buf_prms->i4_chrm_src_stride;
6372     UWORD8 *pu1_cu_left = ps_chrm_cu_buf_prms->pu1_cu_left;
6373     UWORD8 *pu1_cu_top = ps_chrm_cu_buf_prms->pu1_cu_top;
6374     UWORD8 *pu1_cu_top_left = ps_chrm_cu_buf_prms->pu1_cu_top_left;
6375     WORD32 cu_left_stride = ps_chrm_cu_buf_prms->i4_cu_left_stride;
6376     WORD32 cu_size = ps_cu_analyse->u1_cu_size;
6377     WORD32 i4_perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_rdoq;
6378     WORD32 i4_perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_sbh;
6379     UWORD8 u1_is_422 = (ps_ctxt->u1_chroma_array_type == 2);
6380 
6381     ihevc_intra_pred_chroma_ref_substitution_fptr =
6382         ps_ctxt->ps_func_selector->ihevc_intra_pred_chroma_ref_substitution_fptr;
6383     i4_num_sub_tus = (u1_is_422 == 1) + 1;
6384 
6385 #if DISABLE_RDOQ_INTRA
6386     i4_perform_rdoq = 0;
6387 #endif
6388 
6389     if(TU_EQ_CU == tu_mode)
6390     {
6391         num_tus_in_cu = 1;
6392         trans_size = cu_size >> 1;
6393         num_4x4_luma_in_tu = trans_size >> 1; /*at luma level*/
6394         ps_chr_intra_satd_ctxt = &ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[tu_mode];
6395     }
6396     else
6397     {
6398         num_tus_in_cu = 4;
6399         trans_size = cu_size >> 2;
6400         num_4x4_luma_in_tu = trans_size >> 1; /*at luma level*/
6401 
6402         /* For 8x8 CU only one TU */
6403         if(MIN_TU_SIZE > trans_size)
6404         {
6405             trans_size = MIN_TU_SIZE;
6406             num_tus_in_cu = 1;
6407             /* chroma nbr avail. is derived based on luma.
6408             for 4x4 chrm use 8x8 luma's size */
6409             num_4x4_luma_in_tu = num_4x4_luma_in_tu << 1;
6410         }
6411 
6412         ps_chr_intra_satd_ctxt = &ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[tu_mode];
6413     }
6414 
6415     /* Can't be TU_EQ_SUBCU case */
6416     ASSERT(TU_EQ_SUBCU != tu_mode);
6417 
6418     /* translate the transform size to index */
6419     trans_idx = trans_size >> 2;
6420 
6421     pu1_pred = (UWORD8 *)ps_chr_intra_satd_ctxt->pv_pred_data;
6422 
6423     pred_strd = ps_chr_intra_satd_ctxt->i4_pred_stride;
6424 
6425     /* for 16x16 cases */
6426     if(16 == trans_size)
6427     {
6428         trans_idx = 3;
6429     }
6430 
6431     best_chrm_mode = ihevce_distortion_based_intra_chroma_mode_selector(
6432         ps_cu_analyse,
6433         ihevc_intra_pred_chroma_ref_substitution_fptr,
6434         ps_ctxt->apf_chrm_ip,
6435         ps_ctxt->apf_chrm_resd_trns_had,
6436         pu1_chrm_src,
6437         chrm_src_stride,
6438         pu1_pred,
6439         pred_strd,
6440         ps_ctxt->pu1_ctb_nbr_map,
6441         ps_ctxt->i4_nbr_map_strd,
6442         (UWORD8 *)ps_ctxt->pv_ref_sub_out,
6443         i4_alpha_stim_multiplier,
6444         u1_is_cu_noisy,
6445         trans_size,
6446         trans_idx,
6447         num_tus_in_cu,
6448         num_4x4_luma_in_tu,
6449         ps_ctxt->u1_enable_psyRDOPT,
6450         u1_is_422);
6451 
6452     /* Store the best chroma mode */
6453     ps_chr_intra_satd_ctxt->u1_best_cr_mode = best_chrm_mode;
6454 
6455     /* evaluate RDOPT cost for the Best mode */
6456     {
6457         WORD32 i4_subtu_pos_x;
6458         WORD32 i4_subtu_pos_y;
6459         UWORD8 u1_compute_spatial_ssd;
6460 
6461         WORD32 ai4_total_bytes_offset_cb[2] = { 0, 0 };
6462         WORD32 ai4_total_bytes_offset_cr[2] = { 0, 0 };
6463         /* State for prefix bin of chroma intra pred mode before CU encode */
6464         UWORD8 u1_chroma_intra_mode_prefix_state =
6465             ps_ctxt->au1_rdopt_init_ctxt_models[IHEVC_CAB_CHROMA_PRED_MODE];
6466         WORD32 luma_trans_size = trans_size << 1;
6467         WORD32 calc_recon = 0;
6468         UWORD8 *pu1_left = pu1_cu_left;
6469         UWORD8 *pu1_top = pu1_cu_top;
6470         UWORD8 *pu1_top_left = pu1_cu_top_left;
6471         WORD32 left_strd = cu_left_stride;
6472 
6473         if(ps_ctxt->i1_cu_qp_delta_enable)
6474         {
6475             ihevce_update_cu_level_qp_lamda(ps_ctxt, ps_cu_analyse, luma_trans_size, 1);
6476         }
6477 
6478         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
6479                                  (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
6480                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
6481 
6482         if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
6483         {
6484             u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
6485                                      CONVERT_SSDS_TO_SPATIAL_DOMAIN;
6486         }
6487 
6488         /* get the 4x4 level postion of current cu */
6489         cu_pos_x = (ps_cu_analyse->b3_cu_pos_x << 1);
6490         cu_pos_y = (ps_cu_analyse->b3_cu_pos_y << 1);
6491 
6492         calc_recon = !u1_compute_spatial_ssd && ((4 == num_tus_in_cu) || (u1_is_422 == 1));
6493 
6494         if(calc_recon || u1_compute_spatial_ssd)
6495         {
6496             aps_recon_datastore[0]->au1_is_chromaRecon_available[1 + (num_tus_in_cu > 1)] = 1;
6497             aps_recon_datastore[1]->au1_is_chromaRecon_available[1 + (num_tus_in_cu > 1)] = 1;
6498         }
6499         else
6500         {
6501             aps_recon_datastore[0]->au1_is_chromaRecon_available[1 + (num_tus_in_cu > 1)] = 0;
6502             aps_recon_datastore[1]->au1_is_chromaRecon_available[1 + (num_tus_in_cu > 1)] = 0;
6503         }
6504 
6505         /* loop based on num tus in a cu */
6506         for(ctr = 0; ctr < num_tus_in_cu; ctr++)
6507         {
6508             WORD16 *pi2_cur_deq_data_cb;
6509             WORD16 *pi2_cur_deq_data_cr;
6510 
6511             WORD32 deq_data_strd = ps_chr_intra_satd_ctxt->i4_iq_buff_stride;
6512             WORD32 luma_nbr_flags = 0;
6513 
6514             luma_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
6515                 ps_ctxt->pu1_ctb_nbr_map,
6516                 ps_ctxt->i4_nbr_map_strd,
6517                 (ctr & 1) * (luma_trans_size >> 2) + cu_pos_x,
6518                 (ctr > 1) * (luma_trans_size >> 2) + cu_pos_y,
6519                 (luma_trans_size >> 2),
6520                 (luma_trans_size >> 2));
6521 
6522             for(i4_subtu_idx = 0; i4_subtu_idx < i4_num_sub_tus; i4_subtu_idx++)
6523             {
6524                 WORD32 cbf, num_bytes;
6525                 LWORD64 trans_ssd_u, trans_ssd_v;
6526                 UWORD8 u1_is_recon_available;
6527 
6528                 WORD32 trans_size_m2 = trans_size << 1;
6529                 UWORD8 *pu1_cur_src = pu1_chrm_src + ((ctr & 1) * trans_size_m2) +
6530                                       (((ctr > 1) * trans_size * chrm_src_stride) << u1_is_422) +
6531                                       (i4_subtu_idx * trans_size * chrm_src_stride);
6532                 UWORD8 *pu1_cur_pred = pu1_pred + ((ctr & 1) * trans_size_m2) +
6533                                        (((ctr > 1) * trans_size * pred_strd) << u1_is_422) +
6534                                        (i4_subtu_idx * trans_size * pred_strd);
6535                 WORD32 i4_recon_stride = aps_recon_datastore[0]->i4_chromaRecon_stride;
6536                 UWORD8 *pu1_cur_recon = ((UWORD8 *)aps_recon_datastore[0]
6537                                              ->apv_chroma_recon_bufs[1 + (num_tus_in_cu > 1)]) +
6538                                         ((ctr & 1) * trans_size_m2) +
6539                                         (((ctr > 1) * trans_size * i4_recon_stride) << u1_is_422) +
6540                                         (i4_subtu_idx * trans_size * i4_recon_stride);
6541 
6542                 /* Use Chroma coeff/iq buf of the cur. intra cand. Not rememb.
6543                 chroma coeff/iq for high quality intra SATD special modes. Will
6544                 be over written by coeff of luma mode in chroma_rdopt call */
6545                 UWORD8 *pu1_ecd_data_cb =
6546                     &ps_chr_intra_satd_ctxt->au1_scan_coeff_cb[i4_subtu_idx][0];
6547                 UWORD8 *pu1_ecd_data_cr =
6548                     &ps_chr_intra_satd_ctxt->au1_scan_coeff_cr[i4_subtu_idx][0];
6549 
6550                 WORD32 chrm_pred_func_idx = 0;
6551                 LWORD64 curr_cb_cod_cost = 0;
6552                 LWORD64 curr_cr_cod_cost = 0;
6553                 WORD32 nbr_flags = 0;
6554 
6555                 i4_subtu_pos_x = (((ctr & 1) * trans_size_m2) >> 2);
6556                 i4_subtu_pos_y = (((ctr > 1) * trans_size) >> (!u1_is_422 + 1)) +
6557                                  ((i4_subtu_idx * trans_size) >> 2);
6558                 pi2_cur_deq_data_cb = &ps_chr_intra_satd_ctxt->ai2_iq_data_cb[0] +
6559                                       ((ctr & 1) * trans_size) +
6560                                       (((ctr > 1) * trans_size * deq_data_strd) << u1_is_422) +
6561                                       (i4_subtu_idx * trans_size * deq_data_strd);
6562                 pi2_cur_deq_data_cr = &ps_chr_intra_satd_ctxt->ai2_iq_data_cr[0] +
6563                                       ((ctr & 1) * trans_size) +
6564                                       (((ctr > 1) * trans_size * deq_data_strd) << u1_is_422) +
6565                                       (i4_subtu_idx * trans_size * deq_data_strd);
6566 
6567                 /* left cu boundary */
6568                 if(0 == i4_subtu_pos_x)
6569                 {
6570                     left_strd = cu_left_stride;
6571                     pu1_left = pu1_cu_left + (i4_subtu_pos_y << 2) * left_strd;
6572                 }
6573                 else
6574                 {
6575                     pu1_left = pu1_cur_recon - 2;
6576                     left_strd = i4_recon_stride;
6577                 }
6578 
6579                 /* top cu boundary */
6580                 if(0 == i4_subtu_pos_y)
6581                 {
6582                     pu1_top = pu1_cu_top + (i4_subtu_pos_x << 2);
6583                 }
6584                 else
6585                 {
6586                     pu1_top = pu1_cur_recon - i4_recon_stride;
6587                 }
6588 
6589                 /* by default top left is set to cu top left */
6590                 pu1_top_left = pu1_cu_top_left;
6591 
6592                 /* top left based on position */
6593                 if((0 != i4_subtu_pos_y) && (0 == i4_subtu_pos_x))
6594                 {
6595                     pu1_top_left = pu1_left - left_strd;
6596                 }
6597                 else if(0 != i4_subtu_pos_x)
6598                 {
6599                     pu1_top_left = pu1_top - 2;
6600                 }
6601 
6602                 /* populate the coeffs scan idx */
6603                 scan_idx = SCAN_DIAG_UPRIGHT;
6604 
6605                 /* RDOPT copy States :  TU init (best until prev TU) to current */
6606                 COPY_CABAC_STATES(
6607                     &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
6608                          .s_cabac_ctxt.au1_ctxt_models[0],
6609                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
6610                     IHEVC_CAB_CTXT_END);
6611 
6612                 /* for 4x4 transforms based on intra pred mode scan is choosen*/
6613                 if(4 == trans_size)
6614                 {
6615                     /* for modes from 22 upto 30 horizontal scan is used */
6616                     if((best_chrm_mode > 21) && (best_chrm_mode < 31))
6617                     {
6618                         scan_idx = SCAN_HORZ;
6619                     }
6620                     /* for modes from 6 upto 14 horizontal scan is used */
6621                     else if((best_chrm_mode > 5) && (best_chrm_mode < 15))
6622                     {
6623                         scan_idx = SCAN_VERT;
6624                     }
6625                 }
6626 
6627                 nbr_flags = ihevce_get_intra_chroma_tu_nbr(
6628                     luma_nbr_flags, i4_subtu_idx, trans_size, u1_is_422);
6629 
6630                 /* call the chroma reference array substitution */
6631                 ihevc_intra_pred_chroma_ref_substitution_fptr(
6632                     pu1_top_left,
6633                     pu1_top,
6634                     pu1_left,
6635                     left_strd,
6636                     trans_size,
6637                     nbr_flags,
6638                     (UWORD8 *)ps_ctxt->pv_ref_sub_out,
6639                     1);
6640 
6641                 /* use the look up to get the function idx */
6642                 chrm_pred_func_idx = g_i4_ip_funcs[best_chrm_mode];
6643 
6644                 /* call the intra prediction function */
6645                 ps_ctxt->apf_chrm_ip[chrm_pred_func_idx](
6646                     (UWORD8 *)ps_ctxt->pv_ref_sub_out,
6647                     1,
6648                     pu1_cur_pred,
6649                     pred_strd,
6650                     trans_size,
6651                     best_chrm_mode);
6652 
6653                 /* UPLANE RDOPT Loop */
6654                 {
6655                     WORD32 tu_bits;
6656 
6657                     cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
6658                         ps_ctxt,
6659                         pu1_cur_pred,
6660                         pred_strd,
6661                         pu1_cur_src,
6662                         chrm_src_stride,
6663                         pi2_cur_deq_data_cb,
6664                         deq_data_strd,
6665                         pu1_cur_recon,
6666                         i4_recon_stride,
6667                         pu1_ecd_data_cb + ai4_total_bytes_offset_cb[i4_subtu_idx],
6668                         ps_ctxt->au1_cu_csbf,
6669                         ps_ctxt->i4_cu_csbf_strd,
6670                         trans_size,
6671                         scan_idx,
6672                         1,
6673                         &num_bytes,
6674                         &tu_bits,
6675                         &ps_chr_intra_satd_ctxt->ai4_zero_col_cb[i4_subtu_idx][ctr],
6676                         &ps_chr_intra_satd_ctxt->ai4_zero_row_cb[i4_subtu_idx][ctr],
6677                         &u1_is_recon_available,
6678                         i4_perform_sbh,
6679                         i4_perform_rdoq,
6680                         &trans_ssd_u,
6681 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
6682                         i4_alpha_stim_multiplier,
6683                         u1_is_cu_noisy,
6684 #endif
6685                         0,
6686                         u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
6687                         U_PLANE);
6688 
6689 #if !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS && COMPUTE_NOISE_TERM_AT_THE_TU_LEVEL
6690                     if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
6691                     {
6692 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
6693                         trans_ssd_u = ihevce_inject_stim_into_distortion(
6694                             pu1_cur_src,
6695                             chrm_src_stride,
6696                             pu1_cur_pred,
6697                             pred_strd,
6698                             trans_ssd_u,
6699                             i4_alpha_stim_multiplier,
6700                             trans_size,
6701                             0,
6702                             ps_ctxt->u1_enable_psyRDOPT,
6703                             U_PLANE);
6704 #else
6705                         if(u1_compute_spatial_ssd && u1_is_recon_available)
6706                         {
6707                             trans_ssd_u = ihevce_inject_stim_into_distortion(
6708                                 pu1_cur_src,
6709                                 chrm_src_stride,
6710                                 pu1_cur_recon,
6711                                 i4_recon_stride,
6712                                 trans_ssd_u,
6713                                 i4_alpha_stim_multiplier,
6714                                 trans_size,
6715                                 0,
6716                                 ps_ctxt->u1_enable_psyRDOPT,
6717                                 U_PLANE);
6718                         }
6719                         else
6720                         {
6721                             trans_ssd_u = ihevce_inject_stim_into_distortion(
6722                                 pu1_cur_src,
6723                                 chrm_src_stride,
6724                                 pu1_cur_pred,
6725                                 pred_strd,
6726                                 trans_ssd_u,
6727                                 i4_alpha_stim_multiplier,
6728                                 trans_size,
6729                                 0,
6730                                 ps_ctxt->u1_enable_psyRDOPT,
6731                                 U_PLANE);
6732                         }
6733 #endif
6734                     }
6735 #endif
6736 
6737                     /* RDOPT copy States :  New updated after curr TU to TU init */
6738                     if(0 != cbf)
6739                     {
6740                         memcpy(
6741                             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
6742                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
6743                                  .s_cabac_ctxt.au1_ctxt_models[0],
6744                             IHEVC_CAB_CTXT_END);
6745                     }
6746                     /* RDOPT copy States :  Restoring back the Cb init state to Cr */
6747                     else
6748                     {
6749                         memcpy(
6750                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
6751                                  .s_cabac_ctxt.au1_ctxt_models[0],
6752                             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
6753                             IHEVC_CAB_CTXT_END);
6754                     }
6755 
6756                     if(calc_recon || (!u1_is_recon_available && u1_compute_spatial_ssd))
6757                     {
6758                         ihevce_chroma_it_recon_fxn(
6759                             ps_ctxt,
6760                             pi2_cur_deq_data_cb,
6761                             deq_data_strd,
6762                             pu1_cur_pred,
6763                             pred_strd,
6764                             pu1_cur_recon,
6765                             i4_recon_stride,
6766                             (pu1_ecd_data_cb + ai4_total_bytes_offset_cb[i4_subtu_idx]),
6767                             trans_size,
6768                             cbf,
6769                             ps_chr_intra_satd_ctxt->ai4_zero_col_cb[i4_subtu_idx][ctr],
6770                             ps_chr_intra_satd_ctxt->ai4_zero_row_cb[i4_subtu_idx][ctr],
6771                             U_PLANE);
6772                     }
6773 
6774                     ps_chr_intra_satd_ctxt->au1_cbf_cb[i4_subtu_idx][ctr] = cbf;
6775                     curr_cb_cod_cost =
6776                         trans_ssd_u +
6777                         COMPUTE_RATE_COST_CLIP30(
6778                             tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
6779                     chrm_tu_bits += tu_bits;
6780                     ai4_total_bytes_offset_cb[i4_subtu_idx] += num_bytes;
6781                     ps_chr_intra_satd_ctxt->ai4_num_bytes_scan_coeff_cb_per_tu[i4_subtu_idx][ctr] =
6782                         num_bytes;
6783                 }
6784 
6785                 /* VPLANE RDOPT Loop */
6786                 {
6787                     WORD32 tu_bits;
6788 
6789                     cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
6790                         ps_ctxt,
6791                         pu1_cur_pred,
6792                         pred_strd,
6793                         pu1_cur_src,
6794                         chrm_src_stride,
6795                         pi2_cur_deq_data_cr,
6796                         deq_data_strd,
6797                         pu1_cur_recon,
6798                         i4_recon_stride,
6799                         pu1_ecd_data_cr + ai4_total_bytes_offset_cr[i4_subtu_idx],
6800                         ps_ctxt->au1_cu_csbf,
6801                         ps_ctxt->i4_cu_csbf_strd,
6802                         trans_size,
6803                         scan_idx,
6804                         1,
6805                         &num_bytes,
6806                         &tu_bits,
6807                         &ps_chr_intra_satd_ctxt->ai4_zero_col_cr[i4_subtu_idx][ctr],
6808                         &ps_chr_intra_satd_ctxt->ai4_zero_row_cr[i4_subtu_idx][ctr],
6809                         &u1_is_recon_available,
6810                         i4_perform_sbh,
6811                         i4_perform_rdoq,
6812                         &trans_ssd_v,
6813 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
6814                         i4_alpha_stim_multiplier,
6815                         u1_is_cu_noisy,
6816 #endif
6817                         0,
6818                         u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
6819                         V_PLANE);
6820 
6821 #if !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS && COMPUTE_NOISE_TERM_AT_THE_TU_LEVEL
6822                     if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
6823                     {
6824 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
6825                         trans_ssd_v = ihevce_inject_stim_into_distortion(
6826                             pu1_cur_src,
6827                             chrm_src_stride,
6828                             pu1_cur_pred,
6829                             pred_strd,
6830                             trans_ssd_v,
6831                             i4_alpha_stim_multiplier,
6832                             trans_size,
6833                             0,
6834                             ps_ctxt->u1_enable_psyRDOPT,
6835                             V_PLANE);
6836 #else
6837                         if(u1_compute_spatial_ssd && u1_is_recon_available)
6838                         {
6839                             trans_ssd_v = ihevce_inject_stim_into_distortion(
6840                                 pu1_cur_src,
6841                                 chrm_src_stride,
6842                                 pu1_cur_recon,
6843                                 i4_recon_stride,
6844                                 trans_ssd_v,
6845                                 i4_alpha_stim_multiplier,
6846                                 trans_size,
6847                                 0,
6848                                 ps_ctxt->u1_enable_psyRDOPT,
6849                                 V_PLANE);
6850                         }
6851                         else
6852                         {
6853                             trans_ssd_v = ihevce_inject_stim_into_distortion(
6854                                 pu1_cur_src,
6855                                 chrm_src_stride,
6856                                 pu1_cur_pred,
6857                                 pred_strd,
6858                                 trans_ssd_v,
6859                                 i4_alpha_stim_multiplier,
6860                                 trans_size,
6861                                 0,
6862                                 ps_ctxt->u1_enable_psyRDOPT,
6863                                 V_PLANE);
6864                         }
6865 #endif
6866                     }
6867 #endif
6868 
6869                     /* RDOPT copy States :  New updated after curr TU to TU init */
6870                     if(0 != cbf)
6871                     {
6872                         COPY_CABAC_STATES(
6873                             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
6874                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
6875                                  .s_cabac_ctxt.au1_ctxt_models[0],
6876                             IHEVC_CAB_CTXT_END);
6877                     }
6878                     /* RDOPT copy States :  Restoring back the Cb init state to Cr */
6879                     else
6880                     {
6881                         COPY_CABAC_STATES(
6882                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
6883                                  .s_cabac_ctxt.au1_ctxt_models[0],
6884                             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
6885                             IHEVC_CAB_CTXT_END);
6886                     }
6887 
6888                     if(calc_recon || (!u1_is_recon_available && u1_compute_spatial_ssd))
6889                     {
6890                         ihevce_chroma_it_recon_fxn(
6891                             ps_ctxt,
6892                             pi2_cur_deq_data_cr,
6893                             deq_data_strd,
6894                             pu1_cur_pred,
6895                             pred_strd,
6896                             pu1_cur_recon,
6897                             i4_recon_stride,
6898                             (pu1_ecd_data_cr + ai4_total_bytes_offset_cr[i4_subtu_idx]),
6899                             trans_size,
6900                             cbf,
6901                             ps_chr_intra_satd_ctxt->ai4_zero_col_cr[i4_subtu_idx][ctr],
6902                             ps_chr_intra_satd_ctxt->ai4_zero_row_cr[i4_subtu_idx][ctr],
6903                             V_PLANE);
6904                     }
6905 
6906                     ps_chr_intra_satd_ctxt->au1_cbf_cr[i4_subtu_idx][ctr] = cbf;
6907                     curr_cr_cod_cost =
6908                         trans_ssd_v +
6909                         COMPUTE_RATE_COST_CLIP30(
6910                             tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
6911                     chrm_tu_bits += tu_bits;
6912                     ai4_total_bytes_offset_cr[i4_subtu_idx] += num_bytes;
6913                     ps_chr_intra_satd_ctxt->ai4_num_bytes_scan_coeff_cr_per_tu[i4_subtu_idx][ctr] =
6914                         num_bytes;
6915                 }
6916 
6917                 chrm_cod_cost += curr_cb_cod_cost;
6918                 chrm_cod_cost += curr_cr_cod_cost;
6919             }
6920 
6921             /* set the neighbour map to 1 */
6922             ihevce_set_nbr_map(
6923                 ps_ctxt->pu1_ctb_nbr_map,
6924                 ps_ctxt->i4_nbr_map_strd,
6925                 (ctr & 1) * (luma_trans_size >> 2) + cu_pos_x,
6926                 (ctr > 1) * (luma_trans_size >> 2) + cu_pos_y,
6927                 (luma_trans_size >> 2),
6928                 1);
6929         }
6930 
6931         /* set the neighbour map to 0 */
6932         ihevce_set_nbr_map(
6933             ps_ctxt->pu1_ctb_nbr_map,
6934             ps_ctxt->i4_nbr_map_strd,
6935             (ps_cu_analyse->b3_cu_pos_x << 1),
6936             (ps_cu_analyse->b3_cu_pos_y << 1),
6937             (ps_cu_analyse->u1_cu_size >> 2),
6938             0);
6939 
6940         /* Account for coding b3_chroma_intra_pred_mode prefix and suffix bins */
6941         /* This is done by adding the bits for signalling chroma mode (0-3)    */
6942         /* and subtracting the bits for chroma mode same as luma mode (4)      */
6943 #if CHROMA_RDOPT_ENABLE
6944         {
6945             /* Estimate bits to encode prefix bin as 1 for b3_chroma_intra_pred_mode */
6946             WORD32 bits_frac_1 =
6947                 gau2_ihevce_cabac_bin_to_bits[u1_chroma_intra_mode_prefix_state ^ 1];
6948 
6949             WORD32 bits_for_mode_0to3 = (2 << CABAC_FRAC_BITS_Q) + bits_frac_1;
6950 
6951             /* Estimate bits to encode prefix bin as 0 for b3_chroma_intra_pred_mode */
6952             WORD32 bits_for_mode4 =
6953                 gau2_ihevce_cabac_bin_to_bits[u1_chroma_intra_mode_prefix_state ^ 0];
6954 
6955             /* accumulate into final rd cost for chroma */
6956             ps_chr_intra_satd_ctxt->i8_cost_to_encode_chroma_mode = COMPUTE_RATE_COST_CLIP30(
6957                 (bits_for_mode_0to3 - bits_for_mode4),
6958                 ps_ctxt->i8_cl_ssd_lambda_chroma_qf,
6959                 (LAMBDA_Q_SHIFT + CABAC_FRAC_BITS_Q));
6960 
6961             chrm_cod_cost += ps_chr_intra_satd_ctxt->i8_cost_to_encode_chroma_mode;
6962         }
6963 #endif
6964 
6965         if(ps_ctxt->u1_enable_psyRDOPT)
6966         {
6967             UWORD8 *pu1_recon_cu;
6968             WORD32 recon_stride;
6969             WORD32 curr_pos_x;
6970             WORD32 curr_pos_y;
6971             WORD32 start_index;
6972             WORD32 num_horz_cu_in_ctb;
6973             WORD32 had_block_size;
6974 
6975             /* tODO: sreenivasa ctb size has to be used appropriately */
6976             had_block_size = 8;
6977             num_horz_cu_in_ctb = 2 * 64 / had_block_size;
6978             curr_pos_x = ps_cu_analyse->b3_cu_pos_x << 3; /* pel units */
6979             curr_pos_y = ps_cu_analyse->b3_cu_pos_x << 3; /* pel units */
6980             recon_stride = aps_recon_datastore[0]->i4_chromaRecon_stride;
6981             pu1_recon_cu =
6982                 aps_recon_datastore[0]->apv_chroma_recon_bufs[1 + (num_tus_in_cu > 1)];  //
6983 
6984             /* start index to index the source satd of curr cu int he current ctb*/
6985             start_index = 2 * (curr_pos_x / had_block_size) +
6986                           (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
6987 
6988             {
6989                 chrm_cod_cost += ihevce_psy_rd_cost_croma(
6990                     ps_ctxt->ai4_source_chroma_satd,
6991                     pu1_recon_cu,
6992                     recon_stride,
6993                     1,  //
6994                     cu_size,
6995                     0,  // pic type
6996                     0,  //layer id
6997                     ps_ctxt->i4_satd_lamda,  // lambda
6998                     start_index,
6999                     ps_ctxt->u1_is_input_data_hbd,  // 8 bit
7000                     ps_ctxt->u1_chroma_array_type,
7001                     &ps_ctxt->s_cmn_opt_func
7002 
7003                 );  // chroma subsampling 420
7004             }
7005         }
7006 
7007         ps_chr_intra_satd_ctxt->i8_chroma_best_rdopt = chrm_cod_cost;
7008         ps_chr_intra_satd_ctxt->i4_chrm_tu_bits = chrm_tu_bits;
7009 
7010         memcpy(
7011             &ps_chr_intra_satd_ctxt->au1_chrm_satd_updated_ctxt_models[0],
7012             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7013             IHEVC_CAB_CTXT_END);
7014     }
7015 }
7016 
7017 /*!
7018 ******************************************************************************
7019 * \if Function name : ihevce_chroma_cu_prcs_rdopt \endif
7020 *
7021 * \brief
7022 *    Coding unit processing function for chroma
7023 *
7024 * \param[in] ps_ctxt    enc_loop module ctxt pointer
7025 * \param[in] rd_opt_curr_idx index in the array of RDopt params
7026 * \param[in] func_proc_mode TU_EQ_CU or other case
7027 * \param[in] pu1_chrm_src  pointer to source data buffer
7028 * \param[in] chrm_src_stride   source buffer stride
7029 * \param[in] pu1_cu_left pointer to left recon data buffer
7030 * \param[in] pu1_cu_top  pointer to top recon data buffer
7031 * \param[in] pu1_cu_top_left pointer to top left recon data buffer
7032 * \param[in] left_stride left recon buffer stride
7033 * \param[out] cu_pos_x position x of current CU in CTB
7034 * \param[out] cu_pos_y position y of current CU in CTB
7035 * \param[out] pi4_chrm_tu_bits pointer to store the totla chroma bits
7036 *
7037 * \return
7038 *    Chroma coding cost (cb adn Cr included)
7039 *
7040 * \author
7041 *  Ittiam
7042 *
7043 *****************************************************************************
7044 */
ihevce_chroma_cu_prcs_rdopt(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD32 rd_opt_curr_idx,WORD32 func_proc_mode,UWORD8 * pu1_chrm_src,WORD32 chrm_src_stride,UWORD8 * pu1_cu_left,UWORD8 * pu1_cu_top,UWORD8 * pu1_cu_top_left,WORD32 cu_left_stride,WORD32 cu_pos_x,WORD32 cu_pos_y,WORD32 * pi4_chrm_tu_bits,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy)7045 LWORD64 ihevce_chroma_cu_prcs_rdopt(
7046     ihevce_enc_loop_ctxt_t *ps_ctxt,
7047     WORD32 rd_opt_curr_idx,
7048     WORD32 func_proc_mode,
7049     UWORD8 *pu1_chrm_src,
7050     WORD32 chrm_src_stride,
7051     UWORD8 *pu1_cu_left,
7052     UWORD8 *pu1_cu_top,
7053     UWORD8 *pu1_cu_top_left,
7054     WORD32 cu_left_stride,
7055     WORD32 cu_pos_x,
7056     WORD32 cu_pos_y,
7057     WORD32 *pi4_chrm_tu_bits,
7058     WORD32 i4_alpha_stim_multiplier,
7059     UWORD8 u1_is_cu_noisy)
7060 {
7061     tu_enc_loop_out_t *ps_tu;
7062     tu_enc_loop_temp_prms_t *ps_tu_temp_prms;
7063 
7064     ihevc_intra_pred_chroma_ref_substitution_ft *ihevc_intra_pred_chroma_ref_substitution_fptr;
7065 
7066     UWORD8 *pu1_pred;
7067     UWORD8 *pu1_recon;
7068     WORD32 i4_recon_stride;
7069     WORD32 cu_size, trans_size = 0;
7070     WORD32 pred_strd;
7071     WORD32 ctr, i4_subtu_idx;
7072     WORD32 scan_idx;
7073     WORD32 u1_is_cu_coded_old;
7074     WORD32 init_bytes_offset;
7075 
7076     enc_loop_cu_final_prms_t *ps_best_cu_prms = &ps_ctxt->as_cu_prms[rd_opt_curr_idx];
7077     recon_datastore_t *ps_recon_datastore = &ps_best_cu_prms->s_recon_datastore;
7078 
7079     WORD32 total_bytes_offset = 0;
7080     LWORD64 chrm_cod_cost = 0;
7081     WORD32 chrm_tu_bits = 0;
7082     WORD32 chrm_pred_mode = DM_CHROMA_IDX, luma_pred_mode = 35;
7083     LWORD64 i8_ssd_cb = 0;
7084     WORD32 i4_bits_cb = 0;
7085     LWORD64 i8_ssd_cr = 0;
7086     WORD32 i4_bits_cr = 0;
7087     UWORD8 u1_is_422 = (ps_ctxt->u1_chroma_array_type == 2);
7088     UWORD8 u1_num_tus =
7089         /* NumChromaTU's = 1, if TUSize = 4 and CUSize = 8 */
7090         (!ps_best_cu_prms->as_tu_enc_loop[0].s_tu.b3_size && ps_best_cu_prms->u1_intra_flag)
7091             ? 1
7092             : ps_best_cu_prms->u2_num_tus_in_cu;
7093     UWORD8 u1_num_subtus_in_tu = u1_is_422 + 1;
7094     UWORD8 u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
7095                                     (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
7096                                     CONVERT_SSDS_TO_SPATIAL_DOMAIN;
7097     /* Get the RDOPT cost of the best CU mode for early_exit */
7098     LWORD64 prev_best_rdopt_cost = ps_ctxt->as_cu_prms[!rd_opt_curr_idx].i8_best_rdopt_cost;
7099     /* Get the current running RDOPT (Luma RDOPT) for early_exit */
7100     LWORD64 curr_rdopt_cost = ps_ctxt->as_cu_prms[rd_opt_curr_idx].i8_curr_rdopt_cost;
7101     WORD32 i4_perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_rdoq;
7102     WORD32 i4_perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_sbh;
7103 
7104     ihevc_intra_pred_chroma_ref_substitution_fptr =
7105         ps_ctxt->ps_func_selector->ihevc_intra_pred_chroma_ref_substitution_fptr;
7106 
7107     if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
7108     {
7109         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
7110                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
7111     }
7112 
7113     /* Store the init bytes offset from luma */
7114     init_bytes_offset = ps_best_cu_prms->i4_num_bytes_ecd_data;
7115 
7116     /* Unused pred buffer in merge_skip_pred_data_t structure is used as
7117     Chroma pred storage buf. for final_recon function.
7118     The buffer is split into two and used as a ping-pong buffer */
7119     pu1_pred = ps_ctxt->s_cu_me_intra_pred_prms.pu1_pred_data[CU_ME_INTRA_PRED_CHROMA_IDX] +
7120                rd_opt_curr_idx * ((MAX_CTB_SIZE * MAX_CTB_SIZE >> 1) +
7121                                   (u1_is_422 * (MAX_CTB_SIZE * MAX_CTB_SIZE >> 1)));
7122 
7123     pred_strd = ps_ctxt->s_cu_me_intra_pred_prms.ai4_pred_data_stride[CU_ME_INTRA_PRED_CHROMA_IDX];
7124 
7125     pu1_recon = (UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs[0];
7126     i4_recon_stride = ps_recon_datastore->i4_chromaRecon_stride;
7127     cu_size = ps_best_cu_prms->u1_cu_size;
7128     chrm_tu_bits = 0;
7129 
7130     /* get the first TU pointer */
7131     ps_tu = &ps_best_cu_prms->as_tu_enc_loop[0];
7132     /* get the first TU enc_loop temp prms pointer */
7133     ps_tu_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
7134 
7135     if(PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag)
7136     {
7137         /* Mode signalled by intra prediction for luma */
7138         luma_pred_mode = ps_best_cu_prms->au1_intra_pred_mode[0];
7139 
7140 #if DISABLE_RDOQ_INTRA
7141         i4_perform_rdoq = 0;
7142 #endif
7143     }
7144 
7145     else
7146     {
7147         UWORD8 *pu1_pred_org = pu1_pred;
7148 
7149         /* ------ Motion Compensation for Chroma -------- */
7150         for(ctr = 0; ctr < ps_best_cu_prms->u2_num_pus_in_cu; ctr++)
7151         {
7152             pu_t *ps_pu;
7153             WORD32 inter_pu_wd;
7154             WORD32 inter_pu_ht;
7155 
7156             ps_pu = &ps_best_cu_prms->as_pu_chrm_proc[ctr];
7157 
7158             inter_pu_wd = (ps_pu->b4_wd + 1) << 2; /* cb and cr pixel interleaved */
7159             inter_pu_ht = ((ps_pu->b4_ht + 1) << 2) >> 1;
7160             inter_pu_ht <<= u1_is_422;
7161 
7162             ihevce_chroma_inter_pred_pu(&ps_ctxt->s_mc_ctxt, ps_pu, pu1_pred, pred_strd);
7163 
7164             if(2 == ps_best_cu_prms->u2_num_pus_in_cu)
7165             {
7166                 /* 2Nx__ partion case */
7167                 if(inter_pu_wd == cu_size)
7168                 {
7169                     pu1_pred += (inter_pu_ht * pred_strd);
7170                 }
7171 
7172                 /* __x2N partion case */
7173                 if(inter_pu_ht == (cu_size >> (u1_is_422 == 0)))
7174                 {
7175                     pu1_pred += inter_pu_wd;
7176                 }
7177             }
7178         }
7179 
7180         /* restore the pred pointer to start for transform loop */
7181         pu1_pred = pu1_pred_org;
7182     }
7183 
7184     /* Used to store back only the luma based info. if SATD based chorma
7185     mode also comes */
7186     u1_is_cu_coded_old = ps_best_cu_prms->u1_is_cu_coded;
7187 
7188     /* evaluate chroma candidates (same as luma) and
7189     if INTRA & HIGH_QUALITY compare with best SATD mode */
7190     {
7191         WORD32 calc_recon = 0, deq_data_strd;
7192         WORD16 *pi2_deq_data;
7193         UWORD8 *pu1_ecd_data;
7194         UWORD8 u1_is_mode_eq_chroma_satd_mode = 0;
7195 
7196         pi2_deq_data = &ps_best_cu_prms->pi2_cu_deq_coeffs[0];
7197         pi2_deq_data += ps_best_cu_prms->i4_chrm_deq_coeff_strt_idx;
7198         deq_data_strd = cu_size;
7199         /* update ecd buffer for storing coeff. */
7200         pu1_ecd_data = &ps_best_cu_prms->pu1_cu_coeffs[0];
7201         pu1_ecd_data += init_bytes_offset;
7202         /* store chroma starting index */
7203         ps_best_cu_prms->i4_chrm_cu_coeff_strt_idx = init_bytes_offset;
7204 
7205         /* get the first TU pointer */
7206         ps_tu = &ps_best_cu_prms->as_tu_enc_loop[0];
7207         ps_tu_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
7208 
7209         /* Reset total_bytes_offset for each candidate */
7210         chrm_pred_mode = (u1_is_422 == 1) ? gau1_chroma422_intra_angle_mapping[luma_pred_mode]
7211                                           : luma_pred_mode;
7212 
7213         total_bytes_offset = 0;
7214 
7215         if(TU_EQ_SUBCU == func_proc_mode)
7216         {
7217             func_proc_mode = TU_EQ_CU_DIV2;
7218         }
7219 
7220         /* For cu_size=8 case, chroma cost will be same for TU_EQ_CU and
7221         TU_EQ_CU_DIV2 and  TU_EQ_SUBCU case */
7222         if(8 == cu_size)
7223         {
7224             func_proc_mode = TU_EQ_CU;
7225         }
7226 
7227         /* loop based on num tus in a cu */
7228         if(!ps_best_cu_prms->u1_intra_flag || !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_satd ||
7229            (ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_satd &&
7230             (chrm_pred_mode !=
7231              ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[func_proc_mode].u1_best_cr_mode)))
7232         {
7233             /* loop based on num tus in a cu */
7234             for(ctr = 0; ctr < u1_num_tus; ctr++)
7235             {
7236                 WORD32 num_bytes = 0;
7237                 LWORD64 curr_cb_cod_cost = 0;
7238                 LWORD64 curr_cr_cod_cost = 0;
7239                 WORD32 chrm_pred_func_idx = 0;
7240                 UWORD8 u1_is_early_exit_condition_satisfied = 0;
7241 
7242                 /* Default cb and cr offset initializatio for b3_chroma_intra_mode_idx=7   */
7243                 /* FIX for TU tree shrinkage caused by ecd data copies in final mode recon */
7244                 ps_tu->s_tu.b1_cb_cbf = ps_tu->s_tu.b1_cr_cbf = 0;
7245                 ps_tu->s_tu.b1_cb_cbf_subtu1 = ps_tu->s_tu.b1_cr_cbf_subtu1 = 0;
7246                 ps_tu->ai4_cb_coeff_offset[0] = total_bytes_offset + init_bytes_offset;
7247                 ps_tu->ai4_cr_coeff_offset[0] = total_bytes_offset + init_bytes_offset;
7248                 ps_tu->ai4_cb_coeff_offset[1] = total_bytes_offset + init_bytes_offset;
7249                 ps_tu->ai4_cr_coeff_offset[1] = total_bytes_offset + init_bytes_offset;
7250                 ps_tu_temp_prms->ai2_cb_bytes_consumed[0] = 0;
7251                 ps_tu_temp_prms->ai2_cr_bytes_consumed[0] = 0;
7252                 ps_tu_temp_prms->ai2_cb_bytes_consumed[1] = 0;
7253                 ps_tu_temp_prms->ai2_cr_bytes_consumed[1] = 0;
7254 
7255                 /* TU level inits */
7256                 /* check if chroma present flag is set */
7257                 if(1 == ps_tu->s_tu.b3_chroma_intra_mode_idx)
7258                 {
7259                     /* RDOPT copy States :  TU init (best until prev TU) to current */
7260                     COPY_CABAC_STATES(
7261                         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
7262                              .s_cabac_ctxt.au1_ctxt_models[0],
7263                         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7264                         IHEVC_CAB_CTXT_END);
7265 
7266                     /* get the current transform size */
7267                     trans_size = ps_tu->s_tu.b3_size;
7268                     trans_size = (1 << (trans_size + 1)); /* in chroma units */
7269 
7270                     /* since 2x2 transform is not allowed for chroma*/
7271                     if(2 == trans_size)
7272                     {
7273                         trans_size = 4;
7274                     }
7275                 }
7276 
7277                 for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus_in_tu; i4_subtu_idx++)
7278                 {
7279                     WORD32 cbf;
7280                     UWORD8 u1_is_recon_available;
7281 
7282                     WORD32 nbr_flags = 0;
7283                     WORD32 zero_cols = 0;
7284                     WORD32 zero_rows = 0;
7285 
7286                     /* check if chroma present flag is set */
7287                     if(1 == ps_tu->s_tu.b3_chroma_intra_mode_idx)
7288                     {
7289                         UWORD8 *pu1_cur_pred;
7290                         UWORD8 *pu1_cur_recon;
7291                         UWORD8 *pu1_cur_src;
7292                         WORD16 *pi2_cur_deq_data;
7293                         WORD32 curr_pos_x, curr_pos_y;
7294                         LWORD64 trans_ssd_u, trans_ssd_v;
7295 
7296                         /* get the current sub-tu posx and posy w.r.t to cu */
7297                         curr_pos_x = (ps_tu->s_tu.b4_pos_x << 2) - (cu_pos_x << 3);
7298                         curr_pos_y = (ps_tu->s_tu.b4_pos_y << 2) - (cu_pos_y << 3) +
7299                                      (i4_subtu_idx * trans_size);
7300 
7301                         /* 420sp case only vertical height will be half */
7302                         if(u1_is_422 == 0)
7303                         {
7304                             curr_pos_y >>= 1;
7305                         }
7306 
7307                         /* increment the pointers to start of current Sub-TU */
7308                         pu1_cur_recon = (pu1_recon + curr_pos_x);
7309                         pu1_cur_recon += (curr_pos_y * i4_recon_stride);
7310                         pu1_cur_src = (pu1_chrm_src + curr_pos_x);
7311                         pu1_cur_src += (curr_pos_y * chrm_src_stride);
7312                         pu1_cur_pred = (pu1_pred + curr_pos_x);
7313                         pu1_cur_pred += (curr_pos_y * pred_strd);
7314                         pi2_cur_deq_data = pi2_deq_data + curr_pos_x;
7315                         pi2_cur_deq_data += (curr_pos_y * deq_data_strd);
7316 
7317                         /* populate the coeffs scan idx */
7318                         scan_idx = SCAN_DIAG_UPRIGHT;
7319 
7320                         /* perform intra prediction only for Intra case */
7321                         if(PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag)
7322                         {
7323                             UWORD8 *pu1_top_left;
7324                             UWORD8 *pu1_top;
7325                             UWORD8 *pu1_left;
7326                             WORD32 left_strd;
7327 
7328                             calc_recon = !u1_compute_spatial_ssd &&
7329                                          ((4 == u1_num_tus) || (u1_is_422 == 1)) &&
7330                                          (((u1_num_tus == 1) && (0 == i4_subtu_idx)) ||
7331                                           ((ctr == 3) && (0 == i4_subtu_idx) && (u1_is_422 == 1)) ||
7332                                           ((u1_num_tus == 4) && (ctr < 3)));
7333 
7334                             /* left cu boundary */
7335                             if(0 == curr_pos_x)
7336                             {
7337                                 pu1_left = pu1_cu_left + curr_pos_y * cu_left_stride;
7338                                 left_strd = cu_left_stride;
7339                             }
7340                             else
7341                             {
7342                                 pu1_left = pu1_cur_recon - 2;
7343                                 left_strd = i4_recon_stride;
7344                             }
7345 
7346                             /* top cu boundary */
7347                             if(0 == curr_pos_y)
7348                             {
7349                                 pu1_top = pu1_cu_top + curr_pos_x;
7350                             }
7351                             else
7352                             {
7353                                 pu1_top = pu1_cur_recon - i4_recon_stride;
7354                             }
7355 
7356                             /* by default top left is set to cu top left */
7357                             pu1_top_left = pu1_cu_top_left;
7358 
7359                             /* top left based on position */
7360                             if((0 != curr_pos_y) && (0 == curr_pos_x))
7361                             {
7362                                 pu1_top_left = pu1_left - cu_left_stride;
7363                             }
7364                             else if(0 != curr_pos_x)
7365                             {
7366                                 pu1_top_left = pu1_top - 2;
7367                             }
7368 
7369                             /* for 4x4 transforms based on intra pred mode scan is choosen*/
7370                             if(4 == trans_size)
7371                             {
7372                                 /* for modes from 22 upto 30 horizontal scan is used */
7373                                 if((chrm_pred_mode > 21) && (chrm_pred_mode < 31))
7374                                 {
7375                                     scan_idx = SCAN_HORZ;
7376                                 }
7377                                 /* for modes from 6 upto 14 horizontal scan is used */
7378                                 else if((chrm_pred_mode > 5) && (chrm_pred_mode < 15))
7379                                 {
7380                                     scan_idx = SCAN_VERT;
7381                                 }
7382                             }
7383 
7384                             nbr_flags = ihevce_get_intra_chroma_tu_nbr(
7385                                 ps_best_cu_prms->au4_nbr_flags[ctr],
7386                                 i4_subtu_idx,
7387                                 trans_size,
7388                                 u1_is_422);
7389 
7390                             /* call the chroma reference array substitution */
7391                             ihevc_intra_pred_chroma_ref_substitution_fptr(
7392                                 pu1_top_left,
7393                                 pu1_top,
7394                                 pu1_left,
7395                                 left_strd,
7396                                 trans_size,
7397                                 nbr_flags,
7398                                 (UWORD8 *)ps_ctxt->pv_ref_sub_out,
7399                                 1);
7400 
7401                             /* use the look up to get the function idx */
7402                             chrm_pred_func_idx = g_i4_ip_funcs[chrm_pred_mode];
7403 
7404                             /* call the intra prediction function */
7405                             ps_ctxt->apf_chrm_ip[chrm_pred_func_idx](
7406                                 (UWORD8 *)ps_ctxt->pv_ref_sub_out,
7407                                 1,
7408                                 pu1_cur_pred,
7409                                 pred_strd,
7410                                 trans_size,
7411                                 chrm_pred_mode);
7412                         }
7413 
7414                         if(!ctr && !i4_subtu_idx && (u1_compute_spatial_ssd || calc_recon))
7415                         {
7416                             ps_recon_datastore->au1_is_chromaRecon_available[0] =
7417                                 !ps_best_cu_prms->u1_skip_flag;
7418                         }
7419                         else if(!ctr && !i4_subtu_idx)
7420                         {
7421                             ps_recon_datastore->au1_is_chromaRecon_available[0] = 0;
7422                         }
7423                         /************************************************************/
7424                         /* recon loop is done for all cases including skip cu       */
7425                         /* This is because skipping chroma reisdual based on luma   */
7426                         /* skip decision can lead to chroma artifacts               */
7427                         /************************************************************/
7428                         /************************************************************/
7429                         /*In the high quality and medium speed modes, wherein chroma*/
7430                         /*and luma costs are included in the total cost calculation */
7431                         /*the cost is just a ssd cost, and not that obtained through*/
7432                         /*iq_it path                                                */
7433                         /************************************************************/
7434                         if(ps_best_cu_prms->u1_skip_flag == 0)
7435                         {
7436                             WORD32 tu_bits;
7437 
7438                             cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
7439                                 ps_ctxt,
7440                                 pu1_cur_pred,
7441                                 pred_strd,
7442                                 pu1_cur_src,
7443                                 chrm_src_stride,
7444                                 pi2_cur_deq_data,
7445                                 deq_data_strd,
7446                                 pu1_cur_recon,
7447                                 i4_recon_stride,
7448                                 pu1_ecd_data + total_bytes_offset,
7449                                 ps_ctxt->au1_cu_csbf,
7450                                 ps_ctxt->i4_cu_csbf_strd,
7451                                 trans_size,
7452                                 scan_idx,
7453                                 PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag,
7454                                 &num_bytes,
7455                                 &tu_bits,
7456                                 &zero_cols,
7457                                 &zero_rows,
7458                                 &u1_is_recon_available,
7459                                 i4_perform_sbh,
7460                                 i4_perform_rdoq,
7461                                 &trans_ssd_u,
7462 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
7463                                 i4_alpha_stim_multiplier,
7464                                 u1_is_cu_noisy,
7465 #endif
7466                                 ps_best_cu_prms->u1_skip_flag,
7467                                 u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
7468                                 U_PLANE);
7469 
7470                             if(u1_compute_spatial_ssd && u1_is_recon_available)
7471                             {
7472                                 ps_recon_datastore
7473                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7474                                                                         [i4_subtu_idx] = 0;
7475                             }
7476                             else
7477                             {
7478                                 ps_recon_datastore
7479                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7480                                                                         [i4_subtu_idx] = UCHAR_MAX;
7481                             }
7482 
7483 #if !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
7484                             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
7485                             {
7486 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
7487                                 trans_ssd_u = ihevce_inject_stim_into_distortion(
7488                                     pu1_cur_src,
7489                                     chrm_src_stride,
7490                                     pu1_cur_pred,
7491                                     pred_strd,
7492                                     trans_ssd_u,
7493                                     i4_alpha_stim_multiplier,
7494                                     trans_size,
7495                                     0,
7496                                     ps_ctxt->u1_enable_psyRDOPT,
7497                                     U_PLANE);
7498 #else
7499                                 if(u1_compute_spatial_ssd && u1_is_recon_available)
7500                                 {
7501                                     trans_ssd_u = ihevce_inject_stim_into_distortion(
7502                                         pu1_cur_src,
7503                                         chrm_src_stride,
7504                                         pu1_cur_recon,
7505                                         i4_recon_stride,
7506                                         trans_ssd_u,
7507                                         i4_alpha_stim_multiplier,
7508                                         trans_size,
7509                                         0,
7510                                         ps_ctxt->u1_enable_psyRDOPT,
7511                                         U_PLANE);
7512                                 }
7513                                 else
7514                                 {
7515                                     trans_ssd_u = ihevce_inject_stim_into_distortion(
7516                                         pu1_cur_src,
7517                                         chrm_src_stride,
7518                                         pu1_cur_pred,
7519                                         pred_strd,
7520                                         trans_ssd_u,
7521                                         i4_alpha_stim_multiplier,
7522                                         trans_size,
7523                                         0,
7524                                         ps_ctxt->u1_enable_psyRDOPT,
7525                                         U_PLANE);
7526                                 }
7527 #endif
7528                             }
7529 #endif
7530 
7531                             curr_cb_cod_cost =
7532                                 trans_ssd_u +
7533                                 COMPUTE_RATE_COST_CLIP30(
7534                                     tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
7535 
7536                             chrm_tu_bits += tu_bits;
7537                             i4_bits_cb += tu_bits;
7538 
7539                             /* RDOPT copy States :  New updated after curr TU to TU init */
7540                             if(0 != cbf)
7541                             {
7542                                 COPY_CABAC_STATES(
7543                                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7544                                     &ps_ctxt->s_rdopt_entropy_ctxt
7545                                          .as_cu_entropy_ctxt[rd_opt_curr_idx]
7546                                          .s_cabac_ctxt.au1_ctxt_models[0],
7547                                     IHEVC_CAB_CTXT_END);
7548                             }
7549                             /* RDOPT copy States :  Restoring back the Cb init state to Cr */
7550                             else
7551                             {
7552                                 COPY_CABAC_STATES(
7553                                     &ps_ctxt->s_rdopt_entropy_ctxt
7554                                          .as_cu_entropy_ctxt[rd_opt_curr_idx]
7555                                          .s_cabac_ctxt.au1_ctxt_models[0],
7556                                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7557                                     IHEVC_CAB_CTXT_END);
7558                             }
7559 
7560                             /* If Intra and TU=CU/2, need recon for next TUs */
7561                             if(calc_recon)
7562                             {
7563                                 ihevce_chroma_it_recon_fxn(
7564                                     ps_ctxt,
7565                                     pi2_cur_deq_data,
7566                                     deq_data_strd,
7567                                     pu1_cur_pred,
7568                                     pred_strd,
7569                                     pu1_cur_recon,
7570                                     i4_recon_stride,
7571                                     (pu1_ecd_data + total_bytes_offset),
7572                                     trans_size,
7573                                     cbf,
7574                                     zero_cols,
7575                                     zero_rows,
7576                                     U_PLANE);
7577 
7578                                 ps_recon_datastore
7579                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7580                                                                         [i4_subtu_idx] = 0;
7581                             }
7582                             else
7583                             {
7584                                 ps_recon_datastore
7585                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7586                                                                         [i4_subtu_idx] = UCHAR_MAX;
7587                             }
7588                         }
7589                         else
7590                         {
7591                             /* num bytes is set to 0 */
7592                             num_bytes = 0;
7593 
7594                             /* cbf is returned as 0 */
7595                             cbf = 0;
7596 
7597                             curr_cb_cod_cost = trans_ssd_u =
7598 
7599                                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
7600                                     pu1_cur_pred,
7601                                     pu1_cur_src,
7602                                     pred_strd,
7603                                     chrm_src_stride,
7604                                     trans_size,
7605                                     trans_size,
7606                                     U_PLANE);
7607 
7608                             if(u1_compute_spatial_ssd)
7609                             {
7610                                 /* buffer copy fromp pred to recon */
7611 
7612                                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
7613                                     pu1_cur_pred,
7614                                     pred_strd,
7615                                     pu1_cur_recon,
7616                                     i4_recon_stride,
7617                                     trans_size,
7618                                     trans_size,
7619                                     U_PLANE);
7620 
7621                                 ps_recon_datastore
7622                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7623                                                                         [i4_subtu_idx] = 0;
7624                             }
7625 
7626                             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
7627                             {
7628                                 trans_ssd_u = ihevce_inject_stim_into_distortion(
7629                                     pu1_cur_src,
7630                                     chrm_src_stride,
7631                                     pu1_cur_pred,
7632                                     pred_strd,
7633                                     trans_ssd_u,
7634                                     i4_alpha_stim_multiplier,
7635                                     trans_size,
7636                                     0,
7637                                     ps_ctxt->u1_enable_psyRDOPT,
7638                                     U_PLANE);
7639                             }
7640 
7641 #if ENABLE_INTER_ZCU_COST
7642 #if !WEIGH_CHROMA_COST
7643                             /* cbf = 0, accumulate cu not coded cost */
7644                             ps_ctxt->i8_cu_not_coded_cost += curr_cb_cod_cost;
7645 #else
7646                             /* cbf = 0, accumulate cu not coded cost */
7647 
7648                             ps_ctxt->i8_cu_not_coded_cost += (LWORD64)(
7649                                 (curr_cb_cod_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
7650                                  (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
7651                                 CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
7652 #endif
7653 #endif
7654                         }
7655 
7656 #if !WEIGH_CHROMA_COST
7657                         curr_rdopt_cost += curr_cb_cod_cost;
7658 #else
7659                         curr_rdopt_cost +=
7660                             ((curr_cb_cod_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
7661                               (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
7662                              CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
7663 #endif
7664                         chrm_cod_cost += curr_cb_cod_cost;
7665                         i8_ssd_cb += trans_ssd_u;
7666 
7667                         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
7668                         {
7669                             /* Early exit : If the current running cost exceeds
7670                             the prev. best mode cost, break */
7671                             if(curr_rdopt_cost > prev_best_rdopt_cost)
7672                             {
7673                                 u1_is_early_exit_condition_satisfied = 1;
7674                                 break;
7675                             }
7676                         }
7677 
7678                         /* inter cu is coded if any of the tu is coded in it */
7679                         ps_best_cu_prms->u1_is_cu_coded |= cbf;
7680 
7681                         /* update CB related params */
7682                         ps_tu->ai4_cb_coeff_offset[i4_subtu_idx] =
7683                             total_bytes_offset + init_bytes_offset;
7684 
7685                         if(0 == i4_subtu_idx)
7686                         {
7687                             ps_tu->s_tu.b1_cb_cbf = cbf;
7688                         }
7689                         else
7690                         {
7691                             ps_tu->s_tu.b1_cb_cbf_subtu1 = cbf;
7692                         }
7693 
7694                         total_bytes_offset += num_bytes;
7695 
7696                         ps_tu_temp_prms->au4_cb_zero_col[i4_subtu_idx] = zero_cols;
7697                         ps_tu_temp_prms->au4_cb_zero_row[i4_subtu_idx] = zero_rows;
7698                         ps_tu_temp_prms->ai2_cb_bytes_consumed[i4_subtu_idx] = num_bytes;
7699 
7700                         /* recon loop is done for non skip cases */
7701                         if(ps_best_cu_prms->u1_skip_flag == 0)
7702                         {
7703                             WORD32 tu_bits;
7704 
7705                             cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
7706                                 ps_ctxt,
7707                                 pu1_cur_pred,
7708                                 pred_strd,
7709                                 pu1_cur_src,
7710                                 chrm_src_stride,
7711                                 pi2_cur_deq_data + trans_size,
7712                                 deq_data_strd,
7713                                 pu1_cur_recon,
7714                                 i4_recon_stride,
7715                                 pu1_ecd_data + total_bytes_offset,
7716                                 ps_ctxt->au1_cu_csbf,
7717                                 ps_ctxt->i4_cu_csbf_strd,
7718                                 trans_size,
7719                                 scan_idx,
7720                                 PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag,
7721                                 &num_bytes,
7722                                 &tu_bits,
7723                                 &zero_cols,
7724                                 &zero_rows,
7725                                 &u1_is_recon_available,
7726                                 i4_perform_sbh,
7727                                 i4_perform_rdoq,
7728                                 &trans_ssd_v,
7729 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
7730                                 i4_alpha_stim_multiplier,
7731                                 u1_is_cu_noisy,
7732 #endif
7733                                 ps_best_cu_prms->u1_skip_flag,
7734                                 u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
7735                                 V_PLANE);
7736 
7737                             if(u1_compute_spatial_ssd && u1_is_recon_available)
7738                             {
7739                                 ps_recon_datastore
7740                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7741                                                                         [i4_subtu_idx] = 0;
7742                             }
7743                             else
7744                             {
7745                                 ps_recon_datastore
7746                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7747                                                                         [i4_subtu_idx] = UCHAR_MAX;
7748                             }
7749 
7750 #if !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
7751                             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
7752                             {
7753 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
7754                                 trans_ssd_v = ihevce_inject_stim_into_distortion(
7755                                     pu1_cur_src,
7756                                     chrm_src_stride,
7757                                     pu1_cur_pred,
7758                                     pred_strd,
7759                                     trans_ssd_v,
7760                                     i4_alpha_stim_multiplier,
7761                                     trans_size,
7762                                     0,
7763                                     ps_ctxt->u1_enable_psyRDOPT,
7764                                     V_PLANE);
7765 #else
7766                                 if(u1_compute_spatial_ssd && u1_is_recon_available)
7767                                 {
7768                                     trans_ssd_v = ihevce_inject_stim_into_distortion(
7769                                         pu1_cur_src,
7770                                         chrm_src_stride,
7771                                         pu1_cur_recon,
7772                                         i4_recon_stride,
7773                                         trans_ssd_v,
7774                                         i4_alpha_stim_multiplier,
7775                                         trans_size,
7776                                         0,
7777                                         ps_ctxt->u1_enable_psyRDOPT,
7778                                         V_PLANE);
7779                                 }
7780                                 else
7781                                 {
7782                                     trans_ssd_v = ihevce_inject_stim_into_distortion(
7783                                         pu1_cur_src,
7784                                         chrm_src_stride,
7785                                         pu1_cur_pred,
7786                                         pred_strd,
7787                                         trans_ssd_v,
7788                                         i4_alpha_stim_multiplier,
7789                                         trans_size,
7790                                         0,
7791                                         ps_ctxt->u1_enable_psyRDOPT,
7792                                         V_PLANE);
7793                                 }
7794 #endif
7795                             }
7796 #endif
7797 
7798                             curr_cr_cod_cost =
7799                                 trans_ssd_v +
7800                                 COMPUTE_RATE_COST_CLIP30(
7801                                     tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
7802                             chrm_tu_bits += tu_bits;
7803                             i4_bits_cr += tu_bits;
7804 
7805                             /* RDOPT copy States :  New updated after curr TU to TU init */
7806                             if(0 != cbf)
7807                             {
7808                                 COPY_CABAC_STATES(
7809                                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7810                                     &ps_ctxt->s_rdopt_entropy_ctxt
7811                                          .as_cu_entropy_ctxt[rd_opt_curr_idx]
7812                                          .s_cabac_ctxt.au1_ctxt_models[0],
7813                                     IHEVC_CAB_CTXT_END);
7814                             }
7815                             /* RDOPT copy States :  Restoring back the Cb init state to Cr */
7816                             else
7817                             {
7818                                 COPY_CABAC_STATES(
7819                                     &ps_ctxt->s_rdopt_entropy_ctxt
7820                                          .as_cu_entropy_ctxt[rd_opt_curr_idx]
7821                                          .s_cabac_ctxt.au1_ctxt_models[0],
7822                                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7823                                     IHEVC_CAB_CTXT_END);
7824                             }
7825 
7826                             /* If Intra and TU=CU/2, need recon for next TUs */
7827                             if(calc_recon)
7828                             {
7829                                 ihevce_chroma_it_recon_fxn(
7830                                     ps_ctxt,
7831                                     (pi2_cur_deq_data + trans_size),
7832                                     deq_data_strd,
7833                                     pu1_cur_pred,
7834                                     pred_strd,
7835                                     pu1_cur_recon,
7836                                     i4_recon_stride,
7837                                     (pu1_ecd_data + total_bytes_offset),
7838                                     trans_size,
7839                                     cbf,
7840                                     zero_cols,
7841                                     zero_rows,
7842                                     V_PLANE);
7843 
7844                                 ps_recon_datastore
7845                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7846                                                                         [i4_subtu_idx] = 0;
7847                             }
7848                             else
7849                             {
7850                                 ps_recon_datastore
7851                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7852                                                                         [i4_subtu_idx] = UCHAR_MAX;
7853                             }
7854                         }
7855                         else
7856                         {
7857                             /* num bytes is set to 0 */
7858                             num_bytes = 0;
7859 
7860                             /* cbf is returned as 0 */
7861                             cbf = 0;
7862 
7863                             curr_cr_cod_cost = trans_ssd_v =
7864 
7865                                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
7866                                     pu1_cur_pred,
7867                                     pu1_cur_src,
7868                                     pred_strd,
7869                                     chrm_src_stride,
7870                                     trans_size,
7871                                     trans_size,
7872                                     V_PLANE);
7873 
7874                             if(u1_compute_spatial_ssd)
7875                             {
7876                                 /* buffer copy fromp pred to recon */
7877                                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
7878                                     pu1_cur_pred,
7879                                     pred_strd,
7880                                     pu1_cur_recon,
7881                                     i4_recon_stride,
7882                                     trans_size,
7883                                     trans_size,
7884                                     V_PLANE);
7885 
7886                                 ps_recon_datastore
7887                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7888                                                                         [i4_subtu_idx] = 0;
7889                             }
7890 
7891                             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
7892                             {
7893                                 trans_ssd_v = ihevce_inject_stim_into_distortion(
7894                                     pu1_cur_src,
7895                                     chrm_src_stride,
7896                                     pu1_cur_pred,
7897                                     pred_strd,
7898                                     trans_ssd_v,
7899                                     i4_alpha_stim_multiplier,
7900                                     trans_size,
7901                                     0,
7902                                     ps_ctxt->u1_enable_psyRDOPT,
7903                                     V_PLANE);
7904                             }
7905 
7906 #if ENABLE_INTER_ZCU_COST
7907 #if !WEIGH_CHROMA_COST
7908                             /* cbf = 0, accumulate cu not coded cost */
7909                             ps_ctxt->i8_cu_not_coded_cost += curr_cr_cod_cost;
7910 #else
7911                             /* cbf = 0, accumulate cu not coded cost */
7912 
7913                             ps_ctxt->i8_cu_not_coded_cost += (LWORD64)(
7914                                 (curr_cr_cod_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
7915                                  (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
7916                                 CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
7917 #endif
7918 #endif
7919                         }
7920 
7921 #if !WEIGH_CHROMA_COST
7922                         curr_rdopt_cost += curr_cr_cod_cost;
7923 #else
7924                         curr_rdopt_cost +=
7925                             ((curr_cr_cod_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
7926                               (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
7927                              CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
7928 #endif
7929 
7930                         chrm_cod_cost += curr_cr_cod_cost;
7931                         i8_ssd_cr += trans_ssd_v;
7932 
7933                         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
7934                         {
7935                             /* Early exit : If the current running cost exceeds
7936                             the prev. best mode cost, break */
7937                             if(curr_rdopt_cost > prev_best_rdopt_cost)
7938                             {
7939                                 u1_is_early_exit_condition_satisfied = 1;
7940                                 break;
7941                             }
7942                         }
7943 
7944                         /* inter cu is coded if any of the tu is coded in it */
7945                         ps_best_cu_prms->u1_is_cu_coded |= cbf;
7946 
7947                         /* update CR related params */
7948                         ps_tu->ai4_cr_coeff_offset[i4_subtu_idx] =
7949                             total_bytes_offset + init_bytes_offset;
7950 
7951                         if(0 == i4_subtu_idx)
7952                         {
7953                             ps_tu->s_tu.b1_cr_cbf = cbf;
7954                         }
7955                         else
7956                         {
7957                             ps_tu->s_tu.b1_cr_cbf_subtu1 = cbf;
7958                         }
7959 
7960                         total_bytes_offset += num_bytes;
7961 
7962                         ps_tu_temp_prms->au4_cr_zero_col[i4_subtu_idx] = zero_cols;
7963                         ps_tu_temp_prms->au4_cr_zero_row[i4_subtu_idx] = zero_rows;
7964                         ps_tu_temp_prms->ai2_cr_bytes_consumed[i4_subtu_idx] = num_bytes;
7965                     }
7966                     else
7967                     {
7968                         ps_recon_datastore
7969                             ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx] =
7970                             UCHAR_MAX;
7971                         ps_recon_datastore
7972                             ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx] =
7973                             UCHAR_MAX;
7974                     }
7975                 }
7976 
7977                 if(u1_is_early_exit_condition_satisfied)
7978                 {
7979                     break;
7980                 }
7981 
7982                 /* loop increments */
7983                 ps_tu++;
7984                 ps_tu_temp_prms++;
7985             }
7986 
7987             /* Signal as luma mode. HIGH_QUALITY may update it */
7988             ps_best_cu_prms->u1_chroma_intra_pred_mode = 4;
7989 
7990             /* modify the cost chrm_cod_cost */
7991             if(ps_ctxt->u1_enable_psyRDOPT)
7992             {
7993                 UWORD8 *pu1_recon_cu;
7994                 WORD32 recon_stride;
7995                 WORD32 curr_pos_x;
7996                 WORD32 curr_pos_y;
7997                 WORD32 start_index;
7998                 WORD32 num_horz_cu_in_ctb;
7999                 WORD32 had_block_size;
8000                 /* tODO: sreenivasa ctb size has to be used appropriately */
8001                 had_block_size = 8;
8002                 num_horz_cu_in_ctb = 2 * 64 / had_block_size;
8003 
8004                 curr_pos_x = cu_pos_x << 3; /* pel units */
8005                 curr_pos_y = cu_pos_y << 3; /* pel units */
8006                 recon_stride = i4_recon_stride;
8007                 pu1_recon_cu = pu1_recon;
8008 
8009                 /* start index to index the source satd of curr cu int he current ctb*/
8010                 start_index = 2 * (curr_pos_x / had_block_size) +
8011                               (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
8012 
8013                 {
8014                     chrm_cod_cost += ihevce_psy_rd_cost_croma(
8015                         ps_ctxt->ai4_source_chroma_satd,
8016                         pu1_recon,
8017                         recon_stride,
8018                         1,  //
8019                         cu_size,
8020                         0,  // pic type
8021                         0,  //layer id
8022                         ps_ctxt->i4_satd_lamda,  // lambda
8023                         start_index,
8024                         ps_ctxt->u1_is_input_data_hbd,  // 8 bit
8025                         ps_ctxt->u1_chroma_array_type,
8026                         &ps_ctxt->s_cmn_opt_func
8027 
8028                     );  // chroma subsampling 420
8029                 }
8030             }
8031         }
8032         else
8033         {
8034             u1_is_mode_eq_chroma_satd_mode = 1;
8035             chrm_cod_cost = MAX_COST_64;
8036         }
8037 
8038         /* If Intra Block and preset is HIGH QUALITY, then compare with best SATD mode */
8039         if((PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag) &&
8040            (1 == ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_satd))
8041         {
8042             if(64 == cu_size)
8043             {
8044                 ASSERT(TU_EQ_CU != func_proc_mode);
8045             }
8046 
8047             if(ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[func_proc_mode]
8048                    .i8_chroma_best_rdopt < chrm_cod_cost)
8049             {
8050                 UWORD8 *pu1_src;
8051                 UWORD8 *pu1_ecd_data_src_cb;
8052                 UWORD8 *pu1_ecd_data_src_cr;
8053 
8054                 chroma_intra_satd_ctxt_t *ps_chr_intra_satd_ctxt =
8055                     &ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[func_proc_mode];
8056 
8057                 UWORD8 *pu1_dst = &ps_ctxt->au1_rdopt_init_ctxt_models[0];
8058                 WORD32 ai4_ecd_data_cb_offset[2] = { 0, 0 };
8059                 WORD32 ai4_ecd_data_cr_offset[2] = { 0, 0 };
8060 
8061                 pu1_src = &ps_chr_intra_satd_ctxt->au1_chrm_satd_updated_ctxt_models[0];
8062                 chrm_cod_cost = ps_chr_intra_satd_ctxt->i8_chroma_best_rdopt;
8063                 chrm_pred_mode = ps_chr_intra_satd_ctxt->u1_best_cr_mode;
8064                 chrm_tu_bits = ps_chr_intra_satd_ctxt->i4_chrm_tu_bits;
8065 
8066                 if(u1_is_mode_eq_chroma_satd_mode)
8067                 {
8068                     chrm_cod_cost -= ps_chr_intra_satd_ctxt->i8_cost_to_encode_chroma_mode;
8069                 }
8070 
8071                 /*Resetting total_num_bytes_to 0*/
8072                 total_bytes_offset = 0;
8073 
8074                 /* Update the CABAC state corresponding to chroma only */
8075                 /* Chroma Cbf */
8076                 memcpy(pu1_dst + IHEVC_CAB_CBCR_IDX, pu1_src + IHEVC_CAB_CBCR_IDX, 2);
8077                 /* Chroma transform skip */
8078                 memcpy(pu1_dst + IHEVC_CAB_TFM_SKIP12, pu1_src + IHEVC_CAB_TFM_SKIP12, 1);
8079                 /* Chroma last coeff x prefix */
8080                 memcpy(
8081                     pu1_dst + IHEVC_CAB_COEFFX_PREFIX + 15,
8082                     pu1_src + IHEVC_CAB_COEFFX_PREFIX + 15,
8083                     3);
8084                 /* Chroma last coeff y prefix */
8085                 memcpy(
8086                     pu1_dst + IHEVC_CAB_COEFFY_PREFIX + 15,
8087                     pu1_src + IHEVC_CAB_COEFFY_PREFIX + 15,
8088                     3);
8089                 /* Chroma csbf */
8090                 memcpy(
8091                     pu1_dst + IHEVC_CAB_CODED_SUBLK_IDX + 2,
8092                     pu1_src + IHEVC_CAB_CODED_SUBLK_IDX + 2,
8093                     2);
8094                 /* Chroma sig coeff flags */
8095                 memcpy(
8096                     pu1_dst + IHEVC_CAB_COEFF_FLAG + 27, pu1_src + IHEVC_CAB_COEFF_FLAG + 27, 15);
8097                 /* Chroma absgt1 flags */
8098                 memcpy(
8099                     pu1_dst + IHEVC_CAB_COEFABS_GRTR1_FLAG + 16,
8100                     pu1_src + IHEVC_CAB_COEFABS_GRTR1_FLAG + 16,
8101                     8);
8102                 /* Chroma absgt2 flags */
8103                 memcpy(
8104                     pu1_dst + IHEVC_CAB_COEFABS_GRTR2_FLAG + 4,
8105                     pu1_src + IHEVC_CAB_COEFABS_GRTR2_FLAG + 4,
8106                     2);
8107 
8108                 ps_tu = &ps_best_cu_prms->as_tu_enc_loop[0];
8109                 ps_tu_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
8110 
8111                 /* update to luma decision as we update chroma in final mode */
8112                 ps_best_cu_prms->u1_is_cu_coded = u1_is_cu_coded_old;
8113 
8114                 for(ctr = 0; ctr < u1_num_tus; ctr++)
8115                 {
8116                     for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus_in_tu; i4_subtu_idx++)
8117                     {
8118                         WORD32 cbf;
8119                         WORD32 num_bytes;
8120 
8121                         pu1_ecd_data_src_cb =
8122                             &ps_chr_intra_satd_ctxt->au1_scan_coeff_cb[i4_subtu_idx][0];
8123                         pu1_ecd_data_src_cr =
8124                             &ps_chr_intra_satd_ctxt->au1_scan_coeff_cr[i4_subtu_idx][0];
8125 
8126                         /* check if chroma present flag is set */
8127                         if(1 == ps_tu->s_tu.b3_chroma_intra_mode_idx)
8128                         {
8129                             UWORD8 *pu1_cur_pred_dest;
8130                             UWORD8 *pu1_cur_pred_src;
8131                             WORD32 pred_src_strd;
8132                             WORD16 *pi2_cur_deq_data_dest;
8133                             WORD16 *pi2_cur_deq_data_src_cb;
8134                             WORD16 *pi2_cur_deq_data_src_cr;
8135                             WORD32 deq_src_strd;
8136 
8137                             WORD32 curr_pos_x, curr_pos_y;
8138 
8139                             trans_size = ps_tu->s_tu.b3_size;
8140                             trans_size = (1 << (trans_size + 1)); /* in chroma units */
8141 
8142                             /*Deriving stride values*/
8143                             pred_src_strd = ps_chr_intra_satd_ctxt->i4_pred_stride;
8144                             deq_src_strd = ps_chr_intra_satd_ctxt->i4_iq_buff_stride;
8145 
8146                             /* since 2x2 transform is not allowed for chroma*/
8147                             if(2 == trans_size)
8148                             {
8149                                 trans_size = 4;
8150                             }
8151 
8152                             /* get the current tu posx and posy w.r.t to cu */
8153                             curr_pos_x = (ps_tu->s_tu.b4_pos_x << 2) - (cu_pos_x << 3);
8154                             curr_pos_y = (ps_tu->s_tu.b4_pos_y << 2) - (cu_pos_y << 3) +
8155                                          (i4_subtu_idx * trans_size);
8156 
8157                             /* 420sp case only vertical height will be half */
8158                             if(0 == u1_is_422)
8159                             {
8160                                 curr_pos_y >>= 1;
8161                             }
8162 
8163                             /* increment the pointers to start of current TU  */
8164                             pu1_cur_pred_src =
8165                                 ((UWORD8 *)ps_chr_intra_satd_ctxt->pv_pred_data + curr_pos_x);
8166                             pu1_cur_pred_src += (curr_pos_y * pred_src_strd);
8167                             pu1_cur_pred_dest = (pu1_pred + curr_pos_x);
8168                             pu1_cur_pred_dest += (curr_pos_y * pred_strd);
8169 
8170                             pi2_cur_deq_data_src_cb =
8171                                 &ps_chr_intra_satd_ctxt->ai2_iq_data_cb[0] + (curr_pos_x >> 1);
8172                             pi2_cur_deq_data_src_cr =
8173                                 &ps_chr_intra_satd_ctxt->ai2_iq_data_cr[0] + (curr_pos_x >> 1);
8174                             pi2_cur_deq_data_src_cb += (curr_pos_y * deq_src_strd);
8175                             pi2_cur_deq_data_src_cr += (curr_pos_y * deq_src_strd);
8176                             pi2_cur_deq_data_dest = pi2_deq_data + curr_pos_x;
8177                             pi2_cur_deq_data_dest += (curr_pos_y * deq_data_strd);
8178 
8179                             /*Overwriting deq data with that belonging to the winning special mode
8180                             (luma mode !=  chroma mode)
8181                             ihevce_copy_2d takes source and dest arguments as UWORD8 *. We have to
8182                             correspondingly manipulate to copy WORD16 data*/
8183 
8184                             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
8185                                 (UWORD8 *)pi2_cur_deq_data_dest,
8186                                 (deq_data_strd << 1),
8187                                 (UWORD8 *)pi2_cur_deq_data_src_cb,
8188                                 (deq_src_strd << 1),
8189                                 (trans_size << 1),
8190                                 trans_size);
8191 
8192                             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
8193                                 (UWORD8 *)(pi2_cur_deq_data_dest + trans_size),
8194                                 (deq_data_strd << 1),
8195                                 (UWORD8 *)pi2_cur_deq_data_src_cr,
8196                                 (deq_src_strd << 1),
8197                                 (trans_size << 1),
8198                                 trans_size);
8199 
8200                             /*Overwriting pred data with that belonging to the winning special mode
8201                             (luma mode !=  chroma mode)*/
8202 
8203                             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
8204                                 pu1_cur_pred_dest,
8205                                 pred_strd,
8206                                 pu1_cur_pred_src,
8207                                 pred_src_strd,
8208                                 (trans_size << 1),
8209                                 trans_size);
8210 
8211                             num_bytes = ps_chr_intra_satd_ctxt
8212                                             ->ai4_num_bytes_scan_coeff_cb_per_tu[i4_subtu_idx][ctr];
8213                             cbf = ps_chr_intra_satd_ctxt->au1_cbf_cb[i4_subtu_idx][ctr];
8214                             /* inter cu is coded if any of the tu is coded in it */
8215                             ps_best_cu_prms->u1_is_cu_coded |= cbf;
8216 
8217                             /* update CB related params */
8218                             ps_tu->ai4_cb_coeff_offset[i4_subtu_idx] =
8219                                 total_bytes_offset + init_bytes_offset;
8220 
8221                             if(0 == i4_subtu_idx)
8222                             {
8223                                 ps_tu->s_tu.b1_cb_cbf = cbf;
8224                             }
8225                             else
8226                             {
8227                                 ps_tu->s_tu.b1_cb_cbf_subtu1 = cbf;
8228                             }
8229 
8230                             /*Overwriting the cb ecd data corresponding to the special mode*/
8231                             if(0 != num_bytes)
8232                             {
8233                                 memcpy(
8234                                     (pu1_ecd_data + total_bytes_offset),
8235                                     pu1_ecd_data_src_cb + ai4_ecd_data_cb_offset[i4_subtu_idx],
8236                                     num_bytes);
8237                             }
8238 
8239                             total_bytes_offset += num_bytes;
8240                             ai4_ecd_data_cb_offset[i4_subtu_idx] += num_bytes;
8241                             ps_tu_temp_prms->ai2_cb_bytes_consumed[i4_subtu_idx] = num_bytes;
8242 
8243                             num_bytes = ps_chr_intra_satd_ctxt
8244                                             ->ai4_num_bytes_scan_coeff_cr_per_tu[i4_subtu_idx][ctr];
8245                             cbf = ps_chr_intra_satd_ctxt->au1_cbf_cr[i4_subtu_idx][ctr];
8246                             /* inter cu is coded if any of the tu is coded in it */
8247                             ps_best_cu_prms->u1_is_cu_coded |= cbf;
8248 
8249                             /*Overwriting the cr ecd data corresponding to the special mode*/
8250                             if(0 != num_bytes)
8251                             {
8252                                 memcpy(
8253                                     (pu1_ecd_data + total_bytes_offset),
8254                                     pu1_ecd_data_src_cr + ai4_ecd_data_cr_offset[i4_subtu_idx],
8255                                     num_bytes);
8256                             }
8257 
8258                             /* update CR related params */
8259                             ps_tu->ai4_cr_coeff_offset[i4_subtu_idx] =
8260                                 total_bytes_offset + init_bytes_offset;
8261 
8262                             if(0 == i4_subtu_idx)
8263                             {
8264                                 ps_tu->s_tu.b1_cr_cbf = cbf;
8265                             }
8266                             else
8267                             {
8268                                 ps_tu->s_tu.b1_cr_cbf_subtu1 = cbf;
8269                             }
8270 
8271                             total_bytes_offset += num_bytes;
8272                             ai4_ecd_data_cr_offset[i4_subtu_idx] += num_bytes;
8273 
8274                             /*Updating zero rows and zero cols*/
8275                             ps_tu_temp_prms->au4_cb_zero_col[i4_subtu_idx] =
8276                                 ps_chr_intra_satd_ctxt->ai4_zero_col_cb[i4_subtu_idx][ctr];
8277                             ps_tu_temp_prms->au4_cb_zero_row[i4_subtu_idx] =
8278                                 ps_chr_intra_satd_ctxt->ai4_zero_row_cb[i4_subtu_idx][ctr];
8279                             ps_tu_temp_prms->au4_cr_zero_col[i4_subtu_idx] =
8280                                 ps_chr_intra_satd_ctxt->ai4_zero_col_cr[i4_subtu_idx][ctr];
8281                             ps_tu_temp_prms->au4_cr_zero_row[i4_subtu_idx] =
8282                                 ps_chr_intra_satd_ctxt->ai4_zero_row_cr[i4_subtu_idx][ctr];
8283 
8284                             ps_tu_temp_prms->ai2_cr_bytes_consumed[i4_subtu_idx] = num_bytes;
8285 
8286                             if((u1_num_tus > 1) &&
8287                                ps_recon_datastore->au1_is_chromaRecon_available[2])
8288                             {
8289                                 ps_recon_datastore
8290                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
8291                                                                         [i4_subtu_idx] = 2;
8292                                 ps_recon_datastore
8293                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
8294                                                                         [i4_subtu_idx] = 2;
8295                             }
8296                             else if(
8297                                 (1 == u1_num_tus) &&
8298                                 ps_recon_datastore->au1_is_chromaRecon_available[1])
8299                             {
8300                                 ps_recon_datastore
8301                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
8302                                                                         [i4_subtu_idx] = 1;
8303                                 ps_recon_datastore
8304                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
8305                                                                         [i4_subtu_idx] = 1;
8306                             }
8307                             else
8308                             {
8309                                 ps_recon_datastore
8310                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
8311                                                                         [i4_subtu_idx] = UCHAR_MAX;
8312                                 ps_recon_datastore
8313                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
8314                                                                         [i4_subtu_idx] = UCHAR_MAX;
8315                             }
8316                         }
8317                     }
8318 
8319                     /* loop increments */
8320                     ps_tu++;
8321                     ps_tu_temp_prms++;
8322                 }
8323             }
8324 
8325             if(!u1_is_422)
8326             {
8327                 if(chrm_pred_mode == luma_pred_mode)
8328                 {
8329                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 4;
8330                 }
8331                 else if(chrm_pred_mode == 0)
8332                 {
8333                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 0;
8334                 }
8335                 else if(chrm_pred_mode == 1)
8336                 {
8337                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 3;
8338                 }
8339                 else if(chrm_pred_mode == 10)
8340                 {
8341                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 2;
8342                 }
8343                 else if(chrm_pred_mode == 26)
8344                 {
8345                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 1;
8346                 }
8347                 else
8348                 {
8349                     ASSERT(0); /*Should not come here*/
8350                 }
8351             }
8352             else
8353             {
8354                 if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[luma_pred_mode])
8355                 {
8356                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 4;
8357                 }
8358                 else if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[0])
8359                 {
8360                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 0;
8361                 }
8362                 else if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[1])
8363                 {
8364                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 3;
8365                 }
8366                 else if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[10])
8367                 {
8368                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 2;
8369                 }
8370                 else if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[26])
8371                 {
8372                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 1;
8373                 }
8374                 else
8375                 {
8376                     ASSERT(0); /*Should not come here*/
8377                 }
8378             }
8379         }
8380 
8381         /* Store the actual chroma mode */
8382         ps_best_cu_prms->u1_chroma_intra_pred_actual_mode = chrm_pred_mode;
8383     }
8384 
8385     /* update the total bytes produced */
8386     ps_best_cu_prms->i4_num_bytes_ecd_data = total_bytes_offset + init_bytes_offset;
8387 
8388     /* store the final chrm bits accumulated */
8389     *pi4_chrm_tu_bits = chrm_tu_bits;
8390 
8391     return (chrm_cod_cost);
8392 }
8393 
8394 /*!
8395 ******************************************************************************
8396 * \if Function name : ihevce_final_rdopt_mode_prcs \endif
8397 *
8398 * \brief
8399 *    Final RDOPT mode process function. Performs Recon computation for the
8400 *    final mode. Re-use or Compute pred, iq-data, coeff based on the flags.
8401 *
8402 * \param[in] pv_ctxt : pointer to enc_loop module
8403 * \param[in] ps_prms : pointer to struct containing requisite parameters
8404 *
8405 * \return
8406 *    None
8407 *
8408 * \author
8409 *  Ittiam
8410 *
8411 *****************************************************************************
8412 */
ihevce_final_rdopt_mode_prcs(ihevce_enc_loop_ctxt_t * ps_ctxt,final_mode_process_prms_t * ps_prms)8413 void ihevce_final_rdopt_mode_prcs(
8414     ihevce_enc_loop_ctxt_t *ps_ctxt, final_mode_process_prms_t *ps_prms)
8415 {
8416     enc_loop_cu_final_prms_t *ps_best_cu_prms;
8417     tu_enc_loop_out_t *ps_tu_enc_loop;
8418     tu_enc_loop_temp_prms_t *ps_tu_enc_loop_temp_prms;
8419     nbr_avail_flags_t s_nbr;
8420     recon_datastore_t *ps_recon_datastore;
8421 
8422     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
8423     ihevc_intra_pred_chroma_ref_substitution_ft *ihevc_intra_pred_chroma_ref_substitution_fptr;
8424     ihevc_intra_pred_ref_filtering_ft *ihevc_intra_pred_ref_filtering_fptr;
8425 
8426     WORD32 num_tu_in_cu;
8427     LWORD64 rd_opt_cost;
8428     WORD32 ctr;
8429     WORD32 i4_subtu_idx;
8430     WORD32 cu_size;
8431     WORD32 cu_pos_x, cu_pos_y;
8432     WORD32 chrm_present_flag = 1;
8433     WORD32 num_bytes, total_bytes = 0;
8434     WORD32 chrm_ctr = 0;
8435     WORD32 u1_is_cu_coded;
8436     UWORD8 *pu1_old_ecd_data;
8437     UWORD8 *pu1_chrm_old_ecd_data;
8438     UWORD8 *pu1_cur_pred;
8439     WORD16 *pi2_deq_data;
8440     WORD16 *pi2_chrm_deq_data;
8441     WORD16 *pi2_cur_deq_data;
8442     WORD16 *pi2_cur_deq_data_chrm;
8443     UWORD8 *pu1_cur_luma_recon;
8444     UWORD8 *pu1_cur_chroma_recon;
8445     UWORD8 *pu1_cur_src;
8446     UWORD8 *pu1_cur_src_chrm;
8447     UWORD8 *pu1_cur_pred_chrm;
8448     UWORD8 *pu1_intra_pred_mode;
8449     UWORD32 *pu4_nbr_flags;
8450     LWORD64 i8_ssd;
8451 
8452     cu_nbr_prms_t *ps_cu_nbr_prms = ps_prms->ps_cu_nbr_prms;
8453     cu_inter_cand_t *ps_best_inter_cand = ps_prms->ps_best_inter_cand;
8454     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms = ps_prms->ps_chrm_cu_buf_prms;
8455 
8456     WORD32 packed_pred_mode = ps_prms->packed_pred_mode;
8457     WORD32 rd_opt_best_idx = ps_prms->rd_opt_best_idx;
8458     UWORD8 *pu1_src = (UWORD8 *)ps_prms->pv_src;
8459     WORD32 src_strd = ps_prms->src_strd;
8460     UWORD8 *pu1_pred = (UWORD8 *)ps_prms->pv_pred;
8461     WORD32 pred_strd = ps_prms->pred_strd;
8462     UWORD8 *pu1_pred_chrm = (UWORD8 *)ps_prms->pv_pred_chrm;
8463     WORD32 pred_chrm_strd = ps_prms->pred_chrm_strd;
8464     UWORD8 *pu1_final_ecd_data = ps_prms->pu1_final_ecd_data;
8465     UWORD8 *pu1_csbf_buf = ps_prms->pu1_csbf_buf;
8466     WORD32 csbf_strd = ps_prms->csbf_strd;
8467     UWORD8 *pu1_luma_recon = (UWORD8 *)ps_prms->pv_luma_recon;
8468     WORD32 recon_luma_strd = ps_prms->recon_luma_strd;
8469     UWORD8 *pu1_chrm_recon = (UWORD8 *)ps_prms->pv_chrm_recon;
8470     WORD32 recon_chrma_strd = ps_prms->recon_chrma_strd;
8471     UWORD8 u1_cu_pos_x = ps_prms->u1_cu_pos_x;
8472     UWORD8 u1_cu_pos_y = ps_prms->u1_cu_pos_y;
8473     UWORD8 u1_cu_size = ps_prms->u1_cu_size;
8474     WORD8 i1_cu_qp = ps_prms->i1_cu_qp;
8475     UWORD8 u1_is_422 = (ps_ctxt->u1_chroma_array_type == 2);
8476     UWORD8 u1_num_subtus = (u1_is_422 == 1) + 1;
8477     /* Get the Chroma pointer and parameters */
8478     UWORD8 *pu1_src_chrm = ps_chrm_cu_buf_prms->pu1_curr_src;
8479     WORD32 src_chrm_strd = ps_chrm_cu_buf_prms->i4_chrm_src_stride;
8480     UWORD8 u1_compute_spatial_ssd_luma = 0;
8481     UWORD8 u1_compute_spatial_ssd_chroma = 0;
8482     /* Get the pointer for function selector */
8483     ihevc_intra_pred_luma_ref_substitution_fptr =
8484         ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
8485 
8486     ihevc_intra_pred_ref_filtering_fptr =
8487         ps_ctxt->ps_func_selector->ihevc_intra_pred_ref_filtering_fptr;
8488 
8489     ihevc_intra_pred_chroma_ref_substitution_fptr =
8490         ps_ctxt->ps_func_selector->ihevc_intra_pred_chroma_ref_substitution_fptr;
8491 
8492     /* Get the best CU parameters */
8493     ps_best_cu_prms = &ps_ctxt->as_cu_prms[rd_opt_best_idx];
8494     num_tu_in_cu = ps_best_cu_prms->u2_num_tus_in_cu;
8495     cu_size = ps_best_cu_prms->u1_cu_size;
8496     cu_pos_x = u1_cu_pos_x;
8497     cu_pos_y = u1_cu_pos_y;
8498     pu1_intra_pred_mode = &ps_best_cu_prms->au1_intra_pred_mode[0];
8499     pu4_nbr_flags = &ps_best_cu_prms->au4_nbr_flags[0];
8500     ps_recon_datastore = &ps_best_cu_prms->s_recon_datastore;
8501 
8502     /* get the first TU pointer */
8503     ps_tu_enc_loop = &ps_best_cu_prms->as_tu_enc_loop[0];
8504     /* get the first TU only enc_loop prms pointer */
8505     ps_tu_enc_loop_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
8506     /*modify quant related param in ctxt based on current cu qp*/
8507     if((ps_ctxt->i1_cu_qp_delta_enable))
8508     {
8509         /*recompute quant related param at every cu level*/
8510         ihevce_compute_quant_rel_param(ps_ctxt, i1_cu_qp);
8511 
8512         /* get frame level lambda params */
8513         ihevce_get_cl_cu_lambda_prms(
8514             ps_ctxt, MODULATE_LAMDA_WHEN_SPATIAL_MOD_ON ? i1_cu_qp : ps_ctxt->i4_frame_qp);
8515     }
8516 
8517     ps_best_cu_prms->i8_cu_ssd = 0;
8518     ps_best_cu_prms->u4_cu_open_intra_sad = 0;
8519 
8520     /* For skip case : Set TU_size = CU_size and make cbf = 0
8521     so that same TU loop can be used for all modes */
8522     if(PRED_MODE_SKIP == packed_pred_mode)
8523     {
8524         for(ctr = 0; ctr < num_tu_in_cu; ctr++)
8525         {
8526             ps_tu_enc_loop->s_tu.b1_y_cbf = 0;
8527 
8528             ps_tu_enc_loop_temp_prms->i2_luma_bytes_consumed = 0;
8529 
8530             ps_tu_enc_loop++;
8531             ps_tu_enc_loop_temp_prms++;
8532         }
8533 
8534         /* go back to the first TU pointer */
8535         ps_tu_enc_loop = &ps_best_cu_prms->as_tu_enc_loop[0];
8536         ps_tu_enc_loop_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
8537     }
8538     /**   For inter case, pred calculation is outside the loop     **/
8539     if(PRED_MODE_INTRA != packed_pred_mode)
8540     {
8541         /**------------- Compute pred data if required --------------**/
8542         if((1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data))
8543         {
8544             nbr_4x4_t *ps_topleft_nbr_4x4;
8545             nbr_4x4_t *ps_left_nbr_4x4;
8546             nbr_4x4_t *ps_top_nbr_4x4;
8547             WORD32 nbr_4x4_left_strd;
8548 
8549             ps_best_inter_cand->pu1_pred_data = pu1_pred;
8550             ps_best_inter_cand->i4_pred_data_stride = pred_strd;
8551 
8552             /* Get the CU nbr information */
8553             ps_topleft_nbr_4x4 = ps_cu_nbr_prms->ps_topleft_nbr_4x4;
8554             ps_left_nbr_4x4 = ps_cu_nbr_prms->ps_left_nbr_4x4;
8555             ps_top_nbr_4x4 = ps_cu_nbr_prms->ps_top_nbr_4x4;
8556             nbr_4x4_left_strd = ps_cu_nbr_prms->nbr_4x4_left_strd;
8557 
8558             /* MVP ,MVD calc and Motion compensation */
8559             rd_opt_cost = ((pf_inter_rdopt_cu_mc_mvp)ps_ctxt->pv_inter_rdopt_cu_mc_mvp)(
8560                 ps_ctxt,
8561                 ps_best_inter_cand,
8562                 u1_cu_size,
8563                 cu_pos_x,
8564                 cu_pos_y,
8565                 ps_left_nbr_4x4,
8566                 ps_top_nbr_4x4,
8567                 ps_topleft_nbr_4x4,
8568                 nbr_4x4_left_strd,
8569                 rd_opt_best_idx);
8570         }
8571 
8572         /** ------ Motion Compensation for Chroma -------- **/
8573         if(1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data)
8574         {
8575             UWORD8 *pu1_cur_pred;
8576             pu1_cur_pred = pu1_pred_chrm;
8577 
8578             /* run a loop over all the partitons in cu */
8579             for(ctr = 0; ctr < ps_best_cu_prms->u2_num_pus_in_cu; ctr++)
8580             {
8581                 pu_t *ps_pu;
8582                 WORD32 inter_pu_wd, inter_pu_ht;
8583 
8584                 ps_pu = &ps_best_cu_prms->as_pu_chrm_proc[ctr];
8585 
8586                 /* IF AMP then each partitions can have diff wd ht */
8587                 inter_pu_wd = (ps_pu->b4_wd + 1) << 2; /* cb and cr pixel interleaved */
8588                 inter_pu_ht = ((ps_pu->b4_ht + 1) << 2) >> 1;
8589                 inter_pu_ht <<= u1_is_422;
8590                 /* chroma mc func */
8591                 ihevce_chroma_inter_pred_pu(
8592                     &ps_ctxt->s_mc_ctxt, ps_pu, pu1_cur_pred, pred_chrm_strd);
8593                 if(2 == ps_best_cu_prms->u2_num_pus_in_cu)
8594                 {
8595                     /* 2Nx__ partion case */
8596                     if(inter_pu_wd == ps_best_cu_prms->u1_cu_size)
8597                     {
8598                         pu1_cur_pred += (inter_pu_ht * pred_chrm_strd);
8599                     }
8600                     /* __x2N partion case */
8601                     if(inter_pu_ht == (ps_best_cu_prms->u1_cu_size >> (u1_is_422 == 0)))
8602                     {
8603                         pu1_cur_pred += inter_pu_wd;
8604                     }
8605                 }
8606             }
8607         }
8608     }
8609     pi2_deq_data = &ps_best_cu_prms->pi2_cu_deq_coeffs[0];
8610     pi2_chrm_deq_data =
8611         &ps_best_cu_prms->pi2_cu_deq_coeffs[0] + ps_best_cu_prms->i4_chrm_deq_coeff_strt_idx;
8612     pu1_old_ecd_data = &ps_best_cu_prms->pu1_cu_coeffs[0];
8613     pu1_chrm_old_ecd_data =
8614         &ps_best_cu_prms->pu1_cu_coeffs[0] + ps_best_cu_prms->i4_chrm_cu_coeff_strt_idx;
8615 
8616     /* default value for cu coded flag */
8617     u1_is_cu_coded = 0;
8618 
8619     /* If we are re-computing coeff, set sad to 0 and start accumulating */
8620     /* else use the best cand. sad from RDOPT stage                    */
8621     if(1 == ps_tu_enc_loop_temp_prms->b1_eval_luma_iq_and_coeff_data)
8622     {
8623         /*init of ssd of CU accuumulated over all TU*/
8624         ps_best_cu_prms->u4_cu_sad = 0;
8625 
8626         /* reset the luma residual bits */
8627         ps_best_cu_prms->u4_cu_luma_res_bits = 0;
8628     }
8629 
8630     if(1 == ps_tu_enc_loop_temp_prms->b1_eval_chroma_iq_and_coeff_data)
8631     {
8632         /* reset the chroma residual bits */
8633         ps_best_cu_prms->u4_cu_chroma_res_bits = 0;
8634     }
8635 
8636     if((1 == ps_tu_enc_loop_temp_prms->b1_eval_luma_iq_and_coeff_data) ||
8637        (1 == ps_tu_enc_loop_temp_prms->b1_eval_chroma_iq_and_coeff_data))
8638     {
8639         /*Header bits have to be reevaluated if luma and chroma reevaluation is done, as
8640         the quantized coefficients might be changed.
8641         We are copying only those states which correspond to the header from the cabac state
8642         of the previous CU, because the header is going to be recomputed for this condition*/
8643         ps_ctxt->s_cu_final_recon_flags.u1_eval_header_data = 1;
8644         memcpy(
8645             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
8646             &ps_ctxt->s_rdopt_entropy_ctxt.au1_init_cabac_ctxt_states[0],
8647             IHEVC_CAB_COEFFX_PREFIX);
8648 
8649         if((1 == ps_tu_enc_loop_temp_prms->b1_eval_luma_iq_and_coeff_data))
8650         {
8651             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
8652                 (&ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX),
8653                 (&ps_ctxt->s_rdopt_entropy_ctxt.au1_init_cabac_ctxt_states[0] +
8654                  IHEVC_CAB_COEFFX_PREFIX),
8655                 (IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX));
8656         }
8657         else
8658         {
8659             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
8660                 (&ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX),
8661                 (&ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
8662                       .s_cabac_ctxt.au1_ctxt_models[0] +
8663                  IHEVC_CAB_COEFFX_PREFIX),
8664                 (IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX));
8665         }
8666         ps_ctxt->s_rdopt_entropy_ctxt.i4_curr_buf_idx = rd_opt_best_idx;
8667     }
8668     else
8669     {
8670         ps_ctxt->s_cu_final_recon_flags.u1_eval_header_data = 0;
8671     }
8672 
8673     /* Zero cbf tool is disabled for intra CUs */
8674     if(PRED_MODE_INTRA == packed_pred_mode)
8675     {
8676 #if ENABLE_ZERO_CBF_IN_INTRA
8677         ps_ctxt->i4_zcbf_rdo_level = ZCBF_ENABLE;
8678 #else
8679         ps_ctxt->i4_zcbf_rdo_level = NO_ZCBF;
8680 #endif
8681     }
8682     else
8683     {
8684 #if DISABLE_ZERO_ZBF_IN_INTER
8685         ps_ctxt->i4_zcbf_rdo_level = NO_ZCBF;
8686 #else
8687         ps_ctxt->i4_zcbf_rdo_level = ZCBF_ENABLE;
8688 #endif
8689     }
8690 
8691     /** Loop for all tu blocks in current cu and do reconstruction **/
8692     for(ctr = 0; ctr < num_tu_in_cu; ctr++)
8693     {
8694         tu_t *ps_tu;
8695         WORD32 trans_size, num_4x4_in_tu;
8696         WORD32 cbf, zero_rows, zero_cols;
8697         WORD32 cu_pos_x_in_4x4, cu_pos_y_in_4x4;
8698         WORD32 cu_pos_x_in_pix, cu_pos_y_in_pix;
8699         WORD32 luma_pred_mode, chroma_pred_mode = 0;
8700         UWORD8 au1_is_recon_available[2];
8701 
8702         ps_tu = &(ps_tu_enc_loop->s_tu); /* Points to the TU property ctxt */
8703 
8704         u1_compute_spatial_ssd_luma = 0;
8705         u1_compute_spatial_ssd_chroma = 0;
8706 
8707         trans_size = 1 << (ps_tu->b3_size + 2);
8708         num_4x4_in_tu = (trans_size >> 2);
8709         cu_pos_x_in_4x4 = ps_tu->b4_pos_x;
8710         cu_pos_y_in_4x4 = ps_tu->b4_pos_y;
8711 
8712         /* populate the coeffs scan idx */
8713         ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
8714 
8715         /* get the current pos x and pos y in pixels */
8716         cu_pos_x_in_pix = (cu_pos_x_in_4x4 << 2) - (cu_pos_x << 3);
8717         cu_pos_y_in_pix = (cu_pos_y_in_4x4 << 2) - (cu_pos_y << 3);
8718 
8719         /* Update pointers based on the location */
8720         pu1_cur_src = pu1_src + cu_pos_x_in_pix;
8721         pu1_cur_src += (cu_pos_y_in_pix * src_strd);
8722         pu1_cur_pred = pu1_pred + cu_pos_x_in_pix;
8723         pu1_cur_pred += (cu_pos_y_in_pix * pred_strd);
8724 
8725         pu1_cur_luma_recon = pu1_luma_recon + cu_pos_x_in_pix;
8726         pu1_cur_luma_recon += (cu_pos_y_in_pix * recon_luma_strd);
8727 
8728         pi2_cur_deq_data = pi2_deq_data + cu_pos_x_in_pix;
8729         pi2_cur_deq_data += cu_pos_y_in_pix * cu_size;
8730 
8731         pu1_cur_src_chrm = pu1_src_chrm + cu_pos_x_in_pix;
8732         pu1_cur_src_chrm += ((cu_pos_y_in_pix >> 1) * src_chrm_strd) +
8733                             (u1_is_422 * ((cu_pos_y_in_pix >> 1) * src_chrm_strd));
8734 
8735         pu1_cur_pred_chrm = pu1_pred_chrm + cu_pos_x_in_pix;
8736         pu1_cur_pred_chrm += ((cu_pos_y_in_pix >> 1) * pred_chrm_strd) +
8737                              (u1_is_422 * ((cu_pos_y_in_pix >> 1) * pred_chrm_strd));
8738 
8739         pu1_cur_chroma_recon = pu1_chrm_recon + cu_pos_x_in_pix;
8740         pu1_cur_chroma_recon += ((cu_pos_y_in_pix >> 1) * recon_chrma_strd) +
8741                                 (u1_is_422 * ((cu_pos_y_in_pix >> 1) * recon_chrma_strd));
8742 
8743         pi2_cur_deq_data_chrm = pi2_chrm_deq_data + cu_pos_x_in_pix;
8744         pi2_cur_deq_data_chrm +=
8745             ((cu_pos_y_in_pix >> 1) * cu_size) + (u1_is_422 * ((cu_pos_y_in_pix >> 1) * cu_size));
8746 
8747         /* if transfrom size is 4x4 then only first luma 4x4 will have chroma*/
8748         chrm_present_flag = 1; /* by default chroma present is set to 1*/
8749 
8750         if(4 == trans_size)
8751         {
8752             /* if tusize is 4x4 then only first luma 4x4 will have chroma*/
8753             if(0 != chrm_ctr)
8754             {
8755                 chrm_present_flag = INTRA_PRED_CHROMA_IDX_NONE;
8756             }
8757 
8758             /* increment the chrm ctr unconditionally */
8759             chrm_ctr++;
8760             /* after ctr reached 4 reset it */
8761             if(4 == chrm_ctr)
8762             {
8763                 chrm_ctr = 0;
8764             }
8765         }
8766 
8767         /**------------- Compute pred data if required --------------**/
8768         if(PRED_MODE_INTRA == packed_pred_mode) /* Inter pred calc. is done outside loop */
8769         {
8770             /* Get the pred mode for scan idx calculation, even if pred is not required */
8771             luma_pred_mode = *pu1_intra_pred_mode;
8772 
8773             if((ps_ctxt->i4_rc_pass == 1) ||
8774                (1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data))
8775             {
8776                 WORD32 nbr_flags;
8777                 WORD32 luma_pred_func_idx;
8778                 UWORD8 *pu1_left;
8779                 UWORD8 *pu1_top;
8780                 UWORD8 *pu1_top_left;
8781                 WORD32 left_strd;
8782 
8783                 /* left cu boundary */
8784                 if(0 == cu_pos_x_in_pix)
8785                 {
8786                     left_strd = ps_cu_nbr_prms->cu_left_stride;
8787                     pu1_left = ps_cu_nbr_prms->pu1_cu_left + cu_pos_y_in_pix * left_strd;
8788                 }
8789                 else
8790                 {
8791                     pu1_left = pu1_cur_luma_recon - 1;
8792                     left_strd = recon_luma_strd;
8793                 }
8794 
8795                 /* top cu boundary */
8796                 if(0 == cu_pos_y_in_pix)
8797                 {
8798                     pu1_top = ps_cu_nbr_prms->pu1_cu_top + cu_pos_x_in_pix;
8799                 }
8800                 else
8801                 {
8802                     pu1_top = pu1_cur_luma_recon - recon_luma_strd;
8803                 }
8804 
8805                 /* by default top left is set to cu top left */
8806                 pu1_top_left = ps_cu_nbr_prms->pu1_cu_top_left;
8807 
8808                 /* top left based on position */
8809                 if((0 != cu_pos_y_in_pix) && (0 == cu_pos_x_in_pix))
8810                 {
8811                     pu1_top_left = pu1_left - left_strd;
8812                 }
8813                 else if(0 != cu_pos_x_in_pix)
8814                 {
8815                     pu1_top_left = pu1_top - 1;
8816                 }
8817 
8818                 /* get the neighbour availability flags */
8819                 nbr_flags = ihevce_get_nbr_intra(
8820                     &s_nbr,
8821                     ps_ctxt->pu1_ctb_nbr_map,
8822                     ps_ctxt->i4_nbr_map_strd,
8823                     cu_pos_x_in_4x4,
8824                     cu_pos_y_in_4x4,
8825                     num_4x4_in_tu);
8826 
8827                 if(1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data)
8828                 {
8829                     /* copy the nbr flags for chroma reuse */
8830                     if(4 != trans_size)
8831                     {
8832                         *pu4_nbr_flags = nbr_flags;
8833                     }
8834                     else if(1 == chrm_present_flag)
8835                     {
8836                         /* compute the avail flags assuming luma trans is 8x8 */
8837                         /* get the neighbour availability flags */
8838                         *pu4_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
8839                             ps_ctxt->pu1_ctb_nbr_map,
8840                             ps_ctxt->i4_nbr_map_strd,
8841                             cu_pos_x_in_4x4,
8842                             cu_pos_y_in_4x4,
8843                             (num_4x4_in_tu << 1),
8844                             (num_4x4_in_tu << 1));
8845                     }
8846 
8847                     /* call reference array substitution */
8848                     ihevc_intra_pred_luma_ref_substitution_fptr(
8849                         pu1_top_left,
8850                         pu1_top,
8851                         pu1_left,
8852                         left_strd,
8853                         trans_size,
8854                         nbr_flags,
8855                         (UWORD8 *)ps_ctxt->pv_ref_sub_out,
8856                         1);
8857 
8858                     /* call reference filtering */
8859                     ihevc_intra_pred_ref_filtering_fptr(
8860                         (UWORD8 *)ps_ctxt->pv_ref_sub_out,
8861                         trans_size,
8862                         (UWORD8 *)ps_ctxt->pv_ref_filt_out,
8863                         luma_pred_mode,
8864                         ps_ctxt->i1_strong_intra_smoothing_enable_flag);
8865 
8866                     /* use the look up to get the function idx */
8867                     luma_pred_func_idx = g_i4_ip_funcs[luma_pred_mode];
8868 
8869                     /* call the intra prediction function */
8870                     ps_ctxt->apf_lum_ip[luma_pred_func_idx](
8871                         (UWORD8 *)ps_ctxt->pv_ref_filt_out,
8872                         1,
8873                         pu1_cur_pred,
8874                         pred_strd,
8875                         trans_size,
8876                         luma_pred_mode);
8877                 }
8878             }
8879             else if(
8880                 (1 == chrm_present_flag) &&
8881                 (1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data))
8882             {
8883                 WORD32 temp_num_4x4_in_tu = num_4x4_in_tu;
8884 
8885                 if(4 == trans_size) /* compute the avail flags assuming luma trans is 8x8 */
8886                 {
8887                     temp_num_4x4_in_tu = num_4x4_in_tu << 1;
8888                 }
8889 
8890                 *pu4_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
8891                     ps_ctxt->pu1_ctb_nbr_map,
8892                     ps_ctxt->i4_nbr_map_strd,
8893                     cu_pos_x_in_4x4,
8894                     cu_pos_y_in_4x4,
8895                     temp_num_4x4_in_tu,
8896                     temp_num_4x4_in_tu);
8897             }
8898 
8899             /* Get the pred mode for scan idx calculation, even if pred is not required */
8900             chroma_pred_mode = ps_best_cu_prms->u1_chroma_intra_pred_actual_mode;
8901         }
8902 
8903         if(1 == ps_tu_enc_loop_temp_prms->b1_eval_luma_iq_and_coeff_data)
8904         {
8905             WORD32 temp_bits;
8906             LWORD64 temp_cost;
8907             UWORD32 u4_tu_sad;
8908             WORD32 perform_sbh, perform_rdoq;
8909 
8910             if(PRED_MODE_INTRA == packed_pred_mode)
8911             {
8912                 /* for luma 4x4 and 8x8 transforms based on intra pred mode scan is choosen*/
8913                 if(trans_size < 16)
8914                 {
8915                     /* for modes from 22 upto 30 horizontal scan is used */
8916                     if((luma_pred_mode > 21) && (luma_pred_mode < 31))
8917                     {
8918                         ps_ctxt->i4_scan_idx = SCAN_HORZ;
8919                     }
8920                     /* for modes from 6 upto 14 horizontal scan is used */
8921                     else if((luma_pred_mode > 5) && (luma_pred_mode < 15))
8922                     {
8923                         ps_ctxt->i4_scan_idx = SCAN_VERT;
8924                     }
8925                 }
8926             }
8927 
8928             /* RDOPT copy States :  TU init (best until prev TU) to current */
8929             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
8930                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
8931                         .s_cabac_ctxt.au1_ctxt_models[0] +
8932                     IHEVC_CAB_COEFFX_PREFIX,
8933                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
8934                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
8935 
8936             if(ps_prms->u1_recompute_sbh_and_rdoq)
8937             {
8938                 perform_sbh = (ps_ctxt->i4_sbh_level != NO_SBH);
8939                 perform_rdoq = (ps_ctxt->i4_rdoq_level != NO_RDOQ);
8940             }
8941             else
8942             {
8943                 /* RDOQ will change the coefficients. If coefficients are changed, we will have to do sbh again*/
8944                 perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_sbh;
8945                 /* To do SBH we need the quant and iquant data. This would mean we need to do quantization again, which would mean
8946                 we would have to do RDOQ again.*/
8947                 perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_rdoq;
8948             }
8949 
8950 #if DISABLE_RDOQ_INTRA
8951             if(PRED_MODE_INTRA == packed_pred_mode)
8952             {
8953                 perform_rdoq = 0;
8954             }
8955 #endif
8956             /*If BEST candidate RDOQ is enabled, Eithe no coef level rdoq or CU level rdoq has to be enabled
8957             so that all candidates and best candidate are quantized with same rounding factor  */
8958             if(1 == perform_rdoq)
8959             {
8960                 ASSERT(ps_ctxt->i4_quant_rounding_level != TU_LEVEL_QUANT_ROUNDING);
8961             }
8962 
8963             cbf = ihevce_t_q_iq_ssd_scan_fxn(
8964                 ps_ctxt,
8965                 pu1_cur_pred,
8966                 pred_strd,
8967                 pu1_cur_src,
8968                 src_strd,
8969                 pi2_cur_deq_data,
8970                 cu_size, /*deq_data stride is cu_size*/
8971                 pu1_cur_luma_recon,
8972                 recon_luma_strd,
8973                 pu1_final_ecd_data,
8974                 pu1_csbf_buf,
8975                 csbf_strd,
8976                 trans_size,
8977                 packed_pred_mode,
8978                 &temp_cost,
8979                 &num_bytes,
8980                 &temp_bits,
8981                 &u4_tu_sad,
8982                 &zero_cols,
8983                 &zero_rows,
8984                 &au1_is_recon_available[0],
8985                 perform_rdoq,  //(BEST_CAND_RDOQ == ps_ctxt->i4_rdoq_level),
8986                 perform_sbh,
8987 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
8988                 !ps_ctxt->u1_is_refPic ? ALPHA_FOR_NOISE_TERM_IN_RDOPT
8989                                        : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
8990                                           (double)ALPHA_FOR_NOISE_TERM_IN_RDOPT) /
8991                                              100.0,
8992                 ps_prms->u1_is_cu_noisy,
8993 #endif
8994                 u1_compute_spatial_ssd_luma ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
8995                 1 /*early cbf*/
8996             );  //(BEST_CAND_SBH == ps_ctxt->i4_sbh_level));
8997 
8998             /* Accumulate luma residual bits */
8999             ps_best_cu_prms->u4_cu_luma_res_bits += temp_bits;
9000 
9001             /* RDOPT copy States :  New updated after curr TU to TU init */
9002             if(0 != cbf)
9003             {
9004                 /* update to new state only if CBF is non zero */
9005                 COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9006                     &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9007                     &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9008                             .s_cabac_ctxt.au1_ctxt_models[0] +
9009                         IHEVC_CAB_COEFFX_PREFIX,
9010                     IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9011             }
9012 
9013             /* accumulate the TU sad into cu sad */
9014             ps_best_cu_prms->u4_cu_sad += u4_tu_sad;
9015             ps_tu->b1_y_cbf = cbf;
9016             ps_tu_enc_loop_temp_prms->i2_luma_bytes_consumed = num_bytes;
9017 
9018             /* If somebody updates cbf (RDOQ or SBH), update in nbr str. for BS */
9019             if((ps_prms->u1_will_cabac_state_change) && (!ps_prms->u1_is_first_pass))
9020             {
9021                 WORD32 num_4x4_in_cu = u1_cu_size >> 2;
9022                 nbr_4x4_t *ps_cur_nbr_4x4 = &ps_ctxt->as_cu_nbr[rd_opt_best_idx][0];
9023                 ps_cur_nbr_4x4 = (ps_cur_nbr_4x4 + (cu_pos_x_in_pix >> 2));
9024                 ps_cur_nbr_4x4 += ((cu_pos_y_in_pix >> 2) * num_4x4_in_cu);
9025                 /* repiclate the nbr 4x4 structure for all 4x4 blocks current TU */
9026                 ps_cur_nbr_4x4->b1_y_cbf = cbf;
9027                 /*copy the cu qp. This will be overwritten by qp calculated based on skip flag at final stage of cu mode decide*/
9028                 ps_cur_nbr_4x4->b8_qp = ps_ctxt->i4_cu_qp;
9029                 /* Qp and cbf are stored for the all 4x4 in TU */
9030                 {
9031                     WORD32 i, j;
9032                     nbr_4x4_t *ps_tmp_4x4;
9033                     ps_tmp_4x4 = ps_cur_nbr_4x4;
9034 
9035                     for(i = 0; i < num_4x4_in_tu; i++)
9036                     {
9037                         for(j = 0; j < num_4x4_in_tu; j++)
9038                         {
9039                             ps_tmp_4x4[j].b8_qp = ps_ctxt->i4_cu_qp;
9040                             ps_tmp_4x4[j].b1_y_cbf = cbf;
9041                         }
9042                         /* row level update*/
9043                         ps_tmp_4x4 += num_4x4_in_cu;
9044                     }
9045                 }
9046             }
9047         }
9048         else
9049         {
9050             zero_cols = ps_tu_enc_loop_temp_prms->u4_luma_zero_col;
9051             zero_rows = ps_tu_enc_loop_temp_prms->u4_luma_zero_row;
9052 
9053             if(ps_prms->u1_will_cabac_state_change)
9054             {
9055                 num_bytes = ps_tu_enc_loop_temp_prms->i2_luma_bytes_consumed;
9056             }
9057             else
9058             {
9059                 num_bytes = 0;
9060             }
9061 
9062             /* copy luma ecd data to final buffer */
9063             memcpy(pu1_final_ecd_data, pu1_old_ecd_data, num_bytes);
9064 
9065             pu1_old_ecd_data += num_bytes;
9066 
9067             au1_is_recon_available[0] = 0;
9068         }
9069 
9070         /**-------- Compute Recon data (Do IT & Recon) : Luma  -----------**/
9071         if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9072            (!u1_compute_spatial_ssd_luma ||
9073             (!au1_is_recon_available[0] && u1_compute_spatial_ssd_luma)))
9074         {
9075             if(!ps_recon_datastore->u1_is_lumaRecon_available ||
9076                (ps_recon_datastore->u1_is_lumaRecon_available &&
9077                 (UCHAR_MAX == ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr])))
9078             {
9079                 ihevce_it_recon_fxn(
9080                     ps_ctxt,
9081                     pi2_cur_deq_data,
9082                     cu_size,
9083                     pu1_cur_pred,
9084                     pred_strd,
9085                     pu1_cur_luma_recon,
9086                     recon_luma_strd,
9087                     pu1_final_ecd_data,
9088                     trans_size,
9089                     packed_pred_mode,
9090                     ps_tu->b1_y_cbf,
9091                     zero_cols,
9092                     zero_rows);
9093             }
9094             else if(
9095                 ps_recon_datastore->u1_is_lumaRecon_available &&
9096                 (UCHAR_MAX != ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr]))
9097             {
9098                 UWORD8 *pu1_recon_src =
9099                     ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
9100                          [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr]]) +
9101                     cu_pos_x_in_pix + cu_pos_y_in_pix * ps_recon_datastore->i4_lumaRecon_stride;
9102 
9103                 ps_ctxt->s_cmn_opt_func.pf_copy_2d(
9104                     pu1_cur_luma_recon,
9105                     recon_luma_strd,
9106                     pu1_recon_src,
9107                     ps_recon_datastore->i4_lumaRecon_stride,
9108                     trans_size,
9109                     trans_size);
9110             }
9111         }
9112 
9113         if(ps_prms->u1_will_cabac_state_change)
9114         {
9115             ps_tu_enc_loop->i4_luma_coeff_offset = total_bytes;
9116         }
9117 
9118         pu1_final_ecd_data += num_bytes;
9119         /* update total bytes consumed */
9120         total_bytes += num_bytes;
9121 
9122         u1_is_cu_coded |= ps_tu->b1_y_cbf;
9123 
9124         /***************** Compute T,Q,IQ,IT & Recon for Chroma ********************/
9125         if(1 == chrm_present_flag)
9126         {
9127             pu1_cur_src_chrm = pu1_src_chrm + cu_pos_x_in_pix;
9128             pu1_cur_src_chrm += ((cu_pos_y_in_pix >> 1) * src_chrm_strd) +
9129                                 (u1_is_422 * ((cu_pos_y_in_pix >> 1) * src_chrm_strd));
9130 
9131             pu1_cur_pred_chrm = pu1_pred_chrm + cu_pos_x_in_pix;
9132             pu1_cur_pred_chrm += ((cu_pos_y_in_pix >> 1) * pred_chrm_strd) +
9133                                  (u1_is_422 * ((cu_pos_y_in_pix >> 1) * pred_chrm_strd));
9134 
9135             pu1_cur_chroma_recon = pu1_chrm_recon + cu_pos_x_in_pix;
9136             pu1_cur_chroma_recon += ((cu_pos_y_in_pix >> 1) * recon_chrma_strd) +
9137                                     (u1_is_422 * ((cu_pos_y_in_pix >> 1) * recon_chrma_strd));
9138 
9139             pi2_cur_deq_data_chrm = pi2_chrm_deq_data + cu_pos_x_in_pix;
9140             pi2_cur_deq_data_chrm += ((cu_pos_y_in_pix >> 1) * cu_size) +
9141                                      (u1_is_422 * ((cu_pos_y_in_pix >> 1) * cu_size));
9142 
9143             if(INCLUDE_CHROMA_DURING_TU_RECURSION &&
9144                (ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P0) &&
9145                (PRED_MODE_INTRA != packed_pred_mode))
9146             {
9147                 WORD32 i4_num_bytes;
9148                 UWORD8 *pu1_chroma_pred;
9149                 UWORD8 *pu1_chroma_recon;
9150                 WORD16 *pi2_chroma_deq;
9151                 UWORD32 u4_zero_col;
9152                 UWORD32 u4_zero_row;
9153 
9154                 for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus; i4_subtu_idx++)
9155                 {
9156                     WORD32 chroma_trans_size = MAX(4, trans_size >> 1);
9157                     WORD32 i4_subtu_pos_x = cu_pos_x_in_pix;
9158                     WORD32 i4_subtu_pos_y = cu_pos_y_in_pix + (i4_subtu_idx * chroma_trans_size);
9159 
9160                     if(0 == u1_is_422)
9161                     {
9162                         i4_subtu_pos_y >>= 1;
9163                     }
9164 
9165                     pu1_chroma_pred =
9166                         pu1_cur_pred_chrm + (i4_subtu_idx * chroma_trans_size * pred_chrm_strd);
9167                     pu1_chroma_recon = pu1_cur_chroma_recon +
9168                                        (i4_subtu_idx * chroma_trans_size * recon_chrma_strd);
9169                     pi2_chroma_deq =
9170                         pi2_cur_deq_data_chrm + (i4_subtu_idx * chroma_trans_size * cu_size);
9171 
9172                     u4_zero_col = ps_tu_enc_loop_temp_prms->au4_cb_zero_col[i4_subtu_idx];
9173                     u4_zero_row = ps_tu_enc_loop_temp_prms->au4_cb_zero_row[i4_subtu_idx];
9174 
9175                     if(ps_prms->u1_will_cabac_state_change)
9176                     {
9177                         i4_num_bytes =
9178                             ps_tu_enc_loop_temp_prms->ai2_cb_bytes_consumed[i4_subtu_idx];
9179                     }
9180                     else
9181                     {
9182                         i4_num_bytes = 0;
9183                     }
9184 
9185                     memcpy(pu1_final_ecd_data, pu1_old_ecd_data, i4_num_bytes);
9186 
9187                     pu1_old_ecd_data += i4_num_bytes;
9188 
9189                     au1_is_recon_available[U_PLANE] = 0;
9190 
9191                     if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9192                        (!u1_compute_spatial_ssd_chroma ||
9193                         (!au1_is_recon_available[U_PLANE] && u1_compute_spatial_ssd_chroma)))
9194                     {
9195                         if(!ps_recon_datastore->au1_is_chromaRecon_available[0] ||
9196                            (ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9197                             (UCHAR_MAX ==
9198                              ps_recon_datastore
9199                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx])))
9200                         {
9201                             ihevce_chroma_it_recon_fxn(
9202                                 ps_ctxt,
9203                                 pi2_chroma_deq,
9204                                 cu_size,
9205                                 pu1_chroma_pred,
9206                                 pred_chrm_strd,
9207                                 pu1_chroma_recon,
9208                                 recon_chrma_strd,
9209                                 pu1_final_ecd_data,
9210                                 chroma_trans_size,
9211                                 (i4_subtu_idx == 0) ? ps_tu->b1_cb_cbf : ps_tu->b1_cb_cbf_subtu1,
9212                                 u4_zero_col,
9213                                 u4_zero_row,
9214                                 U_PLANE);
9215                         }
9216                         else if(
9217                             ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9218                             (UCHAR_MAX !=
9219                              ps_recon_datastore
9220                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx]))
9221                         {
9222                             UWORD8 *pu1_recon_src =
9223                                 ((UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs
9224                                      [ps_recon_datastore->au1_bufId_with_winning_ChromaRecon
9225                                           [U_PLANE][ctr][i4_subtu_idx]]) +
9226                                 i4_subtu_pos_x +
9227                                 i4_subtu_pos_y * ps_recon_datastore->i4_chromaRecon_stride;
9228 
9229                             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
9230                                 pu1_recon_src,
9231                                 ps_recon_datastore->i4_lumaRecon_stride,
9232                                 pu1_chroma_recon,
9233                                 recon_chrma_strd,
9234                                 chroma_trans_size,
9235                                 chroma_trans_size,
9236                                 U_PLANE);
9237                         }
9238                     }
9239 
9240                     u1_is_cu_coded |=
9241                         ((1 == i4_subtu_idx) ? ps_tu->b1_cb_cbf_subtu1 : ps_tu->b1_cb_cbf);
9242 
9243                     pu1_final_ecd_data += i4_num_bytes;
9244                     total_bytes += i4_num_bytes;
9245                 }
9246 
9247                 for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus; i4_subtu_idx++)
9248                 {
9249                     WORD32 chroma_trans_size = MAX(4, trans_size >> 1);
9250                     WORD32 i4_subtu_pos_x = cu_pos_x_in_pix;
9251                     WORD32 i4_subtu_pos_y = cu_pos_y_in_pix + (i4_subtu_idx * chroma_trans_size);
9252 
9253                     if(0 == u1_is_422)
9254                     {
9255                         i4_subtu_pos_y >>= 1;
9256                     }
9257 
9258                     pu1_chroma_pred =
9259                         pu1_cur_pred_chrm + (i4_subtu_idx * chroma_trans_size * pred_chrm_strd);
9260                     pu1_chroma_recon = pu1_cur_chroma_recon +
9261                                        (i4_subtu_idx * chroma_trans_size * recon_chrma_strd);
9262                     pi2_chroma_deq = pi2_cur_deq_data_chrm +
9263                                      (i4_subtu_idx * chroma_trans_size * cu_size) +
9264                                      chroma_trans_size;
9265 
9266                     u4_zero_col = ps_tu_enc_loop_temp_prms->au4_cr_zero_col[i4_subtu_idx];
9267                     u4_zero_row = ps_tu_enc_loop_temp_prms->au4_cr_zero_row[i4_subtu_idx];
9268 
9269                     if(ps_prms->u1_will_cabac_state_change)
9270                     {
9271                         i4_num_bytes =
9272                             ps_tu_enc_loop_temp_prms->ai2_cr_bytes_consumed[i4_subtu_idx];
9273                     }
9274                     else
9275                     {
9276                         i4_num_bytes = 0;
9277                     }
9278 
9279                     memcpy(pu1_final_ecd_data, pu1_old_ecd_data, i4_num_bytes);
9280 
9281                     pu1_old_ecd_data += i4_num_bytes;
9282 
9283                     au1_is_recon_available[V_PLANE] = 0;
9284 
9285                     if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9286                        (!u1_compute_spatial_ssd_chroma ||
9287                         (!au1_is_recon_available[V_PLANE] && u1_compute_spatial_ssd_chroma)))
9288                     {
9289                         if(!ps_recon_datastore->au1_is_chromaRecon_available[0] ||
9290                            (ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9291                             (UCHAR_MAX ==
9292                              ps_recon_datastore
9293                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx])))
9294                         {
9295                             ihevce_chroma_it_recon_fxn(
9296                                 ps_ctxt,
9297                                 pi2_chroma_deq,
9298                                 cu_size,
9299                                 pu1_chroma_pred,
9300                                 pred_chrm_strd,
9301                                 pu1_chroma_recon,
9302                                 recon_chrma_strd,
9303                                 pu1_final_ecd_data,
9304                                 chroma_trans_size,
9305                                 (i4_subtu_idx == 0) ? ps_tu->b1_cr_cbf : ps_tu->b1_cr_cbf_subtu1,
9306                                 u4_zero_col,
9307                                 u4_zero_row,
9308                                 V_PLANE);
9309                         }
9310                         else if(
9311                             ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9312                             (UCHAR_MAX !=
9313                              ps_recon_datastore
9314                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx]))
9315                         {
9316                             UWORD8 *pu1_recon_src =
9317                                 ((UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs
9318                                      [ps_recon_datastore->au1_bufId_with_winning_ChromaRecon
9319                                           [V_PLANE][ctr][i4_subtu_idx]]) +
9320                                 i4_subtu_pos_x +
9321                                 i4_subtu_pos_y * ps_recon_datastore->i4_chromaRecon_stride;
9322 
9323                             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
9324                                 pu1_recon_src,
9325                                 ps_recon_datastore->i4_lumaRecon_stride,
9326                                 pu1_chroma_recon,
9327                                 recon_chrma_strd,
9328                                 chroma_trans_size,
9329                                 chroma_trans_size,
9330                                 V_PLANE);
9331                         }
9332                     }
9333 
9334                     u1_is_cu_coded |=
9335                         ((1 == i4_subtu_idx) ? ps_tu->b1_cr_cbf_subtu1 : ps_tu->b1_cr_cbf);
9336 
9337                     pu1_final_ecd_data += i4_num_bytes;
9338                     total_bytes += i4_num_bytes;
9339                 }
9340             }
9341             else
9342             {
9343                 WORD32 cb_zero_col, cb_zero_row, cr_zero_col, cr_zero_row;
9344 
9345                 for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus; i4_subtu_idx++)
9346                 {
9347                     WORD32 cb_cbf, cr_cbf;
9348                     WORD32 cb_num_bytes, cr_num_bytes;
9349 
9350                     WORD32 chroma_trans_size = MAX(4, trans_size >> 1);
9351 
9352                     WORD32 i4_subtu_pos_x = cu_pos_x_in_pix;
9353                     WORD32 i4_subtu_pos_y = cu_pos_y_in_pix + (i4_subtu_idx * chroma_trans_size);
9354 
9355                     if(0 == u1_is_422)
9356                     {
9357                         i4_subtu_pos_y >>= 1;
9358                     }
9359 
9360                     pu1_cur_src_chrm += (i4_subtu_idx * chroma_trans_size * src_chrm_strd);
9361                     pu1_cur_pred_chrm += (i4_subtu_idx * chroma_trans_size * pred_chrm_strd);
9362                     pu1_cur_chroma_recon += (i4_subtu_idx * chroma_trans_size * recon_chrma_strd);
9363                     pi2_cur_deq_data_chrm += (i4_subtu_idx * chroma_trans_size * cu_size);
9364 
9365                     if((PRED_MODE_INTRA == packed_pred_mode) &&
9366                        (1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data))
9367                     {
9368                         WORD32 nbr_flags, left_strd_chrm, chrm_pred_func_idx;
9369                         UWORD8 *pu1_left_chrm;
9370                         UWORD8 *pu1_top_chrm;
9371                         UWORD8 *pu1_top_left_chrm;
9372 
9373                         nbr_flags = ihevce_get_intra_chroma_tu_nbr(
9374                             *pu4_nbr_flags, i4_subtu_idx, chroma_trans_size, u1_is_422);
9375 
9376                         /* left cu boundary */
9377                         if(0 == i4_subtu_pos_x)
9378                         {
9379                             left_strd_chrm = ps_chrm_cu_buf_prms->i4_cu_left_stride;
9380                             pu1_left_chrm =
9381                                 ps_chrm_cu_buf_prms->pu1_cu_left + i4_subtu_pos_y * left_strd_chrm;
9382                         }
9383                         else
9384                         {
9385                             pu1_left_chrm = pu1_cur_chroma_recon - 2;
9386                             left_strd_chrm = recon_chrma_strd;
9387                         }
9388 
9389                         /* top cu boundary */
9390                         if(0 == i4_subtu_pos_y)
9391                         {
9392                             pu1_top_chrm = ps_chrm_cu_buf_prms->pu1_cu_top + i4_subtu_pos_x;
9393                         }
9394                         else
9395                         {
9396                             pu1_top_chrm = pu1_cur_chroma_recon - recon_chrma_strd;
9397                         }
9398 
9399                         /* by default top left is set to cu top left */
9400                         pu1_top_left_chrm = ps_chrm_cu_buf_prms->pu1_cu_top_left;
9401 
9402                         /* top left based on position */
9403                         if((0 != i4_subtu_pos_y) && (0 == i4_subtu_pos_x))
9404                         {
9405                             pu1_top_left_chrm = pu1_left_chrm - left_strd_chrm;
9406                         }
9407                         else if(0 != i4_subtu_pos_x)
9408                         {
9409                             pu1_top_left_chrm = pu1_top_chrm - 2;
9410                         }
9411 
9412                         /* call the chroma reference array substitution */
9413                         ihevc_intra_pred_chroma_ref_substitution_fptr(
9414                             pu1_top_left_chrm,
9415                             pu1_top_chrm,
9416                             pu1_left_chrm,
9417                             left_strd_chrm,
9418                             chroma_trans_size,
9419                             nbr_flags,
9420                             (UWORD8 *)ps_ctxt->pv_ref_sub_out,
9421                             1);
9422 
9423                         /* use the look up to get the function idx */
9424                         chrm_pred_func_idx = g_i4_ip_funcs[chroma_pred_mode];
9425 
9426                         /* call the intra prediction function */
9427                         ps_ctxt->apf_chrm_ip[chrm_pred_func_idx](
9428                             (UWORD8 *)ps_ctxt->pv_ref_sub_out,
9429                             1,
9430                             pu1_cur_pred_chrm,
9431                             pred_chrm_strd,
9432                             chroma_trans_size,
9433                             chroma_pred_mode);
9434                     }
9435 
9436                     /**---------- Compute iq&coeff data if required : Chroma ------------**/
9437                     if(1 == ps_tu_enc_loop_temp_prms->b1_eval_chroma_iq_and_coeff_data)
9438                     {
9439                         WORD32 perform_sbh, perform_rdoq, temp_bits;
9440 
9441                         if(ps_prms->u1_recompute_sbh_and_rdoq)
9442                         {
9443                             perform_sbh = (ps_ctxt->i4_sbh_level != NO_SBH);
9444                             perform_rdoq = (ps_ctxt->i4_rdoq_level != NO_RDOQ);
9445                         }
9446                         else
9447                         {
9448                             /* RDOQ will change the coefficients. If coefficients are changed, we will have to do sbh again*/
9449                             perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_sbh;
9450                             /* To do SBH we need the quant and iquant data. This would mean we need to do quantization again, which would mean
9451                         we would have to do RDOQ again.*/
9452                             perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_rdoq;
9453                         }
9454 
9455                         /* populate the coeffs scan idx */
9456                         ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
9457 
9458                         if(PRED_MODE_INTRA == packed_pred_mode)
9459                         {
9460                             /* for 4x4 transforms based on intra pred mode scan is choosen*/
9461                             if(4 == chroma_trans_size)
9462                             {
9463                                 /* for modes from 22 upto 30 horizontal scan is used */
9464                                 if((chroma_pred_mode > 21) && (chroma_pred_mode < 31))
9465                                 {
9466                                     ps_ctxt->i4_scan_idx = SCAN_HORZ;
9467                                 }
9468                                 /* for modes from 6 upto 14 horizontal scan is used */
9469                                 else if((chroma_pred_mode > 5) && (chroma_pred_mode < 15))
9470                                 {
9471                                     ps_ctxt->i4_scan_idx = SCAN_VERT;
9472                                 }
9473                             }
9474                         }
9475 
9476 #if DISABLE_RDOQ_INTRA
9477                         if(PRED_MODE_INTRA == packed_pred_mode)
9478                         {
9479                             perform_rdoq = 0;
9480                         }
9481 #endif
9482 
9483                         /* RDOPT copy States :  TU init (best until prev TU) to current */
9484                         COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9485                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9486                                     .s_cabac_ctxt.au1_ctxt_models[0] +
9487                                 IHEVC_CAB_COEFFX_PREFIX,
9488                             &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9489                             IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9490 
9491                         ASSERT(rd_opt_best_idx == ps_ctxt->s_rdopt_entropy_ctxt.i4_curr_buf_idx);
9492                         /*If BEST candidate RDOQ is enabled, Eithe no coef level rdoq or CU level rdoq has to be enabled
9493                     so that all candidates and best candidate are quantized with same rounding factor  */
9494                         if(1 == perform_rdoq)
9495                         {
9496                             ASSERT(ps_ctxt->i4_quant_rounding_level != TU_LEVEL_QUANT_ROUNDING);
9497                         }
9498 
9499                         if(!ps_best_cu_prms->u1_skip_flag ||
9500                            !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt)
9501                         {
9502                             /* Cb */
9503                             cb_cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
9504                                 ps_ctxt,
9505                                 pu1_cur_pred_chrm,
9506                                 pred_chrm_strd,
9507                                 pu1_cur_src_chrm,
9508                                 src_chrm_strd,
9509                                 pi2_cur_deq_data_chrm,
9510                                 cu_size,
9511                                 pu1_chrm_recon,
9512                                 recon_chrma_strd,
9513                                 pu1_final_ecd_data,
9514                                 pu1_csbf_buf,
9515                                 csbf_strd,
9516                                 chroma_trans_size,
9517                                 ps_ctxt->i4_scan_idx,
9518                                 (PRED_MODE_INTRA == packed_pred_mode),
9519                                 &cb_num_bytes,
9520                                 &temp_bits,
9521                                 &cb_zero_col,
9522                                 &cb_zero_row,
9523                                 &au1_is_recon_available[U_PLANE],
9524                                 perform_sbh,
9525                                 perform_rdoq,
9526                                 &i8_ssd,
9527 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
9528                                 !ps_ctxt->u1_is_refPic
9529                                     ? ALPHA_FOR_NOISE_TERM_IN_RDOPT
9530                                     : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
9531                                        (double)ALPHA_FOR_NOISE_TERM_IN_RDOPT) /
9532                                           100.0,
9533                                 ps_prms->u1_is_cu_noisy,
9534 #endif
9535                                 ps_best_cu_prms->u1_skip_flag &&
9536                                     ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt,
9537                                 u1_compute_spatial_ssd_chroma ? SPATIAL_DOMAIN_SSD
9538                                                               : FREQUENCY_DOMAIN_SSD,
9539                                 U_PLANE);
9540                         }
9541                         else
9542                         {
9543                             cb_cbf = 0;
9544                             temp_bits = 0;
9545                             cb_num_bytes = 0;
9546                             au1_is_recon_available[U_PLANE] = 0;
9547                             cb_zero_col = 0;
9548                             cb_zero_row = 0;
9549                         }
9550 
9551                         /* Accumulate chroma residual bits */
9552                         ps_best_cu_prms->u4_cu_chroma_res_bits += temp_bits;
9553 
9554                         /* RDOPT copy States :  New updated after curr TU to TU init */
9555                         if(0 != cb_cbf)
9556                         {
9557                             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9558                                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9559                                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9560                                         .s_cabac_ctxt.au1_ctxt_models[0] +
9561                                     IHEVC_CAB_COEFFX_PREFIX,
9562                                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9563                         }
9564                         /* RDOPT copy States :  Restoring back the Cb init state to Cr */
9565                         else
9566                         {
9567                             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9568                                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9569                                         .s_cabac_ctxt.au1_ctxt_models[0] +
9570                                     IHEVC_CAB_COEFFX_PREFIX,
9571                                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9572                                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9573                         }
9574 
9575                         if(!ps_best_cu_prms->u1_skip_flag ||
9576                            !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt)
9577                         {
9578                             /* Cr */
9579                             cr_cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
9580                                 ps_ctxt,
9581                                 pu1_cur_pred_chrm,
9582                                 pred_chrm_strd,
9583                                 pu1_cur_src_chrm,
9584                                 src_chrm_strd,
9585                                 pi2_cur_deq_data_chrm + chroma_trans_size,
9586                                 cu_size,
9587                                 pu1_chrm_recon,
9588                                 recon_chrma_strd,
9589                                 pu1_final_ecd_data + cb_num_bytes,
9590                                 pu1_csbf_buf,
9591                                 csbf_strd,
9592                                 chroma_trans_size,
9593                                 ps_ctxt->i4_scan_idx,
9594                                 (PRED_MODE_INTRA == packed_pred_mode),
9595                                 &cr_num_bytes,
9596                                 &temp_bits,
9597                                 &cr_zero_col,
9598                                 &cr_zero_row,
9599                                 &au1_is_recon_available[V_PLANE],
9600                                 perform_sbh,
9601                                 perform_rdoq,
9602                                 &i8_ssd,
9603 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
9604                                 !ps_ctxt->u1_is_refPic
9605                                     ? ALPHA_FOR_NOISE_TERM_IN_RDOPT
9606                                     : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
9607                                        (double)ALPHA_FOR_NOISE_TERM_IN_RDOPT) /
9608                                           100.0,
9609                                 ps_prms->u1_is_cu_noisy,
9610 #endif
9611                                 ps_best_cu_prms->u1_skip_flag &&
9612                                     ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt,
9613                                 u1_compute_spatial_ssd_chroma ? SPATIAL_DOMAIN_SSD
9614                                                               : FREQUENCY_DOMAIN_SSD,
9615                                 V_PLANE);
9616                         }
9617                         else
9618                         {
9619                             cr_cbf = 0;
9620                             temp_bits = 0;
9621                             cr_num_bytes = 0;
9622                             au1_is_recon_available[V_PLANE] = 0;
9623                             cr_zero_col = 0;
9624                             cr_zero_row = 0;
9625                         }
9626 
9627                         /* Accumulate chroma residual bits */
9628                         ps_best_cu_prms->u4_cu_chroma_res_bits += temp_bits;
9629 
9630                         /* RDOPT copy States :  New updated after curr TU to TU init */
9631                         if(0 != cr_cbf)
9632                         {
9633                             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9634                                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9635                                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9636                                         .s_cabac_ctxt.au1_ctxt_models[0] +
9637                                     IHEVC_CAB_COEFFX_PREFIX,
9638                                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9639                         }
9640 
9641                         if(0 == i4_subtu_idx)
9642                         {
9643                             ps_tu->b1_cb_cbf = cb_cbf;
9644                             ps_tu->b1_cr_cbf = cr_cbf;
9645                         }
9646                         else
9647                         {
9648                             ps_tu->b1_cb_cbf_subtu1 = cb_cbf;
9649                             ps_tu->b1_cr_cbf_subtu1 = cr_cbf;
9650                         }
9651                     }
9652                     else
9653                     {
9654                         cb_zero_col = ps_tu_enc_loop_temp_prms->au4_cb_zero_col[i4_subtu_idx];
9655                         cb_zero_row = ps_tu_enc_loop_temp_prms->au4_cb_zero_row[i4_subtu_idx];
9656                         cr_zero_col = ps_tu_enc_loop_temp_prms->au4_cr_zero_col[i4_subtu_idx];
9657                         cr_zero_row = ps_tu_enc_loop_temp_prms->au4_cr_zero_row[i4_subtu_idx];
9658 
9659                         if(ps_prms->u1_will_cabac_state_change)
9660                         {
9661                             cb_num_bytes =
9662                                 ps_tu_enc_loop_temp_prms->ai2_cb_bytes_consumed[i4_subtu_idx];
9663                         }
9664                         else
9665                         {
9666                             cb_num_bytes = 0;
9667                         }
9668 
9669                         if(ps_prms->u1_will_cabac_state_change)
9670                         {
9671                             cr_num_bytes =
9672                                 ps_tu_enc_loop_temp_prms->ai2_cr_bytes_consumed[i4_subtu_idx];
9673                         }
9674                         else
9675                         {
9676                             cr_num_bytes = 0;
9677                         }
9678 
9679                         /* copy cb ecd data to final buffer */
9680                         memcpy(pu1_final_ecd_data, pu1_chrm_old_ecd_data, cb_num_bytes);
9681 
9682                         pu1_chrm_old_ecd_data += cb_num_bytes;
9683 
9684                         /* copy cb ecd data to final buffer */
9685                         memcpy(
9686                             (pu1_final_ecd_data + cb_num_bytes),
9687                             pu1_chrm_old_ecd_data,
9688                             cr_num_bytes);
9689 
9690                         pu1_chrm_old_ecd_data += cr_num_bytes;
9691 
9692                         au1_is_recon_available[U_PLANE] = 0;
9693                         au1_is_recon_available[V_PLANE] = 0;
9694                     }
9695 
9696                     /**-------- Compute Recon data (Do IT & Recon) : Chroma  -----------**/
9697                     if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9698                        (!u1_compute_spatial_ssd_chroma ||
9699                         (!au1_is_recon_available[U_PLANE] && u1_compute_spatial_ssd_chroma)))
9700                     {
9701                         if(!ps_recon_datastore->au1_is_chromaRecon_available[0] ||
9702                            (ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9703                             (UCHAR_MAX ==
9704                              ps_recon_datastore
9705                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx])))
9706                         {
9707                             ihevce_chroma_it_recon_fxn(
9708                                 ps_ctxt,
9709                                 pi2_cur_deq_data_chrm,
9710                                 cu_size,
9711                                 pu1_cur_pred_chrm,
9712                                 pred_chrm_strd,
9713                                 pu1_cur_chroma_recon,
9714                                 recon_chrma_strd,
9715                                 pu1_final_ecd_data,
9716                                 chroma_trans_size,
9717                                 (i4_subtu_idx == 0) ? ps_tu->b1_cb_cbf : ps_tu->b1_cb_cbf_subtu1,
9718                                 cb_zero_col,
9719                                 cb_zero_row,
9720                                 U_PLANE);
9721                         }
9722                         else if(
9723                             ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9724                             (UCHAR_MAX !=
9725                              ps_recon_datastore
9726                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx]))
9727                         {
9728                             UWORD8 *pu1_recon_src =
9729                                 ((UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs
9730                                      [ps_recon_datastore->au1_bufId_with_winning_ChromaRecon
9731                                           [U_PLANE][ctr][i4_subtu_idx]]) +
9732                                 i4_subtu_pos_x +
9733                                 i4_subtu_pos_y * ps_recon_datastore->i4_chromaRecon_stride;
9734 
9735                             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
9736                                 pu1_recon_src,
9737                                 ps_recon_datastore->i4_lumaRecon_stride,
9738                                 pu1_cur_chroma_recon,
9739                                 recon_chrma_strd,
9740                                 chroma_trans_size,
9741                                 chroma_trans_size,
9742                                 U_PLANE);
9743                         }
9744                     }
9745 
9746                     u1_is_cu_coded |=
9747                         ((1 == i4_subtu_idx) ? ps_tu->b1_cb_cbf_subtu1 : ps_tu->b1_cb_cbf);
9748 
9749                     if(ps_prms->u1_will_cabac_state_change)
9750                     {
9751                         ps_tu_enc_loop->ai4_cb_coeff_offset[i4_subtu_idx] = total_bytes;
9752                     }
9753 
9754                     pu1_final_ecd_data += cb_num_bytes;
9755                     /* update total bytes consumed */
9756                     total_bytes += cb_num_bytes;
9757 
9758                     if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9759                        (!u1_compute_spatial_ssd_chroma ||
9760                         (!au1_is_recon_available[V_PLANE] && u1_compute_spatial_ssd_chroma)))
9761                     {
9762                         if(!ps_recon_datastore->au1_is_chromaRecon_available[0] ||
9763                            (ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9764                             (UCHAR_MAX ==
9765                              ps_recon_datastore
9766                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx])))
9767                         {
9768                             ihevce_chroma_it_recon_fxn(
9769                                 ps_ctxt,
9770                                 pi2_cur_deq_data_chrm + chroma_trans_size,
9771                                 cu_size,
9772                                 pu1_cur_pred_chrm,
9773                                 pred_chrm_strd,
9774                                 pu1_cur_chroma_recon,
9775                                 recon_chrma_strd,
9776                                 pu1_final_ecd_data,
9777                                 chroma_trans_size,
9778                                 (i4_subtu_idx == 0) ? ps_tu->b1_cr_cbf : ps_tu->b1_cr_cbf_subtu1,
9779                                 cr_zero_col,
9780                                 cr_zero_row,
9781                                 V_PLANE);
9782                         }
9783                         else if(
9784                             ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9785                             (UCHAR_MAX !=
9786                              ps_recon_datastore
9787                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx]))
9788                         {
9789                             UWORD8 *pu1_recon_src =
9790                                 ((UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs
9791                                      [ps_recon_datastore->au1_bufId_with_winning_ChromaRecon
9792                                           [V_PLANE][ctr][i4_subtu_idx]]) +
9793                                 i4_subtu_pos_x +
9794                                 i4_subtu_pos_y * ps_recon_datastore->i4_chromaRecon_stride;
9795 
9796                             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
9797                                 pu1_recon_src,
9798                                 ps_recon_datastore->i4_lumaRecon_stride,
9799                                 pu1_cur_chroma_recon,
9800                                 recon_chrma_strd,
9801                                 chroma_trans_size,
9802                                 chroma_trans_size,
9803                                 V_PLANE);
9804                         }
9805                     }
9806 
9807                     u1_is_cu_coded |=
9808                         ((1 == i4_subtu_idx) ? ps_tu->b1_cr_cbf_subtu1 : ps_tu->b1_cr_cbf);
9809 
9810                     if(ps_prms->u1_will_cabac_state_change)
9811                     {
9812                         ps_tu_enc_loop->ai4_cr_coeff_offset[i4_subtu_idx] = total_bytes;
9813                     }
9814 
9815                     pu1_final_ecd_data += cr_num_bytes;
9816                     /* update total bytes consumed */
9817                     total_bytes += cr_num_bytes;
9818                 }
9819             }
9820         }
9821         else
9822         {
9823             ps_tu_enc_loop->ai4_cb_coeff_offset[0] = total_bytes;
9824             ps_tu_enc_loop->ai4_cr_coeff_offset[0] = total_bytes;
9825             ps_tu_enc_loop->ai4_cb_coeff_offset[1] = total_bytes;
9826             ps_tu_enc_loop->ai4_cr_coeff_offset[1] = total_bytes;
9827             ps_tu->b1_cb_cbf = 0;
9828             ps_tu->b1_cr_cbf = 0;
9829             ps_tu->b1_cb_cbf_subtu1 = 0;
9830             ps_tu->b1_cr_cbf_subtu1 = 0;
9831         }
9832 
9833         /* Update to next TU */
9834         ps_tu_enc_loop++;
9835         ps_tu_enc_loop_temp_prms++;
9836 
9837         pu4_nbr_flags++;
9838         pu1_intra_pred_mode++;
9839 
9840         /*Do not set the nbr map for last pu in cu */
9841         if((num_tu_in_cu - 1) != ctr)
9842         {
9843             /* set the neighbour map to 1 */
9844             ihevce_set_nbr_map(
9845                 ps_ctxt->pu1_ctb_nbr_map,
9846                 ps_ctxt->i4_nbr_map_strd,
9847                 cu_pos_x_in_4x4,
9848                 cu_pos_y_in_4x4,
9849                 (trans_size >> 2),
9850                 1);
9851         }
9852     }
9853 
9854     if(ps_prms->u1_will_cabac_state_change)
9855     {
9856         ps_best_cu_prms->u1_is_cu_coded = u1_is_cu_coded;
9857 
9858         /* Modify skip flag, if luma is skipped & Chroma is coded */
9859         if((1 == u1_is_cu_coded) && (PRED_MODE_SKIP == packed_pred_mode))
9860         {
9861             ps_best_cu_prms->u1_skip_flag = 0;
9862         }
9863     }
9864 
9865     /* during chroma evaluation if skip decision was over written     */
9866     /* then the current skip candidate is set to a non skip candidate */
9867     if(PRED_MODE_INTRA != packed_pred_mode)
9868     {
9869         ps_best_inter_cand->b1_skip_flag = ps_best_cu_prms->u1_skip_flag;
9870     }
9871 
9872     /**------------- Compute header data if required --------------**/
9873     if(1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_header_data)
9874     {
9875         WORD32 cbf_bits;
9876         WORD32 cu_bits;
9877         WORD32 unit_4x4_size = cu_size >> 2;
9878 
9879         /*Restoring the running reference into the best rdopt_ctxt cabac states which will then
9880         be copied as the base reference for the next cu
9881         Assumption : We are ensuring that the u1_eval_header_data flag is set to 1 only if either
9882         luma and chroma are being reevaluated*/
9883         COPY_CABAC_STATES(
9884             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9885                  .s_cabac_ctxt.au1_ctxt_models[0],
9886             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
9887             IHEVC_CAB_CTXT_END);
9888 
9889         /* get the neighbour availability flags for current cu  */
9890         ihevce_get_only_nbr_flag(
9891             &s_nbr,
9892             ps_ctxt->pu1_ctb_nbr_map,
9893             ps_ctxt->i4_nbr_map_strd,
9894             (cu_pos_x << 1),
9895             (cu_pos_y << 1),
9896             unit_4x4_size,
9897             unit_4x4_size);
9898 
9899         cu_bits = ihevce_entropy_rdo_encode_cu(
9900             &ps_ctxt->s_rdopt_entropy_ctxt,
9901             ps_best_cu_prms,
9902             cu_pos_x,
9903             cu_pos_y,
9904             cu_size,
9905             ps_ctxt->u1_disable_intra_eval ? !DISABLE_TOP_SYNC && s_nbr.u1_top_avail
9906                                            : s_nbr.u1_top_avail,
9907             s_nbr.u1_left_avail,
9908             (pu1_final_ecd_data - total_bytes),
9909             &cbf_bits);
9910 
9911         /* cbf bits are excluded from header bits, instead considered as texture bits */
9912         ps_best_cu_prms->u4_cu_hdr_bits = cu_bits - cbf_bits;
9913         ps_best_cu_prms->u4_cu_cbf_bits = cbf_bits;
9914     }
9915 
9916     if(ps_prms->u1_will_cabac_state_change)
9917     {
9918         ps_best_cu_prms->i4_num_bytes_ecd_data = total_bytes;
9919     }
9920 }
9921 
9922 /*!
9923 ******************************************************************************
9924 * \if Function name : ihevce_set_eval_flags \endif
9925 *
9926 * \brief
9927 *    Function which decides which eval flags have to be set based on present
9928 *    and RDOQ conditions
9929 *
9930 * \param[in] ps_ctxt : encoder ctxt pointer
9931 * \param[in] enc_loop_cu_final_prms_t : pointer to final cu params
9932 *
9933 * \return
9934 *    None
9935 *
9936 * \author
9937 *  Ittiam
9938 *
9939 *****************************************************************************
9940 */
ihevce_set_eval_flags(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_cu_final_prms_t * ps_enc_loop_bestprms)9941 void ihevce_set_eval_flags(
9942     ihevce_enc_loop_ctxt_t *ps_ctxt, enc_loop_cu_final_prms_t *ps_enc_loop_bestprms)
9943 {
9944     WORD32 count = 0;
9945 
9946     ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data = 0;
9947 
9948     ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data =
9949         !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
9950 
9951     if(ps_ctxt->u1_disable_intra_eval && (!(ps_ctxt->i4_deblk_pad_hpel_cur_pic & 0x1)))
9952     {
9953         ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data = 0;
9954     }
9955     else
9956     {
9957         ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data = 1;
9958     }
9959 
9960     if((1 == ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_rdoq) ||
9961        (1 == ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_sbh))
9962     {
9963         /* When rdoq is enabled only for the best candidate, in case of in Intra nTU
9964         RDOQ might have altered the coeffs of the neighbour CU. As a result, the pred
9965         for the current CU will change. Therefore, we need to reevaluate the pred data*/
9966         if((ps_enc_loop_bestprms->u2_num_tus_in_cu > 1) &&
9967            (ps_enc_loop_bestprms->u1_intra_flag == 1))
9968         {
9969             ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data = 1;
9970             ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data = 1;
9971         }
9972         if(ps_enc_loop_bestprms->u1_skip_flag == 1)
9973         {
9974             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
9975             {
9976                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
9977                     .b1_eval_luma_iq_and_coeff_data = 0;
9978                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
9979                     .b1_eval_chroma_iq_and_coeff_data = 0;
9980             }
9981         }
9982         else
9983         {
9984             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
9985             {
9986                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
9987                     .b1_eval_luma_iq_and_coeff_data = 1;
9988                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
9989                     .b1_eval_chroma_iq_and_coeff_data = 1;
9990             }
9991         }
9992     }
9993     else
9994     {
9995         switch(ps_ctxt->i4_quality_preset)
9996         {
9997         case IHEVCE_QUALITY_P0:
9998         case IHEVCE_QUALITY_P2:
9999         case IHEVCE_QUALITY_P3:
10000         {
10001             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
10002             {
10003                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10004                     .b1_eval_luma_iq_and_coeff_data = 0;
10005                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10006                     .b1_eval_chroma_iq_and_coeff_data =
10007                     !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10008             }
10009 
10010             break;
10011         }
10012         case IHEVCE_QUALITY_P4:
10013         case IHEVCE_QUALITY_P5:
10014         {
10015             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
10016             {
10017                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10018                     .b1_eval_luma_iq_and_coeff_data = 0;
10019                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10020                     .b1_eval_chroma_iq_and_coeff_data =
10021                     !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10022             }
10023 
10024             break;
10025         }
10026         case IHEVCE_QUALITY_P6:
10027         {
10028             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
10029             {
10030                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10031                     .b1_eval_luma_iq_and_coeff_data = 0;
10032 #if !ENABLE_CHROMA_TRACKING_OF_LUMA_CBF_IN_XS25
10033                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10034                     .b1_eval_chroma_iq_and_coeff_data =
10035                     !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10036 #else
10037                 if((ps_ctxt->i1_slice_type == BSLICE) && (ps_ctxt->i4_temporal_layer_id > 1) &&
10038                    (ps_enc_loop_bestprms->as_tu_enc_loop[count].s_tu.b3_size >= 2))
10039                 {
10040                     ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10041                         .b1_eval_chroma_iq_and_coeff_data =
10042                         ps_enc_loop_bestprms->as_tu_enc_loop[count].s_tu.b1_y_cbf;
10043                 }
10044                 else
10045                 {
10046                     ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10047                         .b1_eval_chroma_iq_and_coeff_data =
10048                         !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10049                 }
10050 #endif
10051             }
10052 
10053             break;
10054         }
10055         default:
10056         {
10057             break;
10058         }
10059         }
10060     }
10061 
10062     /* Not recomputing Luma pred-data and header data for any preset now */
10063     ps_ctxt->s_cu_final_recon_flags.u1_eval_header_data = 1;
10064 }
10065 
10066 /**
10067 ******************************************************************************
10068 *
10069 *  @brief Shrink's TU tree of inter CUs by merging redundnant child nodes
10070 *         (not coded children) into a parent node(not coded).
10071 *
10072 *  @par   Description
10073 *         This is required post RDO evaluation as TU decisions are
10074 *         pre-determined(pre RDO) based on recursive SATD,
10075 *         while the quad children TU's can be skipped during RDO
10076 *
10077 *         The shrink process is applied iteratively till there are no
10078 *         more modes to shrink
10079 *
10080 *  @param[inout]   ps_tu_enc_loop
10081 *       pointer to tu enc loop params of inter cu
10082 *
10083 *  @param[inout]   ps_tu_enc_loop_temp_prms
10084 *       pointer to temp tu enc loop params of inter cu
10085 *
10086 *  @param[in]   num_tu_in_cu
10087 *       number of tus in cu
10088 *
10089 *  @return      modified number of tus in cu
10090 *
10091 ******************************************************************************
10092 */
ihevce_shrink_inter_tu_tree(tu_enc_loop_out_t * ps_tu_enc_loop,tu_enc_loop_temp_prms_t * ps_tu_enc_loop_temp_prms,recon_datastore_t * ps_recon_datastore,WORD32 num_tu_in_cu,UWORD8 u1_is_422)10093 WORD32 ihevce_shrink_inter_tu_tree(
10094     tu_enc_loop_out_t *ps_tu_enc_loop,
10095     tu_enc_loop_temp_prms_t *ps_tu_enc_loop_temp_prms,
10096     recon_datastore_t *ps_recon_datastore,
10097     WORD32 num_tu_in_cu,
10098     UWORD8 u1_is_422)
10099 {
10100     WORD32 recurse = 1;
10101     WORD32 ctr;
10102 
10103     /* ------------- Quadtree TU Split Transform flag optimization ------------  */
10104     /* Post RDO, if all 4 child nodes are not coded the overheads of split TU    */
10105     /* flags and cbf flags are saved by merging to parent node and marking       */
10106     /* parent TU as not coded                                                    */
10107     /*                                                                           */
10108     /*                               ParentTUSplit=1                             */
10109     /*                                      |                                    */
10110     /*       ---------------------------------------------------------           */
10111     /*       |C0(Not coded) | C1(Not coded) | C2(Not coded) | C3(Not coded)      */
10112     /*                                     ||                                    */
10113     /*                                     \/                                    */
10114     /*                                                                           */
10115     /*                              ParentTUSplit=0 (Not Coded)                  */
10116     /*                                                                           */
10117     /* ------------- Quadtree TU Split Transform flag optimization ------------  */
10118     while((num_tu_in_cu > 4) && recurse)
10119     {
10120         recurse = 0;
10121 
10122         /* Validate inter CU */
10123         //ASSERT(ps_tu_enc_loop[0].s_tu.s_tu.b1_intra_flag == 0); /*b1_intra_flag no longer a member of tu structure */
10124 
10125         /* loop for all tu blocks in current cu */
10126         for(ctr = 0; ctr < num_tu_in_cu;)
10127         {
10128             /* Get current tu posx, posy and size */
10129             WORD32 curr_pos_x = ps_tu_enc_loop[ctr].s_tu.b4_pos_x << 2;
10130             WORD32 curr_pos_y = ps_tu_enc_loop[ctr].s_tu.b4_pos_y << 2;
10131             /* +1 is for parents size */
10132             WORD32 parent_tu_size = 1 << (ps_tu_enc_loop[ctr].s_tu.b3_size + 2 + 1);
10133 
10134             /* eval merge if leaf nodes reached i.e all child tus are of same size and first tu pos is same as parent pos */
10135             WORD32 eval_merge = ((curr_pos_x & (parent_tu_size - 1)) == 0);
10136             eval_merge &= ((curr_pos_y & (parent_tu_size - 1)) == 0);
10137 
10138             /* As TUs are published in encode order (Z SCAN),                      */
10139             /* Four consecutive TUS of same size implies we have hit leaf nodes.   */
10140             if(((ps_tu_enc_loop[ctr].s_tu.b3_size) == (ps_tu_enc_loop[ctr + 1].s_tu.b3_size)) &&
10141                ((ps_tu_enc_loop[ctr].s_tu.b3_size) == (ps_tu_enc_loop[ctr + 2].s_tu.b3_size)) &&
10142                ((ps_tu_enc_loop[ctr].s_tu.b3_size) == (ps_tu_enc_loop[ctr + 3].s_tu.b3_size)) &&
10143                eval_merge)
10144             {
10145                 WORD32 merge_parent = 1;
10146 
10147                 /* If any leaf noded is coded, it cannot be merged to parent */
10148                 if((ps_tu_enc_loop[ctr].s_tu.b1_y_cbf) || (ps_tu_enc_loop[ctr].s_tu.b1_cb_cbf) ||
10149                    (ps_tu_enc_loop[ctr].s_tu.b1_cr_cbf) ||
10150 
10151                    (ps_tu_enc_loop[ctr + 1].s_tu.b1_y_cbf) ||
10152                    (ps_tu_enc_loop[ctr + 1].s_tu.b1_cb_cbf) ||
10153                    (ps_tu_enc_loop[ctr + 1].s_tu.b1_cr_cbf) ||
10154 
10155                    (ps_tu_enc_loop[ctr + 2].s_tu.b1_y_cbf) ||
10156                    (ps_tu_enc_loop[ctr + 2].s_tu.b1_cb_cbf) ||
10157                    (ps_tu_enc_loop[ctr + 2].s_tu.b1_cr_cbf) ||
10158 
10159                    (ps_tu_enc_loop[ctr + 3].s_tu.b1_y_cbf) ||
10160                    (ps_tu_enc_loop[ctr + 3].s_tu.b1_cb_cbf) ||
10161                    (ps_tu_enc_loop[ctr + 3].s_tu.b1_cr_cbf))
10162                 {
10163                     merge_parent = 0;
10164                 }
10165 
10166                 if(u1_is_422)
10167                 {
10168                     if((ps_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1) ||
10169                        (ps_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1) ||
10170 
10171                        (ps_tu_enc_loop[ctr + 1].s_tu.b1_cb_cbf_subtu1) ||
10172                        (ps_tu_enc_loop[ctr + 1].s_tu.b1_cr_cbf_subtu1) ||
10173 
10174                        (ps_tu_enc_loop[ctr + 2].s_tu.b1_cb_cbf_subtu1) ||
10175                        (ps_tu_enc_loop[ctr + 2].s_tu.b1_cr_cbf_subtu1) ||
10176 
10177                        (ps_tu_enc_loop[ctr + 3].s_tu.b1_cb_cbf_subtu1) ||
10178                        (ps_tu_enc_loop[ctr + 3].s_tu.b1_cr_cbf_subtu1))
10179                     {
10180                         merge_parent = 0;
10181                     }
10182                 }
10183 
10184                 if(merge_parent)
10185                 {
10186                     /* Merge all the children (ctr,ctr+1,ctr+2,ctr+3) to parent (ctr) */
10187 
10188                     if(ps_recon_datastore->u1_is_lumaRecon_available)
10189                     {
10190                         ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr] = UCHAR_MAX;
10191 
10192                         memmove(
10193                             &ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr + 1],
10194                             &ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr + 4],
10195                             (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10196                     }
10197 
10198                     if(ps_recon_datastore->au1_is_chromaRecon_available[0])
10199                     {
10200                         ps_recon_datastore->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][0] =
10201                             UCHAR_MAX;
10202                         ps_recon_datastore->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][0] =
10203                             UCHAR_MAX;
10204 
10205                         memmove(
10206                             &ps_recon_datastore
10207                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr + 1][0],
10208                             &ps_recon_datastore
10209                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr + 4][0],
10210                             (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10211 
10212                         memmove(
10213                             &ps_recon_datastore
10214                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr + 1][0],
10215                             &ps_recon_datastore
10216                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr + 4][0],
10217                             (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10218 
10219                         if(u1_is_422)
10220                         {
10221                             ps_recon_datastore->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][1] =
10222                                 UCHAR_MAX;
10223                             ps_recon_datastore->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][1] =
10224                                 UCHAR_MAX;
10225 
10226                             memmove(
10227                                 &ps_recon_datastore
10228                                      ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr + 1][1],
10229                                 &ps_recon_datastore
10230                                      ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr + 4][1],
10231                                 (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10232 
10233                             memmove(
10234                                 &ps_recon_datastore
10235                                      ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr + 1][1],
10236                                 &ps_recon_datastore
10237                                      ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr + 4][1],
10238                                 (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10239                         }
10240                     }
10241 
10242                     /* Parent node size is one more than that of child */
10243                     ps_tu_enc_loop[ctr].s_tu.b3_size++;
10244 
10245                     ctr++;
10246 
10247                     /* move the subsequent TUs to next element */
10248                     ASSERT(num_tu_in_cu >= (ctr + 3));
10249                     memmove(
10250                         (void *)(ps_tu_enc_loop + ctr),
10251                         (void *)(ps_tu_enc_loop + ctr + 3),
10252                         (num_tu_in_cu - ctr - 3) * sizeof(tu_enc_loop_out_t));
10253 
10254                     /* Also memmove the temp TU params */
10255                     memmove(
10256                         (void *)(ps_tu_enc_loop_temp_prms + ctr),
10257                         (void *)(ps_tu_enc_loop_temp_prms + ctr + 3),
10258                         (num_tu_in_cu - ctr - 3) * sizeof(tu_enc_loop_temp_prms_t));
10259 
10260                     /* Number of TUs in CU are now less by 3 */
10261                     num_tu_in_cu -= 3;
10262 
10263                     /* Recurse again as new parent also be can be merged later */
10264                     recurse = 1;
10265                 }
10266                 else
10267                 {
10268                     /* Go to next set of leaf nodes */
10269                     ctr += 4;
10270                 }
10271             }
10272             else
10273             {
10274                 ctr++;
10275             }
10276         }
10277     }
10278 
10279     /* return the modified num TUs*/
10280     ASSERT(num_tu_in_cu > 0);
10281     return (num_tu_in_cu);
10282 }
10283 
ihevce_intra_mode_nxn_hash_updater(UWORD8 * pu1_mode_array,UWORD8 * pu1_hash_table,UWORD8 u1_num_ipe_modes)10284 UWORD8 ihevce_intra_mode_nxn_hash_updater(
10285     UWORD8 *pu1_mode_array, UWORD8 *pu1_hash_table, UWORD8 u1_num_ipe_modes)
10286 {
10287     WORD32 i;
10288     WORD32 i4_mode;
10289 
10290     for(i = 0; i < MAX_INTRA_CU_CANDIDATES; i++)
10291     {
10292         if(pu1_mode_array[i] < 35)
10293         {
10294             if(pu1_mode_array[i] != 0)
10295             {
10296                 i4_mode = pu1_mode_array[i] - 1;
10297 
10298                 if(!pu1_hash_table[i4_mode])
10299                 {
10300                     pu1_hash_table[i4_mode] = 1;
10301                     pu1_mode_array[u1_num_ipe_modes] = i4_mode;
10302                     u1_num_ipe_modes++;
10303                 }
10304             }
10305 
10306             if(pu1_mode_array[i] != 34)
10307             {
10308                 i4_mode = pu1_mode_array[i] + 1;
10309 
10310                 if((!pu1_hash_table[i4_mode]))
10311                 {
10312                     pu1_hash_table[i4_mode] = 1;
10313                     pu1_mode_array[u1_num_ipe_modes] = i4_mode;
10314                     u1_num_ipe_modes++;
10315                 }
10316             }
10317         }
10318     }
10319 
10320     if(!pu1_hash_table[INTRA_PLANAR])
10321     {
10322         pu1_hash_table[INTRA_PLANAR] = 1;
10323         pu1_mode_array[u1_num_ipe_modes] = INTRA_PLANAR;
10324         u1_num_ipe_modes++;
10325     }
10326 
10327     if(!pu1_hash_table[INTRA_DC])
10328     {
10329         pu1_hash_table[INTRA_DC] = 1;
10330         pu1_mode_array[u1_num_ipe_modes] = INTRA_DC;
10331         u1_num_ipe_modes++;
10332     }
10333 
10334     return u1_num_ipe_modes;
10335 }
10336 
10337 #if ENABLE_TU_TREE_DETERMINATION_IN_RDOPT
ihevce_determine_tu_tree_distribution(cu_inter_cand_t * ps_cu_data,me_func_selector_t * ps_func_selector,WORD16 * pi2_scratch_mem,UWORD8 * pu1_inp,WORD32 i4_inp_stride,WORD32 i4_lambda,UWORD8 u1_lambda_q_shift,UWORD8 u1_cu_size,UWORD8 u1_max_tr_depth)10338 WORD32 ihevce_determine_tu_tree_distribution(
10339     cu_inter_cand_t *ps_cu_data,
10340     me_func_selector_t *ps_func_selector,
10341     WORD16 *pi2_scratch_mem,
10342     UWORD8 *pu1_inp,
10343     WORD32 i4_inp_stride,
10344     WORD32 i4_lambda,
10345     UWORD8 u1_lambda_q_shift,
10346     UWORD8 u1_cu_size,
10347     UWORD8 u1_max_tr_depth)
10348 {
10349     err_prms_t s_err_prms;
10350 
10351     PF_SAD_FXN_TU_REC pf_err_compute[4];
10352 
10353     WORD32 i4_satd;
10354 
10355     s_err_prms.pi4_sad_grid = &i4_satd;
10356     s_err_prms.pi4_tu_split_flags = ps_cu_data->ai4_tu_split_flag;
10357     s_err_prms.pu1_inp = pu1_inp;
10358     s_err_prms.pu1_ref = ps_cu_data->pu1_pred_data;
10359     s_err_prms.i4_inp_stride = i4_inp_stride;
10360     s_err_prms.i4_ref_stride = ps_cu_data->i4_pred_data_stride;
10361     s_err_prms.pu1_wkg_mem = (UWORD8 *)pi2_scratch_mem;
10362 
10363     if(u1_cu_size == 64)
10364     {
10365         s_err_prms.u1_max_tr_depth = MIN(1, u1_max_tr_depth);
10366     }
10367     else
10368     {
10369         s_err_prms.u1_max_tr_depth = u1_max_tr_depth;
10370     }
10371 
10372     pf_err_compute[CU_64x64] = hme_evalsatd_pt_pu_64x64_tu_rec;
10373     pf_err_compute[CU_32x32] = hme_evalsatd_pt_pu_32x32_tu_rec;
10374     pf_err_compute[CU_16x16] = hme_evalsatd_pt_pu_16x16_tu_rec;
10375     pf_err_compute[CU_8x8] = hme_evalsatd_pt_pu_8x8_tu_rec;
10376 
10377     i4_satd = pf_err_compute[hme_get_range(u1_cu_size) - 4](
10378         &s_err_prms, i4_lambda, u1_lambda_q_shift, 0, ps_func_selector);
10379 
10380     if((0 == u1_max_tr_depth) && (ps_cu_data->b3_part_size != 0) && (u1_cu_size != 64))
10381     {
10382         ps_cu_data->ai4_tu_split_flag[0] = 1;
10383     }
10384 
10385     return i4_satd;
10386 }
10387 #endif
10388 
ihevce_populate_nbr_4x4_with_pu_data(nbr_4x4_t * ps_nbr_4x4,pu_t * ps_pu,WORD32 i4_nbr_buf_stride)10389 void ihevce_populate_nbr_4x4_with_pu_data(
10390     nbr_4x4_t *ps_nbr_4x4, pu_t *ps_pu, WORD32 i4_nbr_buf_stride)
10391 {
10392     WORD32 i, j;
10393 
10394     nbr_4x4_t *ps_tmp_4x4 = ps_nbr_4x4;
10395 
10396     WORD32 ht = (ps_pu->b4_ht + 1);
10397     WORD32 wd = (ps_pu->b4_wd + 1);
10398 
10399     ps_nbr_4x4->b1_intra_flag = 0;
10400     ps_nbr_4x4->b1_pred_l0_flag = !(ps_pu->b2_pred_mode & 1);
10401     ps_nbr_4x4->b1_pred_l1_flag = (ps_pu->b2_pred_mode > PRED_L0);
10402     ps_nbr_4x4->mv = ps_pu->mv;
10403 
10404     for(i = 0; i < ht; i++)
10405     {
10406         for(j = 0; j < wd; j++)
10407         {
10408             ps_tmp_4x4[j] = *ps_nbr_4x4;
10409         }
10410 
10411         ps_tmp_4x4 += i4_nbr_buf_stride;
10412     }
10413 }
10414 
ihevce_call_luma_inter_pred_rdopt_pass1(ihevce_enc_loop_ctxt_t * ps_ctxt,cu_inter_cand_t * ps_inter_cand,WORD32 cu_size)10415 void ihevce_call_luma_inter_pred_rdopt_pass1(
10416     ihevce_enc_loop_ctxt_t *ps_ctxt, cu_inter_cand_t *ps_inter_cand, WORD32 cu_size)
10417 {
10418     pu_t *ps_pu;
10419     UWORD8 *pu1_pred;
10420     WORD32 pred_stride, ctr, num_cu_part, skip_or_merge_flag = 0;
10421     WORD32 inter_pu_wd, inter_pu_ht;
10422 
10423     pu1_pred = ps_inter_cand->pu1_pred_data_scr;
10424     pred_stride = ps_inter_cand->i4_pred_data_stride;
10425     num_cu_part = (SIZE_2Nx2N != ps_inter_cand->b3_part_size) + 1;
10426 
10427     for(ctr = 0; ctr < num_cu_part; ctr++)
10428     {
10429         ps_pu = &ps_inter_cand->as_inter_pu[ctr];
10430 
10431         /* IF AMP then each partitions can have diff wd ht */
10432         inter_pu_wd = (ps_pu->b4_wd + 1) << 2;
10433         inter_pu_ht = (ps_pu->b4_ht + 1) << 2;
10434 
10435         skip_or_merge_flag = ps_inter_cand->b1_skip_flag | ps_pu->b1_merge_flag;
10436         //if(0 == skip_or_merge_flag)
10437         {
10438             ihevce_luma_inter_pred_pu(&ps_ctxt->s_mc_ctxt, ps_pu, pu1_pred, pred_stride, 1);
10439         }
10440         if((2 == num_cu_part) && (0 == ctr))
10441         {
10442             /* 2Nx__ partion case */
10443             if(inter_pu_wd == cu_size)
10444             {
10445                 pu1_pred += (inter_pu_ht * pred_stride);
10446             }
10447 
10448             /* __x2N partion case */
10449             if(inter_pu_ht == cu_size)
10450             {
10451                 pu1_pred += inter_pu_wd;
10452             }
10453         }
10454     }
10455 }
10456 
ihevce_it_recon_ssd(ihevce_enc_loop_ctxt_t * ps_ctxt,UWORD8 * pu1_src,WORD32 i4_src_strd,UWORD8 * pu1_pred,WORD32 i4_pred_strd,WORD16 * pi2_deq_data,WORD32 i4_deq_data_strd,UWORD8 * pu1_recon,WORD32 i4_recon_stride,UWORD8 * pu1_ecd_data,UWORD8 u1_trans_size,UWORD8 u1_pred_mode,WORD32 i4_cbf,WORD32 i4_zero_col,WORD32 i4_zero_row,CHROMA_PLANE_ID_T e_chroma_plane)10457 LWORD64 ihevce_it_recon_ssd(
10458     ihevce_enc_loop_ctxt_t *ps_ctxt,
10459     UWORD8 *pu1_src,
10460     WORD32 i4_src_strd,
10461     UWORD8 *pu1_pred,
10462     WORD32 i4_pred_strd,
10463     WORD16 *pi2_deq_data,
10464     WORD32 i4_deq_data_strd,
10465     UWORD8 *pu1_recon,
10466     WORD32 i4_recon_stride,
10467     UWORD8 *pu1_ecd_data,
10468     UWORD8 u1_trans_size,
10469     UWORD8 u1_pred_mode,
10470     WORD32 i4_cbf,
10471     WORD32 i4_zero_col,
10472     WORD32 i4_zero_row,
10473     CHROMA_PLANE_ID_T e_chroma_plane)
10474 {
10475     if(NULL_PLANE == e_chroma_plane)
10476     {
10477         ihevce_it_recon_fxn(
10478             ps_ctxt,
10479             pi2_deq_data,
10480             i4_deq_data_strd,
10481             pu1_pred,
10482             i4_pred_strd,
10483             pu1_recon,
10484             i4_recon_stride,
10485             pu1_ecd_data,
10486             u1_trans_size,
10487             u1_pred_mode,
10488             i4_cbf,
10489             i4_zero_col,
10490             i4_zero_row);
10491 
10492         return ps_ctxt->s_cmn_opt_func.pf_ssd_calculator(
10493             pu1_recon, pu1_src, i4_recon_stride, i4_src_strd, u1_trans_size, u1_trans_size,
10494             e_chroma_plane);
10495     }
10496     else
10497     {
10498         ihevce_chroma_it_recon_fxn(
10499             ps_ctxt,
10500             pi2_deq_data,
10501             i4_deq_data_strd,
10502             pu1_pred,
10503             i4_pred_strd,
10504             pu1_recon,
10505             i4_recon_stride,
10506             pu1_ecd_data,
10507             u1_trans_size,
10508             i4_cbf,
10509             i4_zero_col,
10510             i4_zero_row,
10511             e_chroma_plane);
10512 
10513         return ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
10514             pu1_recon,
10515             pu1_src,
10516             i4_recon_stride,
10517             i4_src_strd,
10518             u1_trans_size,
10519             u1_trans_size,
10520             e_chroma_plane);
10521     }
10522 }
10523 
10524 /*!
10525 ******************************************************************************
10526 * \if Function name : ihevce_t_q_iq_ssd_scan_fxn \endif
10527 *
10528 * \brief
10529 *    Transform unit level (Chroma) enc_loop function
10530 *
10531 * \param[in] ps_ctxt    enc_loop module ctxt pointer
10532 * \param[in] pu1_pred       pointer to predicted data buffer
10533 * \param[in] pred_strd      predicted buffer stride
10534 * \param[in] pu1_src    pointer to source data buffer
10535 * \param[in] src_strd   source buffer stride
10536 * \param[in] pi2_deq_data   pointer to store iq data
10537 * \param[in] deq_data_strd  iq data buffer stride
10538 * \param[out] pu1_ecd_data  pointer coeff output buffer (input to ent cod)
10539 * \param[out] pu1_csbf_buf  pointer to store the csbf for all 4x4 in a current
10540 *                           block
10541 * \param[out] csbf_strd     csbf buffer stride
10542 * \param[in] trans_size     transform size (4, 8, 16)
10543 * \param[in] intra_flag     0:Inter/Skip 1:Intra
10544 * \param[out] pi4_coeff_off pointer to store the number of bytes produced in
10545 *                           coeff buffer
10546 the current TU in RDopt Mode
10547 * \param[out] pi4_zero_col  pointer to store the zero_col info for the TU
10548 * \param[out] pi4_zero_row  pointer to store the zero_row info for the TU
10549 *
10550 * \return
10551 *    CBF of the current block
10552 *
10553 * \author
10554 *  Ittiam
10555 *
10556 *****************************************************************************
10557 */
ihevce_chroma_t_q_iq_ssd_scan_fxn(ihevce_enc_loop_ctxt_t * ps_ctxt,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD16 * pi2_deq_data,WORD32 deq_data_strd,UWORD8 * pu1_recon,WORD32 i4_recon_stride,UWORD8 * pu1_ecd_data,UWORD8 * pu1_csbf_buf,WORD32 csbf_strd,WORD32 trans_size,WORD32 i4_scan_idx,WORD32 intra_flag,WORD32 * pi4_coeff_off,WORD32 * pi4_tu_bits,WORD32 * pi4_zero_col,WORD32 * pi4_zero_row,UWORD8 * pu1_is_recon_available,WORD32 i4_perform_sbh,WORD32 i4_perform_rdoq,LWORD64 * pi8_cost,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy,UWORD8 u1_is_skip,SSD_TYPE_T e_ssd_type,CHROMA_PLANE_ID_T e_chroma_plane)10558 WORD32 ihevce_chroma_t_q_iq_ssd_scan_fxn(
10559     ihevce_enc_loop_ctxt_t *ps_ctxt,
10560     UWORD8 *pu1_pred,
10561     WORD32 pred_strd,
10562     UWORD8 *pu1_src,
10563     WORD32 src_strd,
10564     WORD16 *pi2_deq_data,
10565     WORD32 deq_data_strd,
10566     UWORD8 *pu1_recon,
10567     WORD32 i4_recon_stride,
10568     UWORD8 *pu1_ecd_data,
10569     UWORD8 *pu1_csbf_buf,
10570     WORD32 csbf_strd,
10571     WORD32 trans_size,
10572     WORD32 i4_scan_idx,
10573     WORD32 intra_flag,
10574     WORD32 *pi4_coeff_off,
10575     WORD32 *pi4_tu_bits,
10576     WORD32 *pi4_zero_col,
10577     WORD32 *pi4_zero_row,
10578     UWORD8 *pu1_is_recon_available,
10579     WORD32 i4_perform_sbh,
10580     WORD32 i4_perform_rdoq,
10581     LWORD64 *pi8_cost,
10582 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
10583     WORD32 i4_alpha_stim_multiplier,
10584     UWORD8 u1_is_cu_noisy,
10585 #endif
10586     UWORD8 u1_is_skip,
10587     SSD_TYPE_T e_ssd_type,
10588     CHROMA_PLANE_ID_T e_chroma_plane)
10589 {
10590     WORD32 trans_idx, cbf, u4_blk_sad;
10591     WORD16 *pi2_quant_coeffs;
10592     WORD16 *pi2_trans_values;
10593     WORD32 quant_scale_mat_offset;
10594     WORD32 *pi4_trans_scratch;
10595     WORD32 *pi4_subBlock2csbfId_map = NULL;
10596 
10597 #if PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
10598     WORD32 ai4_quant_rounding_factors[3][MAX_TU_SIZE * MAX_TU_SIZE], i;
10599 #endif
10600 
10601     rdoq_sbh_ctxt_t *ps_rdoq_sbh_ctxt = &ps_ctxt->s_rdoq_sbh_ctxt;
10602 
10603     WORD32 i4_perform_zcbf = (ps_ctxt->i4_zcbf_rdo_level == ZCBF_ENABLE) ||
10604                              (!intra_flag && ENABLE_INTER_ZCU_COST);
10605     WORD32 i4_perform_coeff_level_rdoq =
10606         (ps_ctxt->i4_quant_rounding_level != FIXED_QUANT_ROUNDING) &&
10607         (ps_ctxt->i4_chroma_quant_rounding_level == CHROMA_QUANT_ROUNDING);
10608 
10609     ASSERT((e_chroma_plane == U_PLANE) || (e_chroma_plane == V_PLANE));
10610     ASSERT(csbf_strd == MAX_TU_IN_CTB_ROW);
10611 
10612     *pi4_coeff_off = 0;
10613     *pi4_tu_bits = 0;
10614     pu1_is_recon_available[0] = 0;
10615 
10616     pi4_trans_scratch = (WORD32 *)&ps_ctxt->ai2_scratch[0];
10617     pi2_quant_coeffs = &ps_ctxt->ai2_scratch[0];
10618     pi2_trans_values = &ps_ctxt->ai2_scratch[0] + (MAX_TRANS_SIZE * 2);
10619 
10620     if(2 == trans_size)
10621     {
10622         trans_size = 4;
10623     }
10624 
10625     /* translate the transform size to index */
10626     trans_idx = trans_size >> 2;
10627 
10628     if(16 == trans_size)
10629     {
10630         trans_idx = 3;
10631     }
10632 
10633     if(u1_is_skip)
10634     {
10635         pi8_cost[0] = ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
10636             pu1_pred,
10637             pu1_src,
10638             pred_strd,
10639             src_strd,
10640             trans_size,
10641             trans_size,
10642             e_chroma_plane);
10643 
10644         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
10645         {
10646             /* buffer copy fromp pred to recon */
10647             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
10648                 pu1_pred,
10649                 pred_strd,
10650                 pu1_recon,
10651                 i4_recon_stride,
10652                 trans_size,
10653                 trans_size,
10654                 e_chroma_plane);
10655 
10656             pu1_is_recon_available[0] = 1;
10657         }
10658 
10659 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
10660         if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
10661         {
10662             pi8_cost[0] = ihevce_inject_stim_into_distortion(
10663                 pu1_src,
10664                 src_strd,
10665                 pu1_pred,
10666                 pred_strd,
10667                 pi8_cost[0],
10668                 i4_alpha_stim_multiplier,
10669                 trans_size,
10670                 0,
10671                 ps_ctxt->u1_enable_psyRDOPT,
10672                 e_chroma_plane);
10673         }
10674 #endif
10675 
10676 #if ENABLE_INTER_ZCU_COST
10677 #if !WEIGH_CHROMA_COST
10678         /* cbf = 0, accumulate cu not coded cost */
10679         ps_ctxt->i8_cu_not_coded_cost += pi8_cost[0];
10680 #else
10681         ps_ctxt->i8_cu_not_coded_cost += (pi8_cost[0] * ps_ctxt->u4_chroma_cost_weighing_factor +
10682                                           (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
10683                                          CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT;
10684 #endif
10685 #endif
10686 
10687         return 0;
10688     }
10689 
10690     if(intra_flag == 1)
10691     {
10692         quant_scale_mat_offset = 0;
10693 
10694 #if PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
10695         ai4_quant_rounding_factors[0][0] =
10696             MAX(ps_ctxt->i4_quant_rnd_factor[intra_flag], (1 << QUANT_ROUND_FACTOR_Q) / 3);
10697 
10698         for(i = 0; i < trans_size * trans_size; i++)
10699         {
10700             ai4_quant_rounding_factors[1][i] =
10701                 MAX(ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_0_1[trans_size >> 3][i],
10702                     (1 << QUANT_ROUND_FACTOR_Q) / 3);
10703             ai4_quant_rounding_factors[2][i] =
10704                 MAX(ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_1_2[trans_size >> 3][i],
10705                     (1 << QUANT_ROUND_FACTOR_Q) / 3);
10706         }
10707 #endif
10708     }
10709     else
10710     {
10711         quant_scale_mat_offset = NUM_TRANS_TYPES;
10712     }
10713 
10714     switch(trans_size)
10715     {
10716     case 4:
10717     {
10718         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map4x4TU;
10719 
10720         break;
10721     }
10722     case 8:
10723     {
10724         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map8x8TU;
10725 
10726         break;
10727     }
10728     case 16:
10729     {
10730         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map16x16TU;
10731 
10732         break;
10733     }
10734     case 32:
10735     {
10736         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map32x32TU;
10737 
10738         break;
10739     }
10740     }
10741 
10742     /* ---------- call residue and transform block ------- */
10743     u4_blk_sad = ps_ctxt->apf_chrm_resd_trns[trans_idx - 1](
10744         pu1_src,
10745         pu1_pred,
10746         pi4_trans_scratch,
10747         pi2_trans_values,
10748         src_strd,
10749         pred_strd,
10750         trans_size,
10751         e_chroma_plane);
10752     (void)u4_blk_sad;
10753     /* -------- calculate SSD calculation in Transform Domain ------ */
10754 
10755     cbf = ps_ctxt->apf_quant_iquant_ssd
10756               [i4_perform_coeff_level_rdoq + (e_ssd_type != FREQUENCY_DOMAIN_SSD) * 2]
10757 
10758           (pi2_trans_values,
10759            ps_ctxt->api2_rescal_mat[trans_idx + quant_scale_mat_offset],
10760            pi2_quant_coeffs,
10761            pi2_deq_data,
10762            trans_size,
10763            ps_ctxt->i4_chrm_cu_qp_div6,
10764            ps_ctxt->i4_chrm_cu_qp_mod6,
10765 #if !PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
10766            ps_ctxt->i4_quant_rnd_factor[intra_flag],
10767            ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_0_1[trans_size >> 3],
10768            ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_1_2[trans_size >> 3],
10769 #else
10770            intra_flag ? ai4_quant_rounding_factors[0][0] : ps_ctxt->i4_quant_rnd_factor[intra_flag],
10771            intra_flag ? ai4_quant_rounding_factors[1]
10772                       : ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_0_1[trans_size >> 3],
10773            intra_flag ? ai4_quant_rounding_factors[2]
10774                       : ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_1_2[trans_size >> 3],
10775 #endif
10776            trans_size,
10777            trans_size,
10778            deq_data_strd,
10779            pu1_csbf_buf,
10780            csbf_strd,
10781            pi4_zero_col,
10782            pi4_zero_row,
10783            ps_ctxt->api2_scal_mat[trans_idx + quant_scale_mat_offset],
10784            pi8_cost);
10785 
10786     if(e_ssd_type != FREQUENCY_DOMAIN_SSD)
10787     {
10788         pi8_cost[0] = UINT_MAX;
10789     }
10790 
10791     if(0 != cbf)
10792     {
10793         if(i4_perform_sbh || i4_perform_rdoq)
10794         {
10795             ps_rdoq_sbh_ctxt->i4_iq_data_strd = deq_data_strd;
10796             ps_rdoq_sbh_ctxt->i4_q_data_strd = trans_size;
10797 
10798             ps_rdoq_sbh_ctxt->i4_qp_div = ps_ctxt->i4_chrm_cu_qp_div6;
10799             ps_rdoq_sbh_ctxt->i2_qp_rem = ps_ctxt->i4_chrm_cu_qp_mod6;
10800             ps_rdoq_sbh_ctxt->i4_scan_idx = i4_scan_idx;
10801             ps_rdoq_sbh_ctxt->i8_ssd_cost = *pi8_cost;
10802             ps_rdoq_sbh_ctxt->i4_trans_size = trans_size;
10803 
10804             ps_rdoq_sbh_ctxt->pi2_dequant_coeff =
10805                 ps_ctxt->api2_scal_mat[trans_idx + quant_scale_mat_offset];
10806             ps_rdoq_sbh_ctxt->pi2_iquant_coeffs = pi2_deq_data;
10807             ps_rdoq_sbh_ctxt->pi2_quant_coeffs = pi2_quant_coeffs;
10808             ps_rdoq_sbh_ctxt->pi2_trans_values = pi2_trans_values;
10809             ps_rdoq_sbh_ctxt->pu1_csbf_buf = pu1_csbf_buf;
10810             ps_rdoq_sbh_ctxt->pi4_subBlock2csbfId_map = pi4_subBlock2csbfId_map;
10811 
10812             if((!i4_perform_rdoq))
10813             {
10814                 ihevce_sign_data_hiding(ps_rdoq_sbh_ctxt);
10815 
10816                 pi8_cost[0] = ps_rdoq_sbh_ctxt->i8_ssd_cost;
10817             }
10818         }
10819 
10820         /* ------- call coeffs scan function ------- */
10821         *pi4_coeff_off = ps_ctxt->s_cmn_opt_func.pf_scan_coeffs(
10822             pi2_quant_coeffs,
10823             pi4_subBlock2csbfId_map,
10824             i4_scan_idx,
10825             trans_size,
10826             pu1_ecd_data,
10827             pu1_csbf_buf,
10828             csbf_strd);
10829     }
10830 
10831     /*  Normalize Cost. Note : trans_idx, not (trans_idx-1) */
10832     pi8_cost[0] >>= ga_trans_shift[trans_idx];
10833 
10834 #if RDOPT_ZERO_CBF_ENABLE
10835     if((0 != cbf))
10836     {
10837         WORD32 tu_bits;
10838         LWORD64 zero_cbf_cost_u, curr_cb_cod_cost;
10839 
10840         zero_cbf_cost_u = 0;
10841 
10842         /*Populating the feilds of rdoq_ctxt structure*/
10843         if(i4_perform_rdoq)
10844         {
10845             //memset(ps_rdoq_sbh_ctxt,0,sizeof(rdoq_sbh_ctxt_t));
10846             /* transform size to log2transform size */
10847             GETRANGE(ps_rdoq_sbh_ctxt->i4_log2_trans_size, trans_size);
10848             ps_rdoq_sbh_ctxt->i4_log2_trans_size -= 1;
10849 
10850             ps_rdoq_sbh_ctxt->i8_cl_ssd_lambda_qf = ps_ctxt->i8_cl_ssd_lambda_chroma_qf;
10851             ps_rdoq_sbh_ctxt->i4_is_luma = 0;
10852             ps_rdoq_sbh_ctxt->i4_shift_val_ssd_in_td = ga_trans_shift[trans_idx];
10853             ps_rdoq_sbh_ctxt->i4_round_val_ssd_in_td =
10854                 (1 << (ps_rdoq_sbh_ctxt->i4_shift_val_ssd_in_td - 1));
10855             ps_rdoq_sbh_ctxt->i1_tu_is_coded = 0;
10856             ps_rdoq_sbh_ctxt->pi4_zero_col = pi4_zero_col;
10857             ps_rdoq_sbh_ctxt->pi4_zero_row = pi4_zero_row;
10858         }
10859         else if(i4_perform_zcbf)
10860         {
10861             /* cost of zero cbf encoding */
10862             zero_cbf_cost_u =
10863 
10864                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
10865                     pu1_pred,
10866                     pu1_src,
10867                     pred_strd,
10868                     src_strd,
10869                     trans_size,
10870                     trans_size,
10871                     e_chroma_plane);
10872         }
10873 
10874         /************************************************************************/
10875         /* call the entropy rdo encode to get the bit estimate for current tu   */
10876         /* note that tu includes only residual coding bits and does not include */
10877         /* tu split, cbf and qp delta encoding bits for a TU                    */
10878         /************************************************************************/
10879         if(i4_perform_rdoq)
10880         {
10881             tu_bits = ihevce_entropy_rdo_encode_tu_rdoq(
10882                 &ps_ctxt->s_rdopt_entropy_ctxt,
10883                 pu1_ecd_data,
10884                 trans_size,
10885                 0,
10886                 ps_rdoq_sbh_ctxt,
10887                 pi8_cost,
10888                 &zero_cbf_cost_u,
10889                 0);
10890             //Currently, we are not accounting for sign bit in RDOPT bits calculation when RDOQ is turned on
10891 
10892             if(ps_rdoq_sbh_ctxt->i1_tu_is_coded == 0)
10893             {
10894                 cbf = 0;
10895 
10896                 /* num bytes is set to 0 */
10897                 *pi4_coeff_off = 0;
10898             }
10899 
10900             (*pi4_tu_bits) += tu_bits;
10901 
10902             if((i4_perform_sbh) && (0 != cbf))
10903             {
10904                 ps_rdoq_sbh_ctxt->i8_ssd_cost = pi8_cost[0];
10905 
10906                 ihevce_sign_data_hiding(ps_rdoq_sbh_ctxt);
10907 
10908                 pi8_cost[0] = ps_rdoq_sbh_ctxt->i8_ssd_cost;
10909             }
10910 
10911             /*Add round value before normalizing*/
10912             pi8_cost[0] += ps_rdoq_sbh_ctxt->i4_round_val_ssd_in_td;
10913             pi8_cost[0] >>= ga_trans_shift[trans_idx];
10914 
10915             if(ps_rdoq_sbh_ctxt->i1_tu_is_coded == 1)
10916             {
10917                 *pi4_coeff_off = ps_ctxt->s_cmn_opt_func.pf_scan_coeffs(
10918                     pi2_quant_coeffs,
10919                     pi4_subBlock2csbfId_map,
10920                     i4_scan_idx,
10921                     trans_size,
10922                     pu1_ecd_data,
10923                     ps_rdoq_sbh_ctxt->pu1_csbf_buf,
10924                     csbf_strd);
10925             }
10926         }
10927         else
10928         {
10929             /************************************************************************/
10930             /* call the entropy rdo encode to get the bit estimate for current tu   */
10931             /* note that tu includes only residual coding bits and does not include */
10932             /* tu split, cbf and qp delta encoding bits for a TU                    */
10933             /************************************************************************/
10934             tu_bits = ihevce_entropy_rdo_encode_tu(
10935                 &ps_ctxt->s_rdopt_entropy_ctxt, pu1_ecd_data, trans_size, 0, i4_perform_sbh);
10936 
10937             (*pi4_tu_bits) += tu_bits;
10938         }
10939 
10940         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
10941         {
10942             pi8_cost[0] = ihevce_it_recon_ssd(
10943                 ps_ctxt,
10944                 pu1_src,
10945                 src_strd,
10946                 pu1_pred,
10947                 pred_strd,
10948                 pi2_deq_data,
10949                 deq_data_strd,
10950                 pu1_recon,
10951                 i4_recon_stride,
10952                 pu1_ecd_data,
10953                 trans_size,
10954                 PRED_MODE_INTRA,
10955                 cbf,
10956                 pi4_zero_col[0],
10957                 pi4_zero_row[0],
10958                 e_chroma_plane);
10959 
10960             pu1_is_recon_available[0] = 1;
10961         }
10962 
10963 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
10964         if(u1_is_cu_noisy && (e_ssd_type == SPATIAL_DOMAIN_SSD) && i4_alpha_stim_multiplier)
10965         {
10966             pi8_cost[0] = ihevce_inject_stim_into_distortion(
10967                 pu1_src,
10968                 src_strd,
10969                 pu1_recon,
10970                 i4_recon_stride,
10971                 pi8_cost[0],
10972                 i4_alpha_stim_multiplier,
10973                 trans_size,
10974                 0,
10975                 ps_ctxt->u1_enable_psyRDOPT,
10976                 e_chroma_plane);
10977         }
10978         else if(u1_is_cu_noisy && (e_ssd_type == FREQUENCY_DOMAIN_SSD) && i4_alpha_stim_multiplier)
10979         {
10980             pi8_cost[0] = ihevce_inject_stim_into_distortion(
10981                 pu1_src,
10982                 src_strd,
10983                 pu1_pred,
10984                 pred_strd,
10985                 pi8_cost[0],
10986                 i4_alpha_stim_multiplier,
10987                 trans_size,
10988                 0,
10989                 ps_ctxt->u1_enable_psyRDOPT,
10990                 e_chroma_plane);
10991         }
10992 #endif
10993 
10994         curr_cb_cod_cost = pi8_cost[0];
10995 
10996         /* add the SSD cost to bits estimate given by ECD */
10997         curr_cb_cod_cost +=
10998             COMPUTE_RATE_COST_CLIP30(tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
10999 
11000         if(i4_perform_zcbf)
11001         {
11002 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
11003             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
11004             {
11005                 zero_cbf_cost_u = ihevce_inject_stim_into_distortion(
11006                     pu1_src,
11007                     src_strd,
11008                     pu1_pred,
11009                     pred_strd,
11010                     zero_cbf_cost_u,
11011                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
11012                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
11013                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
11014                                                  100.0,
11015                     trans_size,
11016                     0,
11017                     ps_ctxt->u1_enable_psyRDOPT,
11018                     e_chroma_plane);
11019             }
11020 #endif
11021             /* force the tu as zero cbf if zero_cbf_cost is lower */
11022             if(zero_cbf_cost_u < curr_cb_cod_cost)
11023             {
11024                 *pi4_coeff_off = 0;
11025                 cbf = 0;
11026                 (*pi4_tu_bits) = 0;
11027                 pi8_cost[0] = zero_cbf_cost_u;
11028 
11029                 pu1_is_recon_available[0] = 0;
11030 
11031                 if(e_ssd_type == SPATIAL_DOMAIN_SSD)
11032                 {
11033                     ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
11034                         pu1_pred,
11035                         pred_strd,
11036                         pu1_recon,
11037                         i4_recon_stride,
11038                         trans_size,
11039                         trans_size,
11040                         e_chroma_plane);
11041 
11042                     pu1_is_recon_available[0] = 1;
11043                 }
11044             }
11045 
11046 #if ENABLE_INTER_ZCU_COST
11047             if(!intra_flag)
11048             {
11049 #if !WEIGH_CHROMA_COST
11050                 ps_ctxt->i8_cu_not_coded_cost += zero_cbf_cost_u;
11051 #else
11052                 ps_ctxt->i8_cu_not_coded_cost += (LWORD64)(
11053                     (zero_cbf_cost_u * ps_ctxt->u4_chroma_cost_weighing_factor +
11054                      (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
11055                     CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
11056 #endif
11057             }
11058 #endif
11059         }
11060     }
11061     else
11062     {
11063         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
11064         {
11065             pi8_cost[0] = ihevce_it_recon_ssd(
11066                 ps_ctxt,
11067                 pu1_src,
11068                 src_strd,
11069                 pu1_pred,
11070                 pred_strd,
11071                 pi2_deq_data,
11072                 deq_data_strd,
11073                 pu1_recon,
11074                 i4_recon_stride,
11075                 pu1_ecd_data,
11076                 trans_size,
11077                 PRED_MODE_INTRA,
11078                 cbf,
11079                 pi4_zero_col[0],
11080                 pi4_zero_row[0],
11081                 e_chroma_plane);
11082 
11083             pu1_is_recon_available[0] = 1;
11084         }
11085 
11086 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
11087         if(u1_is_cu_noisy && (e_ssd_type == SPATIAL_DOMAIN_SSD) && i4_alpha_stim_multiplier)
11088         {
11089             pi8_cost[0] = ihevce_inject_stim_into_distortion(
11090                 pu1_src,
11091                 src_strd,
11092                 pu1_recon,
11093                 i4_recon_stride,
11094                 pi8_cost[0],
11095                 !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
11096                                        : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
11097                                           (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
11098                                              100.0,
11099                 trans_size,
11100                 0,
11101                 ps_ctxt->u1_enable_psyRDOPT,
11102                 e_chroma_plane);
11103         }
11104         else if(u1_is_cu_noisy && (e_ssd_type == FREQUENCY_DOMAIN_SSD) && i4_alpha_stim_multiplier)
11105         {
11106             pi8_cost[0] = ihevce_inject_stim_into_distortion(
11107                 pu1_src,
11108                 src_strd,
11109                 pu1_pred,
11110                 pred_strd,
11111                 pi8_cost[0],
11112                 !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
11113                                        : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
11114                                           (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
11115                                              100.0,
11116                 trans_size,
11117                 0,
11118                 ps_ctxt->u1_enable_psyRDOPT,
11119                 e_chroma_plane);
11120         }
11121 #endif
11122 
11123 #if ENABLE_INTER_ZCU_COST
11124         if(!intra_flag)
11125         {
11126 #if !WEIGH_CHROMA_COST
11127             /* cbf = 0, accumulate cu not coded cost */
11128             ps_ctxt->i8_cu_not_coded_cost += pi8_cost[0];
11129 #else
11130             /* cbf = 0, accumulate cu not coded cost */
11131 
11132             ps_ctxt->i8_cu_not_coded_cost += (LWORD64)(
11133                 (pi8_cost[0] * ps_ctxt->u4_chroma_cost_weighing_factor +
11134                  (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
11135                 CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
11136 #endif
11137         }
11138 #endif
11139     }
11140 #endif /* RDOPT_ZERO_CBF_ENABLE */
11141 
11142     return (cbf);
11143 }
11144