1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /*!
22 ******************************************************************************
23 * \file ihevce_enc_loop_utils.c
24 *
25 * \brief
26 *    This file contains utility functions of Encode loop
27 *
28 * \date
29 *    18/09/2012
30 *
31 * \author
32 *    Ittiam
33 *
34 *
35 * List of Functions
36 *
37 *
38 ******************************************************************************
39 */
40 
41 /*****************************************************************************/
42 /* File Includes                                                             */
43 /*****************************************************************************/
44 /* System include files */
45 #include <stdio.h>
46 #include <string.h>
47 #include <stdlib.h>
48 #include <assert.h>
49 #include <stdarg.h>
50 #include <math.h>
51 #include <limits.h>
52 
53 /* User include files */
54 #include "ihevc_typedefs.h"
55 #include "itt_video_api.h"
56 #include "ihevce_api.h"
57 
58 #include "rc_cntrl_param.h"
59 #include "rc_frame_info_collector.h"
60 #include "rc_look_ahead_params.h"
61 
62 #include "ihevc_defs.h"
63 #include "ihevc_macros.h"
64 #include "ihevc_debug.h"
65 #include "ihevc_structs.h"
66 #include "ihevc_platform_macros.h"
67 #include "ihevc_deblk.h"
68 #include "ihevc_itrans_recon.h"
69 #include "ihevc_chroma_itrans_recon.h"
70 #include "ihevc_chroma_intra_pred.h"
71 #include "ihevc_intra_pred.h"
72 #include "ihevc_inter_pred.h"
73 #include "ihevc_mem_fns.h"
74 #include "ihevc_padding.h"
75 #include "ihevc_weighted_pred.h"
76 #include "ihevc_sao.h"
77 #include "ihevc_resi_trans.h"
78 #include "ihevc_quant_iquant_ssd.h"
79 #include "ihevc_cabac_tables.h"
80 #include "ihevc_common_tables.h"
81 
82 #include "ihevce_defs.h"
83 #include "ihevce_hle_interface.h"
84 #include "ihevce_lap_enc_structs.h"
85 #include "ihevce_multi_thrd_structs.h"
86 #include "ihevce_multi_thrd_funcs.h"
87 #include "ihevce_me_common_defs.h"
88 #include "ihevce_had_satd.h"
89 #include "ihevce_error_codes.h"
90 #include "ihevce_bitstream.h"
91 #include "ihevce_cabac.h"
92 #include "ihevce_rdoq_macros.h"
93 #include "ihevce_function_selector.h"
94 #include "ihevce_enc_structs.h"
95 #include "ihevce_entropy_structs.h"
96 #include "ihevce_cmn_utils_instr_set_router.h"
97 #include "ihevce_ipe_instr_set_router.h"
98 #include "ihevce_decomp_pre_intra_structs.h"
99 #include "ihevce_decomp_pre_intra_pass.h"
100 #include "ihevce_enc_loop_structs.h"
101 #include "ihevce_nbr_avail.h"
102 #include "ihevce_enc_loop_utils.h"
103 #include "ihevce_sub_pic_rc.h"
104 #include "ihevce_global_tables.h"
105 #include "ihevce_bs_compute_ctb.h"
106 #include "ihevce_cabac_rdo.h"
107 #include "ihevce_deblk.h"
108 #include "ihevce_frame_process.h"
109 #include "ihevce_rc_enc_structs.h"
110 #include "hme_datatype.h"
111 #include "hme_interface.h"
112 #include "hme_common_defs.h"
113 #include "hme_defs.h"
114 #include "hme_common_utils.h"
115 #include "ihevce_me_instr_set_router.h"
116 #include "ihevce_enc_subpel_gen.h"
117 #include "ihevce_inter_pred.h"
118 #include "ihevce_mv_pred.h"
119 #include "ihevce_mv_pred_merge.h"
120 #include "ihevce_enc_loop_inter_mode_sifter.h"
121 #include "ihevce_enc_cu_recursion.h"
122 #include "ihevce_enc_loop_pass.h"
123 #include "ihevce_common_utils.h"
124 #include "ihevce_dep_mngr_interface.h"
125 #include "ihevce_sao.h"
126 #include "ihevce_tile_interface.h"
127 #include "ihevce_profile.h"
128 #include "ihevce_stasino_helpers.h"
129 #include "ihevce_tu_tree_selector.h"
130 
131 /*****************************************************************************/
132 /* Globals                                                                   */
133 /*****************************************************************************/
134 
135 extern UWORD16 gau2_ihevce_cabac_bin_to_bits[64 * 2];
136 extern const UWORD8 gu1_hevce_scan4x4[3][16];
137 extern const UWORD8 gu1_hevce_sigcoeff_ctxtinc[4][16];
138 extern const UWORD8 gu1_hevce_sigcoeff_ctxtinc_tr4[16];
139 extern const UWORD8 gu1_hevce_sigcoeff_ctxtinc_00[16];
140 
141 /*****************************************************************************/
142 /* Constant Macros                                                           */
143 /*****************************************************************************/
144 #define ENABLE_ZERO_CBF 1
145 #define DISABLE_RDOQ_INTRA 0
146 
147 /*****************************************************************************/
148 /* Function Definitions                                                      */
149 /*****************************************************************************/
ihevce_tu_tree_update(tu_prms_t * ps_tu_prms,WORD32 * pnum_tu_in_cu,WORD32 depth,WORD32 tu_split_flag,WORD32 tu_early_cbf,WORD32 i4_x_off,WORD32 i4_y_off)150 void *ihevce_tu_tree_update(
151     tu_prms_t *ps_tu_prms,
152     WORD32 *pnum_tu_in_cu,
153     WORD32 depth,
154     WORD32 tu_split_flag,
155     WORD32 tu_early_cbf,
156     WORD32 i4_x_off,
157     WORD32 i4_y_off)
158 {
159     //WORD32 tu_split_flag = p_tu_split_flag[0];
160     WORD32 p_tu_split_flag[4];
161     WORD32 p_tu_early_cbf[4];
162 
163     WORD32 tu_size = ps_tu_prms->u1_tu_size;
164 
165     if(((tu_size >> depth) >= 16) && (tu_split_flag & 0x1))
166     {
167         if((tu_size >> depth) == 32)
168         {
169             /* Get the individual TU split flags */
170             p_tu_split_flag[0] = (tu_split_flag >> 16) & 0x1F;
171             p_tu_split_flag[1] = (tu_split_flag >> 11) & 0x1F;
172             p_tu_split_flag[2] = (tu_split_flag >> 6) & 0x1F;
173             p_tu_split_flag[3] = (tu_split_flag >> 1) & 0x1F;
174 
175             /* Get the early CBF flags */
176             p_tu_early_cbf[0] = (tu_early_cbf >> 16) & 0x1F;
177             p_tu_early_cbf[1] = (tu_early_cbf >> 11) & 0x1F;
178             p_tu_early_cbf[2] = (tu_early_cbf >> 6) & 0x1F;
179             p_tu_early_cbf[3] = (tu_early_cbf >> 1) & 0x1F;
180         }
181         else
182         {
183             /* Get the individual TU split flags */
184             p_tu_split_flag[0] = ((tu_split_flag >> 4) & 0x1);
185             p_tu_split_flag[1] = ((tu_split_flag >> 3) & 0x1);
186             p_tu_split_flag[2] = ((tu_split_flag >> 2) & 0x1);
187             p_tu_split_flag[3] = ((tu_split_flag >> 1) & 0x1);
188 
189             /* Get the early CBF flags */
190             p_tu_early_cbf[0] = ((tu_early_cbf >> 4) & 0x1);
191             p_tu_early_cbf[1] = ((tu_early_cbf >> 3) & 0x1);
192             p_tu_early_cbf[2] = ((tu_early_cbf >> 2) & 0x1);
193             p_tu_early_cbf[3] = ((tu_early_cbf >> 1) & 0x1);
194         }
195 
196         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
197             ps_tu_prms,
198             pnum_tu_in_cu,
199             depth + 1,
200             p_tu_split_flag[0],
201             p_tu_early_cbf[0],
202             i4_x_off,
203             i4_y_off);
204 
205         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
206             ps_tu_prms,
207             pnum_tu_in_cu,
208             depth + 1,
209             p_tu_split_flag[1],
210             p_tu_early_cbf[1],
211             (i4_x_off + (tu_size >> (depth + 1))),
212             i4_y_off);
213 
214         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
215             ps_tu_prms,
216             pnum_tu_in_cu,
217             depth + 1,
218             p_tu_split_flag[2],
219             p_tu_early_cbf[2],
220             i4_x_off,
221             (i4_y_off + (tu_size >> (depth + 1))));
222 
223         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
224             ps_tu_prms,
225             pnum_tu_in_cu,
226             depth + 1,
227             p_tu_split_flag[3],
228             p_tu_early_cbf[3],
229             (i4_x_off + (tu_size >> (depth + 1))),
230             (i4_y_off + (tu_size >> (depth + 1))));
231     }
232     else
233     {
234         if(tu_split_flag & 0x1)
235         {
236             /* This piece of code will be entered for the 8x8, if it is split
237             Update the 4 child TU's accordingly. */
238 
239             (*pnum_tu_in_cu) += 4;
240 
241             /* TL TU update */
242             ps_tu_prms->u1_tu_size = tu_size >> (depth + 1);
243 
244             ps_tu_prms->u1_x_off = i4_x_off;
245 
246             ps_tu_prms->u1_y_off = i4_y_off;
247 
248             /* Early CBF is not done for 4x4 transforms */
249             ps_tu_prms->i4_early_cbf = 1;
250 
251             ps_tu_prms++;
252 
253             /* TR TU update */
254             ps_tu_prms->u1_tu_size = tu_size >> (depth + 1);
255 
256             ps_tu_prms->u1_x_off = i4_x_off + (tu_size >> (depth + 1));
257 
258             ps_tu_prms->u1_y_off = i4_y_off;
259 
260             /* Early CBF is not done for 4x4 transforms */
261             ps_tu_prms->i4_early_cbf = 1;
262 
263             ps_tu_prms++;
264 
265             /* BL TU update */
266             ps_tu_prms->u1_tu_size = tu_size >> (depth + 1);
267 
268             ps_tu_prms->u1_x_off = i4_x_off;
269 
270             ps_tu_prms->u1_y_off = i4_y_off + (tu_size >> (depth + 1));
271 
272             /* Early CBF is not done for 4x4 transforms */
273             ps_tu_prms->i4_early_cbf = 1;
274 
275             ps_tu_prms++;
276 
277             /* BR TU update */
278             ps_tu_prms->u1_tu_size = tu_size >> (depth + 1);
279 
280             ps_tu_prms->u1_x_off = i4_x_off + (tu_size >> (depth + 1));
281 
282             ps_tu_prms->u1_y_off = i4_y_off + (tu_size >> (depth + 1));
283 
284             /* Early CBF is not done for 4x4 transforms */
285             ps_tu_prms->i4_early_cbf = 1;
286         }
287         else
288         {
289             /* Update the TU params */
290             ps_tu_prms->u1_tu_size = tu_size >> depth;
291 
292             ps_tu_prms->u1_x_off = i4_x_off;
293 
294             ps_tu_prms->u1_y_off = i4_y_off;
295 
296             (*pnum_tu_in_cu)++;
297 
298             /* Early CBF update for current TU */
299             ps_tu_prms->i4_early_cbf = tu_early_cbf & 0x1;
300         }
301         if((*pnum_tu_in_cu) < MAX_TU_IN_CTB)
302         {
303             ps_tu_prms++;
304 
305             ps_tu_prms->u1_tu_size = tu_size;
306         }
307     }
308 
309     return ps_tu_prms;
310 }
311 
312 /*!
313 ******************************************************************************
314 * \if Function name : ihevce_compute_quant_rel_param \endif
315 *
316 * \brief
317 *    This function updates quantization related parameters like qp_mod_6 etc in
318 *       context according to new qp
319 *
320 * \date
321 *    08/01/2013
322 *
323 * \author
324 *    Ittiam
325 *
326 * \return
327 *
328 * List of Functions
329 *
330 *
331 ******************************************************************************
332 */
ihevce_compute_quant_rel_param(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD8 i1_cu_qp)333 void ihevce_compute_quant_rel_param(ihevce_enc_loop_ctxt_t *ps_ctxt, WORD8 i1_cu_qp)
334 {
335     WORD32 i4_div_factor;
336 
337     ps_ctxt->i4_chrm_cu_qp =
338         (ps_ctxt->u1_chroma_array_type == 2)
339             ? MIN(i1_cu_qp + ps_ctxt->i4_chroma_qp_offset, 51)
340             : gai1_ihevc_chroma_qp_scale[i1_cu_qp + ps_ctxt->i4_chroma_qp_offset + MAX_QP_BD_OFFSET];
341     ps_ctxt->i4_cu_qp_div6 = (i1_cu_qp + (6 * (ps_ctxt->u1_bit_depth - 8))) / 6;
342     i4_div_factor = (i1_cu_qp + 3) / 6;
343     i4_div_factor = CLIP3(i4_div_factor, 3, 6);
344     ps_ctxt->i4_cu_qp_mod6 = (i1_cu_qp + (6 * (ps_ctxt->u1_bit_depth - 8))) % 6;
345     ps_ctxt->i4_chrm_cu_qp_div6 = (ps_ctxt->i4_chrm_cu_qp + (6 * (ps_ctxt->u1_bit_depth - 8))) / 6;
346     ps_ctxt->i4_chrm_cu_qp_mod6 = (ps_ctxt->i4_chrm_cu_qp + (6 * (ps_ctxt->u1_bit_depth - 8))) % 6;
347 
348 #define INTER_RND_QP_BY_6
349 #ifdef INTER_RND_QP_BY_6
350     /* quant factor without RDOQ is 1/6th of shift for inter : like in H264 */
351     {
352         ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTER] =
353             (WORD32)(((1 << QUANT_ROUND_FACTOR_Q) / (float)6) + 0.5f);
354     }
355 #else
356     /* quant factor without RDOQ is 1/6th of shift for inter : like in H264 */
357     ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTER] = (1 << QUANT_ROUND_FACTOR_Q) / 3;
358 #endif
359 
360     if(ISLICE == ps_ctxt->i1_slice_type)
361     {
362         /* quant factor without RDOQ is 1/3rd of shift for intra : like in H264 */
363         ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTRA] =
364             (WORD32)(((1 << QUANT_ROUND_FACTOR_Q) / (float)3) + 0.5f);
365     }
366     else
367     {
368         if(0) /*TRAQO_EXT_ENABLE_ONE_THIRD_RND*/
369         {
370             /* quant factor without RDOQ is 1/3rd of shift for intra : like in H264 */
371             ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTRA] =
372                 (WORD32)(((1 << QUANT_ROUND_FACTOR_Q) / (float)3) + 0.5f);
373         }
374         else
375         {
376             /* quant factor without RDOQ is 1/6th of shift for intra in inter pic */
377             ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTRA] =
378                 ps_ctxt->i4_quant_rnd_factor[PRED_MODE_INTER];
379             /* (1 << QUANT_ROUND_FACTOR_Q) / 6; */
380         }
381     }
382 }
383 
384 /*!
385 ******************************************************************************
386 * \if Function name : ihevce_populate_cl_cu_lambda_prms \endif
387 *
388 * \brief
389 *    Function whihc calculates the Lambda params for current picture
390 *
391 * \param[in] ps_enc_ctxt : encoder ctxt pointer
392 * \param[in] ps_cur_pic_ctxt : current pic ctxt
393 * \param[in] i4_cur_frame_qp : current pic QP
394 * \param[in] first_field : is first field flag
395 * \param[in] i4_temporal_lyr_id : Current picture layer id
396 *
397 * \return
398 *    None
399 *
400 * \author
401 *  Ittiam
402 *
403 *****************************************************************************
404 */
ihevce_populate_cl_cu_lambda_prms(ihevce_enc_loop_ctxt_t * ps_ctxt,frm_lambda_ctxt_t * ps_frm_lamda,WORD32 i4_slice_type,WORD32 i4_temporal_lyr_id,WORD32 i4_lambda_type)405 void ihevce_populate_cl_cu_lambda_prms(
406     ihevce_enc_loop_ctxt_t *ps_ctxt,
407     frm_lambda_ctxt_t *ps_frm_lamda,
408     WORD32 i4_slice_type,
409     WORD32 i4_temporal_lyr_id,
410     WORD32 i4_lambda_type)
411 {
412     WORD32 i4_curr_cu_qp, i4_curr_cu_qp_offset;
413     double lambda_modifier;
414     double lambda_uv_modifier;
415     double lambda;
416     double lambda_uv;
417 
418     WORD32 i4_qp_bdoffset = 6 * (ps_ctxt->u1_bit_depth - 8);
419 
420     /*Populate lamda modifier */
421     ps_ctxt->i4_lamda_modifier = ps_frm_lamda->lambda_modifier;
422     ps_ctxt->i4_uv_lamda_modifier = ps_frm_lamda->lambda_uv_modifier;
423     ps_ctxt->i4_temporal_layer_id = i4_temporal_lyr_id;
424 
425     for(i4_curr_cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qp;
426         i4_curr_cu_qp <= ps_ctxt->ps_rc_quant_ctxt->i2_max_qp;
427         i4_curr_cu_qp++)
428     {
429         WORD32 chroma_qp = (ps_ctxt->i4_chroma_format == IV_YUV_422SP_UV)
430                                ? MIN(i4_curr_cu_qp, 51)
431                                : gai1_ihevc_chroma_qp_scale[i4_curr_cu_qp + MAX_QP_BD_OFFSET];
432 
433         i4_curr_cu_qp_offset = i4_curr_cu_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset;
434 
435         lambda = pow(2.0, (((double)(i4_curr_cu_qp + i4_qp_bdoffset - 12)) / 3.0));
436         lambda_uv = pow(2.0, (((double)(chroma_qp + i4_qp_bdoffset - 12)) / 3.0));
437 
438         if((BSLICE == i4_slice_type) && (i4_temporal_lyr_id))
439         {
440             lambda_modifier = ps_frm_lamda->lambda_modifier *
441                               CLIP3((((double)(i4_curr_cu_qp - 12)) / 6.0), 2.00, 4.00);
442             lambda_uv_modifier = ps_frm_lamda->lambda_uv_modifier *
443                                  CLIP3((((double)(chroma_qp - 12)) / 6.0), 2.00, 4.00);
444         }
445         else
446         {
447             lambda_modifier = ps_frm_lamda->lambda_modifier;
448             lambda_uv_modifier = ps_frm_lamda->lambda_uv_modifier;
449         }
450         if(ps_ctxt->i4_use_const_lamda_modifier)
451         {
452             if(ISLICE == ps_ctxt->i1_slice_type)
453             {
454                 lambda_modifier = ps_ctxt->f_i_pic_lamda_modifier;
455                 lambda_uv_modifier = ps_ctxt->f_i_pic_lamda_modifier;
456             }
457             else
458             {
459                 lambda_modifier = CONST_LAMDA_MOD_VAL;
460                 lambda_uv_modifier = CONST_LAMDA_MOD_VAL;
461             }
462         }
463         switch(i4_lambda_type)
464         {
465         case 0:
466         {
467             i4_qp_bdoffset = 0;
468 
469             lambda = pow(2.0, (((double)(i4_curr_cu_qp + i4_qp_bdoffset - 12)) / 3.0));
470             lambda_uv = pow(2.0, (((double)(chroma_qp + i4_qp_bdoffset - 12)) / 3.0));
471 
472             lambda *= lambda_modifier;
473             lambda_uv *= lambda_uv_modifier;
474 
475             ps_ctxt->au4_chroma_cost_weighing_factor_array[i4_curr_cu_qp_offset] =
476                 (UWORD32)((lambda / lambda_uv) * (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT));
477 
478             ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset] =
479                 (LWORD64)(lambda * (1 << LAMBDA_Q_SHIFT));
480 
481             ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
482                 (LWORD64)(lambda_uv * (1 << LAMBDA_Q_SHIFT));
483             if(ps_ctxt->i4_use_const_lamda_modifier)
484             {
485                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
486                     (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
487             }
488             else
489             {
490                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
491                     (WORD32)(sqrt(lambda * 1.9) * (1 << LAMBDA_Q_SHIFT));
492             }
493 
494             ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset] =
495                 (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
496 
497             ps_ctxt->i8_cl_ssd_type2_lambda_qf_array[i4_curr_cu_qp_offset] =
498                 ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset];
499 
500             ps_ctxt->i8_cl_ssd_type2_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
501                 ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset];
502 
503             ps_ctxt->i4_satd_type2_lamda_array[i4_curr_cu_qp_offset] =
504                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset];
505 
506             ps_ctxt->i4_sad_type2_lamda_array[i4_curr_cu_qp_offset] =
507                 ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset];
508 
509             break;
510         }
511         case 1:
512         {
513             lambda = pow(2.0, (((double)(i4_curr_cu_qp + i4_qp_bdoffset - 12)) / 3.0));
514             lambda_uv = pow(2.0, (((double)(chroma_qp + i4_qp_bdoffset - 12)) / 3.0));
515 
516             lambda *= lambda_modifier;
517             lambda_uv *= lambda_uv_modifier;
518 
519             ps_ctxt->au4_chroma_cost_weighing_factor_array[i4_curr_cu_qp_offset] =
520                 (UWORD32)((lambda / lambda_uv) * (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT));
521 
522             ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset] =
523                 (LWORD64)(lambda * (1 << LAMBDA_Q_SHIFT));
524 
525             ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
526                 (LWORD64)(lambda_uv * (1 << LAMBDA_Q_SHIFT));
527             if(ps_ctxt->i4_use_const_lamda_modifier)
528             {
529                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
530                     (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
531             }
532             else
533             {
534                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
535                     (WORD32)(sqrt(lambda * 1.9) * (1 << LAMBDA_Q_SHIFT));
536             }
537             ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset] =
538                 (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
539 
540             ps_ctxt->i8_cl_ssd_type2_lambda_qf_array[i4_curr_cu_qp_offset] =
541                 ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset];
542 
543             ps_ctxt->i8_cl_ssd_type2_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
544                 ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset];
545 
546             ps_ctxt->i4_satd_type2_lamda_array[i4_curr_cu_qp_offset] =
547                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset];
548 
549             ps_ctxt->i4_sad_type2_lamda_array[i4_curr_cu_qp_offset] =
550                 ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset];
551 
552             break;
553         }
554         case 2:
555         {
556             lambda = pow(2.0, (((double)(i4_curr_cu_qp + i4_qp_bdoffset - 12)) / 3.0));
557             lambda_uv = pow(2.0, (((double)(chroma_qp + i4_qp_bdoffset - 12)) / 3.0));
558 
559             lambda *= lambda_modifier;
560             lambda_uv *= lambda_uv_modifier;
561 
562             ps_ctxt->au4_chroma_cost_weighing_factor_array[i4_curr_cu_qp_offset] =
563                 (UWORD32)((lambda / lambda_uv) * (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT));
564 
565             ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_curr_cu_qp_offset] =
566                 (LWORD64)(lambda * (1 << LAMBDA_Q_SHIFT));
567 
568             ps_ctxt->i8_cl_ssd_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
569                 (LWORD64)(lambda_uv * (1 << LAMBDA_Q_SHIFT));
570 
571             if(ps_ctxt->i4_use_const_lamda_modifier)
572             {
573                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
574                     (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
575             }
576             else
577             {
578                 ps_ctxt->i4_satd_lamda_array[i4_curr_cu_qp_offset] =
579                     (WORD32)(sqrt(lambda * 1.9) * (1 << LAMBDA_Q_SHIFT));
580             }
581             ps_ctxt->i4_sad_lamda_array[i4_curr_cu_qp_offset] =
582                 (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
583 
584             /* lambda corresponding to 8- bit, for metrics based on 8- bit ( Example 8bit SAD in encloop)*/
585             lambda = pow(2.0, (((double)(i4_curr_cu_qp - 12)) / 3.0));
586             lambda_uv = pow(2.0, (((double)(chroma_qp - 12)) / 3.0));
587 
588             lambda *= lambda_modifier;
589             lambda_uv *= lambda_uv_modifier;
590 
591             ps_ctxt->au4_chroma_cost_weighing_factor_array[i4_curr_cu_qp_offset] =
592                 (UWORD32)((lambda / lambda_uv) * (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT));
593 
594             ps_ctxt->i8_cl_ssd_type2_lambda_qf_array[i4_curr_cu_qp_offset] =
595                 (LWORD64)(lambda * (1 << LAMBDA_Q_SHIFT));
596 
597             ps_ctxt->i8_cl_ssd_type2_lambda_chroma_qf_array[i4_curr_cu_qp_offset] =
598                 (LWORD64)(lambda_uv * (1 << LAMBDA_Q_SHIFT));
599             if(ps_ctxt->i4_use_const_lamda_modifier)
600             {
601                 ps_ctxt->i4_satd_type2_lamda_array[i4_curr_cu_qp_offset] =
602                     (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
603             }
604             else
605             {
606                 ps_ctxt->i4_satd_type2_lamda_array[i4_curr_cu_qp_offset] =
607                     (WORD32)(sqrt(lambda * 1.9) * (1 << LAMBDA_Q_SHIFT));
608             }
609 
610             ps_ctxt->i4_sad_type2_lamda_array[i4_curr_cu_qp_offset] =
611                 (WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT));
612 
613             break;
614         }
615         default:
616         {
617             /* Intended to be a barren wasteland! */
618             ASSERT(0);
619         }
620         }
621     }
622 }
623 
624 /*!
625 ******************************************************************************
626 * \if Function name : ihevce_get_cl_cu_lambda_prms \endif
627 *
628 * \brief
629 *    Function whihc calculates the Lambda params for current picture
630 *
631 * \param[in] ps_enc_ctxt : encoder ctxt pointer
632 * \param[in] ps_cur_pic_ctxt : current pic ctxt
633 * \param[in] i4_cur_frame_qp : current pic QP
634 * \param[in] first_field : is first field flag
635 * \param[in] i4_temporal_lyr_id : Current picture layer id
636 *
637 * \return
638 *    None
639 *
640 * \author
641 *  Ittiam
642 *
643 *****************************************************************************
644 */
ihevce_get_cl_cu_lambda_prms(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD32 i4_cur_cu_qp)645 void ihevce_get_cl_cu_lambda_prms(ihevce_enc_loop_ctxt_t *ps_ctxt, WORD32 i4_cur_cu_qp)
646 {
647     WORD32 chroma_qp = (ps_ctxt->u1_chroma_array_type == 2)
648                            ? MIN(i4_cur_cu_qp + ps_ctxt->i4_chroma_qp_offset, 51)
649                            : gai1_ihevc_chroma_qp_scale
650                                  [i4_cur_cu_qp + ps_ctxt->i4_chroma_qp_offset + MAX_QP_BD_OFFSET];
651 
652     /* closed loop ssd lambda is same as final lambda */
653     ps_ctxt->i8_cl_ssd_lambda_qf =
654         ps_ctxt->i8_cl_ssd_lambda_qf_array[i4_cur_cu_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
655     ps_ctxt->i8_cl_ssd_lambda_chroma_qf =
656         ps_ctxt
657             ->i8_cl_ssd_lambda_chroma_qf_array[chroma_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
658     ps_ctxt->u4_chroma_cost_weighing_factor =
659         ps_ctxt->au4_chroma_cost_weighing_factor_array
660             [chroma_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
661     /* --- Initialized the lambda for SATD computations --- */
662     /* --- 0.95 is the multiplication factor as per HM --- */
663     /* --- 1.9 is the multiplication factor for Hadamard Transform --- */
664     ps_ctxt->i4_satd_lamda =
665         ps_ctxt->i4_satd_lamda_array[i4_cur_cu_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
666     ps_ctxt->i4_sad_lamda =
667         ps_ctxt->i4_sad_type2_lamda_array[i4_cur_cu_qp + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset];
668 }
669 
670 /*!
671 ******************************************************************************
672 * \if Function name : ihevce_update_pred_qp \endif
673 *
674 * \brief
675 *    Computes pred qp for the given CU
676 *
677 * \param[in]
678 *
679 * \return
680 *
681 *
682 * \author
683 *  Ittiam
684 *
685 *****************************************************************************
686 */
ihevce_update_pred_qp(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD32 cu_pos_x,WORD32 cu_pos_y)687 void ihevce_update_pred_qp(ihevce_enc_loop_ctxt_t *ps_ctxt, WORD32 cu_pos_x, WORD32 cu_pos_y)
688 {
689     WORD32 i4_pred_qp = 0x7FFFFFFF;
690     WORD32 i4_top, i4_left;
691     if(cu_pos_x == 0 && cu_pos_y == 0) /*CTB start*/
692     {
693         i4_pred_qp = ps_ctxt->i4_prev_QP;
694     }
695     else
696     {
697         if(cu_pos_y == 0) /*CTB boundary*/
698         {
699             i4_top = ps_ctxt->i4_prev_QP;
700         }
701         else /*within CTB*/
702         {
703             i4_top = ps_ctxt->ai4_qp_qg[(cu_pos_y - 1) * 8 + (cu_pos_x)];
704         }
705         if(cu_pos_x == 0) /*CTB boundary*/
706         {
707             i4_left = ps_ctxt->i4_prev_QP;
708         }
709         else /*within CTB*/
710         {
711             i4_left = ps_ctxt->ai4_qp_qg[(cu_pos_y)*8 + (cu_pos_x - 1)];
712         }
713         i4_pred_qp = (i4_left + i4_top + 1) >> 1;
714     }
715     ps_ctxt->i4_pred_qp = i4_pred_qp;
716     return;
717 }
718 /*!
719 ******************************************************************************
720 * \if Function name : ihevce_compute_cu_level_QP \endif
721 *
722 * \brief
723 *    Computes cu level QP with Traqo,Spatial Mod and In-frame RC
724 *
725 * \param[in]
726 *
727 * \return
728 *
729 *
730 * \author
731 *  Ittiam
732 *
733 *****************************************************************************
734 */
ihevce_compute_cu_level_QP(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD32 i4_activity_for_qp,WORD32 i4_activity_for_lamda,WORD32 i4_reduce_qp)735 void ihevce_compute_cu_level_QP(
736     ihevce_enc_loop_ctxt_t *ps_ctxt,
737     WORD32 i4_activity_for_qp,
738     WORD32 i4_activity_for_lamda,
739     WORD32 i4_reduce_qp)
740 {
741     /*modify quant related param in ctxt based on current cu qp*/
742     WORD32 i4_input_QP = ps_ctxt->i4_frame_mod_qp;
743     WORD32 cu_qp = i4_input_QP + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset;
744 
745     WORD32 i4_max_qp_allowed;
746     WORD32 i4_min_qp_allowed;
747     WORD32 i4_pred_qp;
748 
749     i4_pred_qp = ps_ctxt->i4_pred_qp;
750 
751     if(ps_ctxt->i4_sub_pic_level_rc)
752     {
753         i4_max_qp_allowed = (i4_pred_qp + (25 + (ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset / 2)));
754         i4_min_qp_allowed = (i4_pred_qp - (26 + (ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset / 2)));
755     }
756     else
757     {
758         i4_max_qp_allowed = (i4_input_QP + (7 + (ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset / 4)));
759         i4_min_qp_allowed = (i4_input_QP - (18 + (ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset / 4)));
760     }
761     if((ps_ctxt->i1_slice_type == BSLICE) && (ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6))
762         return;
763 
764 #if LAMDA_BASED_ON_QUANT
765     i4_activity_for_lamda = i4_activity_for_qp;
766 #endif
767 
768     if(i4_activity_for_qp != -1)
769     {
770         cu_qp = (ps_ctxt->ps_rc_quant_ctxt
771                      ->pi4_qp_to_qscale[i4_input_QP + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset]);
772         if(ps_ctxt->i4_qp_mod)
773         {
774             /*Recompute the Qp as per enc thread's frame level Qp*/
775             ASSERT(i4_activity_for_qp > 0);
776             cu_qp = ((cu_qp * i4_activity_for_qp) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >>
777                     QP_LEVEL_MOD_ACT_FACTOR;
778         }
779 
780         // To avoid access of uninitialised Qscale to qp conversion table
781         if(cu_qp > ps_ctxt->ps_rc_quant_ctxt->i2_max_qscale)
782             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_max_qscale;
783         else if(cu_qp < ps_ctxt->ps_rc_quant_ctxt->i2_min_qscale)
784             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qscale;
785 
786         cu_qp = ps_ctxt->ps_rc_quant_ctxt->pi4_qscale_to_qp[cu_qp];
787 
788         if((1 == i4_reduce_qp) && (cu_qp > 1))
789             cu_qp--;
790 
791         /*CLIP the delta to obey standard allowed QP variation of (-26 + offset/2) to (25 + offset/2)*/
792         if(cu_qp > i4_max_qp_allowed)
793             cu_qp = i4_max_qp_allowed;
794         else if(cu_qp < i4_min_qp_allowed)
795             cu_qp = i4_min_qp_allowed;
796 
797         /* CLIP to maintain Qp between user configured and min and max Qp values*/
798         if(cu_qp > ps_ctxt->ps_rc_quant_ctxt->i2_max_qp)
799             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_max_qp;
800         else if(cu_qp < ps_ctxt->ps_rc_quant_ctxt->i2_min_qp)
801             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qp;
802 
803         /*cu qp must be populated in cu_analyse_t struct*/
804         ps_ctxt->i4_cu_qp = cu_qp;
805         /*recompute quant related param at every cu level*/
806         ihevce_compute_quant_rel_param(ps_ctxt, cu_qp);
807     }
808 
809     /*Decoupling qp and lamda calculation */
810     if(i4_activity_for_lamda != -1)
811     {
812         cu_qp = (ps_ctxt->ps_rc_quant_ctxt
813                      ->pi4_qp_to_qscale[i4_input_QP + ps_ctxt->ps_rc_quant_ctxt->i1_qp_offset]);
814 
815         if(ps_ctxt->i4_qp_mod)
816         {
817 #if MODULATE_LAMDA_WHEN_SPATIAL_MOD_ON
818             /*Recompute the Qp as per enc thread's frame level Qp*/
819             ASSERT(i4_activity_for_lamda > 0);
820             cu_qp = ((cu_qp * i4_activity_for_lamda) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >>
821                     QP_LEVEL_MOD_ACT_FACTOR;
822 #endif
823         }
824         if(cu_qp > ps_ctxt->ps_rc_quant_ctxt->i2_max_qscale)
825             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_max_qscale;
826         else if(cu_qp < ps_ctxt->ps_rc_quant_ctxt->i2_min_qscale)
827             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qscale;
828 
829         cu_qp = ps_ctxt->ps_rc_quant_ctxt->pi4_qscale_to_qp[cu_qp];
830 
831         /*CLIP the delta to obey standard allowed QP variation of (-26 + offset/2) to (25 + offset/2)*/
832         if(cu_qp > i4_max_qp_allowed)
833             cu_qp = i4_max_qp_allowed;
834         else if(cu_qp < i4_min_qp_allowed)
835             cu_qp = i4_min_qp_allowed;
836 
837         /* CLIP to maintain Qp between user configured and min and max Qp values*/
838         if(cu_qp > ps_ctxt->ps_rc_quant_ctxt->i2_max_qp)
839             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_max_qp;
840         else if(cu_qp < ps_ctxt->ps_rc_quant_ctxt->i2_min_qp)
841             cu_qp = ps_ctxt->ps_rc_quant_ctxt->i2_min_qp;
842         /* get frame level lambda params */
843         ihevce_get_cl_cu_lambda_prms(
844             ps_ctxt, MODULATE_LAMDA_WHEN_SPATIAL_MOD_ON ? cu_qp : ps_ctxt->i4_frame_qp);
845     }
846 }
847 
848 /**
849 *******************************************************************************
850 * \if Function name : ihevce_scan_coeffs \endif
851 *
852 * @brief * Computes the coeff buffer for a coded TU for entropy coding
853 *
854 * @par   Description
855 * Computes the coeff buffer for a coded TU for entropy coding
856 *
857 * \param[in] pi2_quan_coeffs Quantized coefficient context
858 *
859 * \param[in] scan_idx Scan index specifying the scan order
860 *
861 * \param[in] trans_size Transform unit size
862 *
863 * \param[inout] pu1_out_data output coeff buffer for a coded TU for entropy coding
864 *
865 * \param[in] pu1_csbf_buf csb flag buffer
866 *
867 * @returns num_bytes
868 * Number of bytes written to pu1_out_data
869 *
870 * @remarks
871 *
872 * \author
873 *  Ittiam
874 *
875 *******************************************************************************
876 */
877 
ihevce_scan_coeffs(WORD16 * pi2_quant_coeffs,WORD32 * pi4_subBlock2csbfId_map,WORD32 scan_idx,WORD32 trans_size,UWORD8 * pu1_out_data,UWORD8 * pu1_csbf_buf,WORD32 i4_csbf_stride)878 WORD32 ihevce_scan_coeffs(
879     WORD16 *pi2_quant_coeffs,
880     WORD32 *pi4_subBlock2csbfId_map,
881     WORD32 scan_idx,
882     WORD32 trans_size,
883     UWORD8 *pu1_out_data,
884     UWORD8 *pu1_csbf_buf,
885     WORD32 i4_csbf_stride)
886 {
887     WORD32 i, trans_unit_idx, num_gt1_flag;
888     UWORD16 u2_csbf0flags;
889     WORD32 num_bytes = 0;
890     UWORD8 *pu1_trans_table;
891     UWORD8 *pu1_csb_table;
892     WORD32 shift_value, mask_value;
893     UWORD16 u2_sig_coeff_abs_gt0_flags = 0, u2_sig_coeff_abs_gt1_flags = 0;
894     UWORD16 u2_sign_flags;
895     UWORD16 u2_abs_coeff_remaining[16];
896     WORD32 blk_row, blk_col;
897 
898     UWORD8 *pu1_out_data_header;
899     UWORD16 *pu2_out_data_coeff;
900 
901     WORD32 x_pos, y_pos;
902     WORD32 quant_coeff;
903 
904     WORD32 num_gt0_flag;
905     (void)i4_csbf_stride;
906     pu1_out_data_header = pu1_out_data;
907     /* Need only last 3 bits, rest are reserved for debugging and making */
908     /* WORD alignment */
909     u2_csbf0flags = 0xBAD0;
910 
911     /* Select proper order for your transform unit and csb based on scan_idx*/
912     /* and the trans_size */
913 
914     /* scan order inside a csb */
915     pu1_csb_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
916     /* GETRANGE will give the log_2 of trans_size to shift_value */
917     GETRANGE(shift_value, trans_size);
918     shift_value = shift_value - 3; /* for finding. row no. from scan index */
919     mask_value = (trans_size / 4) - 1; /*for finding the col. no. from scan index*/
920     switch(trans_size)
921     {
922     case 32:
923         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_8x8[scan_idx][0]);
924         break;
925     case 16:
926         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
927         break;
928     case 8:
929         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_2x2[scan_idx][0]);
930         break;
931     case 4:
932         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_1x1[0]);
933         break;
934     default:
935         DBG_PRINTF("Invalid Trans Size\n");
936         return -1;
937         break;
938     }
939 
940     /*go through each csb in the scan order for first non-zero coded sub-block*/
941     for(trans_unit_idx = (trans_size * trans_size / 16) - 1; trans_unit_idx >= 0; trans_unit_idx--)
942     {
943         /* check for the first csb flag in our scan order */
944         if(pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]])
945         {
946             UWORD8 u1_last_x, u1_last_y;
947             /* row of csb */
948             blk_row = pu1_trans_table[trans_unit_idx] >> shift_value;
949             /* col of csb */
950             blk_col = pu1_trans_table[trans_unit_idx] & mask_value;
951 
952             /*check for the 1st non-0 values inside the csb in our scan order*/
953             for(i = 15; i >= 0; i--)
954             {
955                 x_pos = (pu1_csb_table[i] & 0x3) + blk_col * 4;
956                 y_pos = (pu1_csb_table[i] >> 2) + blk_row * 4;
957 
958                 quant_coeff = pi2_quant_coeffs[x_pos + (y_pos * trans_size)];
959 
960                 if(quant_coeff != 0)
961                     break;
962             }
963 
964             ASSERT(i >= 0);
965 
966             u1_last_x = x_pos;
967             u1_last_y = y_pos;
968 
969             /* storing last_x and last_y */
970             *pu1_out_data_header = u1_last_x;
971             pu1_out_data_header++;
972             num_bytes++;
973             *pu1_out_data_header = u1_last_y;
974             pu1_out_data_header++;
975             num_bytes++;
976 
977             /* storing the scan order */
978             *pu1_out_data_header = scan_idx;
979             pu1_out_data_header++;
980             num_bytes++;
981             /* storing last_sub_block pos. in scan order count */
982             *pu1_out_data_header = trans_unit_idx;
983             pu1_out_data_header++;
984             num_bytes++;
985 
986             /*stored the first 4 bytes, now all are word16. So word16 pointer*/
987             pu2_out_data_coeff = (UWORD16 *)pu1_out_data_header;
988 
989             /* u2_csbf0flags word */
990             u2_csbf0flags = 0xBAD0 | 1; /*since right&bottom csbf is 0*/
991             /* storing u2_csbf0flags word */
992             *pu2_out_data_coeff = u2_csbf0flags;
993             pu2_out_data_coeff++;
994             num_bytes += 2;
995 
996             num_gt0_flag = 1;
997             num_gt1_flag = 0;
998             u2_sign_flags = 0;
999 
1000             /* set the i th bit of u2_sig_coeff_abs_gt0_flags */
1001             u2_sig_coeff_abs_gt0_flags = u2_sig_coeff_abs_gt0_flags | (1 << i);
1002             if(abs(quant_coeff) > 1)
1003             {
1004                 /* set the i th bit of u2_sig_coeff_abs_gt1_flags */
1005                 u2_sig_coeff_abs_gt1_flags = u2_sig_coeff_abs_gt1_flags | (1 << i);
1006                 /* update u2_abs_coeff_remaining */
1007                 u2_abs_coeff_remaining[num_gt1_flag] = (UWORD16)abs(quant_coeff) - 1;
1008 
1009                 num_gt1_flag++;
1010             }
1011 
1012             if(quant_coeff < 0)
1013             {
1014                 /* set the i th bit of u2_sign_flags */
1015                 u2_sign_flags = u2_sign_flags | (1 << i);
1016             }
1017 
1018             /* Test remaining elements in our scan order */
1019             /* Can optimize further by CLZ macro */
1020             for(i = i - 1; i >= 0; i--)
1021             {
1022                 x_pos = (pu1_csb_table[i] & 0x3) + blk_col * 4;
1023                 y_pos = (pu1_csb_table[i] >> 2) + blk_row * 4;
1024 
1025                 quant_coeff = pi2_quant_coeffs[x_pos + (y_pos * trans_size)];
1026 
1027                 if(quant_coeff != 0)
1028                 {
1029                     /* set the i th bit of u2_sig_coeff_abs_gt0_flags */
1030                     u2_sig_coeff_abs_gt0_flags |= (1 << i);
1031 
1032                     if((abs(quant_coeff) > 1) || (num_gt0_flag >= MAX_GT_ONE))
1033                     {
1034                         /* set the i th bit of u2_sig_coeff_abs_gt1_flags */
1035                         u2_sig_coeff_abs_gt1_flags |= (1 << i);
1036 
1037                         /* update u2_abs_coeff_remaining */
1038                         u2_abs_coeff_remaining[num_gt1_flag] = (UWORD16)abs(quant_coeff) - 1;
1039 
1040                         num_gt1_flag++; /*n0. of Ones in sig_coeff_abs_gt1_flag*/
1041                     }
1042 
1043                     if(quant_coeff < 0)
1044                     {
1045                         /* set the i th bit of u2_sign_flags */
1046                         u2_sign_flags |= (1 << i);
1047                     }
1048 
1049                     num_gt0_flag++;
1050                 }
1051             }
1052 
1053             /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
1054             *pu2_out_data_coeff = u2_sig_coeff_abs_gt0_flags;
1055             pu2_out_data_coeff++;
1056             num_bytes += 2;
1057             /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
1058             *pu2_out_data_coeff = u2_sig_coeff_abs_gt1_flags;
1059             pu2_out_data_coeff++;
1060             num_bytes += 2;
1061             /* storing u2_sign_flags 2 bytes */
1062             *pu2_out_data_coeff = u2_sign_flags;
1063             pu2_out_data_coeff++;
1064             num_bytes += 2;
1065 
1066             /* Store the u2_abs_coeff_remaining[] */
1067             for(i = 0; i < num_gt1_flag; i++)
1068             {
1069                 /* storing u2_abs_coeff_remaining[i] 2 bytes */
1070                 *pu2_out_data_coeff = u2_abs_coeff_remaining[i];
1071                 pu2_out_data_coeff++;
1072                 num_bytes += 2;
1073             }
1074 
1075             break; /*We just need this loop for finding 1st non-zero csb only*/
1076         }
1077     }
1078 
1079     /* go through remaining csb in the scan order */
1080     for(trans_unit_idx = trans_unit_idx - 1; trans_unit_idx >= 0; trans_unit_idx--)
1081     {
1082         blk_row = pu1_trans_table[trans_unit_idx] >> shift_value; /*row of csb*/
1083         blk_col = pu1_trans_table[trans_unit_idx] & mask_value; /*col of csb*/
1084 
1085         /* u2_csbf0flags word */
1086         u2_csbf0flags = 0xBAD0 | /* assuming csbf_buf has only 0 or 1 values */
1087                         (pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]]);
1088 
1089         /********************************************************************/
1090         /* Minor hack: As per HEVC spec csbf in not signalled in stream for */
1091         /* block0, instead sig coeff map is directly signalled. This is     */
1092         /* taken care by forcing csbf for block0 to be 1 even if it is 0    */
1093         /********************************************************************/
1094         if(0 == trans_unit_idx)
1095         {
1096             u2_csbf0flags |= 1;
1097         }
1098 
1099         if((blk_col + 1 < trans_size / 4)) /* checking right boundary */
1100         {
1101             if(pu1_csbf_buf[pi4_subBlock2csbfId_map[blk_row * trans_size / 4 + blk_col + 1]])
1102             {
1103                 /* set the 2nd bit of u2_csbf0flags for right csbf */
1104                 u2_csbf0flags = u2_csbf0flags | (1 << 1);
1105             }
1106         }
1107         if((blk_row + 1 < trans_size / 4)) /* checking bottom oundary */
1108         {
1109             if(pu1_csbf_buf[pi4_subBlock2csbfId_map[(blk_row + 1) * trans_size / 4 + blk_col]])
1110             {
1111                 /* set the 3rd bit of u2_csbf0flags  for bottom csbf */
1112                 u2_csbf0flags = u2_csbf0flags | (1 << 2);
1113             }
1114         }
1115 
1116         /* storing u2_csbf0flags word */
1117         *pu2_out_data_coeff = u2_csbf0flags;
1118         pu2_out_data_coeff++;
1119         num_bytes += 2;
1120 
1121         /* check for the csb flag in our scan order */
1122         if(u2_csbf0flags & 0x1)
1123         {
1124             u2_sig_coeff_abs_gt0_flags = 0;
1125             u2_sig_coeff_abs_gt1_flags = 0;
1126             u2_sign_flags = 0;
1127 
1128             num_gt0_flag = 0;
1129             num_gt1_flag = 0;
1130             /* check for the non-0 values inside the csb in our scan order */
1131             /* Can optimize further by CLZ macro */
1132             for(i = 15; i >= 0; i--)
1133             {
1134                 x_pos = (pu1_csb_table[i] & 0x3) + blk_col * 4;
1135                 y_pos = (pu1_csb_table[i] >> 2) + blk_row * 4;
1136 
1137                 quant_coeff = pi2_quant_coeffs[x_pos + (y_pos * trans_size)];
1138 
1139                 if(quant_coeff != 0)
1140                 {
1141                     /* set the i th bit of u2_sig_coeff_abs_gt0_flags */
1142                     u2_sig_coeff_abs_gt0_flags |= (1 << i);
1143 
1144                     if((abs(quant_coeff) > 1) || (num_gt0_flag >= MAX_GT_ONE))
1145                     {
1146                         /* set the i th bit of u2_sig_coeff_abs_gt1_flags */
1147                         u2_sig_coeff_abs_gt1_flags |= (1 << i);
1148 
1149                         /* update u2_abs_coeff_remaining */
1150                         u2_abs_coeff_remaining[num_gt1_flag] = (UWORD16)abs(quant_coeff) - 1;
1151 
1152                         num_gt1_flag++;
1153                     }
1154 
1155                     if(quant_coeff < 0)
1156                     {
1157                         /* set the i th bit of u2_sign_flags */
1158                         u2_sign_flags = u2_sign_flags | (1 << i);
1159                     }
1160 
1161                     num_gt0_flag++;
1162                 }
1163             }
1164 
1165             /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
1166             *pu2_out_data_coeff = u2_sig_coeff_abs_gt0_flags;
1167             pu2_out_data_coeff++;
1168             num_bytes += 2;
1169 
1170             /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
1171             *pu2_out_data_coeff = u2_sig_coeff_abs_gt1_flags;
1172             pu2_out_data_coeff++;
1173             num_bytes += 2;
1174 
1175             /* storing u2_sign_flags 2 bytes */
1176             *pu2_out_data_coeff = u2_sign_flags;
1177             pu2_out_data_coeff++;
1178             num_bytes += 2;
1179 
1180             /* Store the u2_abs_coeff_remaining[] */
1181             for(i = 0; i < num_gt1_flag; i++)
1182             {
1183                 /* storing u2_abs_coeff_remaining[i] 2 bytes */
1184                 *pu2_out_data_coeff = u2_abs_coeff_remaining[i];
1185                 pu2_out_data_coeff++;
1186                 num_bytes += 2;
1187             }
1188         }
1189     }
1190 
1191     return num_bytes; /* Return the number of bytes written to out_data */
1192 }
1193 
1194 /**
1195 *******************************************************************************
1196 * \if Function name : ihevce_populate_intra_pred_mode \endif
1197 *
1198 * \brief * populates intra pred modes,b2_mpm_idx,b1_prev_intra_luma_pred_flag &
1199 * b5_rem_intra_pred_mode for a CU based on nieghbouring CUs,
1200 *
1201 * \par   Description
1202 * Computes the b1_prev_intra_luma_pred_flag, b2_mpm_idx & b5_rem_intra_pred_mode
1203 * for a CU
1204 *
1205 * \param[in] top_intra_mode Top intra mode
1206 * \param[in] left_intra_mode Left intra mode
1207 * \param[in] available_top Top availability flag
1208 * \param[in] available_left Left availability flag
1209 * \param[in] cu_pos_y CU 'y' position
1210 * \param[in] ps_cand_mode_list pointer to populate candidate list
1211 *
1212 * \returns none
1213 *
1214 * \author
1215 *  Ittiam
1216 *
1217 *******************************************************************************
1218 */
1219 
ihevce_populate_intra_pred_mode(WORD32 top_intra_mode,WORD32 left_intra_mode,WORD32 available_top,WORD32 available_left,WORD32 cu_pos_y,WORD32 * ps_cand_mode_list)1220 void ihevce_populate_intra_pred_mode(
1221     WORD32 top_intra_mode,
1222     WORD32 left_intra_mode,
1223     WORD32 available_top,
1224     WORD32 available_left,
1225     WORD32 cu_pos_y,
1226     WORD32 *ps_cand_mode_list)
1227 {
1228     /* local variables */
1229     WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
1230 
1231     /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
1232     /* N = top */
1233     if(0 == available_top)
1234     {
1235         cand_intra_pred_mode_top = INTRA_DC;
1236     }
1237     /* for neighbour != INTRA, setting DC is done outside */
1238     else if(0 == cu_pos_y) /* It's on the CTB boundary */
1239     {
1240         cand_intra_pred_mode_top = INTRA_DC;
1241     }
1242     else
1243     {
1244         cand_intra_pred_mode_top = top_intra_mode;
1245     }
1246 
1247     /* N = left */
1248     if(0 == available_left)
1249     {
1250         cand_intra_pred_mode_left = INTRA_DC;
1251     }
1252     /* for neighbour != INTRA, setting DC is done outside */
1253     else
1254     {
1255         cand_intra_pred_mode_left = left_intra_mode;
1256     }
1257 
1258     /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
1259     if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
1260     {
1261         if(cand_intra_pred_mode_left < 2)
1262         {
1263             ps_cand_mode_list[0] = INTRA_PLANAR;
1264             ps_cand_mode_list[1] = INTRA_DC;
1265             ps_cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
1266         }
1267         else
1268         {
1269             ps_cand_mode_list[0] = cand_intra_pred_mode_left;
1270             ps_cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
1271             ps_cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
1272         }
1273     }
1274     else
1275     {
1276         ps_cand_mode_list[0] = cand_intra_pred_mode_left;
1277         ps_cand_mode_list[1] = cand_intra_pred_mode_top;
1278 
1279         if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
1280            (cand_intra_pred_mode_top != INTRA_PLANAR))
1281         {
1282             ps_cand_mode_list[2] = INTRA_PLANAR;
1283         }
1284         else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
1285         {
1286             ps_cand_mode_list[2] = INTRA_DC;
1287         }
1288         else
1289         {
1290             ps_cand_mode_list[2] = INTRA_ANGULAR(26);
1291         }
1292     }
1293 }
1294 /**
1295 *******************************************************************************
1296 * \if Function name : ihevce_intra_pred_mode_signaling \endif
1297 *
1298 * \brief * Computes the b1_prev_intra_luma_pred_flag, b2_mpm_idx &
1299 * b5_rem_intra_pred_mode for a CU
1300 *
1301 * \par   Description
1302 * Computes the b1_prev_intra_luma_pred_flag, b2_mpm_idx & b5_rem_intra_pred_mode
1303 * for a CU
1304 *
1305 * \param[in] ps_nbr_top Top neighbour context
1306 * \param[in] ps_nbr_left Left neighbour context
1307 * \param[in] available_top Top availability flag
1308 * \param[in] available_left Left availability flag
1309 * \param[in] cu_pos_y CU 'y' position
1310 * \param[in] luma_intra_pred_mode_current the intra_pred_mode of current block
1311 * \param[inout] ps_intra_pred_mode_current
1312 * Pointer to structure having b1_prev_intra_luma_pred_flag, b2_mpm_idx and
1313 * b5_rem_intra_pred_mode
1314 *
1315 * \returns none
1316 *
1317 * \author
1318 *  Ittiam
1319 *
1320 *******************************************************************************
1321 */
1322 
ihevce_intra_pred_mode_signaling(WORD32 top_intra_mode,WORD32 left_intra_mode,WORD32 available_top,WORD32 available_left,WORD32 cu_pos_y,WORD32 luma_intra_pred_mode_current,intra_prev_rem_flags_t * ps_intra_pred_mode_current)1323 void ihevce_intra_pred_mode_signaling(
1324     WORD32 top_intra_mode,
1325     WORD32 left_intra_mode,
1326     WORD32 available_top,
1327     WORD32 available_left,
1328     WORD32 cu_pos_y,
1329     WORD32 luma_intra_pred_mode_current,
1330     intra_prev_rem_flags_t *ps_intra_pred_mode_current)
1331 {
1332     /* local variables */
1333     WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
1334     WORD32 cand_mode_list[3];
1335 
1336     ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 0;
1337     ps_intra_pred_mode_current->b2_mpm_idx = 0;  // for safety purpose
1338     ps_intra_pred_mode_current->b5_rem_intra_pred_mode = 0;
1339 
1340     /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
1341     /* N = top */
1342     if(0 == available_top)
1343     {
1344         cand_intra_pred_mode_top = INTRA_DC;
1345     }
1346     /* for neighbour != INTRA, setting DC is done outside */
1347     else if(0 == cu_pos_y) /* It's on the CTB boundary */
1348     {
1349         cand_intra_pred_mode_top = INTRA_DC;
1350     }
1351     else
1352     {
1353         cand_intra_pred_mode_top = top_intra_mode;
1354     }
1355 
1356     /* N = left */
1357     if(0 == available_left)
1358     {
1359         cand_intra_pred_mode_left = INTRA_DC;
1360     }
1361     /* for neighbour != INTRA, setting DC is done outside */
1362     else
1363     {
1364         cand_intra_pred_mode_left = left_intra_mode;
1365     }
1366 
1367     /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
1368     if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
1369     {
1370         if(cand_intra_pred_mode_left < 2)
1371         {
1372             cand_mode_list[0] = INTRA_PLANAR;
1373             cand_mode_list[1] = INTRA_DC;
1374             cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
1375         }
1376         else
1377         {
1378             cand_mode_list[0] = cand_intra_pred_mode_left;
1379             cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
1380             cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
1381         }
1382     }
1383     else
1384     {
1385         cand_mode_list[0] = cand_intra_pred_mode_left;
1386         cand_mode_list[1] = cand_intra_pred_mode_top;
1387 
1388         if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
1389            (cand_intra_pred_mode_top != INTRA_PLANAR))
1390         {
1391             cand_mode_list[2] = INTRA_PLANAR;
1392         }
1393         else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
1394         {
1395             cand_mode_list[2] = INTRA_DC;
1396         }
1397         else
1398         {
1399             cand_mode_list[2] = INTRA_ANGULAR(26);
1400         }
1401     }
1402 
1403     /* Signal Generation */
1404 
1405     /* Flag & mpm_index generation */
1406     if(cand_mode_list[0] == luma_intra_pred_mode_current)
1407     {
1408         ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 1;
1409         ps_intra_pred_mode_current->b2_mpm_idx = 0;
1410     }
1411     else if(cand_mode_list[1] == luma_intra_pred_mode_current)
1412     {
1413         ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 1;
1414         ps_intra_pred_mode_current->b2_mpm_idx = 1;
1415     }
1416     else if(cand_mode_list[2] == luma_intra_pred_mode_current)
1417     {
1418         ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 1;
1419         ps_intra_pred_mode_current->b2_mpm_idx = 2;
1420     }
1421     /* Flag & b5_rem_intra_pred_mode generation */
1422     else
1423     {
1424         WORD32 rem_mode;
1425 
1426         ps_intra_pred_mode_current->b1_prev_intra_luma_pred_flag = 0;
1427 
1428         /* sorting cand_mode_list */
1429         if(cand_mode_list[0] > cand_mode_list[1])
1430         {
1431             SWAP(cand_mode_list[0], cand_mode_list[1]);
1432         }
1433         if(cand_mode_list[0] > cand_mode_list[2])
1434         {
1435             SWAP(cand_mode_list[0], cand_mode_list[2]);
1436         }
1437         if(cand_mode_list[1] > cand_mode_list[2])
1438         {
1439             SWAP(cand_mode_list[1], cand_mode_list[2]);
1440         }
1441 
1442         rem_mode = luma_intra_pred_mode_current;
1443 
1444         if((rem_mode) >= cand_mode_list[2])
1445         {
1446             (rem_mode)--;
1447         }
1448         if((rem_mode) >= cand_mode_list[1])
1449         {
1450             (rem_mode)--;
1451         }
1452         if((rem_mode) >= cand_mode_list[0])
1453         {
1454             (rem_mode)--;
1455         }
1456         ps_intra_pred_mode_current->b5_rem_intra_pred_mode = rem_mode;
1457     }
1458 }
1459 
ihevce_quant_rounding_factor_gen(WORD32 i4_trans_size,WORD32 is_luma,rdopt_entropy_ctxt_t * ps_rdopt_entropy_ctxt,WORD32 * pi4_quant_round_0_1,WORD32 * pi4_quant_round_1_2,double i4_lamda_modifier,UWORD8 i4_is_tu_level_quant_rounding)1460 void ihevce_quant_rounding_factor_gen(
1461     WORD32 i4_trans_size,
1462     WORD32 is_luma,
1463     rdopt_entropy_ctxt_t *ps_rdopt_entropy_ctxt,
1464     WORD32 *pi4_quant_round_0_1,
1465     WORD32 *pi4_quant_round_1_2,
1466     double i4_lamda_modifier,
1467     UWORD8 i4_is_tu_level_quant_rounding)
1468 {
1469     //WORD32 i4_scan_idx = ps_ctxt->i4_scan_idx;
1470     UWORD8 *pu1_ctxt_model;
1471     WORD32 scan_pos;
1472     WORD32 sig_coeff_base_ctxt; /* cabac context for sig coeff flag    */
1473     WORD32 abs_gt1_base_ctxt;
1474     WORD32 log2_tr_size, i;
1475     UWORD16 u4_bits_estimated_r0, u4_bits_estimated_r1, u4_bits_estimated_r2;
1476     UWORD16 u4_bits_estimated_r1_temp;
1477     WORD32 j = 0;
1478     WORD32 k = 0;
1479     WORD32 temp2;
1480 
1481     double i4_lamda_mod = i4_lamda_modifier * pow(2.0, (-8.0 / 3.0));
1482     LWORD64 lamda_mod = (LWORD64)(i4_lamda_mod * (1 << LAMDA_Q_SHIFT_FACT));
1483     /* transform size to log2transform size */
1484     GETRANGE(log2_tr_size, i4_trans_size);
1485     log2_tr_size -= 1;
1486 
1487     if(1 == i4_is_tu_level_quant_rounding)
1488     {
1489         entropy_context_t *ps_cur_tu_entropy;
1490         cab_ctxt_t *ps_cabac;
1491         WORD32 curr_buf_idx = ps_rdopt_entropy_ctxt->i4_curr_buf_idx;
1492         ps_cur_tu_entropy = &ps_rdopt_entropy_ctxt->as_cu_entropy_ctxt[curr_buf_idx];
1493 
1494         ps_cabac = &ps_cur_tu_entropy->s_cabac_ctxt;
1495 
1496         pu1_ctxt_model = &ps_cabac->au1_ctxt_models[0];
1497     }
1498     else
1499     {
1500         pu1_ctxt_model = &ps_rdopt_entropy_ctxt->au1_init_cabac_ctxt_states[0];
1501     }
1502     /*If transform size is 4x4, then only one sub-block*/
1503     if(is_luma)
1504     {
1505         sig_coeff_base_ctxt = IHEVC_CAB_COEFF_FLAG;
1506         abs_gt1_base_ctxt = IHEVC_CAB_COEFABS_GRTR1_FLAG;
1507 
1508         if(3 == log2_tr_size)
1509         {
1510             /* 8x8 transform size */
1511             /* Assuming diagnol scan idx for now */
1512             sig_coeff_base_ctxt += 9;
1513         }
1514         else if(3 < log2_tr_size)
1515         {
1516             /* larger transform sizes */
1517             sig_coeff_base_ctxt += 21;
1518         }
1519     }
1520     else
1521     {
1522         /* chroma context initializations */
1523         sig_coeff_base_ctxt = IHEVC_CAB_COEFF_FLAG + 27;
1524         abs_gt1_base_ctxt = IHEVC_CAB_COEFABS_GRTR1_FLAG + 16;
1525 
1526         if(3 == log2_tr_size)
1527         {
1528             /* 8x8 transform size */
1529             sig_coeff_base_ctxt += 9;
1530         }
1531         else if(3 < log2_tr_size)
1532         {
1533             /* larger transform sizes */
1534             sig_coeff_base_ctxt += 12;
1535         }
1536     }
1537 
1538     /*Transform size of 4x4 will have only a single CSB */
1539     /* derive the context inc as per section 9.3.3.1.4 */
1540 
1541     if(2 == log2_tr_size)
1542     {
1543         UWORD8 sig_ctxinc;
1544         WORD32 state_mps;
1545         WORD32 gt1_ctxt = 0;
1546         WORD32 ctxt_set = 0;
1547         WORD32 ctxt_idx = 0;
1548 
1549         /* context set based on luma subblock pos */
1550 
1551         /* Encodet the abs level gt1 bins */
1552         /* Currently calculating trade off between mps(2) and mps(1)*/
1553         /* The estimation has to be further done for mps(11) and mps(111)*/
1554         /*ctxt_set = 0 as transform 4x4 has only one csb with DC */
1555         /* gt1_ctxt = 0 for the co-ef value to be 2 */
1556 
1557         ctxt_set = gt1_ctxt = 0;
1558         ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1559 
1560         state_mps = pu1_ctxt_model[ctxt_idx];
1561 
1562         u4_bits_estimated_r2 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1];
1563 
1564         u4_bits_estimated_r1_temp = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1565 
1566         QUANT_ROUND_FACTOR(temp2, u4_bits_estimated_r2, u4_bits_estimated_r1_temp, lamda_mod);
1567         for(scan_pos = 0; scan_pos < 16; scan_pos++)
1568         {
1569             *(pi4_quant_round_1_2 + scan_pos) = temp2;
1570         }
1571 
1572         for(scan_pos = 0; scan_pos < 16; scan_pos++)
1573         {
1574             //UWORD8 nbr_csbf = 1;
1575             /* derive the x,y pos */
1576             UWORD8 y_pos_x_pos = scan_pos;  //gu1_hevce_scan4x4[i4_scan_idx][scan_pos];
1577 
1578             /* 4x4 transform size increment uses lookup */
1579             sig_ctxinc = gu1_hevce_sigcoeff_ctxtinc_tr4[y_pos_x_pos];
1580 
1581             /*Get the mps state based on ctxt modes */
1582             state_mps = pu1_ctxt_model[sig_ctxinc + sig_coeff_base_ctxt];
1583 
1584             /* Bits taken to encode sig co-ef flag as 0 */
1585             u4_bits_estimated_r0 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1586 
1587             /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1588             //
1589             u4_bits_estimated_r1 =
1590                 (gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1] + ROUND_Q12(1.000000000));
1591 
1592             /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1593             u4_bits_estimated_r1 += u4_bits_estimated_r1_temp;
1594 
1595             QUANT_ROUND_FACTOR(temp2, u4_bits_estimated_r1, u4_bits_estimated_r0, lamda_mod);
1596             *(pi4_quant_round_0_1 + scan_pos) = temp2;
1597         }
1598     }
1599     else
1600     {
1601         UWORD8 *pu1_hevce_sigcoeff_ctxtinc;
1602         WORD32 is_nbr_csb_state_mps;
1603 
1604         WORD32 state_mps;
1605         WORD32 gt1_ctxt = 0;
1606         WORD32 ctxt_set = 0;
1607         WORD32 ctxt_idx;
1608         /*1to2 rounding factor is same for all sub blocks except for sub-block = 0*/
1609         /*Hence will write all the sub-block with i >=1 coeff, and then overwrite for i = 0*/
1610 
1611         /*ctxt_set = 0 DC subblock, the previous state did not have 2
1612         ctxt_set = 1 DC subblock, the previous state did have >= 2
1613         ctxt_set = 2 AC subblock, the previous state did not have 2
1614         ctxt_set = 3 AC subblock, the previous state did have >= 2*/
1615         i = 1;
1616         ctxt_set = (i && is_luma) ? 2 : 0;
1617 
1618         ctxt_set++;
1619 
1620         /*0th position indicates the probability of 2 */
1621         /*1th position indicates the probability of 1 */
1622         /*2th position indicates the probability of 11 */
1623         /*3th position indicates the probability of 111 */
1624 
1625         gt1_ctxt = 0;
1626         ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1627 
1628         state_mps = pu1_ctxt_model[ctxt_idx];
1629 
1630         u4_bits_estimated_r2 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1];
1631 
1632         u4_bits_estimated_r1 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1633         QUANT_ROUND_FACTOR(temp2, u4_bits_estimated_r2, u4_bits_estimated_r1, lamda_mod);
1634 
1635         for(scan_pos = 0; scan_pos < (16 * (i4_trans_size * i4_trans_size >> 4)); scan_pos++)
1636         {
1637             *(pi4_quant_round_1_2 + scan_pos) = temp2;
1638         }
1639 
1640         i = 0;
1641         ctxt_set = (i && is_luma) ? 2 : 0;
1642         ctxt_set++;
1643 
1644         /*0th position indicates the probability of 2 */
1645         /*1th position indicates the probability of 1 */
1646         /*2th position indicates the probability of 11 */
1647         /*3th position indicates the probability of 111 */
1648 
1649         gt1_ctxt = 0;
1650         ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1651 
1652         state_mps = pu1_ctxt_model[ctxt_idx];
1653 
1654         u4_bits_estimated_r2 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1];
1655 
1656         u4_bits_estimated_r1 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1657         QUANT_ROUND_FACTOR(temp2, u4_bits_estimated_r2, u4_bits_estimated_r1, lamda_mod);
1658 
1659         for(scan_pos = 0; scan_pos < 16; scan_pos++)
1660         {
1661             *(pi4_quant_round_1_2 + ((scan_pos % 4) + ((scan_pos >> 2) * i4_trans_size))) = temp2;
1662         }
1663 
1664         {
1665             WORD32 ctxt_idx;
1666 
1667             WORD32 nbr_csbf_0, nbr_csbf_1;
1668             WORD32 state_mps_0, state_mps_1;
1669             ctxt_idx = IHEVC_CAB_CODED_SUBLK_IDX;
1670             ctxt_idx += is_luma ? 0 : 2;
1671 
1672             /* ctxt based on right / bottom avail csbf, section 9.3.3.1.3 */
1673             /* if neibhor not available, ctxt idx = 0*/
1674             nbr_csbf_0 = 0;
1675             ctxt_idx += nbr_csbf_0 ? 1 : 0;
1676             state_mps_0 = pu1_ctxt_model[ctxt_idx];
1677 
1678             nbr_csbf_1 = 1;
1679             ctxt_idx += nbr_csbf_1 ? 1 : 0;
1680             state_mps_1 = pu1_ctxt_model[ctxt_idx];
1681 
1682             is_nbr_csb_state_mps = ((state_mps_0 % 2) == 1) && ((state_mps_1 % 2) == 1);
1683         }
1684 
1685         if(1 == is_nbr_csb_state_mps)
1686         {
1687             for(i = 0; i < (i4_trans_size * i4_trans_size >> 4); i++)
1688             {
1689                 UWORD8 sig_ctxinc;
1690                 WORD32 state_mps;
1691                 WORD32 gt1_ctxt = 0;
1692                 WORD32 ctxt_set = 0;
1693 
1694                 WORD32 ctxt_idx;
1695 
1696                 /*Check if the cabac states had previous nbr available */
1697 
1698                 if(i == 0)
1699                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[3][0];
1700                 else if(i < (i4_trans_size >> 2))
1701                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[1][0];
1702                 else if((i % (i4_trans_size >> 2)) == 0)
1703                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[2][0];
1704                 else
1705                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[0][0];
1706 
1707                 if(((i % (i4_trans_size >> 2)) == 0) && (i != 0))
1708                     k++;
1709 
1710                 j = ((i4_trans_size * 4) * k) + ((i % (i4_trans_size >> 2)) * 4);
1711                 /*ctxt_set = 0 DC subblock, the previous state did not have 2
1712                 ctxt_set = 1 DC subblock, the previous state did have >= 2
1713                 ctxt_set = 2 AC subblock, the previous state did not have 2
1714                 ctxt_set = 3 AC subblock, the previous state did have >= 2*/
1715 
1716                 ctxt_set = (i && is_luma) ? 2 : 0;
1717 
1718                 /* gt1_ctxt = 1 for the co-ef value to be 1 */
1719                 gt1_ctxt = 0;
1720                 ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1721 
1722                 state_mps = pu1_ctxt_model[ctxt_idx];
1723 
1724                 /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1725                 u4_bits_estimated_r1_temp = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1726 
1727                 for(scan_pos = 0; scan_pos < 16; scan_pos++)
1728                 {
1729                     UWORD8 y_pos_x_pos;
1730 
1731                     if(scan_pos || i)
1732                     {
1733                         y_pos_x_pos = scan_pos;  // gu1_hevce_scan4x4[i4_scan_idx][scan_pos];
1734                         /* ctxt for AC coeff depends on curpos and neigbour csbf */
1735                         sig_ctxinc = pu1_hevce_sigcoeff_ctxtinc[y_pos_x_pos];
1736 
1737                         /* based on luma subblock pos */
1738                         sig_ctxinc += (i && is_luma) ? 3 : 0;
1739 
1740                         sig_ctxinc += sig_coeff_base_ctxt;
1741                     }
1742                     else
1743                     {
1744                         /*MAM : both scan pos and i 0 impies the DC coef of 1st block only */
1745                         /* DC coeff has fixed context for luma and chroma */
1746                         sig_ctxinc = is_luma ? IHEVC_CAB_COEFF_FLAG : IHEVC_CAB_COEFF_FLAG + 27;
1747                     }
1748 
1749                     /*Get the mps state based on ctxt modes */
1750                     state_mps = pu1_ctxt_model[sig_ctxinc];
1751 
1752                     /* Bits taken to encode sig co-ef flag as 0 */
1753                     u4_bits_estimated_r0 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1754 
1755                     u4_bits_estimated_r1 =
1756                         (gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1] + ROUND_Q12(1.000000000));
1757 
1758                     /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1759                     u4_bits_estimated_r1 += u4_bits_estimated_r1_temp;
1760                     {
1761                         QUANT_ROUND_FACTOR(
1762                             temp2, u4_bits_estimated_r1, u4_bits_estimated_r0, lamda_mod);
1763                         *(pi4_quant_round_0_1 +
1764                           ((scan_pos % 4) + ((scan_pos >> 2) * i4_trans_size)) + j) = temp2;
1765                     }
1766                 }
1767             }
1768         }
1769         else
1770         {
1771             /*If Both nbr csbfs are 0, then all the coef in sub-blocks will have same value except for 1st subblock,
1772             Hence will write the same value to all sub block, and overwrite for the 1st one */
1773             i = 1;
1774             {
1775                 UWORD8 sig_ctxinc;
1776                 UWORD8 y_pos_x_pos;
1777                 WORD32 quant_rounding_0_1;
1778 
1779                 pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc_00[0];
1780 
1781                 scan_pos = 0;
1782                 y_pos_x_pos = scan_pos;  // gu1_hevce_scan4x4[i4_scan_idx][scan_pos];
1783                 /* ctxt for AC coeff depends on curpos and neigbour csbf */
1784                 sig_ctxinc = pu1_hevce_sigcoeff_ctxtinc[y_pos_x_pos];
1785 
1786                 /* based on luma subblock pos */
1787                 sig_ctxinc += (is_luma) ? 3 : 0;
1788 
1789                 sig_ctxinc += sig_coeff_base_ctxt;
1790 
1791                 /*Get the mps state based on ctxt modes */
1792                 state_mps = pu1_ctxt_model[sig_ctxinc];
1793 
1794                 /* Bits taken to encode sig co-ef flag as 0 */
1795                 u4_bits_estimated_r0 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1796 
1797                 u4_bits_estimated_r1 =
1798                     (gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1] + ROUND_Q12(1.000000000));
1799 
1800                 /*ctxt_set = 0 DC subblock, the previous state did not have 2
1801                 ctxt_set = 1 DC subblock, the previous state did have >= 2
1802                 ctxt_set = 2 AC subblock, the previous state did not have 2
1803                 ctxt_set = 3 AC subblock, the previous state did have >= 2*/
1804 
1805                 ctxt_set = (i && is_luma) ? 2 : 0;
1806 
1807                 /* gt1_ctxt = 1 for the co-ef value to be 1 */
1808                 gt1_ctxt = 0;
1809                 ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1810 
1811                 state_mps = pu1_ctxt_model[ctxt_idx];
1812 
1813                 /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1814                 u4_bits_estimated_r1 += gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1815 
1816                 QUANT_ROUND_FACTOR(
1817                     quant_rounding_0_1, u4_bits_estimated_r1, u4_bits_estimated_r0, lamda_mod);
1818 
1819                 for(scan_pos = 0; scan_pos < (16 * (i4_trans_size * i4_trans_size >> 4));
1820                     scan_pos++)
1821                 {
1822                     *(pi4_quant_round_0_1 + scan_pos) = quant_rounding_0_1;
1823                 }
1824             }
1825 
1826             /*First Subblock*/
1827             i = 0;
1828 
1829             {
1830                 UWORD8 sig_ctxinc;
1831                 WORD32 state_mps;
1832                 WORD32 gt1_ctxt = 0;
1833                 WORD32 ctxt_set = 0;
1834 
1835                 WORD32 ctxt_idx;
1836 
1837                 /*Check if the cabac states had previous nbr available */
1838 
1839                 {
1840                     pu1_hevce_sigcoeff_ctxtinc = (UWORD8 *)&gu1_hevce_sigcoeff_ctxtinc[0][0];
1841 
1842                     /*ctxt_set = 0 DC subblock, the previous state did not have 2
1843                     ctxt_set = 1 DC subblock, the previous state did have >= 2
1844                     ctxt_set = 2 AC subblock, the previous state did not have 2
1845                     ctxt_set = 3 AC subblock, the previous state did have >= 2*/
1846                     ctxt_set = (i && is_luma) ? 2 : 0;
1847 
1848                     /* gt1_ctxt = 1 for the co-ef value to be 1 */
1849                     gt1_ctxt = 0;
1850                     ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
1851 
1852                     state_mps = pu1_ctxt_model[ctxt_idx];
1853 
1854                     /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1855                     u4_bits_estimated_r1_temp = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1856 
1857                     for(scan_pos = 0; scan_pos < 16; scan_pos++)
1858                     {
1859                         UWORD8 y_pos_x_pos;
1860 
1861                         if(scan_pos)
1862                         {
1863                             y_pos_x_pos = scan_pos;  // gu1_hevce_scan4x4[i4_scan_idx][scan_pos];
1864                             /* ctxt for AC coeff depends on curpos and neigbour csbf */
1865                             sig_ctxinc = pu1_hevce_sigcoeff_ctxtinc[y_pos_x_pos];
1866 
1867                             /* based on luma subblock pos */
1868                             sig_ctxinc += (i && is_luma) ? 3 : 0;
1869 
1870                             sig_ctxinc += sig_coeff_base_ctxt;
1871                         }
1872                         else
1873                         {
1874                             /*MAM : both scan pos and i 0 impies the DC coef of 1st block only */
1875                             /* DC coeff has fixed context for luma and chroma */
1876                             sig_ctxinc = is_luma ? IHEVC_CAB_COEFF_FLAG : IHEVC_CAB_COEFF_FLAG + 27;
1877                         }
1878 
1879                         /*Get the mps state based on ctxt modes */
1880                         state_mps = pu1_ctxt_model[sig_ctxinc];
1881 
1882                         /* Bits taken to encode sig co-ef flag as 0 */
1883                         u4_bits_estimated_r0 = gau2_ihevce_cabac_bin_to_bits[state_mps ^ 0];
1884 
1885                         u4_bits_estimated_r1 =
1886                             (gau2_ihevce_cabac_bin_to_bits[state_mps ^ 1] + ROUND_Q12(1.000000000));
1887 
1888                         /* Bits taken to encode sig co-ef flag as 1, also account for sign bit worst case */
1889                         u4_bits_estimated_r1 += u4_bits_estimated_r1_temp;
1890                         {
1891                             QUANT_ROUND_FACTOR(
1892                                 temp2, u4_bits_estimated_r1, u4_bits_estimated_r0, lamda_mod);
1893                             *(pi4_quant_round_0_1 +
1894                               ((scan_pos % 4) + ((scan_pos >> 2) * i4_trans_size))) = temp2;
1895                         }
1896                     }
1897                 }
1898             }
1899         }
1900     }
1901     return;
1902 }
1903 
1904 /*!
1905 ******************************************************************************
1906 * \if Function name : ihevce_t_q_iq_ssd_scan_fxn \endif
1907 *
1908 * \brief
1909 *    Transform unit level (Luma) enc_loop function
1910 *
1911 * \param[in] ps_ctxt    enc_loop module ctxt pointer
1912 * \param[in] pu1_pred   pointer to predicted data buffer
1913 * \param[in] pred_strd  predicted buffer stride
1914 * \param[in] pu1_src    pointer to source data buffer
1915 * \param[in] src_strd   source buffer stride
1916 * \param[in] pi2_deq_data   pointer to store iq data
1917 * \param[in] deq_data_strd  iq data buffer stride
1918 * \param[out] pu1_ecd_data pointer coeff output buffer (input to ent cod)
1919 * \param[out] pu1_csbf_buf  pointer to store the csbf for all 4x4 in a current
1920 *                           block
1921 * \param[out] csbf_strd  csbf buffer stride
1922 * \param[in] trans_size transform size (4, 8, 16,32)
1923 * \param[in] packed_pred_mode   0:Inter 1:Intra 2:Skip
1924 * \param[out] pi4_cost      pointer to store the cost
1925 * \param[out] pi4_coeff_off pointer to store the number of bytes produced in
1926 *                           coeff buffer
1927 * \param[out] pu4_tu_bits   pointer to store the best TU bits required encode
1928 the current TU in RDopt Mode
1929 * \param[out] pu4_blk_sad   pointer to store the block sad for RC
1930 * \param[out] pi4_zero_col  pointer to store the zero_col info for the TU
1931 * \param[out] pi4_zero_row  pointer to store the zero_row info for the TU
1932 * \param[in]  i4_perform_rdoq Indicates if RDOQ should be performed or not
1933 * \param[in]  i4_perform_sbh Indicates if SBH should be performed or not
1934 *
1935 * \return
1936 *    CBF of the current block
1937 *
1938 * \author
1939 *  Ittiam
1940 *
1941 *****************************************************************************
1942 */
1943 
ihevce_t_q_iq_ssd_scan_fxn(ihevce_enc_loop_ctxt_t * ps_ctxt,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD16 * pi2_deq_data,WORD32 deq_data_strd,UWORD8 * pu1_recon,WORD32 i4_recon_stride,UWORD8 * pu1_ecd_data,UWORD8 * pu1_csbf_buf,WORD32 csbf_strd,WORD32 trans_size,WORD32 packed_pred_mode,LWORD64 * pi8_cost,WORD32 * pi4_coeff_off,WORD32 * pi4_tu_bits,UWORD32 * pu4_blk_sad,WORD32 * pi4_zero_col,WORD32 * pi4_zero_row,UWORD8 * pu1_is_recon_available,WORD32 i4_perform_rdoq,WORD32 i4_perform_sbh,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy,SSD_TYPE_T e_ssd_type,WORD32 early_cbf)1944 WORD32 ihevce_t_q_iq_ssd_scan_fxn(
1945     ihevce_enc_loop_ctxt_t *ps_ctxt,
1946     UWORD8 *pu1_pred,
1947     WORD32 pred_strd,
1948     UWORD8 *pu1_src,
1949     WORD32 src_strd,
1950     WORD16 *pi2_deq_data,
1951     WORD32 deq_data_strd,
1952     UWORD8 *pu1_recon,
1953     WORD32 i4_recon_stride,
1954     UWORD8 *pu1_ecd_data,
1955     UWORD8 *pu1_csbf_buf,
1956     WORD32 csbf_strd,
1957     WORD32 trans_size,
1958     WORD32 packed_pred_mode,
1959     LWORD64 *pi8_cost,
1960     WORD32 *pi4_coeff_off,
1961     WORD32 *pi4_tu_bits,
1962     UWORD32 *pu4_blk_sad,
1963     WORD32 *pi4_zero_col,
1964     WORD32 *pi4_zero_row,
1965     UWORD8 *pu1_is_recon_available,
1966     WORD32 i4_perform_rdoq,
1967     WORD32 i4_perform_sbh,
1968 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
1969     WORD32 i4_alpha_stim_multiplier,
1970     UWORD8 u1_is_cu_noisy,
1971 #endif
1972     SSD_TYPE_T e_ssd_type,
1973     WORD32 early_cbf)
1974 {
1975     WORD32 cbf = 0;
1976     WORD32 trans_idx;
1977     WORD32 quant_scale_mat_offset;
1978     WORD32 *pi4_trans_scratch;
1979     WORD16 *pi2_trans_values;
1980     WORD16 *pi2_quant_coeffs;
1981     WORD32 *pi4_subBlock2csbfId_map = NULL;
1982 
1983 #if PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
1984     WORD32 ai4_quant_rounding_factors[3][MAX_TU_SIZE * MAX_TU_SIZE], i;
1985 #endif
1986 
1987     rdoq_sbh_ctxt_t *ps_rdoq_sbh_ctxt = &ps_ctxt->s_rdoq_sbh_ctxt;
1988 
1989     WORD32 i4_perform_zcbf = (ENABLE_INTER_ZCU_COST && (PRED_MODE_INTRA != packed_pred_mode)) ||
1990                              (ps_ctxt->i4_zcbf_rdo_level == ZCBF_ENABLE);
1991     WORD32 i4_perform_coeff_level_rdoq = (ps_ctxt->i4_quant_rounding_level != FIXED_QUANT_ROUNDING);
1992     WORD8 intra_flag = 0;
1993     ASSERT(csbf_strd == MAX_TU_IN_CTB_ROW);
1994 
1995     *pi4_tu_bits = 0;
1996     *pi4_coeff_off = 0;
1997     pu1_is_recon_available[0] = 0;
1998 
1999     if((PRED_MODE_SKIP == packed_pred_mode) || (0 == early_cbf))
2000     {
2001         if(e_ssd_type != NULL_TYPE)
2002         {
2003             /* SSD cost is stored to the pointer */
2004             pi8_cost[0] =
2005 
2006                 ps_ctxt->s_cmn_opt_func.pf_ssd_and_sad_calculator(
2007                     pu1_pred, pred_strd, pu1_src, src_strd, trans_size, pu4_blk_sad);
2008 
2009 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2010             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
2011             {
2012                 pi8_cost[0] = ihevce_inject_stim_into_distortion(
2013                     pu1_src,
2014                     src_strd,
2015                     pu1_pred,
2016                     pred_strd,
2017                     pi8_cost[0],
2018                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
2019                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
2020                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
2021                                                  100.0,
2022                     trans_size,
2023                     0,
2024                     ps_ctxt->u1_enable_psyRDOPT,
2025                     NULL_PLANE);
2026             }
2027 #endif
2028 
2029             /* copy pred to recon for skip mode */
2030             if(SPATIAL_DOMAIN_SSD == e_ssd_type)
2031             {
2032                 ps_ctxt->s_cmn_opt_func.pf_copy_2d(
2033                     pu1_recon, i4_recon_stride, pu1_pred, pred_strd, trans_size, trans_size);
2034                 pu1_is_recon_available[0] = 1;
2035             }
2036             else
2037             {
2038                 pu1_is_recon_available[0] = 0;
2039             }
2040 
2041 #if ENABLE_INTER_ZCU_COST
2042             ps_ctxt->i8_cu_not_coded_cost += pi8_cost[0];
2043 #endif
2044         }
2045         else
2046         {
2047             pi8_cost[0] = UINT_MAX;
2048         }
2049 
2050         /* cbf is returned as 0 */
2051         return (0);
2052     }
2053 
2054     /* derive context variables */
2055     pi4_trans_scratch = (WORD32 *)&ps_ctxt->ai2_scratch[0];
2056     pi2_quant_coeffs = &ps_ctxt->ai2_scratch[0];
2057     pi2_trans_values = &ps_ctxt->ai2_scratch[0] + (MAX_TRANS_SIZE * 2);
2058 
2059     /* translate the transform size to index for 4x4 and 8x8 */
2060     trans_idx = trans_size >> 2;
2061 
2062     if(PRED_MODE_INTRA == packed_pred_mode)
2063     {
2064         quant_scale_mat_offset = 0;
2065         intra_flag = 1;
2066 #if PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
2067         ai4_quant_rounding_factors[0][0] =
2068             MAX(ps_ctxt->i4_quant_rnd_factor[intra_flag], (1 << QUANT_ROUND_FACTOR_Q) / 3);
2069 
2070         for(i = 0; i < trans_size * trans_size; i++)
2071         {
2072             ai4_quant_rounding_factors[1][i] =
2073                 MAX(ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3][i],
2074                     (1 << QUANT_ROUND_FACTOR_Q) / 3);
2075             ai4_quant_rounding_factors[2][i] =
2076                 MAX(ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3][i],
2077                     (1 << QUANT_ROUND_FACTOR_Q) / 3);
2078         }
2079 #endif
2080     }
2081     else
2082     {
2083         quant_scale_mat_offset = NUM_TRANS_TYPES;
2084     }
2085     /* for intra 4x4 DST transform should be used */
2086     if((1 == trans_idx) && (1 == intra_flag))
2087     {
2088         trans_idx = 0;
2089     }
2090     /* for 16x16 cases */
2091     else if(16 == trans_size)
2092     {
2093         trans_idx = 3;
2094     }
2095     /* for 32x32 cases */
2096     else if(32 == trans_size)
2097     {
2098         trans_idx = 4;
2099     }
2100 
2101     switch(trans_size)
2102     {
2103     case 4:
2104     {
2105         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map4x4TU;
2106 
2107         break;
2108     }
2109     case 8:
2110     {
2111         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map8x8TU;
2112 
2113         break;
2114     }
2115     case 16:
2116     {
2117         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map16x16TU;
2118 
2119         break;
2120     }
2121     case 32:
2122     {
2123         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map32x32TU;
2124 
2125         break;
2126     }
2127     }
2128 
2129     /* Do not call the FT and Quant functions if early_cbf is 0 */
2130     if(1 == early_cbf)
2131     {
2132         /* ---------- call residue and transform block ------- */
2133         *pu4_blk_sad = ps_ctxt->apf_resd_trns[trans_idx](
2134             pu1_src,
2135             pu1_pred,
2136             pi4_trans_scratch,
2137             pi2_trans_values,
2138             src_strd,
2139             pred_strd,
2140             ((trans_size << 16) + 0)); /* dst strd and chroma flag are packed together */
2141 
2142         cbf = ps_ctxt->apf_quant_iquant_ssd
2143                   [i4_perform_coeff_level_rdoq + (e_ssd_type != FREQUENCY_DOMAIN_SSD) * 2](
2144                       pi2_trans_values,
2145                       ps_ctxt->api2_rescal_mat[trans_idx + quant_scale_mat_offset],
2146                       pi2_quant_coeffs,
2147                       pi2_deq_data,
2148                       trans_size,
2149                       ps_ctxt->i4_cu_qp_div6,
2150                       ps_ctxt->i4_cu_qp_mod6,
2151 #if !PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
2152                       ps_ctxt->i4_quant_rnd_factor[intra_flag],
2153                       ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
2154                       ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
2155 #else
2156                       intra_flag ? ai4_quant_rounding_factors[0][0]
2157                                  : ps_ctxt->i4_quant_rnd_factor[intra_flag],
2158                       intra_flag ? ai4_quant_rounding_factors[1]
2159                                  : ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
2160                       intra_flag ? ai4_quant_rounding_factors[2]
2161                                  : ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
2162 #endif
2163                       trans_size,
2164                       trans_size,
2165                       deq_data_strd,
2166                       pu1_csbf_buf,
2167                       csbf_strd,
2168                       pi4_zero_col,
2169                       pi4_zero_row,
2170                       ps_ctxt->api2_scal_mat[trans_idx + quant_scale_mat_offset],
2171                       pi8_cost);
2172 
2173         if(e_ssd_type != FREQUENCY_DOMAIN_SSD)
2174         {
2175             pi8_cost[0] = UINT_MAX;
2176         }
2177     }
2178 
2179     if(0 != cbf)
2180     {
2181         if(i4_perform_sbh || i4_perform_rdoq)
2182         {
2183             ps_rdoq_sbh_ctxt->i4_iq_data_strd = deq_data_strd;
2184             ps_rdoq_sbh_ctxt->i4_q_data_strd = trans_size;
2185             ps_rdoq_sbh_ctxt->pi4_subBlock2csbfId_map = pi4_subBlock2csbfId_map;
2186 
2187             ps_rdoq_sbh_ctxt->i4_qp_div = ps_ctxt->i4_cu_qp_div6;
2188             ps_rdoq_sbh_ctxt->i2_qp_rem = ps_ctxt->i4_cu_qp_mod6;
2189             ps_rdoq_sbh_ctxt->i4_scan_idx = ps_ctxt->i4_scan_idx;
2190             ps_rdoq_sbh_ctxt->i8_ssd_cost = *pi8_cost;
2191             ps_rdoq_sbh_ctxt->i4_trans_size = trans_size;
2192 
2193             ps_rdoq_sbh_ctxt->pi2_dequant_coeff =
2194                 ps_ctxt->api2_scal_mat[trans_idx + quant_scale_mat_offset];
2195             ps_rdoq_sbh_ctxt->pi2_iquant_coeffs = pi2_deq_data;
2196             ps_rdoq_sbh_ctxt->pi2_quant_coeffs = pi2_quant_coeffs;
2197             ps_rdoq_sbh_ctxt->pi2_trans_values = pi2_trans_values;
2198             ps_rdoq_sbh_ctxt->pu1_csbf_buf = pu1_csbf_buf;
2199 
2200             /* ------- call coeffs scan function ------- */
2201             if((!i4_perform_rdoq))
2202             {
2203                 ihevce_sign_data_hiding(ps_rdoq_sbh_ctxt);
2204 
2205                 pi8_cost[0] = ps_rdoq_sbh_ctxt->i8_ssd_cost;
2206             }
2207         }
2208 
2209         *pi4_coeff_off = ps_ctxt->s_cmn_opt_func.pf_scan_coeffs(
2210             pi2_quant_coeffs,
2211             pi4_subBlock2csbfId_map,
2212             ps_ctxt->i4_scan_idx,
2213             trans_size,
2214             pu1_ecd_data,
2215             pu1_csbf_buf,
2216             csbf_strd);
2217     }
2218     *pi8_cost >>= ga_trans_shift[trans_idx];
2219 
2220 #if RDOPT_ZERO_CBF_ENABLE
2221     /* compare null cbf cost with encode tu rd-cost */
2222     if(cbf != 0)
2223     {
2224         WORD32 tu_bits;
2225         LWORD64 tu_rd_cost;
2226 
2227         LWORD64 zero_cbf_cost = 0;
2228 
2229         /*Populating the feilds of rdoq_ctxt structure*/
2230         if(i4_perform_rdoq)
2231         {
2232             /* transform size to log2transform size */
2233             GETRANGE(ps_rdoq_sbh_ctxt->i4_log2_trans_size, trans_size);
2234             ps_rdoq_sbh_ctxt->i4_log2_trans_size -= 1;
2235             ps_rdoq_sbh_ctxt->i8_cl_ssd_lambda_qf = ps_ctxt->i8_cl_ssd_lambda_qf;
2236             ps_rdoq_sbh_ctxt->i4_is_luma = 1;
2237             ps_rdoq_sbh_ctxt->i4_shift_val_ssd_in_td = ga_trans_shift[trans_idx];
2238             ps_rdoq_sbh_ctxt->i4_round_val_ssd_in_td =
2239                 (1 << ps_rdoq_sbh_ctxt->i4_shift_val_ssd_in_td) / 2;
2240             ps_rdoq_sbh_ctxt->i1_tu_is_coded = 0;
2241             ps_rdoq_sbh_ctxt->pi4_zero_col = pi4_zero_col;
2242             ps_rdoq_sbh_ctxt->pi4_zero_row = pi4_zero_row;
2243         }
2244         else if(i4_perform_zcbf)
2245         {
2246             zero_cbf_cost =
2247 
2248                 ps_ctxt->s_cmn_opt_func.pf_ssd_calculator(
2249                     pu1_src, pu1_pred, src_strd, pred_strd, trans_size, trans_size);
2250         }
2251 
2252         /************************************************************************/
2253         /* call the entropy rdo encode to get the bit estimate for current tu   */
2254         /* note that tu includes only residual coding bits and does not include */
2255         /* tu split, cbf and qp delta encoding bits for a TU                    */
2256         /************************************************************************/
2257         if(i4_perform_rdoq)
2258         {
2259             tu_bits = ihevce_entropy_rdo_encode_tu_rdoq(
2260                 &ps_ctxt->s_rdopt_entropy_ctxt,
2261                 (pu1_ecd_data),
2262                 trans_size,
2263                 1,
2264                 ps_rdoq_sbh_ctxt,
2265                 pi8_cost,
2266                 &zero_cbf_cost,
2267                 0);
2268 
2269             if(ps_rdoq_sbh_ctxt->i1_tu_is_coded == 0)
2270             {
2271                 cbf = 0;
2272                 *pi4_coeff_off = 0;
2273             }
2274 
2275             if((i4_perform_sbh) && (0 != cbf))
2276             {
2277                 ps_rdoq_sbh_ctxt->i8_ssd_cost = *pi8_cost;
2278                 ihevce_sign_data_hiding(ps_rdoq_sbh_ctxt);
2279                 *pi8_cost = ps_rdoq_sbh_ctxt->i8_ssd_cost;
2280             }
2281 
2282             /*Add round value before normalizing*/
2283             *pi8_cost += ps_rdoq_sbh_ctxt->i4_round_val_ssd_in_td;
2284             *pi8_cost >>= ga_trans_shift[trans_idx];
2285 
2286             if(ps_rdoq_sbh_ctxt->i1_tu_is_coded == 1)
2287             {
2288                 pi2_quant_coeffs = &ps_ctxt->ai2_scratch[0];
2289                 *pi4_coeff_off = ps_ctxt->s_cmn_opt_func.pf_scan_coeffs(
2290                     pi2_quant_coeffs,
2291                     pi4_subBlock2csbfId_map,
2292                     ps_ctxt->i4_scan_idx,
2293                     trans_size,
2294                     pu1_ecd_data,
2295                     pu1_csbf_buf,
2296                     csbf_strd);
2297             }
2298         }
2299         else
2300         {
2301             tu_bits = ihevce_entropy_rdo_encode_tu(
2302                 &ps_ctxt->s_rdopt_entropy_ctxt, pu1_ecd_data, trans_size, 1, i4_perform_sbh);
2303         }
2304 
2305         *pi4_tu_bits = tu_bits;
2306 
2307         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
2308         {
2309             *pi8_cost = ihevce_it_recon_ssd(
2310                 ps_ctxt,
2311                 pu1_src,
2312                 src_strd,
2313                 pu1_pred,
2314                 pred_strd,
2315                 pi2_deq_data,
2316                 deq_data_strd,
2317                 pu1_recon,
2318                 i4_recon_stride,
2319                 pu1_ecd_data,
2320                 trans_size,
2321                 packed_pred_mode,
2322                 cbf,
2323                 *pi4_zero_col,
2324                 *pi4_zero_row,
2325                 NULL_PLANE);
2326 
2327             pu1_is_recon_available[0] = 1;
2328         }
2329 
2330 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2331         if(u1_is_cu_noisy && (e_ssd_type == SPATIAL_DOMAIN_SSD) && i4_alpha_stim_multiplier)
2332         {
2333             pi8_cost[0] = ihevce_inject_stim_into_distortion(
2334                 pu1_src,
2335                 src_strd,
2336                 pu1_recon,
2337                 i4_recon_stride,
2338                 pi8_cost[0],
2339                 i4_alpha_stim_multiplier,
2340                 trans_size,
2341                 0,
2342                 ps_ctxt->u1_enable_psyRDOPT,
2343                 NULL_PLANE);
2344         }
2345         else if(u1_is_cu_noisy && (e_ssd_type == FREQUENCY_DOMAIN_SSD) && i4_alpha_stim_multiplier)
2346         {
2347             pi8_cost[0] = ihevce_inject_stim_into_distortion(
2348                 pu1_src,
2349                 src_strd,
2350                 pu1_pred,
2351                 pred_strd,
2352                 pi8_cost[0],
2353                 i4_alpha_stim_multiplier,
2354                 trans_size,
2355                 0,
2356                 ps_ctxt->u1_enable_psyRDOPT,
2357                 NULL_PLANE);
2358         }
2359 #endif
2360 
2361         /* add the SSD cost to bits estimate given by ECD */
2362         tu_rd_cost = *pi8_cost + COMPUTE_RATE_COST_CLIP30(
2363                                      tu_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
2364 
2365         if(i4_perform_zcbf)
2366         {
2367 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2368             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
2369             {
2370                 zero_cbf_cost = ihevce_inject_stim_into_distortion(
2371                     pu1_src,
2372                     src_strd,
2373                     pu1_pred,
2374                     pred_strd,
2375                     zero_cbf_cost,
2376                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
2377                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
2378                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
2379                                                  100.0,
2380                     trans_size,
2381                     0,
2382                     ps_ctxt->u1_enable_psyRDOPT,
2383                     NULL_PLANE);
2384             }
2385 #endif
2386 
2387             /* force the tu as zero cbf if zero_cbf_cost is lower */
2388             if(zero_cbf_cost < tu_rd_cost)
2389             {
2390                 /* num bytes is set to 0 */
2391                 *pi4_coeff_off = 0;
2392 
2393                 /* cbf is returned as 0 */
2394                 cbf = 0;
2395 
2396                 /* cost is returned as 0 cbf cost */
2397                 *pi8_cost = zero_cbf_cost;
2398 
2399                 /* TU bits is set to 0 */
2400                 *pi4_tu_bits = 0;
2401                 pu1_is_recon_available[0] = 0;
2402 
2403                 if(SPATIAL_DOMAIN_SSD == e_ssd_type)
2404                 {
2405                     /* copy pred to recon for zcbf mode */
2406 
2407                     ps_ctxt->s_cmn_opt_func.pf_copy_2d(
2408                         pu1_recon, i4_recon_stride, pu1_pred, pred_strd, trans_size, trans_size);
2409 
2410                     pu1_is_recon_available[0] = 1;
2411                 }
2412             }
2413             /* accumulate cu not coded cost with zcbf cost */
2414 #if ENABLE_INTER_ZCU_COST
2415             ps_ctxt->i8_cu_not_coded_cost += zero_cbf_cost;
2416 #endif
2417         }
2418     }
2419     else
2420     {
2421         /* cbf = 0, accumulate cu not coded cost */
2422         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
2423         {
2424             *pi8_cost = ihevce_it_recon_ssd(
2425                 ps_ctxt,
2426                 pu1_src,
2427                 src_strd,
2428                 pu1_pred,
2429                 pred_strd,
2430                 pi2_deq_data,
2431                 deq_data_strd,
2432                 pu1_recon,
2433                 i4_recon_stride,
2434                 pu1_ecd_data,
2435                 trans_size,
2436                 packed_pred_mode,
2437                 cbf,
2438                 *pi4_zero_col,
2439                 *pi4_zero_row,
2440                 NULL_PLANE);
2441 
2442             pu1_is_recon_available[0] = 1;
2443         }
2444 
2445 #if ENABLE_INTER_ZCU_COST
2446         {
2447 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
2448             if(u1_is_cu_noisy && (e_ssd_type == SPATIAL_DOMAIN_SSD) && i4_alpha_stim_multiplier)
2449             {
2450                 pi8_cost[0] = ihevce_inject_stim_into_distortion(
2451                     pu1_src,
2452                     src_strd,
2453                     pu1_recon,
2454                     i4_recon_stride,
2455                     pi8_cost[0],
2456                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
2457                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
2458                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
2459                                                  100.0,
2460                     trans_size,
2461                     0,
2462                     ps_ctxt->u1_enable_psyRDOPT,
2463                     NULL_PLANE);
2464             }
2465             else if(u1_is_cu_noisy && (e_ssd_type == FREQUENCY_DOMAIN_SSD) && i4_alpha_stim_multiplier)
2466             {
2467                 pi8_cost[0] = ihevce_inject_stim_into_distortion(
2468                     pu1_src,
2469                     src_strd,
2470                     pu1_pred,
2471                     pred_strd,
2472                     pi8_cost[0],
2473                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
2474                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
2475                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
2476                                                  100.0,
2477                     trans_size,
2478                     0,
2479                     ps_ctxt->u1_enable_psyRDOPT,
2480                     NULL_PLANE);
2481             }
2482 #endif
2483 
2484             ps_ctxt->i8_cu_not_coded_cost += *pi8_cost;
2485         }
2486 #endif /* ENABLE_INTER_ZCU_COST */
2487     }
2488 #endif
2489 
2490     return (cbf);
2491 }
2492 
2493 /*!
2494 ******************************************************************************
2495 * \if Function name : ihevce_it_recon_fxn \endif
2496 *
2497 * \brief
2498 *    Transform unit level (Luma) IT Recon function
2499 *
2500 * \param[in] ps_ctxt        enc_loop module ctxt pointer
2501 * \param[in] pi2_deq_data   pointer to iq data
2502 * \param[in] deq_data_strd  iq data buffer stride
2503 * \param[in] pu1_pred       pointer to predicted data buffer
2504 * \param[in] pred_strd      predicted buffer stride
2505 * \param[in] pu1_recon      pointer to recon buffer
2506 * \param[in] recon_strd     recon buffer stride
2507 * \param[out] pu1_ecd_data  pointer coeff output buffer (input to ent cod)
2508 * \param[in] trans_size     transform size (4, 8, 16,32)
2509 * \param[in] packed_pred_mode   0:Inter 1:Intra 2:Skip
2510 * \param[in] cbf            CBF of the current block
2511 * \param[in] zero_cols      zero_cols of the current block
2512 * \param[in] zero_rows      zero_rows of the current block
2513 *
2514 * \return
2515 *
2516 * \author
2517 *  Ittiam
2518 *
2519 *****************************************************************************
2520 */
2521 
ihevce_it_recon_fxn(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD16 * pi2_deq_data,WORD32 deq_dat_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_recon,WORD32 recon_strd,UWORD8 * pu1_ecd_data,WORD32 trans_size,WORD32 packed_pred_mode,WORD32 cbf,WORD32 zero_cols,WORD32 zero_rows)2522 void ihevce_it_recon_fxn(
2523     ihevce_enc_loop_ctxt_t *ps_ctxt,
2524     WORD16 *pi2_deq_data,
2525     WORD32 deq_dat_strd,
2526     UWORD8 *pu1_pred,
2527     WORD32 pred_strd,
2528     UWORD8 *pu1_recon,
2529     WORD32 recon_strd,
2530     UWORD8 *pu1_ecd_data,
2531     WORD32 trans_size,
2532     WORD32 packed_pred_mode,
2533     WORD32 cbf,
2534     WORD32 zero_cols,
2535     WORD32 zero_rows)
2536 {
2537     WORD32 dc_add_flag = 0;
2538     WORD32 trans_idx;
2539 
2540     /* translate the transform size to index for 4x4 and 8x8 */
2541     trans_idx = trans_size >> 2;
2542 
2543     /* if SKIP mode needs to be evaluated the pred is copied to recon */
2544     if(PRED_MODE_SKIP == packed_pred_mode)
2545     {
2546         UWORD8 *pu1_curr_recon, *pu1_curr_pred;
2547 
2548         pu1_curr_pred = pu1_pred;
2549         pu1_curr_recon = pu1_recon;
2550 
2551         /* 2D copy of data */
2552 
2553         ps_ctxt->s_cmn_opt_func.pf_2d_square_copy(
2554             pu1_curr_recon, recon_strd, pu1_curr_pred, pred_strd, trans_size, sizeof(UWORD8));
2555 
2556         return;
2557     }
2558 
2559     /* for intra 4x4 DST transform should be used */
2560     if((1 == trans_idx) && (PRED_MODE_INTRA == packed_pred_mode))
2561     {
2562         trans_idx = 0;
2563     }
2564     /* for 16x16 cases */
2565     else if(16 == trans_size)
2566     {
2567         trans_idx = 3;
2568     }
2569     /* for 32x32 cases */
2570     else if(32 == trans_size)
2571     {
2572         trans_idx = 4;
2573     }
2574 
2575     /*if (lastx == 0 && lasty == 0) , ie only 1 coefficient */
2576     if((0 == pu1_ecd_data[0]) && (0 == pu1_ecd_data[1]))
2577     {
2578         dc_add_flag = 1;
2579     }
2580 
2581     if(0 == cbf)
2582     {
2583         /* buffer copy */
2584         ps_ctxt->s_cmn_opt_func.pf_2d_square_copy(
2585             pu1_recon, recon_strd, pu1_pred, pred_strd, trans_size, 1);
2586     }
2587     else if((1 == dc_add_flag) && (0 != trans_idx))
2588     {
2589         /* dc add */
2590         ps_ctxt->s_cmn_opt_func.pf_itrans_recon_dc(
2591             pu1_pred,
2592             pred_strd,
2593             pu1_recon,
2594             recon_strd,
2595             trans_size,
2596             pi2_deq_data[0],
2597             NULL_PLANE /* luma */
2598         );
2599     }
2600     else
2601     {
2602         ps_ctxt->apf_it_recon[trans_idx](
2603             pi2_deq_data,
2604             &ps_ctxt->ai2_scratch[0],
2605             pu1_pred,
2606             pu1_recon,
2607             deq_dat_strd,
2608             pred_strd,
2609             recon_strd,
2610             zero_cols,
2611             zero_rows);
2612     }
2613 }
2614 
2615 /*!
2616 ******************************************************************************
2617 * \if Function name : ihevce_chroma_it_recon_fxn \endif
2618 *
2619 * \brief
2620 *    Transform unit level (Chroma) IT Recon function
2621 *
2622 * \param[in] ps_ctxt        enc_loop module ctxt pointer
2623 * \param[in] pi2_deq_data   pointer to iq data
2624 * \param[in] deq_data_strd  iq data buffer stride
2625 * \param[in] pu1_pred       pointer to predicted data buffer
2626 * \param[in] pred_strd      predicted buffer stride
2627 * \param[in] pu1_recon      pointer to recon buffer
2628 * \param[in] recon_strd     recon buffer stride
2629 * \param[out] pu1_ecd_data  pointer coeff output buffer (input to ent cod)
2630 * \param[in] trans_size     transform size (4, 8, 16)
2631 * \param[in] cbf            CBF of the current block
2632 * \param[in] zero_cols      zero_cols of the current block
2633 * \param[in] zero_rows      zero_rows of the current block
2634 *
2635 * \return
2636 *
2637 * \author
2638 *  Ittiam
2639 *
2640 *****************************************************************************
2641 */
2642 
ihevce_chroma_it_recon_fxn(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD16 * pi2_deq_data,WORD32 deq_dat_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_recon,WORD32 recon_strd,UWORD8 * pu1_ecd_data,WORD32 trans_size,WORD32 cbf,WORD32 zero_cols,WORD32 zero_rows,CHROMA_PLANE_ID_T e_chroma_plane)2643 void ihevce_chroma_it_recon_fxn(
2644     ihevce_enc_loop_ctxt_t *ps_ctxt,
2645     WORD16 *pi2_deq_data,
2646     WORD32 deq_dat_strd,
2647     UWORD8 *pu1_pred,
2648     WORD32 pred_strd,
2649     UWORD8 *pu1_recon,
2650     WORD32 recon_strd,
2651     UWORD8 *pu1_ecd_data,
2652     WORD32 trans_size,
2653     WORD32 cbf,
2654     WORD32 zero_cols,
2655     WORD32 zero_rows,
2656     CHROMA_PLANE_ID_T e_chroma_plane)
2657 {
2658     WORD32 trans_idx;
2659 
2660     ASSERT((e_chroma_plane == U_PLANE) || (e_chroma_plane == V_PLANE));
2661 
2662     /* since 2x2 transform is not allowed for chroma*/
2663     if(2 == trans_size)
2664     {
2665         trans_size = 4;
2666     }
2667 
2668     /* translate the transform size to index */
2669     trans_idx = trans_size >> 2;
2670 
2671     /* for 16x16 cases */
2672     if(16 == trans_size)
2673     {
2674         trans_idx = 3;
2675     }
2676 
2677     if(0 == cbf)
2678     {
2679         /* buffer copy */
2680         ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
2681             pu1_pred, pred_strd, pu1_recon, recon_strd, trans_size, trans_size, e_chroma_plane);
2682     }
2683     else if((0 == pu1_ecd_data[0]) && (0 == pu1_ecd_data[1]))
2684     {
2685         /* dc add */
2686         ps_ctxt->s_cmn_opt_func.pf_itrans_recon_dc(
2687             pu1_pred,
2688             pred_strd,
2689             pu1_recon,
2690             recon_strd,
2691             trans_size,
2692             pi2_deq_data[0],
2693             e_chroma_plane /* chroma plane */
2694         );
2695     }
2696     else
2697     {
2698         ps_ctxt->apf_chrm_it_recon[trans_idx - 1](
2699             pi2_deq_data,
2700             &ps_ctxt->ai2_scratch[0],
2701             pu1_pred + (WORD32)e_chroma_plane,
2702             pu1_recon + (WORD32)e_chroma_plane,
2703             deq_dat_strd,
2704             pred_strd,
2705             recon_strd,
2706             zero_cols,
2707             zero_rows);
2708     }
2709 }
2710 
2711 /**
2712 *******************************************************************************
2713 * \if Function name : ihevce_mpm_idx_based_filter_RDOPT_cand \endif
2714 *
2715 * \brief * Filters the RDOPT candidates based on mpm_idx
2716 *
2717 * \par   Description
2718 * Computes the b1_prev_intra_luma_pred_flag, b2_mpm_idx & b5_rem_intra_pred_mode
2719 * for a CU
2720 *
2721 * \param[in] ps_ctxt : ptr to enc loop context
2722 * \param[in] ps_cu_analyse : ptr to CU analyse structure
2723 * \param[in] ps_top_nbr_4x4 top 4x4 neighbour pointer
2724 * \param[in] ps_left_nbr_4x4 left 4x4 neighbour pointer
2725 * \param[in] pu1_luma_mode luma mode
2726 *
2727 * \returns none
2728 *
2729 * \author
2730 *  Ittiam
2731 *
2732 *******************************************************************************
2733 */
2734 
ihevce_mpm_idx_based_filter_RDOPT_cand(ihevce_enc_loop_ctxt_t * ps_ctxt,cu_analyse_t * ps_cu_analyse,nbr_4x4_t * ps_left_nbr_4x4,nbr_4x4_t * ps_top_nbr_4x4,UWORD8 * pu1_luma_mode,UWORD8 * pu1_eval_mark)2735 void ihevce_mpm_idx_based_filter_RDOPT_cand(
2736     ihevce_enc_loop_ctxt_t *ps_ctxt,
2737     cu_analyse_t *ps_cu_analyse,
2738     nbr_4x4_t *ps_left_nbr_4x4,
2739     nbr_4x4_t *ps_top_nbr_4x4,
2740     UWORD8 *pu1_luma_mode,
2741     UWORD8 *pu1_eval_mark)
2742 {
2743     WORD32 cu_pos_x;
2744     WORD32 cu_pos_y;
2745     nbr_avail_flags_t s_nbr;
2746     WORD32 trans_size;
2747     WORD32 au4_cand_mode_list[3];
2748     WORD32 nbr_flags;
2749     UWORD8 *pu1_intra_luma_modes;
2750     WORD32 rdopt_cand_ctr = 0;
2751     UWORD8 *pu1_luma_eval_mark;
2752 
2753     cu_pos_x = ps_cu_analyse->b3_cu_pos_x << 1;
2754     cu_pos_y = ps_cu_analyse->b3_cu_pos_y << 1;
2755     trans_size = ps_cu_analyse->u1_cu_size;
2756 
2757     /* get the neighbour availability flags */
2758     nbr_flags = ihevce_get_nbr_intra(
2759         &s_nbr,
2760         ps_ctxt->pu1_ctb_nbr_map,
2761         ps_ctxt->i4_nbr_map_strd,
2762         cu_pos_x,
2763         cu_pos_y,
2764         trans_size >> 2);
2765     (void)nbr_flags;
2766     /*Call the fun to populate luma intra pred mode fro TU=CU and use the same list fro
2767     *TU=CU/2 also since the modes are same in both the cases.
2768     */
2769     ihevce_populate_intra_pred_mode(
2770         ps_top_nbr_4x4->b6_luma_intra_mode,
2771         ps_left_nbr_4x4->b6_luma_intra_mode,
2772         s_nbr.u1_top_avail,
2773         s_nbr.u1_left_avail,
2774         cu_pos_y,
2775         &au4_cand_mode_list[0]);
2776 
2777     /*Loop through all the RDOPT candidates of TU=CU and TU=CU/2 and check if the current RDOPT
2778     *cand is present in a4_cand_mode_list, If yes set eval flag to 1 else set it to zero
2779     */
2780 
2781     pu1_intra_luma_modes = pu1_luma_mode;
2782     pu1_luma_eval_mark = pu1_eval_mark;
2783 
2784     while(pu1_intra_luma_modes[rdopt_cand_ctr] != 255)
2785     {
2786         WORD32 i;
2787         WORD32 found_flag = 0;
2788 
2789         /*1st candidate of TU=CU list and TU=CU/2 list must go through RDOPT stage
2790         *irrespective of whether the cand is present in the mpm idx list or not
2791         */
2792         if(rdopt_cand_ctr == 0)
2793         {
2794             rdopt_cand_ctr++;
2795             continue;
2796         }
2797 
2798         for(i = 0; i < 3; i++)
2799         {
2800             if(pu1_intra_luma_modes[rdopt_cand_ctr] == au4_cand_mode_list[i])
2801             {
2802                 found_flag = 1;
2803                 break;
2804             }
2805         }
2806 
2807         if(found_flag == 0)
2808         {
2809             pu1_luma_eval_mark[rdopt_cand_ctr] = 0;
2810         }
2811 
2812         rdopt_cand_ctr++;
2813     }
2814 }
2815 
2816 /*!
2817 ******************************************************************************
2818 * \if Function name : ihevce_intra_rdopt_cu_ntu \endif
2819 *
2820 * \brief
2821 *    Intra Coding unit funtion for RD opt mode
2822 *
2823 * \param[in] ps_ctxt    enc_loop module ctxt pointer
2824 * \param[in] ps_chrm_cu_buf_prms pointer to chroma buffer pointers structure
2825 * \param[in] pu1_luma_mode : pointer to luma mode
2826 * \param[in] ps_cu_analyse  pointer to cu analyse pointer
2827 * \param[in] pu1_src    pointer to source data buffer
2828 * \param[in] src_strd   source buffer stride
2829 * \param[in] pu1_cu_left pointer to left recon data buffer
2830 * \param[in] pu1_cu_top  pointer to top recon data buffer
2831 * \param[in] pu1_cu_top_left pointer to top left recon data buffer
2832 * \param[in] ps_left_nbr_4x4 : left 4x4 neighbour pointer
2833 * \param[in] ps_top_nbr_4x4 : top 4x4 neighbour pointer
2834 * \param[in] nbr_4x4_left_strd left nbr4x4 stride
2835 * \param[in] cu_left_stride left recon buffer stride
2836 * \param[in] curr_buf_idx RD opt buffer index for current usage
2837 * \param[in] func_proc_mode : function procesing mode @sa TU_SIZE_WRT_CU_T
2838 *
2839 * \return
2840 *    RDopt cost
2841 *
2842 * \author
2843 *  Ittiam
2844 *
2845 *****************************************************************************
2846 */
ihevce_intra_rdopt_cu_ntu(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_cu_prms_t * ps_cu_prms,void * pv_pred_org,WORD32 pred_strd_org,enc_loop_chrm_cu_buf_prms_t * ps_chrm_cu_buf_prms,UWORD8 * pu1_luma_mode,cu_analyse_t * ps_cu_analyse,void * pv_curr_src,void * pv_cu_left,void * pv_cu_top,void * pv_cu_top_left,nbr_4x4_t * ps_left_nbr_4x4,nbr_4x4_t * ps_top_nbr_4x4,WORD32 nbr_4x4_left_strd,WORD32 cu_left_stride,WORD32 curr_buf_idx,WORD32 func_proc_mode,WORD32 i4_alpha_stim_multiplier)2847 LWORD64 ihevce_intra_rdopt_cu_ntu(
2848     ihevce_enc_loop_ctxt_t *ps_ctxt,
2849     enc_loop_cu_prms_t *ps_cu_prms,
2850     void *pv_pred_org,
2851     WORD32 pred_strd_org,
2852     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
2853     UWORD8 *pu1_luma_mode,
2854     cu_analyse_t *ps_cu_analyse,
2855     void *pv_curr_src,
2856     void *pv_cu_left,
2857     void *pv_cu_top,
2858     void *pv_cu_top_left,
2859     nbr_4x4_t *ps_left_nbr_4x4,
2860     nbr_4x4_t *ps_top_nbr_4x4,
2861     WORD32 nbr_4x4_left_strd,
2862     WORD32 cu_left_stride,
2863     WORD32 curr_buf_idx,
2864     WORD32 func_proc_mode,
2865     WORD32 i4_alpha_stim_multiplier)
2866 {
2867     enc_loop_cu_final_prms_t *ps_final_prms;
2868     nbr_avail_flags_t s_nbr;
2869     nbr_4x4_t *ps_nbr_4x4;
2870     nbr_4x4_t *ps_tmp_lt_4x4;
2871     recon_datastore_t *ps_recon_datastore;
2872 
2873     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
2874 
2875     UWORD32 *pu4_nbr_flags;
2876     UWORD8 *pu1_intra_pred_mode;
2877     WORD32 cu_pos_x;
2878     WORD32 cu_pos_y;
2879     WORD32 trans_size = 0;
2880     UWORD8 *pu1_left;
2881     UWORD8 *pu1_top;
2882     UWORD8 *pu1_top_left;
2883     UWORD8 *pu1_recon;
2884     UWORD8 *pu1_csbf_buf;
2885     UWORD8 *pu1_ecd_data;
2886     WORD16 *pi2_deq_data;
2887     WORD32 deq_data_strd;
2888     LWORD64 total_rdopt_cost;
2889     WORD32 ctr;
2890     WORD32 left_strd;
2891     WORD32 i4_recon_stride;
2892     WORD32 csbf_strd;
2893     WORD32 ecd_data_bytes_cons;
2894     WORD32 num_4x4_in_tu;
2895     WORD32 num_4x4_in_cu;
2896     WORD32 chrm_present_flag;
2897     WORD32 tx_size;
2898     WORD32 cu_bits;
2899     WORD32 num_cu_parts = 0;
2900     WORD32 num_cands = 0;
2901     WORD32 cu_pos_x_8pelunits;
2902     WORD32 cu_pos_y_8pelunits;
2903     WORD32 i4_perform_rdoq;
2904     WORD32 i4_perform_sbh;
2905     UWORD8 u1_compute_spatial_ssd;
2906     UWORD8 u1_compute_recon;
2907     UWORD8 au1_intra_nxn_rdopt_ctxt_models[2][IHEVC_CAB_CTXT_END];
2908 
2909     UWORD16 u2_num_tus_in_cu = 0;
2910     WORD32 is_sub_pu_in_hq = 0;
2911     /* Get the RDOPT cost of the best CU mode for early_exit */
2912     LWORD64 prev_best_rdopt_cost = ps_ctxt->as_cu_prms[!curr_buf_idx].i8_best_rdopt_cost;
2913     /* cabac context of prev intra luma pred flag */
2914     UWORD8 u1_prev_flag_cabac_ctxt =
2915         ps_ctxt->au1_rdopt_init_ctxt_models[IHEVC_CAB_INTRA_LUMA_PRED_FLAG];
2916     WORD32 src_strd = ps_cu_prms->i4_luma_src_stride;
2917 
2918     UWORD8 u1_is_cu_noisy = ps_cu_prms->u1_is_cu_noisy && !DISABLE_INTRA_WHEN_NOISY;
2919 
2920     total_rdopt_cost = 0;
2921     ps_final_prms = &ps_ctxt->as_cu_prms[curr_buf_idx];
2922     ps_recon_datastore = &ps_final_prms->s_recon_datastore;
2923     i4_recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
2924     csbf_strd = ps_ctxt->i4_cu_csbf_strd;
2925     pu1_csbf_buf = &ps_ctxt->au1_cu_csbf[0];
2926     pu1_ecd_data = &ps_final_prms->pu1_cu_coeffs[0];
2927     pi2_deq_data = &ps_final_prms->pi2_cu_deq_coeffs[0];
2928     deq_data_strd = ps_cu_analyse->u1_cu_size; /* deq_data stride is cu size */
2929     ps_nbr_4x4 = &ps_ctxt->as_cu_nbr[curr_buf_idx][0];
2930     ps_tmp_lt_4x4 = ps_left_nbr_4x4;
2931     pu4_nbr_flags = &ps_final_prms->au4_nbr_flags[0];
2932     pu1_intra_pred_mode = &ps_final_prms->au1_intra_pred_mode[0];
2933     cu_pos_x = ps_cu_analyse->b3_cu_pos_x;
2934     cu_pos_y = ps_cu_analyse->b3_cu_pos_y;
2935     cu_pos_x_8pelunits = cu_pos_x;
2936     cu_pos_y_8pelunits = cu_pos_y;
2937 
2938     /* reset cu not coded cost */
2939     ps_ctxt->i8_cu_not_coded_cost = 0;
2940 
2941     /* based on the Processng mode */
2942     if(TU_EQ_CU == func_proc_mode)
2943     {
2944         ps_final_prms->u1_part_mode = SIZE_2Nx2N;
2945         trans_size = ps_cu_analyse->u1_cu_size;
2946         num_cu_parts = 1;
2947         num_cands = 1;
2948         u2_num_tus_in_cu = 1;
2949     }
2950     else if(TU_EQ_CU_DIV2 == func_proc_mode)
2951     {
2952         ps_final_prms->u1_part_mode = SIZE_2Nx2N;
2953         trans_size = ps_cu_analyse->u1_cu_size >> 1;
2954         num_cu_parts = 4;
2955         num_cands = 1;
2956         u2_num_tus_in_cu = 4;
2957     }
2958     else if(TU_EQ_SUBCU == func_proc_mode)
2959     {
2960         ps_final_prms->u1_part_mode = SIZE_NxN;
2961         trans_size = ps_cu_analyse->u1_cu_size >> 1;
2962         num_cu_parts = 4;
2963         /*In HQ for TU = SUBPU, all 35 modes used for RDOPT instead of 3 modes */
2964         if(IHEVCE_QUALITY_P3 > ps_ctxt->i4_quality_preset)
2965         {
2966             if(ps_ctxt->i1_slice_type != BSLICE)
2967             {
2968                 num_cands = (4 * MAX_INTRA_CU_CANDIDATES) + 2;
2969             }
2970             else
2971             {
2972                 num_cands = (2 * MAX_INTRA_CU_CANDIDATES);
2973             }
2974         }
2975         else
2976         {
2977             num_cands = MAX_INTRA_CU_CANDIDATES;
2978         }
2979         u2_num_tus_in_cu = 4;
2980     }
2981     else
2982     {
2983         /* should not enter here */
2984         ASSERT(0);
2985     }
2986 
2987     if(ps_ctxt->i1_cu_qp_delta_enable)
2988     {
2989         WORD32 i4_act_counter = 0, i4_act_counter_lamda = 0;
2990         if(ps_cu_analyse->u1_cu_size == 64)
2991         {
2992             ASSERT(
2993                 (trans_size == 32) || (trans_size == 16) || (trans_size == 8) || (trans_size == 4));
2994             i4_act_counter = (trans_size == 16) + 2 * ((trans_size == 8) || (trans_size == 4));
2995             i4_act_counter_lamda = 3;
2996         }
2997         else if(ps_cu_analyse->u1_cu_size == 32)
2998         {
2999             ASSERT(
3000                 (trans_size == 32) || (trans_size == 16) || (trans_size == 8) || (trans_size == 4));
3001             i4_act_counter = (trans_size == 16) + 2 * ((trans_size == 8) || (trans_size == 4));
3002             i4_act_counter_lamda = 0;
3003         }
3004         else if(ps_cu_analyse->u1_cu_size == 16)
3005         {
3006             ASSERT((trans_size == 16) || (trans_size == 8) || (trans_size == 4));
3007             i4_act_counter = (trans_size == 8) || (trans_size == 4);
3008             i4_act_counter_lamda = 0;
3009         }
3010         else if(ps_cu_analyse->u1_cu_size == 8)
3011         {
3012             ASSERT((trans_size == 8) || (trans_size == 4));
3013             i4_act_counter = 1;
3014             i4_act_counter_lamda = 0;
3015         }
3016         else
3017         {
3018             ASSERT(0);
3019         }
3020         if(ps_ctxt->i4_use_ctb_level_lamda)
3021         {
3022             ihevce_compute_cu_level_QP(
3023                 ps_ctxt, ps_cu_analyse->i4_act_factor[i4_act_counter][1], -1, 0);
3024         }
3025         else
3026         {
3027             ihevce_compute_cu_level_QP(
3028                 ps_ctxt,
3029                 ps_cu_analyse->i4_act_factor[i4_act_counter][1],
3030                 ps_cu_analyse->i4_act_factor[i4_act_counter_lamda][1],
3031                 0);
3032         }
3033 
3034         ps_cu_analyse->i1_cu_qp = ps_ctxt->i4_cu_qp;
3035     }
3036     if(u1_is_cu_noisy && !ps_ctxt->u1_enable_psyRDOPT)
3037     {
3038         ps_ctxt->i8_cl_ssd_lambda_qf =
3039             ((float)ps_ctxt->i8_cl_ssd_lambda_qf * (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) /
3040              100.0f);
3041         ps_ctxt->i8_cl_ssd_lambda_chroma_qf =
3042             ((float)ps_ctxt->i8_cl_ssd_lambda_chroma_qf *
3043              (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) / 100.0f);
3044     }
3045 
3046     u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
3047                              (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
3048                              CONVERT_SSDS_TO_SPATIAL_DOMAIN;
3049 
3050     if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
3051     {
3052         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
3053                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
3054     }
3055 
3056     /* populate the neigbours */
3057     pu1_left = (UWORD8 *)pv_cu_left;
3058     pu1_top = (UWORD8 *)pv_cu_top;
3059     pu1_top_left = (UWORD8 *)pv_cu_top_left;
3060     left_strd = cu_left_stride;
3061     num_4x4_in_tu = (trans_size >> 2);
3062     num_4x4_in_cu = (ps_cu_analyse->u1_cu_size >> 2);
3063     chrm_present_flag = 1;
3064     ecd_data_bytes_cons = 0;
3065     cu_bits = 0;
3066 
3067     /* get the 4x4 level postion of current cu */
3068     cu_pos_x = cu_pos_x << 1;
3069     cu_pos_y = cu_pos_y << 1;
3070 
3071     /* pouplate cu level params knowing that current is intra */
3072     ps_final_prms->u1_skip_flag = 0;
3073     ps_final_prms->u1_intra_flag = PRED_MODE_INTRA;
3074     ps_final_prms->u2_num_pus_in_cu = 1;
3075     /*init the is_cu_coded flag*/
3076     ps_final_prms->u1_is_cu_coded = 0;
3077     ps_final_prms->u4_cu_sad = 0;
3078 
3079     ps_final_prms->as_pu_enc_loop[0].b1_intra_flag = PRED_MODE_INTRA;
3080     ps_final_prms->as_pu_enc_loop[0].b4_wd = (trans_size >> 1) - 1;
3081     ps_final_prms->as_pu_enc_loop[0].b4_ht = (trans_size >> 1) - 1;
3082     ps_final_prms->as_pu_enc_loop[0].b4_pos_x = cu_pos_x;
3083     ps_final_prms->as_pu_enc_loop[0].b4_pos_y = cu_pos_y;
3084     ps_final_prms->as_pu_enc_loop[0].b1_merge_flag = 0;
3085 
3086     ps_final_prms->as_col_pu_enc_loop[0].b1_intra_flag = 1;
3087 
3088     /*copy qp directly as intra cant be skip*/
3089     ps_nbr_4x4->b8_qp = ps_ctxt->i4_cu_qp;
3090     ps_nbr_4x4->mv.s_l0_mv.i2_mvx = 0;
3091     ps_nbr_4x4->mv.s_l0_mv.i2_mvy = 0;
3092     ps_nbr_4x4->mv.s_l1_mv.i2_mvx = 0;
3093     ps_nbr_4x4->mv.s_l1_mv.i2_mvy = 0;
3094     ps_nbr_4x4->mv.i1_l0_ref_pic_buf_id = -1;
3095     ps_nbr_4x4->mv.i1_l1_ref_pic_buf_id = -1;
3096     ps_nbr_4x4->mv.i1_l0_ref_idx = -1;
3097     ps_nbr_4x4->mv.i1_l1_ref_idx = -1;
3098 
3099     /* RDOPT copy States :  TU init (best until prev TU) to current */
3100     memcpy(
3101         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
3102              .s_cabac_ctxt.au1_ctxt_models[0],
3103         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
3104         IHEVC_CAB_COEFFX_PREFIX);
3105 
3106     /* RDOPT copy States :update to init state if 0 cbf */
3107     memcpy(
3108         &au1_intra_nxn_rdopt_ctxt_models[0][0],
3109         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
3110         IHEVC_CAB_COEFFX_PREFIX);
3111     memcpy(
3112         &au1_intra_nxn_rdopt_ctxt_models[1][0],
3113         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
3114         IHEVC_CAB_COEFFX_PREFIX);
3115 
3116     /* loop for all partitions in CU  blocks */
3117     for(ctr = 0; ctr < num_cu_parts; ctr++)
3118     {
3119         UWORD8 *pu1_curr_mode;
3120         WORD32 cand_ctr;
3121         WORD32 nbr_flags;
3122 
3123         /* for NxN case to track the best mode       */
3124         /* for other cases zeroth index will be used */
3125         intra_prev_rem_flags_t as_intra_prev_rem[2];
3126         LWORD64 ai8_cand_rdopt_cost[2];
3127         UWORD32 au4_tu_sad[2];
3128         WORD32 ai4_tu_bits[2];
3129         WORD32 ai4_cbf[2];
3130         WORD32 ai4_curr_bytes[2];
3131         WORD32 ai4_zero_col[2];
3132         WORD32 ai4_zero_row[2];
3133         /* To store the pred, coeff and dequant for TU_EQ_SUBCU case (since mul.
3134         cand. are there) ping-pong buffer to store the best and current */
3135         UWORD8 au1_cur_pred_data[2][MIN_TU_SIZE * MIN_TU_SIZE];
3136         UWORD8 au1_intra_coeffs[2][MAX_SCAN_COEFFS_BYTES_4x4];
3137         WORD16 ai2_intra_deq_coeffs[2][MIN_TU_SIZE * MIN_TU_SIZE];
3138         /* Context models stored for RDopt store and restore purpose */
3139 
3140         UWORD8 au1_recon_availability[2];
3141 
3142         WORD32 best_cand_idx = 0;
3143         LWORD64 best_cand_cost = MAX_COST_64;
3144         /* counters to toggle b/w best and current */
3145         WORD32 best_intra_buf_idx = 1;
3146         WORD32 curr_intra_buf_idx = 0;
3147 
3148         /* copy the mode pointer to be used in inner loop */
3149         pu1_curr_mode = pu1_luma_mode;
3150 
3151         /* get the neighbour availability flags */
3152         nbr_flags = ihevce_get_nbr_intra(
3153             &s_nbr,
3154             ps_ctxt->pu1_ctb_nbr_map,
3155             ps_ctxt->i4_nbr_map_strd,
3156             cu_pos_x,
3157             cu_pos_y,
3158             num_4x4_in_tu);
3159 
3160         /* copy the nbr flags for chroma reuse */
3161         if(4 != trans_size)
3162         {
3163             *pu4_nbr_flags = nbr_flags;
3164         }
3165         else if(1 == chrm_present_flag)
3166         {
3167             /* compute the avail flags assuming luma trans is 8x8 */
3168             /* get the neighbour availability flags */
3169             *pu4_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
3170                 ps_ctxt->pu1_ctb_nbr_map,
3171                 ps_ctxt->i4_nbr_map_strd,
3172                 cu_pos_x,
3173                 cu_pos_y,
3174                 (num_4x4_in_tu << 1),
3175                 (num_4x4_in_tu << 1));
3176         }
3177 
3178         u1_compute_recon = !u1_compute_spatial_ssd && ((num_cu_parts > 1) && (ctr < 3));
3179 
3180         if(!ctr && (u1_compute_spatial_ssd || u1_compute_recon))
3181         {
3182             ps_recon_datastore->u1_is_lumaRecon_available = 1;
3183         }
3184         else if(!ctr)
3185         {
3186             ps_recon_datastore->u1_is_lumaRecon_available = 0;
3187         }
3188 
3189         ihevc_intra_pred_luma_ref_substitution_fptr =
3190             ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
3191 
3192         /* call reference array substitution */
3193         ihevc_intra_pred_luma_ref_substitution_fptr(
3194             pu1_top_left,
3195             pu1_top,
3196             pu1_left,
3197             left_strd,
3198             trans_size,
3199             nbr_flags,
3200             (UWORD8 *)ps_ctxt->pv_ref_sub_out,
3201             1);
3202 
3203         /* Intra Mode gating based on MPM cand list and encoder quality preset */
3204         if((ps_ctxt->i1_slice_type != ISLICE) && (TU_EQ_SUBCU == func_proc_mode) &&
3205            (ps_ctxt->i4_quality_preset >= IHEVCE_QUALITY_P3))
3206         {
3207             ihevce_mpm_idx_based_filter_RDOPT_cand(
3208                 ps_ctxt,
3209                 ps_cu_analyse,
3210                 ps_left_nbr_4x4,
3211                 ps_top_nbr_4x4,
3212                 pu1_luma_mode,
3213                 &ps_cu_analyse->s_cu_intra_cand.au1_nxn_eval_mark[ctr][0]);
3214         }
3215 
3216         if((TU_EQ_SUBCU == func_proc_mode) && (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
3217            (ps_cu_analyse->s_cu_intra_cand.au1_num_modes_added[ctr] >= MAX_INTRA_CU_CANDIDATES))
3218         {
3219             WORD32 ai4_mpm_mode_list[3];
3220             WORD32 i;
3221 
3222             WORD32 i4_curr_index = ps_cu_analyse->s_cu_intra_cand.au1_num_modes_added[ctr];
3223 
3224             ihevce_populate_intra_pred_mode(
3225                 ps_top_nbr_4x4->b6_luma_intra_mode,
3226                 ps_tmp_lt_4x4->b6_luma_intra_mode,
3227                 s_nbr.u1_top_avail,
3228                 s_nbr.u1_left_avail,
3229                 cu_pos_y,
3230                 &ai4_mpm_mode_list[0]);
3231 
3232             for(i = 0; i < 3; i++)
3233             {
3234                 if(ps_cu_analyse->s_cu_intra_cand
3235                        .au1_intra_luma_mode_nxn_hash[ctr][ai4_mpm_mode_list[i]] == 0)
3236                 {
3237                     ASSERT(ai4_mpm_mode_list[i] < 35);
3238 
3239                     ps_cu_analyse->s_cu_intra_cand
3240                         .au1_intra_luma_mode_nxn_hash[ctr][ai4_mpm_mode_list[i]] = 1;
3241                     pu1_luma_mode[i4_curr_index] = ai4_mpm_mode_list[i];
3242                     ps_cu_analyse->s_cu_intra_cand.au1_num_modes_added[ctr]++;
3243                     i4_curr_index++;
3244                 }
3245             }
3246 
3247             pu1_luma_mode[i4_curr_index] = 255;
3248         }
3249 
3250         /* loop over candidates for each partition */
3251         for(cand_ctr = 0; cand_ctr < num_cands; cand_ctr++)
3252         {
3253             WORD32 curr_pred_mode;
3254             WORD32 bits = 0;
3255             LWORD64 curr_cost;
3256             WORD32 luma_pred_func_idx;
3257             UWORD8 *pu1_curr_ecd_data;
3258             WORD16 *pi2_curr_deq_data;
3259             WORD32 curr_deq_data_strd;
3260             WORD32 pred_strd;
3261             UWORD8 *pu1_pred;
3262 
3263             /* if NXN case the recon and ecd data is stored in temp buffers */
3264             if(TU_EQ_SUBCU == func_proc_mode)
3265             {
3266                 pu1_pred = &au1_cur_pred_data[curr_intra_buf_idx][0];
3267                 pred_strd = trans_size;
3268                 pu1_curr_ecd_data = &au1_intra_coeffs[curr_intra_buf_idx][0];
3269                 pi2_curr_deq_data = &ai2_intra_deq_coeffs[curr_intra_buf_idx][0];
3270                 curr_deq_data_strd = trans_size;
3271 
3272                 ASSERT(trans_size == MIN_TU_SIZE);
3273             }
3274             else
3275             {
3276                 pu1_pred = (UWORD8 *)pv_pred_org;
3277                 pred_strd = pred_strd_org;
3278                 pu1_curr_ecd_data = pu1_ecd_data;
3279                 pi2_curr_deq_data = pi2_deq_data;
3280                 curr_deq_data_strd = deq_data_strd;
3281             }
3282 
3283             pu1_recon = ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs[curr_intra_buf_idx]) +
3284                         (ctr & 1) * trans_size + (ctr > 1) * trans_size * i4_recon_stride;
3285 
3286             if(is_sub_pu_in_hq == 1)
3287             {
3288                 curr_pred_mode = cand_ctr;
3289             }
3290             else
3291             {
3292                 curr_pred_mode = pu1_curr_mode[cand_ctr];
3293             }
3294 
3295             /* If the candidate mode is 255, then break */
3296             if(255 == curr_pred_mode)
3297             {
3298                 break;
3299             }
3300             else if(250 == curr_pred_mode)
3301             {
3302                 continue;
3303             }
3304 
3305             /* check if this mode needs to be evaluated or not. For 2nx2n cases, this   */
3306             /* function will be called once per candidate, so this check has been done  */
3307             /* outside this function call. For NxN case, this function will be called   */
3308             /* only once, and all the candidates will be evaluated here.                */
3309             if(ps_ctxt->i4_quality_preset >= IHEVCE_QUALITY_P3)
3310             {
3311                 if((TU_EQ_SUBCU == func_proc_mode) &&
3312                    (0 == ps_cu_analyse->s_cu_intra_cand.au1_nxn_eval_mark[ctr][cand_ctr]))
3313                 {
3314                     continue;
3315                 }
3316             }
3317 
3318             /* call reference filtering */
3319             ps_ctxt->ps_func_selector->ihevc_intra_pred_ref_filtering_fptr(
3320                 (UWORD8 *)ps_ctxt->pv_ref_sub_out,
3321                 trans_size,
3322                 (UWORD8 *)ps_ctxt->pv_ref_filt_out,
3323                 curr_pred_mode,
3324                 ps_ctxt->i1_strong_intra_smoothing_enable_flag);
3325 
3326             /* use the look up to get the function idx */
3327             luma_pred_func_idx = g_i4_ip_funcs[curr_pred_mode];
3328 
3329             /* call the intra prediction function */
3330             ps_ctxt->apf_lum_ip[luma_pred_func_idx](
3331                 (UWORD8 *)ps_ctxt->pv_ref_filt_out,
3332                 1,
3333                 pu1_pred,
3334                 pred_strd,
3335                 trans_size,
3336                 curr_pred_mode);
3337 
3338             /* populate the coeffs scan idx */
3339             ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
3340 
3341             /* for luma 4x4 and 8x8 transforms based on intra pred mode scan is choosen*/
3342             if(trans_size < 16)
3343             {
3344                 /* for modes from 22 upto 30 horizontal scan is used */
3345                 if((curr_pred_mode > 21) && (curr_pred_mode < 31))
3346                 {
3347                     ps_ctxt->i4_scan_idx = SCAN_HORZ;
3348                 }
3349                 /* for modes from 6 upto 14 horizontal scan is used */
3350                 else if((curr_pred_mode > 5) && (curr_pred_mode < 15))
3351                 {
3352                     ps_ctxt->i4_scan_idx = SCAN_VERT;
3353                 }
3354             }
3355 
3356             /* RDOPT copy States :  TU init (best until prev TU) to current */
3357             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
3358                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
3359                         .s_cabac_ctxt.au1_ctxt_models[0] +
3360                     IHEVC_CAB_COEFFX_PREFIX,
3361                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
3362                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
3363 
3364             i4_perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_rdoq;
3365             i4_perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_sbh;
3366 
3367 #if DISABLE_RDOQ_INTRA
3368             i4_perform_rdoq = 0;
3369 #endif
3370 
3371             /*2 Multi- dimensinal array based on trans size  of rounding factor to be added here */
3372             /* arrays are for rounding factor corr. to 0-1 decision and 1-2 decision */
3373             /* Currently the complete array will contain only single value*/
3374             /*The rounding factor is calculated with the formula
3375             Deadzone val = (((R1 - R0) * (2^(-8/3)) * lamMod) + 1)/2
3376             rounding factor = (1 - DeadZone Val)
3377 
3378             Assumption: Cabac states of All the sub-blocks in the TU are considered independent
3379             */
3380             if((ps_ctxt->i4_quant_rounding_level != FIXED_QUANT_ROUNDING))
3381             {
3382                 if((ps_ctxt->i4_quant_rounding_level == TU_LEVEL_QUANT_ROUNDING) && (ctr != 0))
3383                 {
3384                     double i4_lamda_modifier;
3385 
3386                     if((BSLICE == ps_ctxt->i1_slice_type) && (ps_ctxt->i4_temporal_layer_id))
3387                     {
3388                         i4_lamda_modifier =
3389                             ps_ctxt->i4_lamda_modifier *
3390                             CLIP3((((double)(ps_ctxt->i4_cu_qp - 12)) / 6.0), 2.00, 4.00);
3391                     }
3392                     else
3393                     {
3394                         i4_lamda_modifier = ps_ctxt->i4_lamda_modifier;
3395                     }
3396                     if(ps_ctxt->i4_use_const_lamda_modifier)
3397                     {
3398                         if(ISLICE == ps_ctxt->i1_slice_type)
3399                         {
3400                             i4_lamda_modifier = ps_ctxt->f_i_pic_lamda_modifier;
3401                         }
3402                         else
3403                         {
3404                             i4_lamda_modifier = CONST_LAMDA_MOD_VAL;
3405                         }
3406                     }
3407 
3408                     ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3] =
3409                         &ps_ctxt->i4_quant_round_tu[0][0];
3410                     ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3] =
3411                         &ps_ctxt->i4_quant_round_tu[1][0];
3412 
3413                     memset(
3414                         ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
3415                         0,
3416                         trans_size * trans_size * sizeof(WORD32));
3417                     memset(
3418                         ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
3419                         0,
3420                         trans_size * trans_size * sizeof(WORD32));
3421 
3422                     ihevce_quant_rounding_factor_gen(
3423                         trans_size,
3424                         1,
3425                         &ps_ctxt->s_rdopt_entropy_ctxt,
3426                         ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
3427                         ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
3428                         i4_lamda_modifier,
3429                         1);
3430                 }
3431                 else
3432                 {
3433                     ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3] =
3434                         ps_ctxt->pi4_quant_round_factor_cu_ctb_0_1[trans_size >> 3];
3435                     ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3] =
3436                         ps_ctxt->pi4_quant_round_factor_cu_ctb_1_2[trans_size >> 3];
3437                 }
3438             }
3439 
3440             /* call T Q IT IQ and recon function */
3441             ai4_cbf[curr_intra_buf_idx] = ihevce_t_q_iq_ssd_scan_fxn(
3442                 ps_ctxt,
3443                 pu1_pred,
3444                 pred_strd,
3445                 (UWORD8 *)pv_curr_src,
3446                 src_strd,
3447                 pi2_curr_deq_data,
3448                 curr_deq_data_strd,
3449                 pu1_recon,
3450                 i4_recon_stride,
3451                 pu1_curr_ecd_data,
3452                 pu1_csbf_buf,
3453                 csbf_strd,
3454                 trans_size,
3455                 PRED_MODE_INTRA,
3456                 &ai8_cand_rdopt_cost[curr_intra_buf_idx],
3457                 &ai4_curr_bytes[curr_intra_buf_idx],
3458                 &ai4_tu_bits[curr_intra_buf_idx],
3459                 &au4_tu_sad[curr_intra_buf_idx],
3460                 &ai4_zero_col[curr_intra_buf_idx],
3461                 &ai4_zero_row[curr_intra_buf_idx],
3462                 &au1_recon_availability[curr_intra_buf_idx],
3463                 i4_perform_rdoq,
3464                 i4_perform_sbh,
3465 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
3466                 i4_alpha_stim_multiplier,
3467                 u1_is_cu_noisy,
3468 #endif
3469                 u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
3470                 1 /*early_cbf */
3471             );
3472 
3473 #if COMPUTE_NOISE_TERM_AT_THE_TU_LEVEL && !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
3474             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
3475             {
3476 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
3477                 ai8_cand_rdopt_cost[curr_intra_buf_idx] = ihevce_inject_stim_into_distortion(
3478                     pv_curr_src,
3479                     src_strd,
3480                     pu1_pred,
3481                     pred_strd,
3482                     ai8_cand_rdopt_cost[curr_intra_buf_idx],
3483                     i4_alpha_stim_multiplier,
3484                     trans_size,
3485                     0,
3486                     ps_ctxt->u1_enable_psyRDOPT,
3487                     NULL_PLANE);
3488 #else
3489                 if(u1_compute_spatial_ssd && au1_recon_availability[curr_intra_buf_idx])
3490                 {
3491                     ai8_cand_rdopt_cost[curr_intra_buf_idx] = ihevce_inject_stim_into_distortion(
3492                         pv_curr_src,
3493                         src_strd,
3494                         pu1_recon,
3495                         i4_recon_stride,
3496                         ai8_cand_rdopt_cost[curr_intra_buf_idx],
3497                         i4_alpha_stim_multiplier,
3498                         trans_size,
3499                         0,
3500                         ps_ctxt->u1_enable_psyRDOPT,
3501                         NULL_PLANE);
3502                 }
3503                 else
3504                 {
3505                     ai8_cand_rdopt_cost[curr_intra_buf_idx] = ihevce_inject_stim_into_distortion(
3506                         pv_curr_src,
3507                         src_strd,
3508                         pu1_pred,
3509                         pred_strd,
3510                         ai8_cand_rdopt_cost[curr_intra_buf_idx],
3511                         i4_alpha_stim_multiplier,
3512                         trans_size,
3513                         0,
3514                         ps_ctxt->u1_enable_psyRDOPT,
3515                         NULL_PLANE);
3516                 }
3517 #endif
3518             }
3519 #endif
3520 
3521             if(TU_EQ_SUBCU == func_proc_mode)
3522             {
3523                 ASSERT(ai4_curr_bytes[curr_intra_buf_idx] < MAX_SCAN_COEFFS_BYTES_4x4);
3524             }
3525 
3526             /* based on CBF/No CBF copy the corresponding state */
3527             if(0 == ai4_cbf[curr_intra_buf_idx])
3528             {
3529                 /* RDOPT copy States :update to init state if 0 cbf */
3530                 COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
3531                     &au1_intra_nxn_rdopt_ctxt_models[curr_intra_buf_idx][0] +
3532                         IHEVC_CAB_COEFFX_PREFIX,
3533                     &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
3534                     IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
3535             }
3536             else
3537             {
3538                 /* RDOPT copy States :update to new state only if CBF is non zero */
3539                 COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
3540                     &au1_intra_nxn_rdopt_ctxt_models[curr_intra_buf_idx][0] +
3541                         IHEVC_CAB_COEFFX_PREFIX,
3542                     &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
3543                             .s_cabac_ctxt.au1_ctxt_models[0] +
3544                         IHEVC_CAB_COEFFX_PREFIX,
3545                     IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
3546             }
3547 
3548             /* call the function which perform intra mode prediction */
3549             ihevce_intra_pred_mode_signaling(
3550                 ps_top_nbr_4x4->b6_luma_intra_mode,
3551                 ps_tmp_lt_4x4->b6_luma_intra_mode,
3552                 s_nbr.u1_top_avail,
3553                 s_nbr.u1_left_avail,
3554                 cu_pos_y,
3555                 curr_pred_mode,
3556                 &as_intra_prev_rem[curr_intra_buf_idx]);
3557             /******************************************************************/
3558             /* PREV INTRA LUMA FLAG, MPM MODE and REM INTRA MODE bits for I_NxN
3559             The bits for these are evaluated for every RDO mode of current subcu
3560             as they can significantly contribute to RDO cost.  Note that these
3561             bits are not accounted for here (ai8_cand_rdopt_cost) as they
3562             are accounted for in encode_cu call later */
3563 
3564             /******************************************************************/
3565             /* PREV INTRA LUMA FLAG, MPM MODE and REM INTRA MODE bits for I_NxN
3566             The bits for these are evaluated for every RDO mode of current subcu
3567             as they can significantly contribute to RDO cost.  Note that these
3568             bits are not accounted for here (ai8_cand_rdopt_cost) as they
3569             are accounted for in encode_cu call later */
3570 
3571             /* Estimate bits to encode prev rem flag  for NXN mode */
3572             {
3573                 WORD32 bits_frac = gau2_ihevce_cabac_bin_to_bits
3574                     [u1_prev_flag_cabac_ctxt ^
3575                      as_intra_prev_rem[curr_intra_buf_idx].b1_prev_intra_luma_pred_flag];
3576 
3577                 /* rounding the fractional bits to nearest integer */
3578                 bits = ((bits_frac + (1 << (CABAC_FRAC_BITS_Q - 1))) >> CABAC_FRAC_BITS_Q);
3579             }
3580 
3581             /* based on prev flag all the mpmidx bits and rem bits */
3582             if(1 == as_intra_prev_rem[curr_intra_buf_idx].b1_prev_intra_luma_pred_flag)
3583             {
3584                 /* mpm_idx */
3585                 bits += as_intra_prev_rem[curr_intra_buf_idx].b2_mpm_idx ? 2 : 1;
3586             }
3587             else
3588             {
3589                 /* rem intra mode */
3590                 bits += 5;
3591             }
3592 
3593             bits += ai4_tu_bits[curr_intra_buf_idx];
3594 
3595             /* compute the total cost for current candidate */
3596             curr_cost = ai8_cand_rdopt_cost[curr_intra_buf_idx];
3597 
3598             /* get the final ssd cost */
3599             curr_cost +=
3600                 COMPUTE_RATE_COST_CLIP30(bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
3601 
3602             /* check of the best candidate cost */
3603             if(curr_cost < best_cand_cost)
3604             {
3605                 best_cand_cost = curr_cost;
3606                 best_cand_idx = cand_ctr;
3607                 best_intra_buf_idx = curr_intra_buf_idx;
3608                 curr_intra_buf_idx = !curr_intra_buf_idx;
3609             }
3610         }
3611 
3612         /***************    For TU_EQ_SUBCU case    *****************/
3613         /* Copy the pred for best cand. to the final pred array     */
3614         /* Copy the iq-coeff for best cand. to the final array      */
3615         /* copy the best coeffs data to final buffer                */
3616         if(TU_EQ_SUBCU == func_proc_mode)
3617         {
3618             /* Copy the pred for best cand. to the final pred array */
3619 
3620             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
3621                 (UWORD8 *)pv_pred_org,
3622                 pred_strd_org,
3623                 &au1_cur_pred_data[best_intra_buf_idx][0],
3624                 trans_size,
3625                 trans_size,
3626                 trans_size);
3627 
3628             /* Copy the deq-coeff for best cand. to the final array */
3629 
3630             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
3631                 (UWORD8 *)pi2_deq_data,
3632                 deq_data_strd << 1,
3633                 (UWORD8 *)&ai2_intra_deq_coeffs[best_intra_buf_idx][0],
3634                 trans_size << 1,
3635                 trans_size << 1,
3636                 trans_size);
3637             /* copy the coeffs to final cu ecd bytes buffer */
3638             memcpy(
3639                 pu1_ecd_data,
3640                 &au1_intra_coeffs[best_intra_buf_idx][0],
3641                 ai4_curr_bytes[best_intra_buf_idx]);
3642 
3643             pu1_recon = ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs[best_intra_buf_idx]) +
3644                         (ctr & 1) * trans_size + (ctr > 1) * trans_size * i4_recon_stride;
3645         }
3646 
3647         /*----------   Calculate Recon for the best INTRA mode     ---------*/
3648         /* TU_EQ_CU case : No need for recon, otherwise recon is required   */
3649         /* Compute recon only for the best mode for TU_EQ_SUBCU case        */
3650         if(u1_compute_recon)
3651         {
3652             ihevce_it_recon_fxn(
3653                 ps_ctxt,
3654                 pi2_deq_data,
3655                 deq_data_strd,
3656                 (UWORD8 *)pv_pred_org,
3657                 pred_strd_org,
3658                 pu1_recon,
3659                 i4_recon_stride,
3660                 pu1_ecd_data,
3661                 trans_size,
3662                 PRED_MODE_INTRA,
3663                 ai4_cbf[best_intra_buf_idx],
3664                 ai4_zero_col[best_intra_buf_idx],
3665                 ai4_zero_row[best_intra_buf_idx]);
3666 
3667             ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr] = best_intra_buf_idx;
3668         }
3669         else if(u1_compute_spatial_ssd && au1_recon_availability[best_intra_buf_idx])
3670         {
3671             ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr] = best_intra_buf_idx;
3672         }
3673         else
3674         {
3675             ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr] = UCHAR_MAX;
3676         }
3677 
3678         /* RDOPT copy States :update to best modes state */
3679         COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
3680             &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
3681             &au1_intra_nxn_rdopt_ctxt_models[best_intra_buf_idx][0] + IHEVC_CAB_COEFFX_PREFIX,
3682             IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
3683 
3684         /* copy the prev,mpm_idx and rem modes from best cand */
3685         ps_final_prms->as_intra_prev_rem[ctr] = as_intra_prev_rem[best_intra_buf_idx];
3686 
3687         /* update the cabac context of prev intra pred mode flag */
3688         u1_prev_flag_cabac_ctxt = gau1_ihevc_next_state
3689             [(u1_prev_flag_cabac_ctxt << 1) |
3690              as_intra_prev_rem[best_intra_buf_idx].b1_prev_intra_luma_pred_flag];
3691 
3692         /* accumulate the TU bits into cu bits */
3693         cu_bits += ai4_tu_bits[best_intra_buf_idx];
3694 
3695         /* copy the intra pred mode for chroma reuse */
3696         if(is_sub_pu_in_hq == 0)
3697         {
3698             *pu1_intra_pred_mode = pu1_curr_mode[best_cand_idx];
3699         }
3700         else
3701         {
3702             *pu1_intra_pred_mode = best_cand_idx;
3703         }
3704 
3705         /* Store luma mode as chroma mode. If chroma prcs happens, and
3706         if a diff. mode wins, it should update this!! */
3707         if(1 == chrm_present_flag)
3708         {
3709             if(is_sub_pu_in_hq == 0)
3710             {
3711                 ps_final_prms->u1_chroma_intra_pred_actual_mode =
3712                     ((ps_ctxt->u1_chroma_array_type == 2)
3713                          ? gau1_chroma422_intra_angle_mapping[pu1_curr_mode[best_cand_idx]]
3714                          : pu1_curr_mode[best_cand_idx]);
3715             }
3716             else
3717             {
3718                 ps_final_prms->u1_chroma_intra_pred_actual_mode =
3719                     ((ps_ctxt->u1_chroma_array_type == 2)
3720                          ? gau1_chroma422_intra_angle_mapping[best_cand_idx]
3721                          : best_cand_idx);
3722             }
3723 
3724             ps_final_prms->u1_chroma_intra_pred_mode = 4;
3725         }
3726 
3727         /*remember the cbf flag to replicate qp for 4x4 neighbour*/
3728         ps_final_prms->u1_is_cu_coded |= ai4_cbf[best_intra_buf_idx];
3729 
3730         /*accumulate ssd over all TU of intra CU*/
3731         ps_final_prms->u4_cu_sad += au4_tu_sad[best_intra_buf_idx];
3732 
3733         /* update the bytes */
3734         ps_final_prms->as_tu_enc_loop[ctr].i4_luma_coeff_offset = ecd_data_bytes_cons;
3735         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].i2_luma_bytes_consumed =
3736             ai4_curr_bytes[best_intra_buf_idx];
3737         /* update the zero_row and col info for the final mode */
3738         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].u4_luma_zero_col =
3739             ai4_zero_col[best_intra_buf_idx];
3740         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].u4_luma_zero_row =
3741             ai4_zero_row[best_intra_buf_idx];
3742 
3743         ps_final_prms->as_tu_enc_loop[ctr].i4_luma_coeff_offset = ecd_data_bytes_cons;
3744 
3745         /* update the total bytes cons */
3746         ecd_data_bytes_cons += ai4_curr_bytes[best_intra_buf_idx];
3747         pu1_ecd_data += ai4_curr_bytes[best_intra_buf_idx];
3748 
3749         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_y_cbf = ai4_cbf[best_intra_buf_idx];
3750         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf = 0;
3751         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf = 0;
3752         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1 = 0;
3753         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1 = 0;
3754         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_chroma_intra_mode_idx = chrm_present_flag;
3755         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b7_qp = ps_ctxt->i4_cu_qp;
3756         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_first_tu_in_cu = 0;
3757         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_transquant_bypass = 0;
3758         GETRANGE(tx_size, trans_size);
3759         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_size = tx_size - 3;
3760         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_x = cu_pos_x;
3761         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_y = cu_pos_y;
3762 
3763         /* repiclate the nbr 4x4 structure for all 4x4 blocks current TU */
3764         ps_nbr_4x4->b1_skip_flag = 0;
3765         ps_nbr_4x4->b1_intra_flag = 1;
3766         ps_nbr_4x4->b1_pred_l0_flag = 0;
3767         ps_nbr_4x4->b1_pred_l1_flag = 0;
3768 
3769         if(is_sub_pu_in_hq == 0)
3770         {
3771             ps_nbr_4x4->b6_luma_intra_mode = pu1_curr_mode[best_cand_idx];
3772         }
3773         else
3774         {
3775             ps_nbr_4x4->b6_luma_intra_mode = best_cand_idx;
3776         }
3777 
3778         ps_nbr_4x4->b1_y_cbf = ai4_cbf[best_intra_buf_idx];
3779 
3780         /* since tu size can be less than cusize, replication is done with strd */
3781         {
3782             WORD32 i, j;
3783             nbr_4x4_t *ps_tmp_4x4;
3784 
3785             ps_tmp_4x4 = ps_nbr_4x4;
3786 
3787             for(i = 0; i < num_4x4_in_tu; i++)
3788             {
3789                 for(j = 0; j < num_4x4_in_tu; j++)
3790                 {
3791                     ps_tmp_4x4[j] = *ps_nbr_4x4;
3792                 }
3793                 /* row level update*/
3794                 ps_tmp_4x4 += num_4x4_in_cu;
3795             }
3796         }
3797 
3798         if(TU_EQ_SUBCU == func_proc_mode)
3799         {
3800             pu1_luma_mode += ((MAX_INTRA_CU_CANDIDATES * 4) + 2 + 1);
3801         }
3802 
3803         if((num_cu_parts > 1) && (ctr < 3))
3804         {
3805             /* set the neighbour map to 1 */
3806             ihevce_set_nbr_map(
3807                 ps_ctxt->pu1_ctb_nbr_map,
3808                 ps_ctxt->i4_nbr_map_strd,
3809                 cu_pos_x,
3810                 cu_pos_y,
3811                 trans_size >> 2,
3812                 1);
3813 
3814             /* block level updates block number (1 & 3 )*/
3815             pv_curr_src = (UWORD8 *)pv_curr_src + trans_size;
3816             pv_pred_org = (UWORD8 *)pv_pred_org + trans_size;
3817             pi2_deq_data += trans_size;
3818 
3819             switch(ctr)
3820             {
3821             case 0:
3822             {
3823                 pu1_left = pu1_recon + trans_size - 1;
3824                 pu1_top += trans_size;
3825                 pu1_top_left = pu1_top - 1;
3826                 left_strd = i4_recon_stride;
3827 
3828                 break;
3829             }
3830             case 1:
3831             {
3832                 ASSERT(
3833                     (ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0] == 0) ||
3834                     (ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0] == 1));
3835 
3836                 /* Since the 'lumaRefSubstitution' function expects both Top and */
3837                 /* TopRight recon pixels to be present in the same buffer */
3838                 if(ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0] !=
3839                    ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1])
3840                 {
3841                     UWORD8 *pu1_src =
3842                         ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
3843                              [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1]]) +
3844                         trans_size;
3845                     UWORD8 *pu1_dst =
3846                         ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
3847                              [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0]]) +
3848                         trans_size;
3849 
3850                     ps_ctxt->s_cmn_opt_func.pf_copy_2d(
3851                         pu1_dst, i4_recon_stride, pu1_src, i4_recon_stride, trans_size, trans_size);
3852 
3853                     ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1] =
3854                         ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0];
3855                 }
3856 
3857                 pu1_left = (UWORD8 *)pv_cu_left + trans_size * cu_left_stride;
3858                 pu1_top = ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
3859                                [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0]]) +
3860                           (trans_size - 1) * i4_recon_stride;
3861                 pu1_top_left = pu1_left - cu_left_stride;
3862                 left_strd = cu_left_stride;
3863 
3864                 break;
3865             }
3866             case 2:
3867             {
3868                 ASSERT(
3869                     (ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1] == 0) ||
3870                     (ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1] == 1));
3871 
3872                 pu1_left = pu1_recon + trans_size - 1;
3873                 pu1_top = ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
3874                                [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[1]]) +
3875                           (trans_size - 1) * i4_recon_stride + trans_size;
3876                 pu1_top_left = pu1_top - 1;
3877                 left_strd = i4_recon_stride;
3878 
3879                 break;
3880             }
3881             }
3882 
3883             pu1_csbf_buf += num_4x4_in_tu;
3884             cu_pos_x += num_4x4_in_tu;
3885             ps_nbr_4x4 += num_4x4_in_tu;
3886             ps_top_nbr_4x4 += num_4x4_in_tu;
3887             ps_tmp_lt_4x4 = ps_nbr_4x4 - 1;
3888 
3889             pu1_intra_pred_mode++;
3890 
3891             /* after 2 blocks increment the pointers to bottom blocks */
3892             if(1 == ctr)
3893             {
3894                 pv_curr_src = (UWORD8 *)pv_curr_src - (trans_size << 1);
3895                 pv_curr_src = (UWORD8 *)pv_curr_src + (trans_size * src_strd);
3896 
3897                 pv_pred_org = (UWORD8 *)pv_pred_org - (trans_size << 1);
3898                 pv_pred_org = (UWORD8 *)pv_pred_org + (trans_size * pred_strd_org);
3899                 pi2_deq_data -= (trans_size << 1);
3900                 pi2_deq_data += (trans_size * deq_data_strd);
3901 
3902                 pu1_csbf_buf -= (num_4x4_in_tu << 1);
3903                 pu1_csbf_buf += (num_4x4_in_tu * csbf_strd);
3904 
3905                 ps_nbr_4x4 -= (num_4x4_in_tu << 1);
3906                 ps_nbr_4x4 += (num_4x4_in_tu * num_4x4_in_cu);
3907                 ps_top_nbr_4x4 = ps_nbr_4x4 - num_4x4_in_cu;
3908                 ps_tmp_lt_4x4 = ps_left_nbr_4x4 + (num_4x4_in_tu * nbr_4x4_left_strd);
3909 
3910                 /* decrement pos x to start */
3911                 cu_pos_x -= (num_4x4_in_tu << 1);
3912                 cu_pos_y += num_4x4_in_tu;
3913             }
3914         }
3915 
3916 #if RDOPT_ENABLE
3917         /* compute the RDOPT cost for the current TU */
3918         ai8_cand_rdopt_cost[best_intra_buf_idx] += COMPUTE_RATE_COST_CLIP30(
3919             ai4_tu_bits[best_intra_buf_idx], ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
3920 #endif
3921 
3922         /* accumulate the costs */
3923         total_rdopt_cost += ai8_cand_rdopt_cost[best_intra_buf_idx];
3924 
3925         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
3926         {
3927             /* Early exit : If the current running cost exceeds
3928             the prev. best mode cost, break */
3929             if(total_rdopt_cost > prev_best_rdopt_cost)
3930             {
3931                 return (total_rdopt_cost);
3932             }
3933         }
3934 
3935         /* if transfrom size is 4x4 then only first luma 4x4 will have chroma*/
3936         chrm_present_flag = (4 != trans_size) ? 1 : INTRA_PRED_CHROMA_IDX_NONE;
3937 
3938         pu4_nbr_flags++;
3939     }
3940     /* Modify the cost function for this CU. */
3941     /* loop in for 8x8 blocks */
3942     if(ps_ctxt->u1_enable_psyRDOPT)
3943     {
3944         UWORD8 *pu1_recon_cu;
3945         WORD32 recon_stride;
3946         WORD32 curr_pos_x;
3947         WORD32 curr_pos_y;
3948         WORD32 start_index;
3949         WORD32 num_horz_cu_in_ctb;
3950         WORD32 cu_size;
3951         WORD32 had_block_size;
3952 
3953         /* tODO: sreenivasa ctb size has to be used appropriately */
3954         had_block_size = 8;
3955         cu_size = ps_cu_analyse->u1_cu_size; /* todo */
3956         num_horz_cu_in_ctb = 64 / had_block_size;
3957 
3958         curr_pos_x = ps_cu_analyse->b3_cu_pos_x << 3; /* pel units */
3959         curr_pos_y = ps_cu_analyse->b3_cu_pos_y << 3; /* pel units */
3960         recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
3961         pu1_recon_cu =
3962             ((UWORD8 *)ps_final_prms->s_recon_datastore
3963                  .apv_luma_recon_bufs[ps_recon_datastore->au1_bufId_with_winning_LumaRecon[0]]);
3964         /* + \  curr_pos_x + curr_pos_y * recon_stride; */
3965 
3966         /* start index to index the source satd of curr cu int he current ctb*/
3967         start_index =
3968             (curr_pos_x / had_block_size) + (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
3969 
3970         {
3971             total_rdopt_cost += ihevce_psy_rd_cost(
3972                 ps_ctxt->ai4_source_satd_8x8,
3973                 pu1_recon_cu,
3974                 recon_stride,
3975                 1,  //
3976                 cu_size,
3977                 0,  // pic type
3978                 0,  //layer id
3979                 ps_ctxt->i4_satd_lamda,  // lambda
3980                 start_index,
3981                 ps_ctxt->u1_is_input_data_hbd,
3982                 ps_ctxt->u4_psy_strength,
3983                 &ps_ctxt->s_cmn_opt_func
3984 
3985             );  // 8 bit
3986         }
3987     }
3988 
3989 #if !FORCE_INTRA_TU_DEPTH_TO_0  //RATIONALISE_NUM_RDO_MODES_IN_PQ_AND_HQ
3990     if(TU_EQ_SUBCU == func_proc_mode)
3991     {
3992         UWORD8 au1_tu_eq_cu_div2_modes[4];
3993         UWORD8 au1_freq_of_mode[4];
3994 
3995         WORD32 i4_num_clusters = ihevce_find_num_clusters_of_identical_points_1D(
3996             ps_final_prms->au1_intra_pred_mode, au1_tu_eq_cu_div2_modes, au1_freq_of_mode, 4);
3997 
3998         if(1 == i4_num_clusters)
3999         {
4000             ps_final_prms->u2_num_pus_in_cu = 1;
4001             ps_final_prms->u1_part_mode = SIZE_2Nx2N;
4002         }
4003     }
4004 #endif
4005 
4006     /* store the num TUs*/
4007     ps_final_prms->u2_num_tus_in_cu = u2_num_tus_in_cu;
4008 
4009     /* update the bytes consumed */
4010     ps_final_prms->i4_num_bytes_ecd_data = ecd_data_bytes_cons;
4011 
4012     /* store the current cu size to final prms */
4013     ps_final_prms->u1_cu_size = ps_cu_analyse->u1_cu_size;
4014 
4015     /* cu bits will be having luma residual bits till this point    */
4016     /* if zero_cbf eval is disabled then cu bits will be zero       */
4017     ps_final_prms->u4_cu_luma_res_bits = cu_bits;
4018 
4019     /* ------------- Chroma processing -------------- */
4020     /* Chroma rdopt eval for each luma candidate only for HIGH QUALITY/MEDIUM SPEDD preset*/
4021     if(1 == ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt)
4022     {
4023         LWORD64 chrm_rdopt_cost;
4024         WORD32 chrm_rdopt_tu_bits;
4025 
4026         /* Store the current RDOPT cost to enable early exit in chrom_prcs */
4027         ps_ctxt->as_cu_prms[curr_buf_idx].i8_curr_rdopt_cost = total_rdopt_cost;
4028 
4029         chrm_rdopt_cost = ihevce_chroma_cu_prcs_rdopt(
4030             ps_ctxt,
4031             curr_buf_idx,
4032             func_proc_mode,
4033             ps_chrm_cu_buf_prms->pu1_curr_src,
4034             ps_chrm_cu_buf_prms->i4_chrm_src_stride,
4035             ps_chrm_cu_buf_prms->pu1_cu_left,
4036             ps_chrm_cu_buf_prms->pu1_cu_top,
4037             ps_chrm_cu_buf_prms->pu1_cu_top_left,
4038             ps_chrm_cu_buf_prms->i4_cu_left_stride,
4039             cu_pos_x_8pelunits,
4040             cu_pos_y_8pelunits,
4041             &chrm_rdopt_tu_bits,
4042             i4_alpha_stim_multiplier,
4043             u1_is_cu_noisy);
4044 
4045 #if WEIGH_CHROMA_COST
4046         chrm_rdopt_cost = (LWORD64)(
4047             (chrm_rdopt_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
4048              (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
4049             CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
4050 #endif
4051 
4052 #if CHROMA_RDOPT_ENABLE
4053         total_rdopt_cost += chrm_rdopt_cost;
4054 #endif
4055         cu_bits += chrm_rdopt_tu_bits;
4056 
4057         /* cu bits for chroma residual if chroma rdopt is on       */
4058         /* if zero_cbf eval is disabled then cu bits will be zero  */
4059         ps_final_prms->u4_cu_chroma_res_bits = chrm_rdopt_tu_bits;
4060 
4061         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
4062         {
4063             /* Early exit : If the current running cost exceeds
4064             the prev. best mode cost, break */
4065             if(total_rdopt_cost > prev_best_rdopt_cost)
4066             {
4067                 return (total_rdopt_cost);
4068             }
4069         }
4070     }
4071     else
4072     {}
4073 
4074     /* RDOPT copy States :  Best after all luma TUs to current */
4075     COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
4076         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4077                 .s_cabac_ctxt.au1_ctxt_models[0] +
4078             IHEVC_CAB_COEFFX_PREFIX,
4079         &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
4080         IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
4081 
4082     /* get the neighbour availability flags for current cu  */
4083     ihevce_get_only_nbr_flag(
4084         &s_nbr,
4085         ps_ctxt->pu1_ctb_nbr_map,
4086         ps_ctxt->i4_nbr_map_strd,
4087         (cu_pos_x_8pelunits << 1),
4088         (cu_pos_y_8pelunits << 1),
4089         (trans_size << 1),
4090         (trans_size << 1));
4091 
4092     /* call the entropy rdo encode to get the bit estimate for current cu */
4093     /*if ZERO_CBF eval is enabled then this function will return only CU header bits */
4094     {
4095         /*cbf_bits will account for both texture and cbf bits when zero cbf eval flag is 0*/
4096         WORD32 cbf_bits, header_bits;
4097 
4098         header_bits = ihevce_entropy_rdo_encode_cu(
4099             &ps_ctxt->s_rdopt_entropy_ctxt,
4100             ps_final_prms,
4101             cu_pos_x_8pelunits,
4102             cu_pos_y_8pelunits,
4103             ps_cu_analyse->u1_cu_size,
4104             s_nbr.u1_top_avail,
4105             s_nbr.u1_left_avail,
4106             &ps_final_prms->pu1_cu_coeffs[0],
4107             &cbf_bits);
4108 
4109         cu_bits += header_bits;
4110 
4111         /* cbf bits are excluded from header bits, instead considered as texture bits */
4112         /* incase if zero cbf eval is disabled then texture bits gets added here */
4113         ps_final_prms->u4_cu_hdr_bits = (header_bits - cbf_bits);
4114         ps_final_prms->u4_cu_cbf_bits = cbf_bits;
4115 
4116 #if RDOPT_ENABLE
4117         /* add the cost of coding the cu bits */
4118         total_rdopt_cost +=
4119             COMPUTE_RATE_COST_CLIP30(header_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
4120 #endif
4121     }
4122     return (total_rdopt_cost);
4123 }
4124 /*!
4125 ******************************************************************************
4126 * \if Function name : ihevce_inter_rdopt_cu_ntu \endif
4127 *
4128 * \brief
4129 *    Inter Coding unit funtion whic perfomr the TQ IT IQ recon for luma
4130 *
4131 * \param[in] ps_ctxt       enc_loop module ctxt pointer
4132 * \param[in] ps_inter_cand pointer to inter candidate structure
4133 * \param[in] pu1_src       pointer to source data buffer
4134 * \param[in] cu_size       Current CU size
4135 * \param[in] cu_pos_x      cu position x w.r.t to ctb
4136 * \param[in] cu_pos_y      cu position y w.r.t to ctb
4137 * \param[in] src_strd      source buffer stride
4138 * \param[in] curr_buf_idx  buffer index for current output storage
4139 * \param[in] ps_chrm_cu_buf_prms pointer to chroma buffer pointers structure
4140 *
4141 * \return
4142 *    Rdopt cost
4143 *
4144 * \author
4145 *  Ittiam
4146 *
4147 *****************************************************************************
4148 */
ihevce_inter_rdopt_cu_ntu(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_cu_prms_t * ps_cu_prms,void * pv_src,WORD32 cu_size,WORD32 cu_pos_x,WORD32 cu_pos_y,WORD32 curr_buf_idx,enc_loop_chrm_cu_buf_prms_t * ps_chrm_cu_buf_prms,cu_inter_cand_t * ps_inter_cand,cu_analyse_t * ps_cu_analyse,WORD32 i4_alpha_stim_multiplier)4149 LWORD64 ihevce_inter_rdopt_cu_ntu(
4150     ihevce_enc_loop_ctxt_t *ps_ctxt,
4151     enc_loop_cu_prms_t *ps_cu_prms,
4152     void *pv_src,
4153     WORD32 cu_size,
4154     WORD32 cu_pos_x,
4155     WORD32 cu_pos_y,
4156     WORD32 curr_buf_idx,
4157     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
4158     cu_inter_cand_t *ps_inter_cand,
4159     cu_analyse_t *ps_cu_analyse,
4160     WORD32 i4_alpha_stim_multiplier)
4161 {
4162     enc_loop_cu_final_prms_t *ps_final_prms;
4163     nbr_4x4_t *ps_nbr_4x4;
4164     tu_prms_t s_tu_prms[64 * 4];
4165     tu_prms_t *ps_tu_prms;
4166 
4167     WORD32 i4_perform_rdoq;
4168     WORD32 i4_perform_sbh;
4169     WORD32 ai4_tu_split_flags[4];
4170     WORD32 ai4_tu_early_cbf[4];
4171     WORD32 num_split_flags = 1;
4172     WORD32 i;
4173     UWORD8 u1_tu_size;
4174     UWORD8 *pu1_pred;
4175     UWORD8 *pu1_ecd_data;
4176     WORD16 *pi2_deq_data;
4177     UWORD8 *pu1_csbf_buf;
4178     UWORD8 *pu1_tu_sz_sft;
4179     UWORD8 *pu1_tu_posx;
4180     UWORD8 *pu1_tu_posy;
4181     LWORD64 total_rdopt_cost;
4182     WORD32 ctr;
4183     WORD32 chrm_ctr;
4184     WORD32 num_tu_in_cu = 0;
4185     WORD32 pred_stride;
4186     WORD32 recon_stride;
4187     WORD32 trans_size = ps_cu_analyse->u1_cu_size;
4188     WORD32 csbf_strd;
4189     WORD32 chrm_present_flag;
4190     WORD32 ecd_data_bytes_cons;
4191     WORD32 num_4x4_in_cu;
4192     WORD32 num_4x4_in_tu;
4193     WORD32 recon_func_mode;
4194     WORD32 cu_bits;
4195     UWORD8 u1_compute_spatial_ssd;
4196 
4197     /* min_trans_size is initialized to some huge number than usual TU sizes */
4198     WORD32 i4_min_trans_size = 256;
4199     /* Get the RDOPT cost of the best CU mode for early_exit */
4200     LWORD64 prev_best_rdopt_cost = ps_ctxt->as_cu_prms[!curr_buf_idx].i8_best_rdopt_cost;
4201     WORD32 src_strd = ps_cu_prms->i4_luma_src_stride;
4202 
4203     /* model for no residue syntax qt root cbf flag */
4204     UWORD8 u1_qtroot_cbf_cabac_model = ps_ctxt->au1_rdopt_init_ctxt_models[IHEVC_CAB_NORES_IDX];
4205 
4206     /* backup copy of cabac states for restoration if zero cu reside rdo wins later */
4207     UWORD8 au1_rdopt_init_ctxt_models[IHEVC_CAB_CTXT_END];
4208 
4209     /* for skip cases tables are not reqquired */
4210     UWORD8 u1_skip_tu_sz_sft = 0;
4211     UWORD8 u1_skip_tu_posx = 0;
4212     UWORD8 u1_skip_tu_posy = 0;
4213     UWORD8 u1_is_cu_noisy = ps_cu_prms->u1_is_cu_noisy;
4214 
4215     /* get the pointers based on curbuf idx */
4216     ps_final_prms = &ps_ctxt->as_cu_prms[curr_buf_idx];
4217     ps_nbr_4x4 = &ps_ctxt->as_cu_nbr[curr_buf_idx][0];
4218     pu1_ecd_data = &ps_final_prms->pu1_cu_coeffs[0];
4219     pi2_deq_data = &ps_final_prms->pi2_cu_deq_coeffs[0];
4220     csbf_strd = ps_ctxt->i4_cu_csbf_strd;
4221     pu1_csbf_buf = &ps_ctxt->au1_cu_csbf[0];
4222 
4223     pred_stride = ps_inter_cand->i4_pred_data_stride;
4224     recon_stride = cu_size;
4225     pu1_pred = ps_inter_cand->pu1_pred_data;
4226     chrm_ctr = 0;
4227     ecd_data_bytes_cons = 0;
4228     total_rdopt_cost = 0;
4229     num_4x4_in_cu = cu_size >> 2;
4230     recon_func_mode = PRED_MODE_INTER;
4231     cu_bits = 0;
4232 
4233     /* get the 4x4 level postion of current cu */
4234     cu_pos_x = cu_pos_x << 1;
4235     cu_pos_y = cu_pos_y << 1;
4236 
4237     /* default value for cu coded flag */
4238     ps_final_prms->u1_is_cu_coded = 0;
4239 
4240     /*init of ssd of CU accuumulated over all TU*/
4241     ps_final_prms->u4_cu_sad = 0;
4242 
4243     /* populate the coeffs scan idx */
4244     ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
4245 
4246 #if ENABLE_INTER_ZCU_COST
4247     /* reset cu not coded cost */
4248     ps_ctxt->i8_cu_not_coded_cost = 0;
4249 
4250     /* backup copy of cabac states for restoration if zero cu reside rdo wins later */
4251     memcpy(au1_rdopt_init_ctxt_models, &ps_ctxt->au1_rdopt_init_ctxt_models[0], IHEVC_CAB_CTXT_END);
4252 #endif
4253 
4254     if(ps_cu_analyse->u1_cu_size == 64)
4255     {
4256         num_split_flags = 4;
4257         u1_tu_size = 32;
4258     }
4259     else
4260     {
4261         num_split_flags = 1;
4262         u1_tu_size = ps_cu_analyse->u1_cu_size;
4263     }
4264 
4265     /* ckeck for skip mode */
4266     if(1 == ps_final_prms->u1_skip_flag)
4267     {
4268         if(64 == cu_size)
4269         {
4270             /* TU = CU/2 is set but no trnaform is evaluated  */
4271             num_tu_in_cu = 4;
4272             pu1_tu_sz_sft = &gau1_inter_tu_shft_amt[0];
4273             pu1_tu_posx = &gau1_inter_tu_posx_scl_amt[0];
4274             pu1_tu_posy = &gau1_inter_tu_posy_scl_amt[0];
4275         }
4276         else
4277         {
4278             /* TU = CU is set but no trnaform is evaluated  */
4279             num_tu_in_cu = 1;
4280             pu1_tu_sz_sft = &u1_skip_tu_sz_sft;
4281             pu1_tu_posx = &u1_skip_tu_posx;
4282             pu1_tu_posy = &u1_skip_tu_posy;
4283         }
4284 
4285         recon_func_mode = PRED_MODE_SKIP;
4286     }
4287     /* check for PU part mode being AMP or No AMP */
4288     else if(ps_final_prms->u1_part_mode < SIZE_2NxnU)
4289     {
4290         if((SIZE_2Nx2N == ps_final_prms->u1_part_mode) && (cu_size < 64))
4291         {
4292             /* TU= CU is evaluated 2Nx2N inter case */
4293             num_tu_in_cu = 1;
4294             pu1_tu_sz_sft = &u1_skip_tu_sz_sft;
4295             pu1_tu_posx = &u1_skip_tu_posx;
4296             pu1_tu_posy = &u1_skip_tu_posy;
4297         }
4298         else
4299         {
4300             /* currently TU= CU/2 is evaluated for all inter case */
4301             num_tu_in_cu = 4;
4302             pu1_tu_sz_sft = &gau1_inter_tu_shft_amt[0];
4303             pu1_tu_posx = &gau1_inter_tu_posx_scl_amt[0];
4304             pu1_tu_posy = &gau1_inter_tu_posy_scl_amt[0];
4305         }
4306     }
4307     else
4308     {
4309         /* for AMP cases one level of TU recurssion is done */
4310         /* based on oreintation of the partitions           */
4311         num_tu_in_cu = 10;
4312         pu1_tu_sz_sft = &gau1_inter_tu_shft_amt_amp[ps_final_prms->u1_part_mode - 4][0];
4313         pu1_tu_posx = &gau1_inter_tu_posx_scl_amt_amp[ps_final_prms->u1_part_mode - 4][0];
4314         pu1_tu_posy = &gau1_inter_tu_posy_scl_amt_amp[ps_final_prms->u1_part_mode - 4][0];
4315     }
4316 
4317     ps_tu_prms = &s_tu_prms[0];
4318     num_tu_in_cu = 0;
4319 
4320     for(i = 0; i < num_split_flags; i++)
4321     {
4322         WORD32 i4_x_off = 0, i4_y_off = 0;
4323 
4324         if(i == 1 || i == 3)
4325         {
4326             i4_x_off = 32;
4327         }
4328 
4329         if(i == 2 || i == 3)
4330         {
4331             i4_y_off = 32;
4332         }
4333 
4334         if(1 == ps_final_prms->u1_skip_flag)
4335         {
4336             ai4_tu_split_flags[0] = 0;
4337             ps_inter_cand->ai4_tu_split_flag[i] = 0;
4338 
4339             ai4_tu_early_cbf[0] = 0;
4340         }
4341         else
4342         {
4343             ai4_tu_split_flags[0] = ps_inter_cand->ai4_tu_split_flag[i];
4344             ai4_tu_early_cbf[0] = ps_inter_cand->ai4_tu_early_cbf[i];
4345         }
4346 
4347         ps_tu_prms->u1_tu_size = u1_tu_size;
4348 
4349         ps_tu_prms = (tu_prms_t *)ihevce_tu_tree_update(
4350             ps_tu_prms,
4351             &num_tu_in_cu,
4352             0,
4353             ai4_tu_split_flags[0],
4354             ai4_tu_early_cbf[0],
4355             i4_x_off,
4356             i4_y_off);
4357     }
4358 
4359     /* loop for all tu blocks in current cu */
4360     ps_tu_prms = &s_tu_prms[0];
4361     for(ctr = 0; ctr < num_tu_in_cu; ctr++)
4362     {
4363         trans_size = ps_tu_prms->u1_tu_size;
4364 
4365         if(i4_min_trans_size > trans_size)
4366         {
4367             i4_min_trans_size = trans_size;
4368         }
4369         ps_tu_prms++;
4370     }
4371 
4372     if(ps_ctxt->i1_cu_qp_delta_enable)
4373     {
4374         WORD32 i4_act_counter = 0, i4_act_counter_lamda = 0;
4375 
4376         if(ps_cu_analyse->u1_cu_size == 64)
4377         {
4378             ASSERT(
4379                 (i4_min_trans_size == 32) || (i4_min_trans_size == 16) ||
4380                 (i4_min_trans_size == 8) || (i4_min_trans_size == 4));
4381             i4_act_counter = (i4_min_trans_size == 16) +
4382                              2 * ((i4_min_trans_size == 8) || (i4_min_trans_size == 4));
4383             i4_act_counter_lamda = 3;
4384         }
4385         else if(ps_cu_analyse->u1_cu_size == 32)
4386         {
4387             ASSERT(
4388                 (i4_min_trans_size == 32) || (i4_min_trans_size == 16) ||
4389                 (i4_min_trans_size == 8) || (i4_min_trans_size == 4));
4390             i4_act_counter = (i4_min_trans_size == 16) +
4391                              2 * ((i4_min_trans_size == 8) || (i4_min_trans_size == 4));
4392             i4_act_counter_lamda = 0;
4393         }
4394         else if(ps_cu_analyse->u1_cu_size == 16)
4395         {
4396             ASSERT(
4397                 (i4_min_trans_size == 16) || (i4_min_trans_size == 8) || (i4_min_trans_size == 4));
4398             i4_act_counter = (i4_min_trans_size == 8) || (i4_min_trans_size == 4);
4399             i4_act_counter_lamda = 0;
4400         }
4401         else if(ps_cu_analyse->u1_cu_size == 8)
4402         {
4403             ASSERT((i4_min_trans_size == 8) || (i4_min_trans_size == 4));
4404             i4_act_counter = 1;
4405             i4_act_counter_lamda = 0;
4406         }
4407         else
4408         {
4409             ASSERT(0);
4410         }
4411         if(ps_ctxt->i4_use_ctb_level_lamda)
4412         {
4413             ihevce_compute_cu_level_QP(
4414                 ps_ctxt, ps_cu_analyse->i4_act_factor[i4_act_counter][0], -1, 0);
4415         }
4416         else
4417         {
4418             ihevce_compute_cu_level_QP(
4419                 ps_ctxt,
4420                 ps_cu_analyse->i4_act_factor[i4_act_counter][0],
4421                 ps_cu_analyse->i4_act_factor[i4_act_counter_lamda][0],
4422                 0);
4423         }
4424 
4425         ps_cu_analyse->i1_cu_qp = ps_ctxt->i4_cu_qp;
4426     }
4427     if(u1_is_cu_noisy && !ps_ctxt->u1_enable_psyRDOPT)
4428     {
4429         ps_ctxt->i8_cl_ssd_lambda_qf =
4430             ((float)ps_ctxt->i8_cl_ssd_lambda_qf * (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) /
4431              100.0f);
4432         ps_ctxt->i8_cl_ssd_lambda_chroma_qf =
4433             ((float)ps_ctxt->i8_cl_ssd_lambda_chroma_qf *
4434              (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) / 100.0f);
4435     }
4436 
4437     u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
4438                              (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
4439                              CONVERT_SSDS_TO_SPATIAL_DOMAIN;
4440 
4441     if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
4442     {
4443         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
4444                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
4445     }
4446 
4447     if(!u1_compute_spatial_ssd)
4448     {
4449         ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 0;
4450         ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 0;
4451     }
4452     else
4453     {
4454         ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 1;
4455     }
4456 
4457     ps_tu_prms = &s_tu_prms[0];
4458 
4459     ASSERT(num_tu_in_cu <= 256);
4460 
4461     /* RDOPT copy States :  TU init (best until prev TU) to current */
4462     memcpy(
4463         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4464              .s_cabac_ctxt.au1_ctxt_models[0],
4465         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
4466         IHEVC_CAB_COEFFX_PREFIX);
4467 
4468     for(ctr = 0; ctr < num_tu_in_cu; ctr++)
4469     {
4470         WORD32 curr_bytes;
4471         WORD32 tx_size;
4472         WORD32 cbf, zero_col, zero_row;
4473         LWORD64 rdopt_cost;
4474         UWORD8 u1_is_recon_available;
4475 
4476         WORD32 curr_pos_x;
4477         WORD32 curr_pos_y;
4478         nbr_4x4_t *ps_cur_nbr_4x4;
4479         UWORD8 *pu1_cur_pred;
4480         UWORD8 *pu1_cur_src;
4481         UWORD8 *pu1_cur_recon;
4482         WORD16 *pi2_cur_deq_data;
4483         UWORD32 u4_tu_sad;
4484         WORD32 tu_bits;
4485 
4486         WORD32 i4_recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
4487 
4488         trans_size = ps_tu_prms->u1_tu_size;
4489         /* get the current pos x and pos y in pixels */
4490         curr_pos_x = ps_tu_prms->u1_x_off;  //((cu_size >> 2) * pu1_tu_posx[ctr]);
4491         curr_pos_y = ps_tu_prms->u1_y_off;  //((cu_size >> 2) * pu1_tu_posy[ctr]);
4492 
4493         num_4x4_in_tu = trans_size >> 2;
4494 
4495 #if FORCE_8x8_TFR
4496         if(cu_size == 64)
4497         {
4498             curr_pos_x = ((cu_size >> 3) * pu1_tu_posx[ctr]);
4499             curr_pos_y = ((cu_size >> 3) * pu1_tu_posy[ctr]);
4500         }
4501 #endif
4502 
4503         /* increment the pointers to start of current TU  */
4504         pu1_cur_src = ((UWORD8 *)pv_src + curr_pos_x);
4505         pu1_cur_src += (curr_pos_y * src_strd);
4506         pu1_cur_pred = (pu1_pred + curr_pos_x);
4507         pu1_cur_pred += (curr_pos_y * pred_stride);
4508         pi2_cur_deq_data = pi2_deq_data + curr_pos_x;
4509         pi2_cur_deq_data += (curr_pos_y * cu_size);
4510         pu1_cur_recon = ((UWORD8 *)ps_final_prms->s_recon_datastore.apv_luma_recon_bufs[0]) +
4511                         curr_pos_x + curr_pos_y * i4_recon_stride;
4512 
4513         ps_cur_nbr_4x4 = (ps_nbr_4x4 + (curr_pos_x >> 2));
4514         ps_cur_nbr_4x4 += ((curr_pos_y >> 2) * num_4x4_in_cu);
4515 
4516         /* RDOPT copy States :  TU init (best until prev TU) to current */
4517         COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
4518             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4519                     .s_cabac_ctxt.au1_ctxt_models[0] +
4520                 IHEVC_CAB_COEFFX_PREFIX,
4521             &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
4522             IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
4523 
4524         i4_perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_rdoq;
4525         i4_perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_sbh;
4526 
4527         /*2 Multi- dimensinal array based on trans size  of rounding factor to be added here */
4528         /* arrays are for rounding factor corr. to 0-1 decision and 1-2 decision */
4529         /* Currently the complete array will contain only single value*/
4530         /*The rounding factor is calculated with the formula
4531         Deadzone val = (((R1 - R0) * (2^(-8/3)) * lamMod) + 1)/2
4532         rounding factor = (1 - DeadZone Val)
4533 
4534         Assumption: Cabac states of All the sub-blocks in the TU are considered independent
4535         */
4536         if((ps_ctxt->i4_quant_rounding_level == TU_LEVEL_QUANT_ROUNDING) && (ctr != 0))
4537         {
4538             double i4_lamda_modifier;
4539 
4540             if((BSLICE == ps_ctxt->i1_slice_type) && (ps_ctxt->i4_temporal_layer_id))
4541             {
4542                 i4_lamda_modifier = ps_ctxt->i4_lamda_modifier *
4543                                     CLIP3((((double)(ps_ctxt->i4_cu_qp - 12)) / 6.0), 2.00, 4.00);
4544             }
4545             else
4546             {
4547                 i4_lamda_modifier = ps_ctxt->i4_lamda_modifier;
4548             }
4549             if(ps_ctxt->i4_use_const_lamda_modifier)
4550             {
4551                 if(ISLICE == ps_ctxt->i1_slice_type)
4552                 {
4553                     i4_lamda_modifier = ps_ctxt->f_i_pic_lamda_modifier;
4554                 }
4555                 else
4556                 {
4557                     i4_lamda_modifier = CONST_LAMDA_MOD_VAL;
4558                 }
4559             }
4560             ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3] =
4561                 &ps_ctxt->i4_quant_round_tu[0][0];
4562             ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3] =
4563                 &ps_ctxt->i4_quant_round_tu[1][0];
4564 
4565             memset(
4566                 ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
4567                 0,
4568                 trans_size * trans_size * sizeof(WORD32));
4569             memset(
4570                 ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
4571                 0,
4572                 trans_size * trans_size * sizeof(WORD32));
4573 
4574             ihevce_quant_rounding_factor_gen(
4575                 trans_size,
4576                 1,
4577                 &ps_ctxt->s_rdopt_entropy_ctxt,
4578                 ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3],
4579                 ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3],
4580                 i4_lamda_modifier,
4581                 1);
4582         }
4583         else
4584         {
4585             ps_ctxt->pi4_quant_round_factor_tu_0_1[trans_size >> 3] =
4586                 ps_ctxt->pi4_quant_round_factor_cu_ctb_0_1[trans_size >> 3];
4587             ps_ctxt->pi4_quant_round_factor_tu_1_2[trans_size >> 3] =
4588                 ps_ctxt->pi4_quant_round_factor_cu_ctb_1_2[trans_size >> 3];
4589         }
4590 
4591         /* call T Q IT IQ and recon function */
4592         cbf = ihevce_t_q_iq_ssd_scan_fxn(
4593             ps_ctxt,
4594             pu1_cur_pred,
4595             pred_stride,
4596             pu1_cur_src,
4597             src_strd,
4598             pi2_cur_deq_data,
4599             cu_size,
4600             pu1_cur_recon,
4601             i4_recon_stride,
4602             pu1_ecd_data,
4603             pu1_csbf_buf,
4604             csbf_strd,
4605             trans_size,
4606             recon_func_mode,
4607             &rdopt_cost,
4608             &curr_bytes,
4609             &tu_bits,
4610             &u4_tu_sad,
4611             &zero_col,
4612             &zero_row,
4613             &u1_is_recon_available,
4614             i4_perform_rdoq,
4615             i4_perform_sbh,
4616 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
4617             i4_alpha_stim_multiplier,
4618             u1_is_cu_noisy,
4619 #endif
4620             u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
4621             ps_ctxt->u1_use_early_cbf_data ? ps_tu_prms->i4_early_cbf : 1);
4622 
4623 #if COMPUTE_NOISE_TERM_AT_THE_TU_LEVEL && !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
4624         if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
4625         {
4626 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
4627             rdopt_cost = ihevce_inject_stim_into_distortion(
4628                 pu1_cur_src,
4629                 src_strd,
4630                 pu1_cur_pred,
4631                 pred_stride,
4632                 rdopt_cost,
4633                 i4_alpha_stim_multiplier,
4634                 trans_size,
4635                 0,
4636                 ps_ctxt->u1_enable_psyRDOPT,
4637                 NULL_PLANE);
4638 #else
4639             if(u1_compute_spatial_ssd && u1_is_recon_available)
4640             {
4641                 rdopt_cost = ihevce_inject_stim_into_distortion(
4642                     pu1_cur_src,
4643                     src_strd,
4644                     pu1_cur_recon,
4645                     i4_recon_stride,
4646                     rdopt_cost,
4647                     i4_alpha_stim_multiplier,
4648                     trans_size,
4649                     0,
4650                     NULL_PLANE);
4651             }
4652             else
4653             {
4654                 rdopt_cost = ihevce_inject_stim_into_distortion(
4655                     pu1_cur_src,
4656                     src_strd,
4657                     pu1_cur_pred,
4658                     pred_stride,
4659                     rdopt_cost,
4660                     i4_alpha_stim_multiplier,
4661                     trans_size,
4662                     0,
4663                     ps_ctxt->u1_enable_psyRDOPT,
4664                     NULL_PLANE);
4665             }
4666 #endif
4667         }
4668 #endif
4669 
4670         if(u1_compute_spatial_ssd && u1_is_recon_available)
4671         {
4672             ps_final_prms->s_recon_datastore.au1_bufId_with_winning_LumaRecon[ctr] = 0;
4673         }
4674         else
4675         {
4676             ps_final_prms->s_recon_datastore.au1_bufId_with_winning_LumaRecon[ctr] = UCHAR_MAX;
4677         }
4678 
4679         /* accumulate the TU sad into cu sad */
4680         ps_final_prms->u4_cu_sad += u4_tu_sad;
4681 
4682         /* accumulate the TU bits into cu bits */
4683         cu_bits += tu_bits;
4684 
4685         /* inter cu is coded if any of the tu is coded in it */
4686         ps_final_prms->u1_is_cu_coded |= cbf;
4687 
4688         /* call the entropy function to get the bits */
4689         /* add that to rd opt cost(SSD)              */
4690 
4691         /* update the bytes */
4692         ps_final_prms->as_tu_enc_loop[ctr].i4_luma_coeff_offset = ecd_data_bytes_cons;
4693         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].i2_luma_bytes_consumed = curr_bytes;
4694         /* update the zero_row and col info for the final mode */
4695         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].u4_luma_zero_col = zero_col;
4696         ps_final_prms->as_tu_enc_loop_temp_prms[ctr].u4_luma_zero_row = zero_row;
4697 
4698         /* update the bytes */
4699         ps_final_prms->as_tu_enc_loop[ctr].i4_luma_coeff_offset = ecd_data_bytes_cons;
4700 
4701         /* update the total bytes cons */
4702         ecd_data_bytes_cons += curr_bytes;
4703         pu1_ecd_data += curr_bytes;
4704 
4705         /* RDOPT copy States :  New updated after curr TU to TU init */
4706         if(0 != cbf)
4707         {
4708             /* update to new state only if CBF is non zero */
4709             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
4710                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
4711                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4712                         .s_cabac_ctxt.au1_ctxt_models[0] +
4713                     IHEVC_CAB_COEFFX_PREFIX,
4714                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
4715         }
4716 
4717         /* by default chroma present is set to 1*/
4718         chrm_present_flag = 1;
4719         if(4 == trans_size)
4720         {
4721             /* if tusize is 4x4 then only first luma 4x4 will have chroma*/
4722             if(0 != chrm_ctr)
4723             {
4724                 chrm_present_flag = INTRA_PRED_CHROMA_IDX_NONE;
4725             }
4726 
4727             /* increment the chrm ctr unconditionally */
4728             chrm_ctr++;
4729 
4730             /* after ctr reached 4 reset it */
4731             if(4 == chrm_ctr)
4732             {
4733                 chrm_ctr = 0;
4734             }
4735         }
4736 
4737         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_y_cbf = cbf;
4738         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf = 0;
4739         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf = 0;
4740         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1 = 0;
4741         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1 = 0;
4742         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_chroma_intra_mode_idx = chrm_present_flag;
4743         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b7_qp = ps_ctxt->i4_cu_qp;
4744         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_first_tu_in_cu = 0;
4745         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_transquant_bypass = 0;
4746         GETRANGE(tx_size, trans_size);
4747         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_size = tx_size - 3;
4748         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_x = cu_pos_x + (curr_pos_x >> 2);
4749         ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_y = cu_pos_y + (curr_pos_y >> 2);
4750 
4751         /* repiclate the nbr 4x4 structure for all 4x4 blocks current TU */
4752         ps_cur_nbr_4x4->b1_y_cbf = cbf;
4753         /*copy the cu qp. This will be overwritten by qp calculated based on skip flag at final stage of cu mode decide*/
4754         ps_cur_nbr_4x4->b8_qp = ps_ctxt->i4_cu_qp;
4755 
4756         /* Qp and cbf are stored for the all 4x4 in TU */
4757         {
4758             WORD32 i, j;
4759             nbr_4x4_t *ps_tmp_4x4;
4760             ps_tmp_4x4 = ps_cur_nbr_4x4;
4761 
4762             for(i = 0; i < num_4x4_in_tu; i++)
4763             {
4764                 for(j = 0; j < num_4x4_in_tu; j++)
4765                 {
4766                     ps_tmp_4x4[j].b8_qp = ps_ctxt->i4_cu_qp;
4767                     ps_tmp_4x4[j].b1_y_cbf = cbf;
4768                 }
4769                 /* row level update*/
4770                 ps_tmp_4x4 += num_4x4_in_cu;
4771             }
4772         }
4773 
4774 #if RDOPT_ENABLE
4775         /* compute the rdopt cost */
4776         rdopt_cost +=
4777             COMPUTE_RATE_COST_CLIP30(tu_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
4778 #endif
4779         /* accumulate the costs */
4780         total_rdopt_cost += rdopt_cost;
4781 
4782         ps_tu_prms++;
4783 
4784         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
4785         {
4786             /* Early exit : If the current running cost exceeds
4787             the prev. best mode cost, break */
4788             if(total_rdopt_cost > prev_best_rdopt_cost)
4789             {
4790                 return (total_rdopt_cost);
4791             }
4792         }
4793     }
4794 
4795     /* Modify the cost function for this CU. */
4796     /* loop in for 8x8 blocks */
4797     if(ps_ctxt->u1_enable_psyRDOPT)
4798     {
4799         UWORD8 *pu1_recon_cu;
4800         WORD32 recon_stride;
4801         WORD32 curr_pos_x;
4802         WORD32 curr_pos_y;
4803         WORD32 start_index;
4804         WORD32 num_horz_cu_in_ctb;
4805         WORD32 had_block_size;
4806 
4807         /* tODO: sreenivasa ctb size has to be used appropriately */
4808         had_block_size = 8;
4809         num_horz_cu_in_ctb = 64 / had_block_size;
4810 
4811         curr_pos_x = cu_pos_x << 2; /* pel units */
4812         curr_pos_y = cu_pos_y << 2; /* pel units */
4813         recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
4814         pu1_recon_cu = ((UWORD8 *)ps_final_prms->s_recon_datastore
4815                             .apv_luma_recon_bufs[0]);  // already pointing to the current CU recon
4816         //+ \curr_pos_x + curr_pos_y * recon_stride;
4817 
4818         /* start index to index the source satd of curr cu int he current ctb*/
4819         start_index =
4820             (curr_pos_x / had_block_size) + (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
4821 
4822         {
4823             total_rdopt_cost += ihevce_psy_rd_cost(
4824                 ps_ctxt->ai4_source_satd_8x8,
4825                 pu1_recon_cu,
4826                 recon_stride,
4827                 1,  //howz stride
4828                 cu_size,
4829                 0,  // pic type
4830                 0,  //layer id
4831                 ps_ctxt->i4_satd_lamda,  // lambda
4832                 start_index,
4833                 ps_ctxt->u1_is_input_data_hbd,
4834                 ps_ctxt->u4_psy_strength,
4835                 &ps_ctxt->s_cmn_opt_func);  // 8 bit
4836         }
4837     }
4838 
4839     /* store the num TUs*/
4840     ps_final_prms->u2_num_tus_in_cu = num_tu_in_cu;
4841 
4842     /* update the bytes consumed */
4843     ps_final_prms->i4_num_bytes_ecd_data = ecd_data_bytes_cons;
4844 
4845     /* store the current cu size to final prms */
4846     ps_final_prms->u1_cu_size = cu_size;
4847 
4848     /* cu bits will be having luma residual bits till this point    */
4849     /* if zero_cbf eval is disabled then cu bits will be zero       */
4850     ps_final_prms->u4_cu_luma_res_bits = cu_bits;
4851 
4852     /* ------------- Chroma processing -------------- */
4853     /* Chroma rdopt eval for each luma candidate only for HIGH QUALITY/MEDIUM SPEDD preset*/
4854     if(1 == ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt)
4855     {
4856         LWORD64 chrm_rdopt_cost;
4857         WORD32 chrm_rdopt_tu_bits;
4858 
4859         /* Store the current RDOPT cost to enable early exit in chrom_prcs */
4860         ps_ctxt->as_cu_prms[curr_buf_idx].i8_curr_rdopt_cost = total_rdopt_cost;
4861 
4862         chrm_rdopt_cost = ihevce_chroma_cu_prcs_rdopt(
4863             ps_ctxt,
4864             curr_buf_idx,
4865             0, /* TU mode : Don't care in Inter patrh */
4866             ps_chrm_cu_buf_prms->pu1_curr_src,
4867             ps_chrm_cu_buf_prms->i4_chrm_src_stride,
4868             ps_chrm_cu_buf_prms->pu1_cu_left,
4869             ps_chrm_cu_buf_prms->pu1_cu_top,
4870             ps_chrm_cu_buf_prms->pu1_cu_top_left,
4871             ps_chrm_cu_buf_prms->i4_cu_left_stride,
4872             (cu_pos_x >> 1),
4873             (cu_pos_y >> 1),
4874             &chrm_rdopt_tu_bits,
4875             i4_alpha_stim_multiplier,
4876             u1_is_cu_noisy);
4877 
4878 #if WEIGH_CHROMA_COST
4879         chrm_rdopt_cost = (LWORD64)(
4880             (chrm_rdopt_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
4881              (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
4882             CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
4883 #endif
4884 
4885 #if CHROMA_RDOPT_ENABLE
4886         total_rdopt_cost += chrm_rdopt_cost;
4887 #endif
4888         cu_bits += chrm_rdopt_tu_bits;
4889 
4890         /* during chroma evaluation if skip decision was over written     */
4891         /* then the current skip candidate is set to a non skip candidate */
4892         ps_inter_cand->b1_skip_flag = ps_final_prms->u1_skip_flag;
4893 
4894         /* cu bits for chroma residual if chroma rdopt is on       */
4895         /* if zero_cbf eval is disabled then cu bits will be zero  */
4896         ps_final_prms->u4_cu_chroma_res_bits = chrm_rdopt_tu_bits;
4897 
4898         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
4899         {
4900             /* Early exit : If the current running cost exceeds
4901             the prev. best mode cost, break */
4902             if(total_rdopt_cost > prev_best_rdopt_cost)
4903             {
4904                 return (total_rdopt_cost);
4905             }
4906         }
4907     }
4908     else
4909     {}
4910 
4911 #if SHRINK_INTER_TUTREE
4912     /* ------------- Quadtree TU split  optimization ------------  */
4913     if(ps_final_prms->u1_is_cu_coded)
4914     {
4915         ps_final_prms->u2_num_tus_in_cu = ihevce_shrink_inter_tu_tree(
4916             &ps_final_prms->as_tu_enc_loop[0],
4917             &ps_final_prms->as_tu_enc_loop_temp_prms[0],
4918             &ps_final_prms->s_recon_datastore,
4919             num_tu_in_cu,
4920             (ps_ctxt->u1_chroma_array_type == 2));
4921     }
4922 #endif
4923 
4924     /* RDOPT copy States :  Best after all luma TUs (and chroma,if enabled)to current */
4925     COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
4926         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
4927                 .s_cabac_ctxt.au1_ctxt_models[0] +
4928             IHEVC_CAB_COEFFX_PREFIX,
4929         &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
4930         IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
4931 
4932     /* -------- Bit estimate for RD opt -------------- */
4933     {
4934         nbr_avail_flags_t s_nbr;
4935         /*cbf_bits will account for both texture and cbf bits when zero cbf eval flag is 0*/
4936         WORD32 cbf_bits, header_bits;
4937 
4938         /* get the neighbour availability flags for current cu  */
4939         ihevce_get_only_nbr_flag(
4940             &s_nbr,
4941             ps_ctxt->pu1_ctb_nbr_map,
4942             ps_ctxt->i4_nbr_map_strd,
4943             cu_pos_x,
4944             cu_pos_y,
4945             (cu_size >> 2),
4946             (cu_size >> 2));
4947 
4948         /* call the entropy rdo encode to get the bit estimate for current cu */
4949         header_bits = ihevce_entropy_rdo_encode_cu(
4950             &ps_ctxt->s_rdopt_entropy_ctxt,
4951             ps_final_prms,
4952             (cu_pos_x >> 1), /*  back to 8x8 pel units   */
4953             (cu_pos_y >> 1), /*  back to 8x8 pel units   */
4954             cu_size,
4955             ps_ctxt->u1_disable_intra_eval ? !DISABLE_TOP_SYNC && s_nbr.u1_top_avail
4956                                            : s_nbr.u1_top_avail,
4957             s_nbr.u1_left_avail,
4958             &ps_final_prms->pu1_cu_coeffs[0],
4959             &cbf_bits);
4960 
4961         cu_bits += header_bits;
4962 
4963         /* cbf bits are excluded from header bits, instead considered as texture bits */
4964         /* incase if zero cbf eval is disabled then texture bits gets added here */
4965         ps_final_prms->u4_cu_hdr_bits = (header_bits - cbf_bits);
4966         ps_final_prms->u4_cu_cbf_bits = cbf_bits;
4967 
4968 #if RDOPT_ENABLE
4969         /* add the cost of coding the header bits */
4970         total_rdopt_cost +=
4971             COMPUTE_RATE_COST_CLIP30(header_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
4972 
4973 #if ENABLE_INTER_ZCU_COST
4974         /* If cu is coded, Evaluate not coded cost and check if it improves over coded cost */
4975         if(ps_final_prms->u1_is_cu_coded && (ZCBF_ENABLE == ps_ctxt->i4_zcbf_rdo_level))
4976         {
4977             LWORD64 i8_cu_not_coded_cost = ps_ctxt->i8_cu_not_coded_cost;
4978 
4979             WORD32 is_2nx2n_mergecu = (SIZE_2Nx2N == ps_final_prms->u1_part_mode) &&
4980                                       (1 == ps_final_prms->as_pu_enc_loop[0].b1_merge_flag);
4981 
4982             cab_ctxt_t *ps_cab_ctxt =
4983                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx].s_cabac_ctxt;
4984 
4985             /* Read header bits generatated after ihevce_entropy_rdo_encode_cu() call  */
4986             UWORD32 u4_cu_hdr_bits_q12 = ps_cab_ctxt->u4_header_bits_estimated_q12;
4987 
4988             /* account for coding qt_root_cbf = 0 */
4989             /* First subtract cost for coding as 1 (part of header bits) and then add cost for coding as 0 */
4990             u4_cu_hdr_bits_q12 += gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 0];
4991             if(u4_cu_hdr_bits_q12 < gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 1])
4992                 u4_cu_hdr_bits_q12 = 0;
4993             else
4994                 u4_cu_hdr_bits_q12 -= gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 1];
4995 
4996             /* add the cost of coding the header bits */
4997             i8_cu_not_coded_cost += COMPUTE_RATE_COST_CLIP30(
4998                 u4_cu_hdr_bits_q12 /* ps_final_prms->u4_cu_hdr_bits */,
4999                 ps_ctxt->i8_cl_ssd_lambda_qf,
5000                 (LAMBDA_Q_SHIFT + CABAC_FRAC_BITS_Q));
5001 
5002             if(ps_ctxt->u1_enable_psyRDOPT)
5003             {
5004                 i8_cu_not_coded_cost = total_rdopt_cost + 1;
5005             }
5006 
5007             /* Evaluate qtroot cbf rdo; exclude 2Nx2N Merge as skip cu is explicitly evaluated */
5008             if((i8_cu_not_coded_cost <= total_rdopt_cost) && (!is_2nx2n_mergecu))
5009             {
5010                 WORD32 tx_size;
5011 
5012                 /* force cu as not coded and update the cost */
5013                 ps_final_prms->u1_is_cu_coded = 0;
5014                 ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 0;
5015                 ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 0;
5016 
5017                 total_rdopt_cost = i8_cu_not_coded_cost;
5018 
5019                 /* reset num TUs to 1 unless cu size id 64 */
5020                 ps_final_prms->u2_num_tus_in_cu = (64 == cu_size) ? 4 : 1;
5021                 trans_size = (64 == cu_size) ? 32 : cu_size;
5022                 GETRANGE(tx_size, trans_size);
5023 
5024                 /* reset the bytes consumed */
5025                 ps_final_prms->i4_num_bytes_ecd_data = 0;
5026 
5027                 /* reset texture related bits and roll back header bits*/
5028                 ps_final_prms->u4_cu_cbf_bits = 0;
5029                 ps_final_prms->u4_cu_luma_res_bits = 0;
5030                 ps_final_prms->u4_cu_chroma_res_bits = 0;
5031                 ps_final_prms->u4_cu_hdr_bits =
5032                     (u4_cu_hdr_bits_q12 + (1 << (CABAC_FRAC_BITS_Q - 1))) >> CABAC_FRAC_BITS_Q;
5033 
5034                 /* update cabac model with qtroot cbf = 0 decision */
5035                 ps_cab_ctxt->au1_ctxt_models[IHEVC_CAB_NORES_IDX] =
5036                     gau1_ihevc_next_state[u1_qtroot_cbf_cabac_model << 1];
5037 
5038                 /* restore untouched cabac models for, tusplit, cbfs, texture etc */
5039                 memcpy(
5040                     &ps_cab_ctxt->au1_ctxt_models[IHEVC_CAB_SPLIT_TFM],
5041                     &au1_rdopt_init_ctxt_models[IHEVC_CAB_SPLIT_TFM],
5042                     (IHEVC_CAB_CTXT_END - IHEVC_CAB_SPLIT_TFM));
5043 
5044                 /* mark all tus as not coded for final eval */
5045                 for(ctr = 0; ctr < ps_final_prms->u2_num_tus_in_cu; ctr++)
5046                 {
5047                     WORD32 curr_pos_x = (ctr & 0x1) ? (trans_size >> 2) : 0;
5048                     WORD32 curr_pos_y = (ctr & 0x2) ? (trans_size >> 2) : 0;
5049 
5050                     nbr_4x4_t *ps_cur_nbr_4x4 =
5051                         ps_nbr_4x4 + curr_pos_x + (curr_pos_y * num_4x4_in_cu);
5052 
5053                     num_4x4_in_tu = trans_size >> 2;
5054 
5055                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].i2_luma_bytes_consumed = 0;
5056                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].ai2_cb_bytes_consumed[0] = 0;
5057                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].ai2_cr_bytes_consumed[0] = 0;
5058 
5059                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_y_cbf = 0;
5060                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf = 0;
5061                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf = 0;
5062 
5063                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1 = 0;
5064                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1 = 0;
5065 
5066                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_size = tx_size - 3;
5067                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_x = cu_pos_x + curr_pos_x;
5068                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_y = cu_pos_y + curr_pos_y;
5069 
5070                     /* reset cbf for the all 4x4 in TU */
5071                     {
5072                         WORD32 i, j;
5073                         nbr_4x4_t *ps_tmp_4x4;
5074                         ps_tmp_4x4 = ps_cur_nbr_4x4;
5075 
5076                         for(i = 0; i < num_4x4_in_tu; i++)
5077                         {
5078                             for(j = 0; j < num_4x4_in_tu; j++)
5079                             {
5080                                 ps_tmp_4x4[j].b1_y_cbf = 0;
5081                             }
5082                             /* row level update*/
5083                             ps_tmp_4x4 += num_4x4_in_cu;
5084                         }
5085                     }
5086                 }
5087             }
5088         }
5089 #endif /* ENABLE_INTER_ZCU_COST */
5090 
5091 #endif /* RDOPT_ENABLE */
5092     }
5093 
5094     return (total_rdopt_cost);
5095 }
5096 
5097 #if ENABLE_RDO_BASED_TU_RECURSION
ihevce_inter_tu_tree_selector_and_rdopt_cost_computer(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_cu_prms_t * ps_cu_prms,void * pv_src,WORD32 cu_size,WORD32 cu_pos_x,WORD32 cu_pos_y,WORD32 curr_buf_idx,enc_loop_chrm_cu_buf_prms_t * ps_chrm_cu_buf_prms,cu_inter_cand_t * ps_inter_cand,cu_analyse_t * ps_cu_analyse,WORD32 i4_alpha_stim_multiplier)5098 LWORD64 ihevce_inter_tu_tree_selector_and_rdopt_cost_computer(
5099     ihevce_enc_loop_ctxt_t *ps_ctxt,
5100     enc_loop_cu_prms_t *ps_cu_prms,
5101     void *pv_src,
5102     WORD32 cu_size,
5103     WORD32 cu_pos_x,
5104     WORD32 cu_pos_y,
5105     WORD32 curr_buf_idx,
5106     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
5107     cu_inter_cand_t *ps_inter_cand,
5108     cu_analyse_t *ps_cu_analyse,
5109     WORD32 i4_alpha_stim_multiplier)
5110 {
5111     tu_tree_node_t as_tu_nodes[256 + 64 + 16 + 4 + 1];
5112     buffer_data_for_tu_t s_buffer_data_for_tu;
5113     enc_loop_cu_final_prms_t *ps_final_prms;
5114     nbr_4x4_t *ps_nbr_4x4;
5115 
5116     WORD32 num_split_flags = 1;
5117     UWORD8 u1_tu_size;
5118     UWORD8 *pu1_pred;
5119     UWORD8 *pu1_ecd_data;
5120     WORD16 *pi2_deq_data;
5121     UWORD8 *pu1_csbf_buf;
5122     UWORD8 *pu1_tu_sz_sft;
5123     UWORD8 *pu1_tu_posx;
5124     UWORD8 *pu1_tu_posy;
5125     LWORD64 total_rdopt_cost;
5126     WORD32 ctr;
5127     WORD32 chrm_ctr;
5128     WORD32 pred_stride;
5129     WORD32 recon_stride;
5130     WORD32 trans_size = ps_cu_analyse->u1_cu_size;
5131     WORD32 csbf_strd;
5132     WORD32 ecd_data_bytes_cons;
5133     WORD32 num_4x4_in_cu;
5134     WORD32 num_4x4_in_tu;
5135     WORD32 recon_func_mode;
5136     WORD32 cu_bits;
5137     UWORD8 u1_compute_spatial_ssd;
5138     /* backup copy of cabac states for restoration if zero cu reside rdo wins later */
5139     UWORD8 au1_rdopt_init_ctxt_models[IHEVC_CAB_CTXT_END];
5140 
5141     WORD32 i4_min_trans_size = 256;
5142     LWORD64 prev_best_rdopt_cost = ps_ctxt->as_cu_prms[!curr_buf_idx].i8_best_rdopt_cost;
5143     WORD32 src_strd = ps_cu_prms->i4_luma_src_stride;
5144     /* model for no residue syntax qt root cbf flag */
5145     UWORD8 u1_qtroot_cbf_cabac_model = ps_ctxt->au1_rdopt_init_ctxt_models[IHEVC_CAB_NORES_IDX];
5146     UWORD8 u1_skip_tu_sz_sft = 0;
5147     UWORD8 u1_skip_tu_posx = 0;
5148     UWORD8 u1_skip_tu_posy = 0;
5149     UWORD8 u1_is_cu_noisy = ps_cu_prms->u1_is_cu_noisy;
5150 
5151     ps_final_prms = &ps_ctxt->as_cu_prms[curr_buf_idx];
5152     ps_nbr_4x4 = &ps_ctxt->as_cu_nbr[curr_buf_idx][0];
5153     pu1_ecd_data = &ps_final_prms->pu1_cu_coeffs[0];
5154     pi2_deq_data = &ps_final_prms->pi2_cu_deq_coeffs[0];
5155     csbf_strd = ps_ctxt->i4_cu_csbf_strd;
5156     pu1_csbf_buf = &ps_ctxt->au1_cu_csbf[0];
5157     pred_stride = ps_inter_cand->i4_pred_data_stride;
5158     recon_stride = cu_size;
5159     pu1_pred = ps_inter_cand->pu1_pred_data;
5160     chrm_ctr = 0;
5161     ecd_data_bytes_cons = 0;
5162     total_rdopt_cost = 0;
5163     num_4x4_in_cu = cu_size >> 2;
5164     recon_func_mode = PRED_MODE_INTER;
5165     cu_bits = 0;
5166 
5167     /* get the 4x4 level postion of current cu */
5168     cu_pos_x = cu_pos_x << 1;
5169     cu_pos_y = cu_pos_y << 1;
5170 
5171     ps_final_prms->u1_is_cu_coded = 0;
5172     ps_final_prms->u4_cu_sad = 0;
5173 
5174     /* populate the coeffs scan idx */
5175     ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
5176 
5177 #if ENABLE_INTER_ZCU_COST
5178     /* reset cu not coded cost */
5179     ps_ctxt->i8_cu_not_coded_cost = 0;
5180 
5181     /* backup copy of cabac states for restoration if zero cu reside rdo wins later */
5182     memcpy(au1_rdopt_init_ctxt_models, &ps_ctxt->au1_rdopt_init_ctxt_models[0], IHEVC_CAB_CTXT_END);
5183 #endif
5184 
5185     if(ps_cu_analyse->u1_cu_size == 64)
5186     {
5187         num_split_flags = 4;
5188         u1_tu_size = 32;
5189     }
5190     else
5191     {
5192         num_split_flags = 1;
5193         u1_tu_size = ps_cu_analyse->u1_cu_size;
5194     }
5195 
5196     if(1 == ps_final_prms->u1_skip_flag)
5197     {
5198         if(64 == cu_size)
5199         {
5200             /* TU = CU/2 is set but no trnaform is evaluated  */
5201             pu1_tu_sz_sft = &gau1_inter_tu_shft_amt[0];
5202             pu1_tu_posx = &gau1_inter_tu_posx_scl_amt[0];
5203             pu1_tu_posy = &gau1_inter_tu_posy_scl_amt[0];
5204         }
5205         else
5206         {
5207             /* TU = CU is set but no trnaform is evaluated  */
5208             pu1_tu_sz_sft = &u1_skip_tu_sz_sft;
5209             pu1_tu_posx = &u1_skip_tu_posx;
5210             pu1_tu_posy = &u1_skip_tu_posy;
5211         }
5212 
5213         recon_func_mode = PRED_MODE_SKIP;
5214     }
5215     /* check for PU part mode being AMP or No AMP */
5216     else if(ps_final_prms->u1_part_mode < SIZE_2NxnU)
5217     {
5218         if((SIZE_2Nx2N == ps_final_prms->u1_part_mode) && (cu_size < 64))
5219         {
5220             /* TU= CU is evaluated 2Nx2N inter case */
5221             pu1_tu_sz_sft = &u1_skip_tu_sz_sft;
5222             pu1_tu_posx = &u1_skip_tu_posx;
5223             pu1_tu_posy = &u1_skip_tu_posy;
5224         }
5225         else
5226         {
5227             /* currently TU= CU/2 is evaluated for all inter case */
5228             pu1_tu_sz_sft = &gau1_inter_tu_shft_amt[0];
5229             pu1_tu_posx = &gau1_inter_tu_posx_scl_amt[0];
5230             pu1_tu_posy = &gau1_inter_tu_posy_scl_amt[0];
5231         }
5232     }
5233     else
5234     {
5235         /* for AMP cases one level of TU recurssion is done */
5236         /* based on oreintation of the partitions           */
5237         pu1_tu_sz_sft = &gau1_inter_tu_shft_amt_amp[ps_final_prms->u1_part_mode - 4][0];
5238         pu1_tu_posx = &gau1_inter_tu_posx_scl_amt_amp[ps_final_prms->u1_part_mode - 4][0];
5239         pu1_tu_posy = &gau1_inter_tu_posy_scl_amt_amp[ps_final_prms->u1_part_mode - 4][0];
5240     }
5241 
5242     i4_min_trans_size = 4;
5243 
5244     if(ps_ctxt->i1_cu_qp_delta_enable)
5245     {
5246         WORD32 i4_act_counter = 0, i4_act_counter_lamda = 0;
5247         if(ps_cu_analyse->u1_cu_size == 64)
5248         {
5249             ASSERT(
5250                 (i4_min_trans_size == 32) || (i4_min_trans_size == 16) ||
5251                 (i4_min_trans_size == 8) || (i4_min_trans_size == 4));
5252             i4_act_counter = (i4_min_trans_size == 16) +
5253                              2 * ((i4_min_trans_size == 8) || (i4_min_trans_size == 4));
5254             i4_act_counter_lamda = 3;
5255         }
5256         else if(ps_cu_analyse->u1_cu_size == 32)
5257         {
5258             ASSERT(
5259                 (i4_min_trans_size == 32) || (i4_min_trans_size == 16) ||
5260                 (i4_min_trans_size == 8) || (i4_min_trans_size == 4));
5261             i4_act_counter = (i4_min_trans_size == 16) +
5262                              2 * ((i4_min_trans_size == 8) || (i4_min_trans_size == 4));
5263             i4_act_counter_lamda = 0;
5264         }
5265         else if(ps_cu_analyse->u1_cu_size == 16)
5266         {
5267             ASSERT(
5268                 (i4_min_trans_size == 16) || (i4_min_trans_size == 8) || (i4_min_trans_size == 4));
5269             i4_act_counter = (i4_min_trans_size == 8) || (i4_min_trans_size == 4);
5270             i4_act_counter_lamda = 0;
5271         }
5272         else if(ps_cu_analyse->u1_cu_size == 8)
5273         {
5274             ASSERT((i4_min_trans_size == 8) || (i4_min_trans_size == 4));
5275             i4_act_counter = 1;
5276             i4_act_counter_lamda = 0;
5277         }
5278         else
5279         {
5280             ASSERT(0);
5281         }
5282         if(ps_ctxt->i4_use_ctb_level_lamda)
5283         {
5284             ihevce_compute_cu_level_QP(
5285                 ps_ctxt, ps_cu_analyse->i4_act_factor[i4_act_counter][0], -1, 0);
5286         }
5287         else
5288         {
5289             ihevce_compute_cu_level_QP(
5290                 ps_ctxt,
5291                 ps_cu_analyse->i4_act_factor[i4_act_counter][0],
5292                 ps_cu_analyse->i4_act_factor[i4_act_counter_lamda][0],
5293                 0);
5294         }
5295 
5296         ps_cu_analyse->i1_cu_qp = ps_ctxt->i4_cu_qp;
5297     }
5298 
5299     if(u1_is_cu_noisy && !ps_ctxt->u1_enable_psyRDOPT)
5300     {
5301         ps_ctxt->i8_cl_ssd_lambda_qf =
5302             ((float)ps_ctxt->i8_cl_ssd_lambda_qf * (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) /
5303              100.0f);
5304         ps_ctxt->i8_cl_ssd_lambda_chroma_qf =
5305             ((float)ps_ctxt->i8_cl_ssd_lambda_chroma_qf *
5306              (100.0f - RDOPT_LAMBDA_DISCOUNT_WHEN_NOISY) / 100.0f);
5307     }
5308 
5309     u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
5310                              (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
5311                              CONVERT_SSDS_TO_SPATIAL_DOMAIN;
5312 
5313     if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
5314     {
5315         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
5316                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
5317     }
5318 
5319     if(!u1_compute_spatial_ssd)
5320     {
5321         ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 0;
5322         ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 0;
5323     }
5324     else
5325     {
5326         ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 1;
5327 
5328         if(INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0))
5329         {
5330             ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 1;
5331         }
5332     }
5333 
5334     /* RDOPT copy States :  TU init (best until prev TU) to current */
5335     memcpy(
5336         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
5337              .s_cabac_ctxt.au1_ctxt_models[0],
5338         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
5339         IHEVC_CAB_COEFFX_PREFIX);
5340 
5341     ihevce_tu_tree_init(
5342         as_tu_nodes,
5343         cu_size,
5344         (cu_size == 64) ? !ps_inter_cand->b1_skip_flag : 0,
5345         ps_inter_cand->b1_skip_flag ? 0 : ps_ctxt->u1_max_inter_tr_depth,
5346         INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0),
5347         ps_ctxt->u1_chroma_array_type == 2);
5348 
5349     if(!ps_inter_cand->b1_skip_flag && (ps_ctxt->i4_quality_preset >= IHEVCE_QUALITY_P3))
5350     {
5351         ihevce_tuSplitArray_to_tuTree_mapper(
5352             as_tu_nodes,
5353             ps_inter_cand->ai4_tu_split_flag,
5354             cu_size,
5355             cu_size,
5356             MAX(MIN_TU_SIZE, (cu_size >> ps_ctxt->u1_max_inter_tr_depth)),
5357             MIN(MAX_TU_SIZE, cu_size),
5358             ps_inter_cand->b1_skip_flag);
5359     }
5360 
5361     ASSERT(ihevce_tu_tree_coverage_in_cu(as_tu_nodes) == cu_size * cu_size);
5362 
5363 #if ENABLE_INTER_ZCU_COST
5364     ps_ctxt->i8_cu_not_coded_cost = 0;
5365 #endif
5366 
5367     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.pv_src = pv_src;
5368     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.pv_pred = pu1_pred;
5369     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.pv_recon =
5370         ps_final_prms->s_recon_datastore.apv_luma_recon_bufs[0];
5371     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.i4_src_stride = src_strd;
5372     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.i4_pred_stride = pred_stride;
5373     s_buffer_data_for_tu.s_src_pred_rec_buf_luma.i4_recon_stride =
5374         ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
5375     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.pv_src = ps_chrm_cu_buf_prms->pu1_curr_src;
5376     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.pv_pred =
5377         ps_ctxt->s_cu_me_intra_pred_prms.pu1_pred_data[CU_ME_INTRA_PRED_CHROMA_IDX] +
5378         curr_buf_idx * ((MAX_CTB_SIZE * MAX_CTB_SIZE >> 1) + ((ps_ctxt->u1_chroma_array_type == 2) *
5379                                                               (MAX_CTB_SIZE * MAX_CTB_SIZE >> 1)));
5380     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.pv_recon =
5381         ps_final_prms->s_recon_datastore.apv_chroma_recon_bufs[0];
5382     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_src_stride =
5383         ps_chrm_cu_buf_prms->i4_chrm_src_stride;
5384     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_pred_stride =
5385         ps_ctxt->s_cu_me_intra_pred_prms.ai4_pred_data_stride[CU_ME_INTRA_PRED_CHROMA_IDX];
5386     s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_recon_stride =
5387         ps_final_prms->s_recon_datastore.i4_chromaRecon_stride;
5388     s_buffer_data_for_tu.ps_nbr_data_buf = ps_nbr_4x4;
5389     s_buffer_data_for_tu.pi2_deq_data = pi2_deq_data;
5390     s_buffer_data_for_tu.pi2_deq_data_chroma =
5391         pi2_deq_data + ps_final_prms->i4_chrm_deq_coeff_strt_idx;
5392     s_buffer_data_for_tu.i4_nbr_data_buf_stride = num_4x4_in_cu;
5393     s_buffer_data_for_tu.i4_deq_data_stride = cu_size;
5394     s_buffer_data_for_tu.i4_deq_data_stride_chroma = cu_size;
5395     s_buffer_data_for_tu.ppu1_ecd = &pu1_ecd_data;
5396 
5397     if(INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0))
5398     {
5399         UWORD8 i;
5400 
5401         UWORD8 *pu1_pred = (UWORD8 *)s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.pv_pred;
5402 
5403         for(i = 0; i < (!!ps_inter_cand->b3_part_size) + 1; i++)
5404         {
5405             pu_t *ps_pu;
5406 
5407             WORD32 inter_pu_wd;
5408             WORD32 inter_pu_ht;
5409 
5410             ps_pu = ps_inter_cand->as_inter_pu + i;
5411 
5412             inter_pu_wd = (ps_pu->b4_wd + 1) << 2; /* cb and cr pixel interleaved */
5413             inter_pu_ht = ((ps_pu->b4_ht + 1) << 2) >> 1;
5414             inter_pu_ht <<= (ps_ctxt->u1_chroma_array_type == 2);
5415             ihevce_chroma_inter_pred_pu(
5416                 &ps_ctxt->s_mc_ctxt,
5417                 ps_pu,
5418                 pu1_pred,
5419                 s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_pred_stride);
5420             if(!!ps_inter_cand->b3_part_size)
5421             {
5422                 /* 2Nx__ partion case */
5423                 if(inter_pu_wd == cu_size)
5424                 {
5425                     pu1_pred +=
5426                         (inter_pu_ht *
5427                          s_buffer_data_for_tu.s_src_pred_rec_buf_chroma.i4_pred_stride);
5428                 }
5429 
5430                 /* __x2N partion case */
5431                 if(inter_pu_ht == (cu_size >> !(ps_ctxt->u1_chroma_array_type == 2)))
5432                 {
5433                     pu1_pred += inter_pu_wd;
5434                 }
5435             }
5436         }
5437     }
5438 
5439 #if !ENABLE_TOP_DOWN_TU_RECURSION
5440     total_rdopt_cost = ihevce_tu_tree_selector(
5441         ps_ctxt,
5442         as_tu_nodes,
5443         &s_buffer_data_for_tu,
5444         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
5445              .s_cabac_ctxt.au1_ctxt_models[0],
5446         recon_func_mode,
5447 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
5448         i4_alpha_stim_multiplier,
5449         u1_is_cu_noisy,
5450 #endif
5451         0,
5452         ps_ctxt->u1_max_inter_tr_depth,
5453         ps_inter_cand->b3_part_size,
5454         u1_compute_spatial_ssd);
5455 #else
5456     total_rdopt_cost = ihevce_topDown_tu_tree_selector(
5457         ps_ctxt,
5458         as_tu_nodes,
5459         &s_buffer_data_for_tu,
5460         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
5461              .s_cabac_ctxt.au1_ctxt_models[0],
5462         recon_func_mode,
5463 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
5464         i4_alpha_stim_multiplier,
5465         u1_is_cu_noisy,
5466 #endif
5467         0,
5468         ps_ctxt->u1_max_inter_tr_depth,
5469         ps_inter_cand->b3_part_size,
5470         INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0),
5471         u1_compute_spatial_ssd);
5472 #endif
5473 
5474     ps_final_prms->u2_num_tus_in_cu = 0;
5475     ps_final_prms->u4_cu_luma_res_bits = 0;
5476     ps_final_prms->u4_cu_sad = 0;
5477     total_rdopt_cost = 0;
5478     ecd_data_bytes_cons = 0;
5479     cu_bits = 0;
5480 #if ENABLE_INTER_ZCU_COST
5481     ps_ctxt->i8_cu_not_coded_cost = 0;
5482 #endif
5483     ps_final_prms->u1_is_cu_coded = 0;
5484     ps_final_prms->u1_cu_size = cu_size;
5485 
5486     ihevce_tu_selector_debriefer(
5487         as_tu_nodes,
5488         ps_final_prms,
5489         &total_rdopt_cost,
5490 #if ENABLE_INTER_ZCU_COST
5491         &ps_ctxt->i8_cu_not_coded_cost,
5492 #endif
5493         &ecd_data_bytes_cons,
5494         &cu_bits,
5495         &ps_final_prms->u2_num_tus_in_cu,
5496         ps_ctxt->i4_cu_qp,
5497         cu_pos_x * 4,
5498         cu_pos_y * 4,
5499         INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0),
5500         (ps_ctxt->u1_chroma_array_type == 2),
5501         POS_TL);
5502 
5503     if(!(INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0)))
5504     {
5505         ps_final_prms->i4_chrm_cu_coeff_strt_idx = ecd_data_bytes_cons;
5506     }
5507 
5508     /* Modify the cost function for this CU. */
5509     /* loop in for 8x8 blocks */
5510     if(ps_ctxt->u1_enable_psyRDOPT)
5511     {
5512         UWORD8 *pu1_recon_cu;
5513         WORD32 recon_stride;
5514         WORD32 curr_pos_x;
5515         WORD32 curr_pos_y;
5516         WORD32 start_index;
5517         WORD32 num_horz_cu_in_ctb;
5518         WORD32 had_block_size;
5519 
5520         /* tODO: sreenivasa ctb size has to be used appropriately */
5521         had_block_size = 8;
5522         num_horz_cu_in_ctb = 64 / had_block_size;
5523 
5524         curr_pos_x = cu_pos_x << 2; /* pel units */
5525         curr_pos_y = cu_pos_y << 2; /* pel units */
5526         recon_stride = ps_final_prms->s_recon_datastore.i4_lumaRecon_stride;
5527         pu1_recon_cu = ((UWORD8 *)ps_final_prms->s_recon_datastore
5528                             .apv_luma_recon_bufs[0]);  // already pointing to the current CU recon
5529         //+ \curr_pos_x + curr_pos_y * recon_stride;
5530 
5531         /* start index to index the source satd of curr cu int he current ctb*/
5532         start_index =
5533             (curr_pos_x / had_block_size) + (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
5534 
5535         {
5536             total_rdopt_cost += ihevce_psy_rd_cost(
5537                 ps_ctxt->ai4_source_satd_8x8,
5538                 pu1_recon_cu,
5539                 recon_stride,
5540                 1,  //howz stride
5541                 cu_size,
5542                 0,  // pic type
5543                 0,  //layer id
5544                 ps_ctxt->i4_satd_lamda,  // lambda
5545                 start_index,
5546                 ps_ctxt->u1_is_input_data_hbd,
5547                 ps_ctxt->u4_psy_strength,
5548                 &ps_ctxt->s_cmn_opt_func);  // 8 bit
5549         }
5550     }
5551 
5552     ps_final_prms->u1_chroma_intra_pred_mode = 4;
5553 
5554     /* update the bytes consumed */
5555     ps_final_prms->i4_num_bytes_ecd_data = ecd_data_bytes_cons;
5556 
5557     /* store the current cu size to final prms */
5558     ps_final_prms->u1_cu_size = cu_size;
5559     /* ------------- Chroma processing -------------- */
5560     /* Chroma rdopt eval for each luma candidate only for HIGH QUALITY/MEDIUM SPEDD preset*/
5561     if(ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt &&
5562        !(INCLUDE_CHROMA_DURING_TU_RECURSION && (ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P0)))
5563     {
5564         LWORD64 chrm_rdopt_cost;
5565         WORD32 chrm_rdopt_tu_bits;
5566 
5567         /* Store the current RDOPT cost to enable early exit in chrom_prcs */
5568         ps_ctxt->as_cu_prms[curr_buf_idx].i8_curr_rdopt_cost = total_rdopt_cost;
5569 
5570         chrm_rdopt_cost = ihevce_chroma_cu_prcs_rdopt(
5571             ps_ctxt,
5572             curr_buf_idx,
5573             0, /* TU mode : Don't care in Inter patrh */
5574             ps_chrm_cu_buf_prms->pu1_curr_src,
5575             ps_chrm_cu_buf_prms->i4_chrm_src_stride,
5576             ps_chrm_cu_buf_prms->pu1_cu_left,
5577             ps_chrm_cu_buf_prms->pu1_cu_top,
5578             ps_chrm_cu_buf_prms->pu1_cu_top_left,
5579             ps_chrm_cu_buf_prms->i4_cu_left_stride,
5580             (cu_pos_x >> 1),
5581             (cu_pos_y >> 1),
5582             &chrm_rdopt_tu_bits,
5583             i4_alpha_stim_multiplier,
5584             u1_is_cu_noisy);
5585 
5586 #if WEIGH_CHROMA_COST
5587         chrm_rdopt_cost = (LWORD64)(
5588             (chrm_rdopt_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
5589              (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
5590             CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
5591 #endif
5592 
5593 #if CHROMA_RDOPT_ENABLE
5594         total_rdopt_cost += chrm_rdopt_cost;
5595 #endif
5596         cu_bits += chrm_rdopt_tu_bits;
5597 
5598         /* during chroma evaluation if skip decision was over written     */
5599         /* then the current skip candidate is set to a non skip candidate */
5600         ps_inter_cand->b1_skip_flag = ps_final_prms->u1_skip_flag;
5601 
5602         /* cu bits for chroma residual if chroma rdopt is on       */
5603         /* if zero_cbf eval is disabled then cu bits will be zero  */
5604         ps_final_prms->u4_cu_chroma_res_bits = chrm_rdopt_tu_bits;
5605 
5606         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
5607         {
5608             /* Early exit : If the current running cost exceeds
5609             the prev. best mode cost, break */
5610             if(total_rdopt_cost > prev_best_rdopt_cost)
5611             {
5612                 return (total_rdopt_cost);
5613             }
5614         }
5615     }
5616     else
5617     {}
5618 
5619 #if SHRINK_INTER_TUTREE
5620     /* ------------- Quadtree TU split  optimization ------------  */
5621     if(ps_final_prms->u1_is_cu_coded)
5622     {
5623         ps_final_prms->u2_num_tus_in_cu = ihevce_shrink_inter_tu_tree(
5624             &ps_final_prms->as_tu_enc_loop[0],
5625             &ps_final_prms->as_tu_enc_loop_temp_prms[0],
5626             &ps_final_prms->s_recon_datastore,
5627             ps_final_prms->u2_num_tus_in_cu,
5628             (ps_ctxt->u1_chroma_array_type == 2));
5629     }
5630 #endif
5631 
5632     /* RDOPT copy States :  Best after all luma TUs (and chroma,if enabled)to current */
5633     COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
5634         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx]
5635                 .s_cabac_ctxt.au1_ctxt_models[0] +
5636             IHEVC_CAB_COEFFX_PREFIX,
5637         &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
5638         IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
5639 
5640     /* -------- Bit estimate for RD opt -------------- */
5641     {
5642         nbr_avail_flags_t s_nbr;
5643         /*cbf_bits will account for both texture and cbf bits when zero cbf eval flag is 0*/
5644         WORD32 cbf_bits, header_bits;
5645 
5646         /* get the neighbour availability flags for current cu  */
5647         ihevce_get_only_nbr_flag(
5648             &s_nbr,
5649             ps_ctxt->pu1_ctb_nbr_map,
5650             ps_ctxt->i4_nbr_map_strd,
5651             cu_pos_x,
5652             cu_pos_y,
5653             (cu_size >> 2),
5654             (cu_size >> 2));
5655 
5656         /* call the entropy rdo encode to get the bit estimate for current cu */
5657         header_bits = ihevce_entropy_rdo_encode_cu(
5658             &ps_ctxt->s_rdopt_entropy_ctxt,
5659             ps_final_prms,
5660             (cu_pos_x >> 1), /*  back to 8x8 pel units   */
5661             (cu_pos_y >> 1), /*  back to 8x8 pel units   */
5662             cu_size,
5663             ps_ctxt->u1_disable_intra_eval ? !DISABLE_TOP_SYNC && s_nbr.u1_top_avail
5664                                            : s_nbr.u1_top_avail,
5665             s_nbr.u1_left_avail,
5666             &ps_final_prms->pu1_cu_coeffs[0],
5667             &cbf_bits);
5668 
5669         cu_bits += header_bits;
5670 
5671         /* cbf bits are excluded from header bits, instead considered as texture bits */
5672         /* incase if zero cbf eval is disabled then texture bits gets added here */
5673         ps_final_prms->u4_cu_hdr_bits = (header_bits - cbf_bits);
5674         ps_final_prms->u4_cu_cbf_bits = cbf_bits;
5675 
5676 #if RDOPT_ENABLE
5677         /* add the cost of coding the header bits */
5678         total_rdopt_cost +=
5679             COMPUTE_RATE_COST_CLIP30(header_bits, ps_ctxt->i8_cl_ssd_lambda_qf, LAMBDA_Q_SHIFT);
5680 
5681 #if ENABLE_INTER_ZCU_COST
5682         /* If cu is coded, Evaluate not coded cost and check if it improves over coded cost */
5683         if(ps_final_prms->u1_is_cu_coded && (ZCBF_ENABLE == ps_ctxt->i4_zcbf_rdo_level))
5684         {
5685             LWORD64 i8_cu_not_coded_cost = ps_ctxt->i8_cu_not_coded_cost;
5686 
5687             WORD32 is_2nx2n_mergecu = (SIZE_2Nx2N == ps_final_prms->u1_part_mode) &&
5688                                       (1 == ps_final_prms->as_pu_enc_loop[0].b1_merge_flag);
5689 
5690             cab_ctxt_t *ps_cab_ctxt =
5691                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[curr_buf_idx].s_cabac_ctxt;
5692 
5693             /* Read header bits generatated after ihevce_entropy_rdo_encode_cu() call  */
5694             UWORD32 u4_cu_hdr_bits_q12 = ps_cab_ctxt->u4_header_bits_estimated_q12;
5695 
5696             /* account for coding qt_root_cbf = 0 */
5697             /* First subtract cost for coding as 1 (part of header bits) and then add cost for coding as 0 */
5698             u4_cu_hdr_bits_q12 += gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 0];
5699             if(u4_cu_hdr_bits_q12 < gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 1])
5700                 u4_cu_hdr_bits_q12 = 0;
5701             else
5702                 u4_cu_hdr_bits_q12 -= gau2_ihevce_cabac_bin_to_bits[u1_qtroot_cbf_cabac_model ^ 1];
5703 
5704             /* add the cost of coding the header bits */
5705             i8_cu_not_coded_cost += COMPUTE_RATE_COST_CLIP30(
5706                 u4_cu_hdr_bits_q12 /* ps_final_prms->u4_cu_hdr_bits */,
5707                 ps_ctxt->i8_cl_ssd_lambda_qf,
5708                 (LAMBDA_Q_SHIFT + CABAC_FRAC_BITS_Q));
5709 
5710             if(ps_ctxt->u1_enable_psyRDOPT)
5711             {
5712                 i8_cu_not_coded_cost = total_rdopt_cost + 1;
5713             }
5714 
5715             /* Evaluate qtroot cbf rdo; exclude 2Nx2N Merge as skip cu is explicitly evaluated */
5716             if((i8_cu_not_coded_cost <= total_rdopt_cost) && (!is_2nx2n_mergecu))
5717             {
5718                 WORD32 tx_size;
5719 
5720                 /* force cu as not coded and update the cost */
5721                 ps_final_prms->u1_is_cu_coded = 0;
5722                 ps_final_prms->s_recon_datastore.au1_is_chromaRecon_available[0] = 0;
5723                 ps_final_prms->s_recon_datastore.u1_is_lumaRecon_available = 0;
5724 
5725                 total_rdopt_cost = i8_cu_not_coded_cost;
5726 
5727                 /* reset num TUs to 1 unless cu size id 64 */
5728                 ps_final_prms->u2_num_tus_in_cu = (64 == cu_size) ? 4 : 1;
5729                 trans_size = (64 == cu_size) ? 32 : cu_size;
5730                 GETRANGE(tx_size, trans_size);
5731 
5732                 /* reset the bytes consumed */
5733                 ps_final_prms->i4_num_bytes_ecd_data = 0;
5734 
5735                 /* reset texture related bits and roll back header bits*/
5736                 ps_final_prms->u4_cu_cbf_bits = 0;
5737                 ps_final_prms->u4_cu_luma_res_bits = 0;
5738                 ps_final_prms->u4_cu_chroma_res_bits = 0;
5739                 ps_final_prms->u4_cu_hdr_bits =
5740                     (u4_cu_hdr_bits_q12 + (1 << (CABAC_FRAC_BITS_Q - 1))) >> CABAC_FRAC_BITS_Q;
5741 
5742                 /* update cabac model with qtroot cbf = 0 decision */
5743                 ps_cab_ctxt->au1_ctxt_models[IHEVC_CAB_NORES_IDX] =
5744                     gau1_ihevc_next_state[u1_qtroot_cbf_cabac_model << 1];
5745 
5746                 /* restore untouched cabac models for, tusplit, cbfs, texture etc */
5747                 memcpy(
5748                     &ps_cab_ctxt->au1_ctxt_models[IHEVC_CAB_SPLIT_TFM],
5749                     &au1_rdopt_init_ctxt_models[IHEVC_CAB_SPLIT_TFM],
5750                     (IHEVC_CAB_CTXT_END - IHEVC_CAB_SPLIT_TFM));
5751 
5752                 /* mark all tus as not coded for final eval */
5753                 for(ctr = 0; ctr < ps_final_prms->u2_num_tus_in_cu; ctr++)
5754                 {
5755                     WORD32 curr_pos_x = (ctr & 0x1) ? (trans_size >> 2) : 0;
5756                     WORD32 curr_pos_y = (ctr & 0x2) ? (trans_size >> 2) : 0;
5757 
5758                     nbr_4x4_t *ps_cur_nbr_4x4 =
5759                         ps_nbr_4x4 + curr_pos_x + (curr_pos_y * num_4x4_in_cu);
5760 
5761                     num_4x4_in_tu = trans_size >> 2;
5762 
5763                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].i2_luma_bytes_consumed = 0;
5764                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].ai2_cb_bytes_consumed[0] = 0;
5765                     ps_final_prms->as_tu_enc_loop_temp_prms[ctr].ai2_cr_bytes_consumed[0] = 0;
5766 
5767                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_y_cbf = 0;
5768                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf = 0;
5769                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf = 0;
5770 
5771                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1 = 0;
5772                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1 = 0;
5773 
5774                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b3_size = tx_size - 3;
5775                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_x = cu_pos_x + curr_pos_x;
5776                     ps_final_prms->as_tu_enc_loop[ctr].s_tu.b4_pos_y = cu_pos_y + curr_pos_y;
5777 
5778                     /* reset cbf for the all 4x4 in TU */
5779                     {
5780                         WORD32 i, j;
5781                         nbr_4x4_t *ps_tmp_4x4;
5782                         ps_tmp_4x4 = ps_cur_nbr_4x4;
5783 
5784                         for(i = 0; i < num_4x4_in_tu; i++)
5785                         {
5786                             for(j = 0; j < num_4x4_in_tu; j++)
5787                             {
5788                                 ps_tmp_4x4[j].b1_y_cbf = 0;
5789                             }
5790                             /* row level update*/
5791                             ps_tmp_4x4 += num_4x4_in_cu;
5792                         }
5793                     }
5794                 }
5795             }
5796         }
5797 #endif /* ENABLE_INTER_ZCU_COST */
5798 
5799 #endif /* RDOPT_ENABLE */
5800     }
5801 
5802     return (total_rdopt_cost);
5803 }
5804 #endif
5805 
5806 /*!
5807 ******************************************************************************
5808 * \if Function name : ihevce_inter_rdopt_cu_mc_mvp \endif
5809 *
5810 * \brief
5811 *    Inter Coding unit funtion which performs MC and MVP calc for RD opt mode
5812 *
5813 * \param[in] ps_ctxt       enc_loop module ctxt pointer
5814 * \param[in] ps_inter_cand pointer to inter candidate structure
5815 * \param[in] cu_size         Current CU size
5816 * \param[in] cu_pos_x        cu position x w.r.t to ctb
5817 * \param[in] cu_pos_y        cu position y w.r.t to ctb
5818 * \param[in] ps_left_nbr_4x4 Left neighbour 4x4 structure pointer
5819 * \param[in] ps_top_nbr_4x4  top neighbour 4x4 structure pointer
5820 * \param[in] ps_topleft_nbr_4x4  top left neighbour 4x4 structure pointer
5821 * \param[in] nbr_4x4_left_strd  left neighbour 4x4 buffer stride
5822 * \param[in] curr_buf_idx Current Buffer index
5823 *
5824 * \return
5825 *    Rdopt cost
5826 *
5827 * \author
5828 *  Ittiam
5829 *
5830 *****************************************************************************
5831 */
ihevce_inter_rdopt_cu_mc_mvp(ihevce_enc_loop_ctxt_t * ps_ctxt,cu_inter_cand_t * ps_inter_cand,WORD32 cu_size,WORD32 cu_pos_x,WORD32 cu_pos_y,nbr_4x4_t * ps_left_nbr_4x4,nbr_4x4_t * ps_top_nbr_4x4,nbr_4x4_t * ps_topleft_nbr_4x4,WORD32 nbr_4x4_left_strd,WORD32 curr_buf_idx)5832 LWORD64 ihevce_inter_rdopt_cu_mc_mvp(
5833     ihevce_enc_loop_ctxt_t *ps_ctxt,
5834     cu_inter_cand_t *ps_inter_cand,
5835     WORD32 cu_size,
5836     WORD32 cu_pos_x,
5837     WORD32 cu_pos_y,
5838     nbr_4x4_t *ps_left_nbr_4x4,
5839     nbr_4x4_t *ps_top_nbr_4x4,
5840     nbr_4x4_t *ps_topleft_nbr_4x4,
5841     WORD32 nbr_4x4_left_strd,
5842     WORD32 curr_buf_idx)
5843 {
5844     /* local variables */
5845     enc_loop_cu_final_prms_t *ps_final_prms;
5846     nbr_avail_flags_t s_nbr;
5847     nbr_4x4_t *ps_nbr_4x4;
5848 
5849     UWORD8 au1_is_top_used[2][MAX_MVP_LIST_CAND];
5850     UWORD8 *pu1_pred;
5851     WORD32 rdopt_cost;
5852     WORD32 ctr;
5853     WORD32 num_cu_part;
5854     WORD32 inter_pu_wd;
5855     WORD32 inter_pu_ht;
5856     WORD32 pred_stride;
5857 
5858     /* get the pointers based on curbuf idx */
5859     ps_nbr_4x4 = &ps_ctxt->as_cu_nbr[curr_buf_idx][0];
5860     ps_final_prms = &ps_ctxt->as_cu_prms[curr_buf_idx];
5861     pu1_pred = ps_inter_cand->pu1_pred_data;
5862 
5863     pred_stride = ps_inter_cand->i4_pred_data_stride;
5864 
5865     /* store the partition mode in final prms */
5866     ps_final_prms->u1_part_mode = ps_inter_cand->b3_part_size;
5867 
5868     /* since encoder does not support NXN part type */
5869     /* num parts can be either 1 or 2 only          */
5870     ASSERT(SIZE_NxN != ps_inter_cand->b3_part_size);
5871 
5872     num_cu_part = (SIZE_2Nx2N != ps_inter_cand->b3_part_size) + 1;
5873 
5874     /* get the 4x4 level position of current cu */
5875     cu_pos_x = cu_pos_x << 1;
5876     cu_pos_y = cu_pos_y << 1;
5877 
5878     /* populate cu level params */
5879     ps_final_prms->u1_intra_flag = PRED_MODE_INTER;
5880     ps_final_prms->u2_num_pus_in_cu = num_cu_part;
5881 
5882     /* run a loop over all the partitons in cu */
5883     for(ctr = 0; ctr < num_cu_part; ctr++)
5884     {
5885         pu_mv_t as_pred_mv[MAX_MVP_LIST_CAND];
5886         pu_t *ps_pu;
5887         WORD32 skip_or_merge_flag;
5888         UWORD8 u1_use_mvp_from_top_row;
5889 
5890         ps_pu = &ps_inter_cand->as_inter_pu[ctr];
5891 
5892         /* IF AMP then each partitions can have diff wd ht */
5893         inter_pu_wd = (ps_pu->b4_wd + 1) << 2;
5894         inter_pu_ht = (ps_pu->b4_ht + 1) << 2;
5895 
5896         /* populate reference pic buf id for bs compute */
5897 
5898         /* L0 */
5899         if(-1 != ps_pu->mv.i1_l0_ref_idx)
5900         {
5901             ps_pu->mv.i1_l0_ref_pic_buf_id =
5902                 ps_ctxt->s_mv_pred_ctxt.ps_ref_list[0][ps_pu->mv.i1_l0_ref_idx]->i4_buf_id;
5903         }
5904 
5905         /* L1 */
5906         if(-1 != ps_pu->mv.i1_l1_ref_idx)
5907         {
5908             ps_pu->mv.i1_l1_ref_pic_buf_id =
5909                 ps_ctxt->s_mv_pred_ctxt.ps_ref_list[1][ps_pu->mv.i1_l1_ref_idx]->i4_buf_id;
5910         }
5911 
5912         /* SKIP or merge check for every part */
5913         skip_or_merge_flag = ps_inter_cand->b1_skip_flag | ps_pu->b1_merge_flag;
5914 
5915         /* ----------- MV Prediction ----------------- */
5916         if(0 == skip_or_merge_flag)
5917         {
5918             /* get the neighbour availability flags */
5919             ihevce_get_only_nbr_flag(
5920                 &s_nbr,
5921                 ps_ctxt->pu1_ctb_nbr_map,
5922                 ps_ctxt->i4_nbr_map_strd,
5923                 cu_pos_x,
5924                 cu_pos_y,
5925                 inter_pu_wd >> 2,
5926                 inter_pu_ht >> 2);
5927 
5928             if(ps_ctxt->u1_disable_intra_eval && DISABLE_TOP_SYNC && (ps_pu->b4_pos_y == 0))
5929             {
5930                 u1_use_mvp_from_top_row = 0;
5931             }
5932             else
5933             {
5934                 u1_use_mvp_from_top_row = 1;
5935             }
5936 
5937             if(!u1_use_mvp_from_top_row)
5938             {
5939                 if(s_nbr.u1_top_avail || s_nbr.u1_top_lt_avail || s_nbr.u1_top_rt_avail)
5940                 {
5941                     if(!s_nbr.u1_left_avail && !s_nbr.u1_bot_lt_avail)
5942                     {
5943                         WORD32 curr_cu_pos_in_row, cu_top_right_offset, cu_top_right_dep_pos;
5944 
5945                         /* Ensure Top Right Sync */
5946                         if(!ps_ctxt->u1_use_top_at_ctb_boundary)
5947                         {
5948                             curr_cu_pos_in_row =
5949                                 ps_ctxt->s_mc_ctxt.i4_ctb_frm_pos_x + (cu_pos_x << 2);
5950 
5951                             if(ps_ctxt->s_mc_ctxt.i4_ctb_frm_pos_y == 0)
5952                             {
5953                                 /* No wait for 1st row */
5954                                 cu_top_right_offset = -(MAX_CTB_SIZE);
5955                                 {
5956                                     ihevce_tile_params_t *ps_col_tile_params =
5957                                         ((ihevce_tile_params_t *)ps_ctxt->pv_tile_params_base +
5958                                          ps_ctxt->i4_tile_col_idx);
5959 
5960                                     /* No wait for 1st row */
5961                                     cu_top_right_offset =
5962                                         -(ps_col_tile_params->i4_first_sample_x + (MAX_CTB_SIZE));
5963                                 }
5964                                 cu_top_right_dep_pos = 0;
5965                             }
5966                             else
5967                             {
5968                                 cu_top_right_offset = (cu_size) + 4;
5969                                 cu_top_right_dep_pos =
5970                                     (ps_ctxt->s_mc_ctxt.i4_ctb_frm_pos_y >> 6) - 1;
5971                             }
5972 
5973                             ihevce_dmgr_chk_row_row_sync(
5974                                 ps_ctxt->pv_dep_mngr_enc_loop_cu_top_right,
5975                                 curr_cu_pos_in_row,
5976                                 cu_top_right_offset,
5977                                 cu_top_right_dep_pos,
5978                                 ps_ctxt->i4_tile_col_idx, /* Col Tile No. */
5979                                 ps_ctxt->thrd_id);
5980                         }
5981 
5982                         u1_use_mvp_from_top_row = 1;
5983                     }
5984                     else
5985                     {
5986                         s_nbr.u1_top_avail = 0;
5987                         s_nbr.u1_top_lt_avail = 0;
5988                         s_nbr.u1_top_rt_avail = 0;
5989                     }
5990                 }
5991                 else
5992                 {
5993                     u1_use_mvp_from_top_row = 1;
5994                 }
5995             }
5996             /* Call the MV prediction module to get MVP */
5997             ihevce_mv_pred(
5998                 &ps_ctxt->s_mv_pred_ctxt,
5999                 ps_top_nbr_4x4,
6000                 ps_left_nbr_4x4,
6001                 ps_topleft_nbr_4x4,
6002                 nbr_4x4_left_strd,
6003                 &s_nbr,
6004                 NULL, /* colocated MV */
6005                 ps_pu,
6006                 &as_pred_mv[0],
6007                 au1_is_top_used);
6008         }
6009 
6010         /* store the nbr 4x4 structure */
6011         ps_nbr_4x4->b1_skip_flag = ps_inter_cand->b1_skip_flag;
6012         ps_nbr_4x4->b1_intra_flag = 0;
6013         ps_nbr_4x4->b1_pred_l0_flag = 0;
6014         ps_nbr_4x4->b1_pred_l1_flag = 0;
6015 
6016         /* DC is default mode for inter cu, required for intra mode signalling */
6017         ps_nbr_4x4->b6_luma_intra_mode = 1;
6018 
6019         /* copy the motion vectors to neighbour structure */
6020         ps_nbr_4x4->mv = ps_pu->mv;
6021 
6022         /* copy the PU to final out pu */
6023         ps_final_prms->as_pu_enc_loop[ctr] = *ps_pu;
6024 
6025         /* copy the PU to chroma */
6026         ps_final_prms->as_pu_chrm_proc[ctr] = *ps_pu;
6027 
6028         /* store the skip flag to final prms */
6029         ps_final_prms->u1_skip_flag = ps_inter_cand->b1_skip_flag;
6030 
6031         /* MVP index & MVD calc is gated on skip/merge flag */
6032         if(0 == skip_or_merge_flag)
6033         {
6034             /* calculate the MVDs and popluate the MVP idx for L0 */
6035             if((PRED_BI == ps_pu->b2_pred_mode) || (PRED_L0 == ps_pu->b2_pred_mode))
6036             {
6037                 WORD32 idx0_cost, idx1_cost;
6038 
6039                 /* calculate the ABS mvd for cand 0 */
6040                 idx0_cost = abs(ps_pu->mv.s_l0_mv.i2_mvx - as_pred_mv[0].s_l0_mv.i2_mvx);
6041                 idx0_cost += abs(ps_pu->mv.s_l0_mv.i2_mvy - as_pred_mv[0].s_l0_mv.i2_mvy);
6042 
6043                 /* calculate the ABS mvd for cand 1 */
6044                 if(u1_use_mvp_from_top_row)
6045                 {
6046                     idx1_cost = abs(ps_pu->mv.s_l0_mv.i2_mvx - as_pred_mv[1].s_l0_mv.i2_mvx);
6047                     idx1_cost += abs(ps_pu->mv.s_l0_mv.i2_mvy - as_pred_mv[1].s_l0_mv.i2_mvy);
6048                 }
6049                 else
6050                 {
6051                     idx1_cost = INT_MAX;
6052                 }
6053 
6054                 /* based on the least cost choose the mvp idx */
6055                 if(idx0_cost <= idx1_cost)
6056                 {
6057                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l0_mv.i2_mvx -=
6058                         as_pred_mv[0].s_l0_mv.i2_mvx;
6059                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l0_mv.i2_mvy -=
6060                         as_pred_mv[0].s_l0_mv.i2_mvy;
6061 
6062                     ps_final_prms->as_pu_enc_loop[ctr].b1_l0_mvp_idx = 0;
6063                 }
6064                 else
6065                 {
6066                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l0_mv.i2_mvx -=
6067                         as_pred_mv[1].s_l0_mv.i2_mvx;
6068                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l0_mv.i2_mvy -=
6069                         as_pred_mv[1].s_l0_mv.i2_mvy;
6070 
6071                     ps_final_prms->as_pu_enc_loop[ctr].b1_l0_mvp_idx = 1;
6072                 }
6073 
6074                 /* set the pred l0 flag for neighbour storage */
6075                 ps_nbr_4x4->b1_pred_l0_flag = 1;
6076             }
6077             /* calculate the MVDs and popluate the MVP idx for L1 */
6078             if((PRED_BI == ps_pu->b2_pred_mode) || (PRED_L1 == ps_pu->b2_pred_mode))
6079             {
6080                 WORD32 idx0_cost, idx1_cost;
6081 
6082                 /* calculate the ABS mvd for cand 0 */
6083                 idx0_cost = abs(ps_pu->mv.s_l1_mv.i2_mvx - as_pred_mv[0].s_l1_mv.i2_mvx);
6084                 idx0_cost += abs(ps_pu->mv.s_l1_mv.i2_mvy - as_pred_mv[0].s_l1_mv.i2_mvy);
6085 
6086                 /* calculate the ABS mvd for cand 1 */
6087                 if(u1_use_mvp_from_top_row)
6088                 {
6089                     idx1_cost = abs(ps_pu->mv.s_l1_mv.i2_mvx - as_pred_mv[1].s_l1_mv.i2_mvx);
6090                     idx1_cost += abs(ps_pu->mv.s_l1_mv.i2_mvy - as_pred_mv[1].s_l1_mv.i2_mvy);
6091                 }
6092                 else
6093                 {
6094                     idx1_cost = INT_MAX;
6095                 }
6096 
6097                 /* based on the least cost choose the mvp idx */
6098                 if(idx0_cost <= idx1_cost)
6099                 {
6100                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l1_mv.i2_mvx -=
6101                         as_pred_mv[0].s_l1_mv.i2_mvx;
6102                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l1_mv.i2_mvy -=
6103                         as_pred_mv[0].s_l1_mv.i2_mvy;
6104 
6105                     ps_final_prms->as_pu_enc_loop[ctr].b1_l1_mvp_idx = 0;
6106                 }
6107                 else
6108                 {
6109                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l1_mv.i2_mvx -=
6110                         as_pred_mv[1].s_l1_mv.i2_mvx;
6111                     ps_final_prms->as_pu_enc_loop[ctr].mv.s_l1_mv.i2_mvy -=
6112                         as_pred_mv[1].s_l1_mv.i2_mvy;
6113 
6114                     ps_final_prms->as_pu_enc_loop[ctr].b1_l1_mvp_idx = 1;
6115                 }
6116 
6117                 /* set the pred l1 flag for neighbour storage */
6118                 ps_nbr_4x4->b1_pred_l1_flag = 1;
6119             }
6120 
6121             /* set the merge flag to 0 */
6122             ps_final_prms->as_pu_enc_loop[ctr].b1_merge_flag = 0;
6123             ps_final_prms->as_pu_enc_loop[ctr].b3_merge_idx = 0;
6124         }
6125         else
6126         {
6127             /* copy the merge index from candidate */
6128             ps_final_prms->as_pu_enc_loop[ctr].b1_merge_flag = ps_pu->b1_merge_flag;
6129 
6130             ps_final_prms->as_pu_enc_loop[ctr].b3_merge_idx = ps_pu->b3_merge_idx;
6131 
6132             if((PRED_BI == ps_pu->b2_pred_mode) || (PRED_L0 == ps_pu->b2_pred_mode))
6133             {
6134                 /* set the pred l0 flag for neighbour storage */
6135                 ps_nbr_4x4->b1_pred_l0_flag = 1;
6136             }
6137 
6138             /* calculate the MVDs and popluate the MVP idx for L1 */
6139             if((PRED_BI == ps_pu->b2_pred_mode) || (PRED_L1 == ps_pu->b2_pred_mode))
6140             {
6141                 /* set the pred l1 flag for neighbour storage */
6142                 ps_nbr_4x4->b1_pred_l1_flag = 1;
6143             }
6144         }
6145 
6146         /* RD opt cost computation is part of cu_ntu func hence here it is set to 0 */
6147         rdopt_cost = 0;
6148 
6149         /* copy the MV to colocated Mv structure */
6150         ps_final_prms->as_col_pu_enc_loop[ctr].s_l0_mv = ps_pu->mv.s_l0_mv;
6151         ps_final_prms->as_col_pu_enc_loop[ctr].s_l1_mv = ps_pu->mv.s_l1_mv;
6152         ps_final_prms->as_col_pu_enc_loop[ctr].i1_l0_ref_idx = ps_pu->mv.i1_l0_ref_idx;
6153         ps_final_prms->as_col_pu_enc_loop[ctr].i1_l1_ref_idx = ps_pu->mv.i1_l1_ref_idx;
6154         ps_final_prms->as_col_pu_enc_loop[ctr].b2_pred_mode = ps_pu->b2_pred_mode;
6155         ps_final_prms->as_col_pu_enc_loop[ctr].b1_intra_flag = 0;
6156 
6157         /* replicate neighbour 4x4 strcuture for entire partition */
6158         {
6159             WORD32 i, j;
6160             nbr_4x4_t *ps_tmp_4x4;
6161 
6162             ps_tmp_4x4 = ps_nbr_4x4;
6163 
6164             for(i = 0; i < (inter_pu_ht >> 2); i++)
6165             {
6166                 for(j = 0; j < (inter_pu_wd >> 2); j++)
6167                 {
6168                     ps_tmp_4x4[j] = *ps_nbr_4x4;
6169                 }
6170                 /* row level update*/
6171                 ps_tmp_4x4 += (cu_size >> 2);
6172             }
6173         }
6174         /* set the neighbour map to 1 */
6175         ihevce_set_inter_nbr_map(
6176             ps_ctxt->pu1_ctb_nbr_map,
6177             ps_ctxt->i4_nbr_map_strd,
6178             cu_pos_x,
6179             cu_pos_y,
6180             (inter_pu_wd >> 2),
6181             (inter_pu_ht >> 2),
6182             1);
6183         /* ----------- Motion Compensation for Luma ----------- */
6184 #if !ENABLE_MIXED_INTER_MODE_EVAL
6185         {
6186             IV_API_CALL_STATUS_T valid_mv_cand;
6187 
6188             /*If the inter candidate is neither merge cand nor skip cand
6189             then calculate the mc.*/
6190             if(0 == skip_or_merge_flag || (ps_ctxt->u1_high_speed_cu_dec_on))
6191             {
6192                 valid_mv_cand =
6193                     ihevce_luma_inter_pred_pu(&ps_ctxt->s_mc_ctxt, ps_pu, pu1_pred, pred_stride, 0);
6194 
6195                 /* assert if the MC is given a valid mv candidate */
6196                 ASSERT(valid_mv_cand == IV_SUCCESS);
6197             }
6198         }
6199 #endif
6200         if((2 == num_cu_part) && (0 == ctr))
6201         {
6202             /* 2Nx__ partion case */
6203             if(inter_pu_wd == cu_size)
6204             {
6205                 cu_pos_y += (inter_pu_ht >> 2);
6206                 pu1_pred += (inter_pu_ht * pred_stride);
6207                 ps_nbr_4x4 += (inter_pu_ht >> 2) * (cu_size >> 2);
6208                 ps_left_nbr_4x4 += (inter_pu_ht >> 2) * nbr_4x4_left_strd;
6209                 ps_top_nbr_4x4 = ps_nbr_4x4 - (cu_size >> 2);
6210                 ps_topleft_nbr_4x4 = ps_left_nbr_4x4 - nbr_4x4_left_strd;
6211             }
6212 
6213             /* __x2N partion case */
6214             if(inter_pu_ht == cu_size)
6215             {
6216                 cu_pos_x += (inter_pu_wd >> 2);
6217                 pu1_pred += inter_pu_wd;
6218                 ps_nbr_4x4 += (inter_pu_wd >> 2);
6219                 ps_left_nbr_4x4 = ps_nbr_4x4 - 1;
6220                 ps_top_nbr_4x4 += (inter_pu_wd >> 2);
6221                 ps_topleft_nbr_4x4 = ps_top_nbr_4x4 - 1;
6222                 nbr_4x4_left_strd = (cu_size >> 2);
6223             }
6224         }
6225     }
6226 
6227     return (rdopt_cost);
6228 }
6229 
6230 /*!
6231 ******************************************************************************
6232 * \if Function name : ihevce_intra_chroma_pred_mode_selector \endif
6233 *
6234 * \brief
6235 *    Coding unit processing function for chroma special modes (Non-Luma modes)
6236 *
6237 * \param[in] ps_ctxt       enc_loop module ctxt pointer
6238 * \param[in] ps_chrm_cu_buf_prms    ctxt having chroma related prms
6239 * \param[in] ps_cu_analyse      pointer to cu analyse
6240 * \param[in] rd_opt_curr_idx    index in the array of RDopt params
6241 * \param[in] tu_mode            TU_EQ_CU or other case
6242 *
6243 * \return
6244 *    Stores the best SATD mode, it's RDOPT cost, CABAC state, TU bits
6245 *
6246 * \author
6247 *  Ittiam
6248 *
6249 *****************************************************************************
6250 */
ihevce_distortion_based_intra_chroma_mode_selector(cu_analyse_t * ps_cu_analyse,ihevc_intra_pred_chroma_ref_substitution_ft * pf_ref_substitution,pf_intra_pred * ppf_chroma_ip,pf_res_trans_luma_had_chroma * ppf_resd_trns_had,UWORD8 * pu1_src,WORD32 i4_src_stride,UWORD8 * pu1_pred,WORD32 i4_pred_stride,UWORD8 * pu1_ctb_nbr_map,WORD32 i4_nbr_map_strd,UWORD8 * pu1_ref_sub_out,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy,UWORD8 u1_trans_size,UWORD8 u1_trans_idx,UWORD8 u1_num_tus_in_cu,UWORD8 u1_num_4x4_luma_blks_in_tu,UWORD8 u1_enable_psyRDOPT,UWORD8 u1_is_422)6251 UWORD8 ihevce_distortion_based_intra_chroma_mode_selector(
6252     cu_analyse_t *ps_cu_analyse,
6253     ihevc_intra_pred_chroma_ref_substitution_ft *pf_ref_substitution,
6254     pf_intra_pred *ppf_chroma_ip,
6255     pf_res_trans_luma_had_chroma *ppf_resd_trns_had,
6256     UWORD8 *pu1_src,
6257     WORD32 i4_src_stride,
6258     UWORD8 *pu1_pred,
6259     WORD32 i4_pred_stride,
6260     UWORD8 *pu1_ctb_nbr_map,
6261     WORD32 i4_nbr_map_strd,
6262     UWORD8 *pu1_ref_sub_out,
6263     WORD32 i4_alpha_stim_multiplier,
6264     UWORD8 u1_is_cu_noisy,
6265     UWORD8 u1_trans_size,
6266     UWORD8 u1_trans_idx,
6267     UWORD8 u1_num_tus_in_cu,
6268     UWORD8 u1_num_4x4_luma_blks_in_tu,
6269     UWORD8 u1_enable_psyRDOPT,
6270     UWORD8 u1_is_422)
6271 {
6272     UWORD8 u1_chrm_mode;
6273     UWORD8 ctr;
6274     WORD32 i4_subtu_idx;
6275 
6276     WORD32 i = 0;
6277     UWORD8 u1_chrm_modes[4] = { 0, 1, 10, 26 };
6278     WORD32 i4_satd_had[4] = { 0 };
6279     WORD32 i4_best_satd_had = INT_MAX;
6280     UWORD8 u1_cu_pos_x = (ps_cu_analyse->b3_cu_pos_x << 1);
6281     UWORD8 u1_cu_pos_y = (ps_cu_analyse->b3_cu_pos_y << 1);
6282     WORD32 i4_num_sub_tus = u1_is_422 + 1;
6283     UWORD8 u1_best_chrm_mode = 0;
6284 
6285     /* Get the best satd among all possible modes */
6286     for(i = 0; i < 4; i++)
6287     {
6288         WORD32 left_strd = i4_src_stride;
6289 
6290         u1_chrm_mode = (u1_is_422 == 1) ? gau1_chroma422_intra_angle_mapping[u1_chrm_modes[i]]
6291                                         : u1_chrm_modes[i];
6292 
6293         /* loop based on num tus in a cu */
6294         for(ctr = 0; ctr < u1_num_tus_in_cu; ctr++)
6295         {
6296             WORD32 luma_nbr_flags;
6297             WORD32 chrm_pred_func_idx;
6298 
6299             WORD32 i4_trans_size_m2 = u1_trans_size << 1;
6300             UWORD8 *pu1_tu_src = pu1_src + ((ctr & 1) * i4_trans_size_m2) +
6301                                  (((ctr > 1) * u1_trans_size * i4_src_stride) << u1_is_422);
6302             UWORD8 *pu1_tu_pred = pu1_pred + ((ctr & 1) * i4_trans_size_m2) +
6303                                   (((ctr > 1) * u1_trans_size * i4_pred_stride) << u1_is_422);
6304             WORD32 i4_curr_tu_pos_x = u1_cu_pos_x + ((ctr & 1) * u1_num_4x4_luma_blks_in_tu);
6305             WORD32 i4_curr_tu_pos_y = u1_cu_pos_y + ((ctr > 1) * u1_num_4x4_luma_blks_in_tu);
6306 
6307             luma_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
6308                 pu1_ctb_nbr_map,
6309                 i4_nbr_map_strd,
6310                 i4_curr_tu_pos_x,
6311                 i4_curr_tu_pos_y,
6312                 u1_num_4x4_luma_blks_in_tu,
6313                 u1_num_4x4_luma_blks_in_tu);
6314 
6315             for(i4_subtu_idx = 0; i4_subtu_idx < i4_num_sub_tus; i4_subtu_idx++)
6316             {
6317                 WORD32 nbr_flags;
6318 
6319                 UWORD8 *pu1_cur_src =
6320                     pu1_tu_src + ((i4_subtu_idx == 1) * u1_trans_size * i4_src_stride);
6321                 UWORD8 *pu1_cur_pred =
6322                     pu1_tu_pred + ((i4_subtu_idx == 1) * u1_trans_size * i4_pred_stride);
6323                 UWORD8 *pu1_left = pu1_cur_src - 2;
6324                 UWORD8 *pu1_top = pu1_cur_src - i4_src_stride;
6325                 UWORD8 *pu1_top_left = pu1_top - 2;
6326 
6327                 nbr_flags = ihevce_get_intra_chroma_tu_nbr(
6328                     luma_nbr_flags, i4_subtu_idx, u1_trans_size, u1_is_422);
6329 
6330                 /* call the chroma reference array substitution */
6331                 pf_ref_substitution(
6332                     pu1_top_left,
6333                     pu1_top,
6334                     pu1_left,
6335                     left_strd,
6336                     u1_trans_size,
6337                     nbr_flags,
6338                     pu1_ref_sub_out,
6339                     1);
6340 
6341                 /* use the look up to get the function idx */
6342                 chrm_pred_func_idx = g_i4_ip_funcs[u1_chrm_mode];
6343 
6344                 /* call the intra prediction function */
6345                 ppf_chroma_ip[chrm_pred_func_idx](
6346                     pu1_ref_sub_out, 1, pu1_cur_pred, i4_pred_stride, u1_trans_size, u1_chrm_mode);
6347 
6348                 if(!u1_is_cu_noisy || !i4_alpha_stim_multiplier)
6349                 {
6350                     /* compute Hadamard-transform satd : Cb */
6351                     i4_satd_had[i] += ppf_resd_trns_had[u1_trans_idx - 1](
6352                         pu1_cur_src, i4_src_stride, pu1_cur_pred, i4_pred_stride, NULL, 0);
6353 
6354                     /* compute Hadamard-transform satd : Cr */
6355                     i4_satd_had[i] += ppf_resd_trns_had[u1_trans_idx - 1](
6356                         pu1_cur_src + 1, i4_src_stride, pu1_cur_pred + 1, i4_pred_stride, NULL, 0);
6357                 }
6358                 else
6359                 {
6360                     WORD32 i4_satd;
6361 
6362                     /* compute Hadamard-transform satd : Cb */
6363                     i4_satd = ppf_resd_trns_had[u1_trans_idx - 1](
6364                         pu1_cur_src, i4_src_stride, pu1_cur_pred, i4_pred_stride, NULL, 0);
6365 
6366                     i4_satd = ihevce_inject_stim_into_distortion(
6367                         pu1_cur_src,
6368                         i4_src_stride,
6369                         pu1_cur_pred,
6370                         i4_pred_stride,
6371                         i4_satd,
6372                         i4_alpha_stim_multiplier,
6373                         u1_trans_size,
6374                         0,
6375                         u1_enable_psyRDOPT,
6376                         U_PLANE);
6377 
6378                     i4_satd_had[i] += i4_satd;
6379 
6380                     /* compute Hadamard-transform satd : Cr */
6381                     i4_satd = ppf_resd_trns_had[u1_trans_idx - 1](
6382                         pu1_cur_src + 1, i4_src_stride, pu1_cur_pred + 1, i4_pred_stride, NULL, 0);
6383 
6384                     i4_satd = ihevce_inject_stim_into_distortion(
6385                         pu1_cur_src,
6386                         i4_src_stride,
6387                         pu1_cur_pred,
6388                         i4_pred_stride,
6389                         i4_satd,
6390                         i4_alpha_stim_multiplier,
6391                         u1_trans_size,
6392                         0,
6393                         u1_enable_psyRDOPT,
6394                         V_PLANE);
6395 
6396                     i4_satd_had[i] += i4_satd;
6397                 }
6398             }
6399 
6400             /* set the neighbour map to 1 */
6401             ihevce_set_nbr_map(
6402                 pu1_ctb_nbr_map,
6403                 i4_nbr_map_strd,
6404                 i4_curr_tu_pos_x,
6405                 i4_curr_tu_pos_y,
6406                 u1_num_4x4_luma_blks_in_tu,
6407                 1);
6408         }
6409 
6410         /* set the neighbour map to 0 */
6411         ihevce_set_nbr_map(
6412             pu1_ctb_nbr_map,
6413             i4_nbr_map_strd,
6414             (ps_cu_analyse->b3_cu_pos_x << 1),
6415             (ps_cu_analyse->b3_cu_pos_y << 1),
6416             (ps_cu_analyse->u1_cu_size >> 2),
6417             0);
6418 
6419         /* Get the least SATD and corresponding mode */
6420         if(i4_best_satd_had > i4_satd_had[i])
6421         {
6422             i4_best_satd_had = i4_satd_had[i];
6423             u1_best_chrm_mode = u1_chrm_mode;
6424         }
6425     }
6426 
6427     return u1_best_chrm_mode;
6428 }
6429 
ihevce_intra_chroma_pred_mode_selector(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_chrm_cu_buf_prms_t * ps_chrm_cu_buf_prms,cu_analyse_t * ps_cu_analyse,WORD32 rd_opt_curr_idx,WORD32 tu_mode,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy)6430 void ihevce_intra_chroma_pred_mode_selector(
6431     ihevce_enc_loop_ctxt_t *ps_ctxt,
6432     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
6433     cu_analyse_t *ps_cu_analyse,
6434     WORD32 rd_opt_curr_idx,
6435     WORD32 tu_mode,
6436     WORD32 i4_alpha_stim_multiplier,
6437     UWORD8 u1_is_cu_noisy)
6438 {
6439     chroma_intra_satd_ctxt_t *ps_chr_intra_satd_ctxt;
6440 
6441     ihevc_intra_pred_chroma_ref_substitution_ft *ihevc_intra_pred_chroma_ref_substitution_fptr;
6442 
6443     UWORD8 *pu1_pred;
6444     WORD32 trans_size;
6445     WORD32 num_tus_in_cu;
6446     WORD32 pred_strd;
6447     WORD32 ctr;
6448     WORD32 i4_subtu_idx;
6449     WORD32 i4_num_sub_tus;
6450     WORD32 trans_idx;
6451     WORD32 scan_idx;
6452     WORD32 num_4x4_luma_in_tu;
6453     WORD32 cu_pos_x;
6454     WORD32 cu_pos_y;
6455 
6456     recon_datastore_t *aps_recon_datastore[2] = { &ps_ctxt->as_cu_prms[0].s_recon_datastore,
6457                                                   &ps_ctxt->as_cu_prms[1].s_recon_datastore };
6458 
6459     LWORD64 chrm_cod_cost = 0;
6460     WORD32 chrm_tu_bits = 0;
6461     WORD32 best_chrm_mode = DM_CHROMA_IDX;
6462     UWORD8 *pu1_chrm_src = ps_chrm_cu_buf_prms->pu1_curr_src;
6463     WORD32 chrm_src_stride = ps_chrm_cu_buf_prms->i4_chrm_src_stride;
6464     UWORD8 *pu1_cu_left = ps_chrm_cu_buf_prms->pu1_cu_left;
6465     UWORD8 *pu1_cu_top = ps_chrm_cu_buf_prms->pu1_cu_top;
6466     UWORD8 *pu1_cu_top_left = ps_chrm_cu_buf_prms->pu1_cu_top_left;
6467     WORD32 cu_left_stride = ps_chrm_cu_buf_prms->i4_cu_left_stride;
6468     WORD32 cu_size = ps_cu_analyse->u1_cu_size;
6469     WORD32 i4_perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_rdoq;
6470     WORD32 i4_perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_sbh;
6471     UWORD8 u1_is_422 = (ps_ctxt->u1_chroma_array_type == 2);
6472 
6473     ihevc_intra_pred_chroma_ref_substitution_fptr =
6474         ps_ctxt->ps_func_selector->ihevc_intra_pred_chroma_ref_substitution_fptr;
6475     i4_num_sub_tus = (u1_is_422 == 1) + 1;
6476 
6477 #if DISABLE_RDOQ_INTRA
6478     i4_perform_rdoq = 0;
6479 #endif
6480 
6481     if(TU_EQ_CU == tu_mode)
6482     {
6483         num_tus_in_cu = 1;
6484         trans_size = cu_size >> 1;
6485         num_4x4_luma_in_tu = trans_size >> 1; /*at luma level*/
6486         ps_chr_intra_satd_ctxt = &ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[tu_mode];
6487     }
6488     else
6489     {
6490         num_tus_in_cu = 4;
6491         trans_size = cu_size >> 2;
6492         num_4x4_luma_in_tu = trans_size >> 1; /*at luma level*/
6493 
6494         /* For 8x8 CU only one TU */
6495         if(MIN_TU_SIZE > trans_size)
6496         {
6497             trans_size = MIN_TU_SIZE;
6498             num_tus_in_cu = 1;
6499             /* chroma nbr avail. is derived based on luma.
6500             for 4x4 chrm use 8x8 luma's size */
6501             num_4x4_luma_in_tu = num_4x4_luma_in_tu << 1;
6502         }
6503 
6504         ps_chr_intra_satd_ctxt = &ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[tu_mode];
6505     }
6506 
6507     /* Can't be TU_EQ_SUBCU case */
6508     ASSERT(TU_EQ_SUBCU != tu_mode);
6509 
6510     /* translate the transform size to index */
6511     trans_idx = trans_size >> 2;
6512 
6513     pu1_pred = (UWORD8 *)ps_chr_intra_satd_ctxt->pv_pred_data;
6514 
6515     pred_strd = ps_chr_intra_satd_ctxt->i4_pred_stride;
6516 
6517     /* for 16x16 cases */
6518     if(16 == trans_size)
6519     {
6520         trans_idx = 3;
6521     }
6522 
6523     best_chrm_mode = ihevce_distortion_based_intra_chroma_mode_selector(
6524         ps_cu_analyse,
6525         ihevc_intra_pred_chroma_ref_substitution_fptr,
6526         ps_ctxt->apf_chrm_ip,
6527         ps_ctxt->apf_chrm_resd_trns_had,
6528         pu1_chrm_src,
6529         chrm_src_stride,
6530         pu1_pred,
6531         pred_strd,
6532         ps_ctxt->pu1_ctb_nbr_map,
6533         ps_ctxt->i4_nbr_map_strd,
6534         (UWORD8 *)ps_ctxt->pv_ref_sub_out,
6535         i4_alpha_stim_multiplier,
6536         u1_is_cu_noisy,
6537         trans_size,
6538         trans_idx,
6539         num_tus_in_cu,
6540         num_4x4_luma_in_tu,
6541         ps_ctxt->u1_enable_psyRDOPT,
6542         u1_is_422);
6543 
6544     /* Store the best chroma mode */
6545     ps_chr_intra_satd_ctxt->u1_best_cr_mode = best_chrm_mode;
6546 
6547     /* evaluate RDOPT cost for the Best mode */
6548     {
6549         WORD32 i4_subtu_pos_x;
6550         WORD32 i4_subtu_pos_y;
6551         UWORD8 u1_compute_spatial_ssd;
6552 
6553         WORD32 ai4_total_bytes_offset_cb[2] = { 0, 0 };
6554         WORD32 ai4_total_bytes_offset_cr[2] = { 0, 0 };
6555         /* State for prefix bin of chroma intra pred mode before CU encode */
6556         UWORD8 u1_chroma_intra_mode_prefix_state =
6557             ps_ctxt->au1_rdopt_init_ctxt_models[IHEVC_CAB_CHROMA_PRED_MODE];
6558         WORD32 luma_trans_size = trans_size << 1;
6559         WORD32 calc_recon = 0;
6560         UWORD8 *pu1_left = pu1_cu_left;
6561         UWORD8 *pu1_top = pu1_cu_top;
6562         UWORD8 *pu1_top_left = pu1_cu_top_left;
6563         WORD32 left_strd = cu_left_stride;
6564 
6565         if(ps_ctxt->i1_cu_qp_delta_enable)
6566         {
6567             WORD32 i4_act_counter = 0, i4_act_counter_lamda = 0;
6568             if(ps_cu_analyse->u1_cu_size == 64)
6569             {
6570                 ASSERT(
6571                     (luma_trans_size == 32) || (luma_trans_size == 16) || (luma_trans_size == 8) ||
6572                     (luma_trans_size == 4));
6573                 i4_act_counter = (luma_trans_size == 16) +
6574                                  2 * ((luma_trans_size == 8) || (luma_trans_size == 4));
6575                 i4_act_counter_lamda = 3;
6576             }
6577             else if(ps_cu_analyse->u1_cu_size == 32)
6578             {
6579                 ASSERT(
6580                     (luma_trans_size == 32) || (luma_trans_size == 16) || (luma_trans_size == 8) ||
6581                     (luma_trans_size == 4));
6582                 i4_act_counter = (luma_trans_size == 16) +
6583                                  2 * ((luma_trans_size == 8) || (luma_trans_size == 4));
6584                 i4_act_counter_lamda = 0;
6585             }
6586             else if(ps_cu_analyse->u1_cu_size == 16)
6587             {
6588                 ASSERT((luma_trans_size == 16) || (luma_trans_size == 8) || (luma_trans_size == 4));
6589                 i4_act_counter = (luma_trans_size == 8) || (luma_trans_size == 4);
6590                 i4_act_counter_lamda = 0;
6591             }
6592             else if(ps_cu_analyse->u1_cu_size == 8)
6593             {
6594                 ASSERT((luma_trans_size == 8) || (luma_trans_size == 4));
6595                 i4_act_counter = 1;
6596                 i4_act_counter_lamda = 0;
6597             }
6598             else
6599             {
6600                 ASSERT(0);
6601             }
6602             /*assumption is that control comes here for intras*/
6603             if(ps_ctxt->i4_use_ctb_level_lamda)
6604             {
6605                 ihevce_compute_cu_level_QP(
6606                     ps_ctxt, ps_cu_analyse->i4_act_factor[i4_act_counter][1], -1, 0);
6607             }
6608             else
6609             {
6610                 ihevce_compute_cu_level_QP(
6611                     ps_ctxt,
6612                     ps_cu_analyse->i4_act_factor[i4_act_counter][1],
6613                     ps_cu_analyse->i4_act_factor[i4_act_counter_lamda][1],
6614                     0);
6615             }
6616 
6617             ps_cu_analyse->i1_cu_qp = ps_ctxt->i4_cu_qp;
6618         }
6619 
6620         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
6621                                  (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
6622                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
6623 
6624         if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
6625         {
6626             u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
6627                                      CONVERT_SSDS_TO_SPATIAL_DOMAIN;
6628         }
6629 
6630         /* get the 4x4 level postion of current cu */
6631         cu_pos_x = (ps_cu_analyse->b3_cu_pos_x << 1);
6632         cu_pos_y = (ps_cu_analyse->b3_cu_pos_y << 1);
6633 
6634         calc_recon = !u1_compute_spatial_ssd && ((4 == num_tus_in_cu) || (u1_is_422 == 1));
6635 
6636         if(calc_recon || u1_compute_spatial_ssd)
6637         {
6638             aps_recon_datastore[0]->au1_is_chromaRecon_available[1 + (num_tus_in_cu > 1)] = 1;
6639             aps_recon_datastore[1]->au1_is_chromaRecon_available[1 + (num_tus_in_cu > 1)] = 1;
6640         }
6641         else
6642         {
6643             aps_recon_datastore[0]->au1_is_chromaRecon_available[1 + (num_tus_in_cu > 1)] = 0;
6644             aps_recon_datastore[1]->au1_is_chromaRecon_available[1 + (num_tus_in_cu > 1)] = 0;
6645         }
6646 
6647         /* loop based on num tus in a cu */
6648         for(ctr = 0; ctr < num_tus_in_cu; ctr++)
6649         {
6650             WORD16 *pi2_cur_deq_data_cb;
6651             WORD16 *pi2_cur_deq_data_cr;
6652 
6653             WORD32 deq_data_strd = ps_chr_intra_satd_ctxt->i4_iq_buff_stride;
6654             WORD32 luma_nbr_flags = 0;
6655 
6656             luma_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
6657                 ps_ctxt->pu1_ctb_nbr_map,
6658                 ps_ctxt->i4_nbr_map_strd,
6659                 (ctr & 1) * (luma_trans_size >> 2) + cu_pos_x,
6660                 (ctr > 1) * (luma_trans_size >> 2) + cu_pos_y,
6661                 (luma_trans_size >> 2),
6662                 (luma_trans_size >> 2));
6663 
6664             for(i4_subtu_idx = 0; i4_subtu_idx < i4_num_sub_tus; i4_subtu_idx++)
6665             {
6666                 WORD32 cbf, num_bytes;
6667                 LWORD64 trans_ssd_u, trans_ssd_v;
6668                 UWORD8 u1_is_recon_available;
6669 
6670                 WORD32 trans_size_m2 = trans_size << 1;
6671                 UWORD8 *pu1_cur_src = pu1_chrm_src + ((ctr & 1) * trans_size_m2) +
6672                                       (((ctr > 1) * trans_size * chrm_src_stride) << u1_is_422) +
6673                                       (i4_subtu_idx * trans_size * chrm_src_stride);
6674                 UWORD8 *pu1_cur_pred = pu1_pred + ((ctr & 1) * trans_size_m2) +
6675                                        (((ctr > 1) * trans_size * pred_strd) << u1_is_422) +
6676                                        (i4_subtu_idx * trans_size * pred_strd);
6677                 WORD32 i4_recon_stride = aps_recon_datastore[0]->i4_chromaRecon_stride;
6678                 UWORD8 *pu1_cur_recon = ((UWORD8 *)aps_recon_datastore[0]
6679                                              ->apv_chroma_recon_bufs[1 + (num_tus_in_cu > 1)]) +
6680                                         ((ctr & 1) * trans_size_m2) +
6681                                         (((ctr > 1) * trans_size * i4_recon_stride) << u1_is_422) +
6682                                         (i4_subtu_idx * trans_size * i4_recon_stride);
6683 
6684                 /* Use Chroma coeff/iq buf of the cur. intra cand. Not rememb.
6685                 chroma coeff/iq for high quality intra SATD special modes. Will
6686                 be over written by coeff of luma mode in chroma_rdopt call */
6687                 UWORD8 *pu1_ecd_data_cb =
6688                     &ps_chr_intra_satd_ctxt->au1_scan_coeff_cb[i4_subtu_idx][0];
6689                 UWORD8 *pu1_ecd_data_cr =
6690                     &ps_chr_intra_satd_ctxt->au1_scan_coeff_cr[i4_subtu_idx][0];
6691 
6692                 WORD32 chrm_pred_func_idx = 0;
6693                 LWORD64 curr_cb_cod_cost = 0;
6694                 LWORD64 curr_cr_cod_cost = 0;
6695                 WORD32 nbr_flags = 0;
6696 
6697                 i4_subtu_pos_x = (((ctr & 1) * trans_size_m2) >> 2);
6698                 i4_subtu_pos_y = (((ctr > 1) * trans_size) >> (!u1_is_422 + 1)) +
6699                                  ((i4_subtu_idx * trans_size) >> 2);
6700                 pi2_cur_deq_data_cb = &ps_chr_intra_satd_ctxt->ai2_iq_data_cb[0] +
6701                                       ((ctr & 1) * trans_size) +
6702                                       (((ctr > 1) * trans_size * deq_data_strd) << u1_is_422) +
6703                                       (i4_subtu_idx * trans_size * deq_data_strd);
6704                 pi2_cur_deq_data_cr = &ps_chr_intra_satd_ctxt->ai2_iq_data_cr[0] +
6705                                       ((ctr & 1) * trans_size) +
6706                                       (((ctr > 1) * trans_size * deq_data_strd) << u1_is_422) +
6707                                       (i4_subtu_idx * trans_size * deq_data_strd);
6708 
6709                 /* left cu boundary */
6710                 if(0 == i4_subtu_pos_x)
6711                 {
6712                     left_strd = cu_left_stride;
6713                     pu1_left = pu1_cu_left + (i4_subtu_pos_y << 2) * left_strd;
6714                 }
6715                 else
6716                 {
6717                     pu1_left = pu1_cur_recon - 2;
6718                     left_strd = i4_recon_stride;
6719                 }
6720 
6721                 /* top cu boundary */
6722                 if(0 == i4_subtu_pos_y)
6723                 {
6724                     pu1_top = pu1_cu_top + (i4_subtu_pos_x << 2);
6725                 }
6726                 else
6727                 {
6728                     pu1_top = pu1_cur_recon - i4_recon_stride;
6729                 }
6730 
6731                 /* by default top left is set to cu top left */
6732                 pu1_top_left = pu1_cu_top_left;
6733 
6734                 /* top left based on position */
6735                 if((0 != i4_subtu_pos_y) && (0 == i4_subtu_pos_x))
6736                 {
6737                     pu1_top_left = pu1_left - left_strd;
6738                 }
6739                 else if(0 != i4_subtu_pos_x)
6740                 {
6741                     pu1_top_left = pu1_top - 2;
6742                 }
6743 
6744                 /* populate the coeffs scan idx */
6745                 scan_idx = SCAN_DIAG_UPRIGHT;
6746 
6747                 /* RDOPT copy States :  TU init (best until prev TU) to current */
6748                 COPY_CABAC_STATES(
6749                     &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
6750                          .s_cabac_ctxt.au1_ctxt_models[0],
6751                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
6752                     IHEVC_CAB_CTXT_END);
6753 
6754                 /* for 4x4 transforms based on intra pred mode scan is choosen*/
6755                 if(4 == trans_size)
6756                 {
6757                     /* for modes from 22 upto 30 horizontal scan is used */
6758                     if((best_chrm_mode > 21) && (best_chrm_mode < 31))
6759                     {
6760                         scan_idx = SCAN_HORZ;
6761                     }
6762                     /* for modes from 6 upto 14 horizontal scan is used */
6763                     else if((best_chrm_mode > 5) && (best_chrm_mode < 15))
6764                     {
6765                         scan_idx = SCAN_VERT;
6766                     }
6767                 }
6768 
6769                 nbr_flags = ihevce_get_intra_chroma_tu_nbr(
6770                     luma_nbr_flags, i4_subtu_idx, trans_size, u1_is_422);
6771 
6772                 /* call the chroma reference array substitution */
6773                 ihevc_intra_pred_chroma_ref_substitution_fptr(
6774                     pu1_top_left,
6775                     pu1_top,
6776                     pu1_left,
6777                     left_strd,
6778                     trans_size,
6779                     nbr_flags,
6780                     (UWORD8 *)ps_ctxt->pv_ref_sub_out,
6781                     1);
6782 
6783                 /* use the look up to get the function idx */
6784                 chrm_pred_func_idx = g_i4_ip_funcs[best_chrm_mode];
6785 
6786                 /* call the intra prediction function */
6787                 ps_ctxt->apf_chrm_ip[chrm_pred_func_idx](
6788                     (UWORD8 *)ps_ctxt->pv_ref_sub_out,
6789                     1,
6790                     pu1_cur_pred,
6791                     pred_strd,
6792                     trans_size,
6793                     best_chrm_mode);
6794 
6795                 /* UPLANE RDOPT Loop */
6796                 {
6797                     WORD32 tu_bits;
6798 
6799                     cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
6800                         ps_ctxt,
6801                         pu1_cur_pred,
6802                         pred_strd,
6803                         pu1_cur_src,
6804                         chrm_src_stride,
6805                         pi2_cur_deq_data_cb,
6806                         deq_data_strd,
6807                         pu1_cur_recon,
6808                         i4_recon_stride,
6809                         pu1_ecd_data_cb + ai4_total_bytes_offset_cb[i4_subtu_idx],
6810                         ps_ctxt->au1_cu_csbf,
6811                         ps_ctxt->i4_cu_csbf_strd,
6812                         trans_size,
6813                         scan_idx,
6814                         1,
6815                         &num_bytes,
6816                         &tu_bits,
6817                         &ps_chr_intra_satd_ctxt->ai4_zero_col_cb[i4_subtu_idx][ctr],
6818                         &ps_chr_intra_satd_ctxt->ai4_zero_row_cb[i4_subtu_idx][ctr],
6819                         &u1_is_recon_available,
6820                         i4_perform_sbh,
6821                         i4_perform_rdoq,
6822                         &trans_ssd_u,
6823 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
6824                         i4_alpha_stim_multiplier,
6825                         u1_is_cu_noisy,
6826 #endif
6827                         0,
6828                         u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
6829                         U_PLANE);
6830 
6831 #if !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS && COMPUTE_NOISE_TERM_AT_THE_TU_LEVEL
6832                     if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
6833                     {
6834 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
6835                         trans_ssd_u = ihevce_inject_stim_into_distortion(
6836                             pu1_cur_src,
6837                             chrm_src_stride,
6838                             pu1_cur_pred,
6839                             pred_strd,
6840                             trans_ssd_u,
6841                             i4_alpha_stim_multiplier,
6842                             trans_size,
6843                             0,
6844                             ps_ctxt->u1_enable_psyRDOPT,
6845                             U_PLANE);
6846 #else
6847                         if(u1_compute_spatial_ssd && u1_is_recon_available)
6848                         {
6849                             trans_ssd_u = ihevce_inject_stim_into_distortion(
6850                                 pu1_cur_src,
6851                                 chrm_src_stride,
6852                                 pu1_cur_recon,
6853                                 i4_recon_stride,
6854                                 trans_ssd_u,
6855                                 i4_alpha_stim_multiplier,
6856                                 trans_size,
6857                                 0,
6858                                 ps_ctxt->u1_enable_psyRDOPT,
6859                                 U_PLANE);
6860                         }
6861                         else
6862                         {
6863                             trans_ssd_u = ihevce_inject_stim_into_distortion(
6864                                 pu1_cur_src,
6865                                 chrm_src_stride,
6866                                 pu1_cur_pred,
6867                                 pred_strd,
6868                                 trans_ssd_u,
6869                                 i4_alpha_stim_multiplier,
6870                                 trans_size,
6871                                 0,
6872                                 ps_ctxt->u1_enable_psyRDOPT,
6873                                 U_PLANE);
6874                         }
6875 #endif
6876                     }
6877 #endif
6878 
6879                     /* RDOPT copy States :  New updated after curr TU to TU init */
6880                     if(0 != cbf)
6881                     {
6882                         memcpy(
6883                             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
6884                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
6885                                  .s_cabac_ctxt.au1_ctxt_models[0],
6886                             IHEVC_CAB_CTXT_END);
6887                     }
6888                     /* RDOPT copy States :  Restoring back the Cb init state to Cr */
6889                     else
6890                     {
6891                         memcpy(
6892                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
6893                                  .s_cabac_ctxt.au1_ctxt_models[0],
6894                             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
6895                             IHEVC_CAB_CTXT_END);
6896                     }
6897 
6898                     if(calc_recon || (!u1_is_recon_available && u1_compute_spatial_ssd))
6899                     {
6900                         ihevce_chroma_it_recon_fxn(
6901                             ps_ctxt,
6902                             pi2_cur_deq_data_cb,
6903                             deq_data_strd,
6904                             pu1_cur_pred,
6905                             pred_strd,
6906                             pu1_cur_recon,
6907                             i4_recon_stride,
6908                             (pu1_ecd_data_cb + ai4_total_bytes_offset_cb[i4_subtu_idx]),
6909                             trans_size,
6910                             cbf,
6911                             ps_chr_intra_satd_ctxt->ai4_zero_col_cb[i4_subtu_idx][ctr],
6912                             ps_chr_intra_satd_ctxt->ai4_zero_row_cb[i4_subtu_idx][ctr],
6913                             U_PLANE);
6914                     }
6915 
6916                     ps_chr_intra_satd_ctxt->au1_cbf_cb[i4_subtu_idx][ctr] = cbf;
6917                     curr_cb_cod_cost =
6918                         trans_ssd_u +
6919                         COMPUTE_RATE_COST_CLIP30(
6920                             tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
6921                     chrm_tu_bits += tu_bits;
6922                     ai4_total_bytes_offset_cb[i4_subtu_idx] += num_bytes;
6923                     ps_chr_intra_satd_ctxt->ai4_num_bytes_scan_coeff_cb_per_tu[i4_subtu_idx][ctr] =
6924                         num_bytes;
6925                 }
6926 
6927                 /* VPLANE RDOPT Loop */
6928                 {
6929                     WORD32 tu_bits;
6930 
6931                     cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
6932                         ps_ctxt,
6933                         pu1_cur_pred,
6934                         pred_strd,
6935                         pu1_cur_src,
6936                         chrm_src_stride,
6937                         pi2_cur_deq_data_cr,
6938                         deq_data_strd,
6939                         pu1_cur_recon,
6940                         i4_recon_stride,
6941                         pu1_ecd_data_cr + ai4_total_bytes_offset_cr[i4_subtu_idx],
6942                         ps_ctxt->au1_cu_csbf,
6943                         ps_ctxt->i4_cu_csbf_strd,
6944                         trans_size,
6945                         scan_idx,
6946                         1,
6947                         &num_bytes,
6948                         &tu_bits,
6949                         &ps_chr_intra_satd_ctxt->ai4_zero_col_cr[i4_subtu_idx][ctr],
6950                         &ps_chr_intra_satd_ctxt->ai4_zero_row_cr[i4_subtu_idx][ctr],
6951                         &u1_is_recon_available,
6952                         i4_perform_sbh,
6953                         i4_perform_rdoq,
6954                         &trans_ssd_v,
6955 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
6956                         i4_alpha_stim_multiplier,
6957                         u1_is_cu_noisy,
6958 #endif
6959                         0,
6960                         u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
6961                         V_PLANE);
6962 
6963 #if !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS && COMPUTE_NOISE_TERM_AT_THE_TU_LEVEL
6964                     if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
6965                     {
6966 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
6967                         trans_ssd_v = ihevce_inject_stim_into_distortion(
6968                             pu1_cur_src,
6969                             chrm_src_stride,
6970                             pu1_cur_pred,
6971                             pred_strd,
6972                             trans_ssd_v,
6973                             i4_alpha_stim_multiplier,
6974                             trans_size,
6975                             0,
6976                             ps_ctxt->u1_enable_psyRDOPT,
6977                             V_PLANE);
6978 #else
6979                         if(u1_compute_spatial_ssd && u1_is_recon_available)
6980                         {
6981                             trans_ssd_v = ihevce_inject_stim_into_distortion(
6982                                 pu1_cur_src,
6983                                 chrm_src_stride,
6984                                 pu1_cur_recon,
6985                                 i4_recon_stride,
6986                                 trans_ssd_v,
6987                                 i4_alpha_stim_multiplier,
6988                                 trans_size,
6989                                 0,
6990                                 ps_ctxt->u1_enable_psyRDOPT,
6991                                 V_PLANE);
6992                         }
6993                         else
6994                         {
6995                             trans_ssd_v = ihevce_inject_stim_into_distortion(
6996                                 pu1_cur_src,
6997                                 chrm_src_stride,
6998                                 pu1_cur_pred,
6999                                 pred_strd,
7000                                 trans_ssd_v,
7001                                 i4_alpha_stim_multiplier,
7002                                 trans_size,
7003                                 0,
7004                                 ps_ctxt->u1_enable_psyRDOPT,
7005                                 V_PLANE);
7006                         }
7007 #endif
7008                     }
7009 #endif
7010 
7011                     /* RDOPT copy States :  New updated after curr TU to TU init */
7012                     if(0 != cbf)
7013                     {
7014                         COPY_CABAC_STATES(
7015                             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7016                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
7017                                  .s_cabac_ctxt.au1_ctxt_models[0],
7018                             IHEVC_CAB_CTXT_END);
7019                     }
7020                     /* RDOPT copy States :  Restoring back the Cb init state to Cr */
7021                     else
7022                     {
7023                         COPY_CABAC_STATES(
7024                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
7025                                  .s_cabac_ctxt.au1_ctxt_models[0],
7026                             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7027                             IHEVC_CAB_CTXT_END);
7028                     }
7029 
7030                     if(calc_recon || (!u1_is_recon_available && u1_compute_spatial_ssd))
7031                     {
7032                         ihevce_chroma_it_recon_fxn(
7033                             ps_ctxt,
7034                             pi2_cur_deq_data_cr,
7035                             deq_data_strd,
7036                             pu1_cur_pred,
7037                             pred_strd,
7038                             pu1_cur_recon,
7039                             i4_recon_stride,
7040                             (pu1_ecd_data_cr + ai4_total_bytes_offset_cr[i4_subtu_idx]),
7041                             trans_size,
7042                             cbf,
7043                             ps_chr_intra_satd_ctxt->ai4_zero_col_cr[i4_subtu_idx][ctr],
7044                             ps_chr_intra_satd_ctxt->ai4_zero_row_cr[i4_subtu_idx][ctr],
7045                             V_PLANE);
7046                     }
7047 
7048                     ps_chr_intra_satd_ctxt->au1_cbf_cr[i4_subtu_idx][ctr] = cbf;
7049                     curr_cr_cod_cost =
7050                         trans_ssd_v +
7051                         COMPUTE_RATE_COST_CLIP30(
7052                             tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
7053                     chrm_tu_bits += tu_bits;
7054                     ai4_total_bytes_offset_cr[i4_subtu_idx] += num_bytes;
7055                     ps_chr_intra_satd_ctxt->ai4_num_bytes_scan_coeff_cr_per_tu[i4_subtu_idx][ctr] =
7056                         num_bytes;
7057                 }
7058 
7059                 chrm_cod_cost += curr_cb_cod_cost;
7060                 chrm_cod_cost += curr_cr_cod_cost;
7061             }
7062 
7063             /* set the neighbour map to 1 */
7064             ihevce_set_nbr_map(
7065                 ps_ctxt->pu1_ctb_nbr_map,
7066                 ps_ctxt->i4_nbr_map_strd,
7067                 (ctr & 1) * (luma_trans_size >> 2) + cu_pos_x,
7068                 (ctr > 1) * (luma_trans_size >> 2) + cu_pos_y,
7069                 (luma_trans_size >> 2),
7070                 1);
7071         }
7072 
7073         /* set the neighbour map to 0 */
7074         ihevce_set_nbr_map(
7075             ps_ctxt->pu1_ctb_nbr_map,
7076             ps_ctxt->i4_nbr_map_strd,
7077             (ps_cu_analyse->b3_cu_pos_x << 1),
7078             (ps_cu_analyse->b3_cu_pos_y << 1),
7079             (ps_cu_analyse->u1_cu_size >> 2),
7080             0);
7081 
7082         /* Account for coding b3_chroma_intra_pred_mode prefix and suffix bins */
7083         /* This is done by adding the bits for signalling chroma mode (0-3)    */
7084         /* and subtracting the bits for chroma mode same as luma mode (4)      */
7085 #if CHROMA_RDOPT_ENABLE
7086         {
7087             /* Estimate bits to encode prefix bin as 1 for b3_chroma_intra_pred_mode */
7088             WORD32 bits_frac_1 =
7089                 gau2_ihevce_cabac_bin_to_bits[u1_chroma_intra_mode_prefix_state ^ 1];
7090 
7091             WORD32 bits_for_mode_0to3 = (2 << CABAC_FRAC_BITS_Q) + bits_frac_1;
7092 
7093             /* Estimate bits to encode prefix bin as 0 for b3_chroma_intra_pred_mode */
7094             WORD32 bits_for_mode4 =
7095                 gau2_ihevce_cabac_bin_to_bits[u1_chroma_intra_mode_prefix_state ^ 0];
7096 
7097             /* accumulate into final rd cost for chroma */
7098             ps_chr_intra_satd_ctxt->i8_cost_to_encode_chroma_mode = COMPUTE_RATE_COST_CLIP30(
7099                 (bits_for_mode_0to3 - bits_for_mode4),
7100                 ps_ctxt->i8_cl_ssd_lambda_chroma_qf,
7101                 (LAMBDA_Q_SHIFT + CABAC_FRAC_BITS_Q));
7102 
7103             chrm_cod_cost += ps_chr_intra_satd_ctxt->i8_cost_to_encode_chroma_mode;
7104         }
7105 #endif
7106 
7107         if(ps_ctxt->u1_enable_psyRDOPT)
7108         {
7109             UWORD8 *pu1_recon_cu;
7110             WORD32 recon_stride;
7111             WORD32 curr_pos_x;
7112             WORD32 curr_pos_y;
7113             WORD32 start_index;
7114             WORD32 num_horz_cu_in_ctb;
7115             WORD32 had_block_size;
7116 
7117             /* tODO: sreenivasa ctb size has to be used appropriately */
7118             had_block_size = 8;
7119             num_horz_cu_in_ctb = 2 * 64 / had_block_size;
7120             curr_pos_x = ps_cu_analyse->b3_cu_pos_x << 3; /* pel units */
7121             curr_pos_y = ps_cu_analyse->b3_cu_pos_x << 3; /* pel units */
7122             recon_stride = aps_recon_datastore[0]->i4_chromaRecon_stride;
7123             pu1_recon_cu =
7124                 aps_recon_datastore[0]->apv_chroma_recon_bufs[1 + (num_tus_in_cu > 1)];  //
7125 
7126             /* start index to index the source satd of curr cu int he current ctb*/
7127             start_index = 2 * (curr_pos_x / had_block_size) +
7128                           (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
7129 
7130             {
7131                 chrm_cod_cost += ihevce_psy_rd_cost_croma(
7132                     ps_ctxt->ai4_source_chroma_satd,
7133                     pu1_recon_cu,
7134                     recon_stride,
7135                     1,  //
7136                     cu_size,
7137                     0,  // pic type
7138                     0,  //layer id
7139                     ps_ctxt->i4_satd_lamda,  // lambda
7140                     start_index,
7141                     ps_ctxt->u1_is_input_data_hbd,  // 8 bit
7142                     ps_ctxt->u1_chroma_array_type,
7143                     &ps_ctxt->s_cmn_opt_func
7144 
7145                 );  // chroma subsampling 420
7146             }
7147         }
7148 
7149         ps_chr_intra_satd_ctxt->i8_chroma_best_rdopt = chrm_cod_cost;
7150         ps_chr_intra_satd_ctxt->i4_chrm_tu_bits = chrm_tu_bits;
7151 
7152         memcpy(
7153             &ps_chr_intra_satd_ctxt->au1_chrm_satd_updated_ctxt_models[0],
7154             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7155             IHEVC_CAB_CTXT_END);
7156     }
7157 }
7158 
7159 /*!
7160 ******************************************************************************
7161 * \if Function name : ihevce_chroma_cu_prcs_rdopt \endif
7162 *
7163 * \brief
7164 *    Coding unit processing function for chroma
7165 *
7166 * \param[in] ps_ctxt    enc_loop module ctxt pointer
7167 * \param[in] rd_opt_curr_idx index in the array of RDopt params
7168 * \param[in] func_proc_mode TU_EQ_CU or other case
7169 * \param[in] pu1_chrm_src  pointer to source data buffer
7170 * \param[in] chrm_src_stride   source buffer stride
7171 * \param[in] pu1_cu_left pointer to left recon data buffer
7172 * \param[in] pu1_cu_top  pointer to top recon data buffer
7173 * \param[in] pu1_cu_top_left pointer to top left recon data buffer
7174 * \param[in] left_stride left recon buffer stride
7175 * \param[out] cu_pos_x position x of current CU in CTB
7176 * \param[out] cu_pos_y position y of current CU in CTB
7177 * \param[out] pi4_chrm_tu_bits pointer to store the totla chroma bits
7178 *
7179 * \return
7180 *    Chroma coding cost (cb adn Cr included)
7181 *
7182 * \author
7183 *  Ittiam
7184 *
7185 *****************************************************************************
7186 */
ihevce_chroma_cu_prcs_rdopt(ihevce_enc_loop_ctxt_t * ps_ctxt,WORD32 rd_opt_curr_idx,WORD32 func_proc_mode,UWORD8 * pu1_chrm_src,WORD32 chrm_src_stride,UWORD8 * pu1_cu_left,UWORD8 * pu1_cu_top,UWORD8 * pu1_cu_top_left,WORD32 cu_left_stride,WORD32 cu_pos_x,WORD32 cu_pos_y,WORD32 * pi4_chrm_tu_bits,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy)7187 LWORD64 ihevce_chroma_cu_prcs_rdopt(
7188     ihevce_enc_loop_ctxt_t *ps_ctxt,
7189     WORD32 rd_opt_curr_idx,
7190     WORD32 func_proc_mode,
7191     UWORD8 *pu1_chrm_src,
7192     WORD32 chrm_src_stride,
7193     UWORD8 *pu1_cu_left,
7194     UWORD8 *pu1_cu_top,
7195     UWORD8 *pu1_cu_top_left,
7196     WORD32 cu_left_stride,
7197     WORD32 cu_pos_x,
7198     WORD32 cu_pos_y,
7199     WORD32 *pi4_chrm_tu_bits,
7200     WORD32 i4_alpha_stim_multiplier,
7201     UWORD8 u1_is_cu_noisy)
7202 {
7203     tu_enc_loop_out_t *ps_tu;
7204     tu_enc_loop_temp_prms_t *ps_tu_temp_prms;
7205 
7206     ihevc_intra_pred_chroma_ref_substitution_ft *ihevc_intra_pred_chroma_ref_substitution_fptr;
7207 
7208     UWORD8 *pu1_pred;
7209     UWORD8 *pu1_recon;
7210     WORD32 i4_recon_stride;
7211     WORD32 cu_size, trans_size = 0;
7212     WORD32 pred_strd;
7213     WORD32 ctr, i4_subtu_idx;
7214     WORD32 scan_idx;
7215     WORD32 u1_is_cu_coded_old;
7216     WORD32 init_bytes_offset;
7217 
7218     enc_loop_cu_final_prms_t *ps_best_cu_prms = &ps_ctxt->as_cu_prms[rd_opt_curr_idx];
7219     recon_datastore_t *ps_recon_datastore = &ps_best_cu_prms->s_recon_datastore;
7220 
7221     WORD32 total_bytes_offset = 0;
7222     LWORD64 chrm_cod_cost = 0;
7223     WORD32 chrm_tu_bits = 0;
7224     WORD32 chrm_pred_mode = DM_CHROMA_IDX, luma_pred_mode = 35;
7225     LWORD64 i8_ssd_cb = 0;
7226     WORD32 i4_bits_cb = 0;
7227     LWORD64 i8_ssd_cr = 0;
7228     WORD32 i4_bits_cr = 0;
7229     UWORD8 u1_is_422 = (ps_ctxt->u1_chroma_array_type == 2);
7230     UWORD8 u1_num_tus =
7231         /* NumChromaTU's = 1, if TUSize = 4 and CUSize = 8 */
7232         (!ps_best_cu_prms->as_tu_enc_loop[0].s_tu.b3_size && ps_best_cu_prms->u1_intra_flag)
7233             ? 1
7234             : ps_best_cu_prms->u2_num_tus_in_cu;
7235     UWORD8 u1_num_subtus_in_tu = u1_is_422 + 1;
7236     UWORD8 u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_QP_WHERE_SPATIAL_SSD_ENABLED) &&
7237                                     (ps_ctxt->i4_quality_preset < IHEVCE_QUALITY_P3) &&
7238                                     CONVERT_SSDS_TO_SPATIAL_DOMAIN;
7239     /* Get the RDOPT cost of the best CU mode for early_exit */
7240     LWORD64 prev_best_rdopt_cost = ps_ctxt->as_cu_prms[!rd_opt_curr_idx].i8_best_rdopt_cost;
7241     /* Get the current running RDOPT (Luma RDOPT) for early_exit */
7242     LWORD64 curr_rdopt_cost = ps_ctxt->as_cu_prms[rd_opt_curr_idx].i8_curr_rdopt_cost;
7243     WORD32 i4_perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_rdoq;
7244     WORD32 i4_perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_all_cand_sbh;
7245 
7246     ihevc_intra_pred_chroma_ref_substitution_fptr =
7247         ps_ctxt->ps_func_selector->ihevc_intra_pred_chroma_ref_substitution_fptr;
7248 
7249     if(u1_is_cu_noisy || ps_ctxt->u1_enable_psyRDOPT)
7250     {
7251         u1_compute_spatial_ssd = (ps_ctxt->i4_cu_qp <= MAX_HEVC_QP) &&
7252                                  CONVERT_SSDS_TO_SPATIAL_DOMAIN;
7253     }
7254 
7255     /* Store the init bytes offset from luma */
7256     init_bytes_offset = ps_best_cu_prms->i4_num_bytes_ecd_data;
7257 
7258     /* Unused pred buffer in merge_skip_pred_data_t structure is used as
7259     Chroma pred storage buf. for final_recon function.
7260     The buffer is split into two and used as a ping-pong buffer */
7261     pu1_pred = ps_ctxt->s_cu_me_intra_pred_prms.pu1_pred_data[CU_ME_INTRA_PRED_CHROMA_IDX] +
7262                rd_opt_curr_idx * ((MAX_CTB_SIZE * MAX_CTB_SIZE >> 1) +
7263                                   (u1_is_422 * (MAX_CTB_SIZE * MAX_CTB_SIZE >> 1)));
7264 
7265     pred_strd = ps_ctxt->s_cu_me_intra_pred_prms.ai4_pred_data_stride[CU_ME_INTRA_PRED_CHROMA_IDX];
7266 
7267     pu1_recon = (UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs[0];
7268     i4_recon_stride = ps_recon_datastore->i4_chromaRecon_stride;
7269     cu_size = ps_best_cu_prms->u1_cu_size;
7270     chrm_tu_bits = 0;
7271 
7272     /* get the first TU pointer */
7273     ps_tu = &ps_best_cu_prms->as_tu_enc_loop[0];
7274     /* get the first TU enc_loop temp prms pointer */
7275     ps_tu_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
7276 
7277     if(PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag)
7278     {
7279         /* Mode signalled by intra prediction for luma */
7280         luma_pred_mode = ps_best_cu_prms->au1_intra_pred_mode[0];
7281 
7282 #if DISABLE_RDOQ_INTRA
7283         i4_perform_rdoq = 0;
7284 #endif
7285     }
7286 
7287     else
7288     {
7289         UWORD8 *pu1_pred_org = pu1_pred;
7290 
7291         /* ------ Motion Compensation for Chroma -------- */
7292         for(ctr = 0; ctr < ps_best_cu_prms->u2_num_pus_in_cu; ctr++)
7293         {
7294             pu_t *ps_pu;
7295             WORD32 inter_pu_wd;
7296             WORD32 inter_pu_ht;
7297 
7298             ps_pu = &ps_best_cu_prms->as_pu_chrm_proc[ctr];
7299 
7300             inter_pu_wd = (ps_pu->b4_wd + 1) << 2; /* cb and cr pixel interleaved */
7301             inter_pu_ht = ((ps_pu->b4_ht + 1) << 2) >> 1;
7302             inter_pu_ht <<= u1_is_422;
7303 
7304             ihevce_chroma_inter_pred_pu(&ps_ctxt->s_mc_ctxt, ps_pu, pu1_pred, pred_strd);
7305 
7306             if(2 == ps_best_cu_prms->u2_num_pus_in_cu)
7307             {
7308                 /* 2Nx__ partion case */
7309                 if(inter_pu_wd == cu_size)
7310                 {
7311                     pu1_pred += (inter_pu_ht * pred_strd);
7312                 }
7313 
7314                 /* __x2N partion case */
7315                 if(inter_pu_ht == (cu_size >> (u1_is_422 == 0)))
7316                 {
7317                     pu1_pred += inter_pu_wd;
7318                 }
7319             }
7320         }
7321 
7322         /* restore the pred pointer to start for transform loop */
7323         pu1_pred = pu1_pred_org;
7324     }
7325 
7326     /* Used to store back only the luma based info. if SATD based chorma
7327     mode also comes */
7328     u1_is_cu_coded_old = ps_best_cu_prms->u1_is_cu_coded;
7329 
7330     /* evaluate chroma candidates (same as luma) and
7331     if INTRA & HIGH_QUALITY compare with best SATD mode */
7332     {
7333         WORD32 calc_recon = 0, deq_data_strd;
7334         WORD16 *pi2_deq_data;
7335         UWORD8 *pu1_ecd_data;
7336         UWORD8 u1_is_mode_eq_chroma_satd_mode = 0;
7337 
7338         pi2_deq_data = &ps_best_cu_prms->pi2_cu_deq_coeffs[0];
7339         pi2_deq_data += ps_best_cu_prms->i4_chrm_deq_coeff_strt_idx;
7340         deq_data_strd = cu_size;
7341         /* update ecd buffer for storing coeff. */
7342         pu1_ecd_data = &ps_best_cu_prms->pu1_cu_coeffs[0];
7343         pu1_ecd_data += init_bytes_offset;
7344         /* store chroma starting index */
7345         ps_best_cu_prms->i4_chrm_cu_coeff_strt_idx = init_bytes_offset;
7346 
7347         /* get the first TU pointer */
7348         ps_tu = &ps_best_cu_prms->as_tu_enc_loop[0];
7349         ps_tu_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
7350 
7351         /* Reset total_bytes_offset for each candidate */
7352         chrm_pred_mode = (u1_is_422 == 1) ? gau1_chroma422_intra_angle_mapping[luma_pred_mode]
7353                                           : luma_pred_mode;
7354 
7355         total_bytes_offset = 0;
7356 
7357         if(TU_EQ_SUBCU == func_proc_mode)
7358         {
7359             func_proc_mode = TU_EQ_CU_DIV2;
7360         }
7361 
7362         /* For cu_size=8 case, chroma cost will be same for TU_EQ_CU and
7363         TU_EQ_CU_DIV2 and  TU_EQ_SUBCU case */
7364         if(8 == cu_size)
7365         {
7366             func_proc_mode = TU_EQ_CU;
7367         }
7368 
7369         /* loop based on num tus in a cu */
7370         if(!ps_best_cu_prms->u1_intra_flag || !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_satd ||
7371            (ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_satd &&
7372             (chrm_pred_mode !=
7373              ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[func_proc_mode].u1_best_cr_mode)))
7374         {
7375             /* loop based on num tus in a cu */
7376             for(ctr = 0; ctr < u1_num_tus; ctr++)
7377             {
7378                 WORD32 num_bytes = 0;
7379                 LWORD64 curr_cb_cod_cost = 0;
7380                 LWORD64 curr_cr_cod_cost = 0;
7381                 WORD32 chrm_pred_func_idx = 0;
7382                 UWORD8 u1_is_early_exit_condition_satisfied = 0;
7383 
7384                 /* Default cb and cr offset initializatio for b3_chroma_intra_mode_idx=7   */
7385                 /* FIX for TU tree shrinkage caused by ecd data copies in final mode recon */
7386                 ps_tu->s_tu.b1_cb_cbf = ps_tu->s_tu.b1_cr_cbf = 0;
7387                 ps_tu->s_tu.b1_cb_cbf_subtu1 = ps_tu->s_tu.b1_cr_cbf_subtu1 = 0;
7388                 ps_tu->ai4_cb_coeff_offset[0] = total_bytes_offset + init_bytes_offset;
7389                 ps_tu->ai4_cr_coeff_offset[0] = total_bytes_offset + init_bytes_offset;
7390                 ps_tu->ai4_cb_coeff_offset[1] = total_bytes_offset + init_bytes_offset;
7391                 ps_tu->ai4_cr_coeff_offset[1] = total_bytes_offset + init_bytes_offset;
7392                 ps_tu_temp_prms->ai2_cb_bytes_consumed[0] = 0;
7393                 ps_tu_temp_prms->ai2_cr_bytes_consumed[0] = 0;
7394                 ps_tu_temp_prms->ai2_cb_bytes_consumed[1] = 0;
7395                 ps_tu_temp_prms->ai2_cr_bytes_consumed[1] = 0;
7396 
7397                 /* TU level inits */
7398                 /* check if chroma present flag is set */
7399                 if(1 == ps_tu->s_tu.b3_chroma_intra_mode_idx)
7400                 {
7401                     /* RDOPT copy States :  TU init (best until prev TU) to current */
7402                     COPY_CABAC_STATES(
7403                         &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_curr_idx]
7404                              .s_cabac_ctxt.au1_ctxt_models[0],
7405                         &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7406                         IHEVC_CAB_CTXT_END);
7407 
7408                     /* get the current transform size */
7409                     trans_size = ps_tu->s_tu.b3_size;
7410                     trans_size = (1 << (trans_size + 1)); /* in chroma units */
7411 
7412                     /* since 2x2 transform is not allowed for chroma*/
7413                     if(2 == trans_size)
7414                     {
7415                         trans_size = 4;
7416                     }
7417                 }
7418 
7419                 for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus_in_tu; i4_subtu_idx++)
7420                 {
7421                     WORD32 cbf;
7422                     UWORD8 u1_is_recon_available;
7423 
7424                     WORD32 nbr_flags = 0;
7425                     WORD32 zero_cols = 0;
7426                     WORD32 zero_rows = 0;
7427 
7428                     /* check if chroma present flag is set */
7429                     if(1 == ps_tu->s_tu.b3_chroma_intra_mode_idx)
7430                     {
7431                         UWORD8 *pu1_cur_pred;
7432                         UWORD8 *pu1_cur_recon;
7433                         UWORD8 *pu1_cur_src;
7434                         WORD16 *pi2_cur_deq_data;
7435                         WORD32 curr_pos_x, curr_pos_y;
7436                         LWORD64 trans_ssd_u, trans_ssd_v;
7437 
7438                         /* get the current sub-tu posx and posy w.r.t to cu */
7439                         curr_pos_x = (ps_tu->s_tu.b4_pos_x << 2) - (cu_pos_x << 3);
7440                         curr_pos_y = (ps_tu->s_tu.b4_pos_y << 2) - (cu_pos_y << 3) +
7441                                      (i4_subtu_idx * trans_size);
7442 
7443                         /* 420sp case only vertical height will be half */
7444                         if(u1_is_422 == 0)
7445                         {
7446                             curr_pos_y >>= 1;
7447                         }
7448 
7449                         /* increment the pointers to start of current Sub-TU */
7450                         pu1_cur_recon = (pu1_recon + curr_pos_x);
7451                         pu1_cur_recon += (curr_pos_y * i4_recon_stride);
7452                         pu1_cur_src = (pu1_chrm_src + curr_pos_x);
7453                         pu1_cur_src += (curr_pos_y * chrm_src_stride);
7454                         pu1_cur_pred = (pu1_pred + curr_pos_x);
7455                         pu1_cur_pred += (curr_pos_y * pred_strd);
7456                         pi2_cur_deq_data = pi2_deq_data + curr_pos_x;
7457                         pi2_cur_deq_data += (curr_pos_y * deq_data_strd);
7458 
7459                         /* populate the coeffs scan idx */
7460                         scan_idx = SCAN_DIAG_UPRIGHT;
7461 
7462                         /* perform intra prediction only for Intra case */
7463                         if(PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag)
7464                         {
7465                             UWORD8 *pu1_top_left;
7466                             UWORD8 *pu1_top;
7467                             UWORD8 *pu1_left;
7468                             WORD32 left_strd;
7469 
7470                             calc_recon = !u1_compute_spatial_ssd &&
7471                                          ((4 == u1_num_tus) || (u1_is_422 == 1)) &&
7472                                          (((u1_num_tus == 1) && (0 == i4_subtu_idx)) ||
7473                                           ((ctr == 3) && (0 == i4_subtu_idx) && (u1_is_422 == 1)) ||
7474                                           ((u1_num_tus == 4) && (ctr < 3)));
7475 
7476                             /* left cu boundary */
7477                             if(0 == curr_pos_x)
7478                             {
7479                                 pu1_left = pu1_cu_left + curr_pos_y * cu_left_stride;
7480                                 left_strd = cu_left_stride;
7481                             }
7482                             else
7483                             {
7484                                 pu1_left = pu1_cur_recon - 2;
7485                                 left_strd = i4_recon_stride;
7486                             }
7487 
7488                             /* top cu boundary */
7489                             if(0 == curr_pos_y)
7490                             {
7491                                 pu1_top = pu1_cu_top + curr_pos_x;
7492                             }
7493                             else
7494                             {
7495                                 pu1_top = pu1_cur_recon - i4_recon_stride;
7496                             }
7497 
7498                             /* by default top left is set to cu top left */
7499                             pu1_top_left = pu1_cu_top_left;
7500 
7501                             /* top left based on position */
7502                             if((0 != curr_pos_y) && (0 == curr_pos_x))
7503                             {
7504                                 pu1_top_left = pu1_left - cu_left_stride;
7505                             }
7506                             else if(0 != curr_pos_x)
7507                             {
7508                                 pu1_top_left = pu1_top - 2;
7509                             }
7510 
7511                             /* for 4x4 transforms based on intra pred mode scan is choosen*/
7512                             if(4 == trans_size)
7513                             {
7514                                 /* for modes from 22 upto 30 horizontal scan is used */
7515                                 if((chrm_pred_mode > 21) && (chrm_pred_mode < 31))
7516                                 {
7517                                     scan_idx = SCAN_HORZ;
7518                                 }
7519                                 /* for modes from 6 upto 14 horizontal scan is used */
7520                                 else if((chrm_pred_mode > 5) && (chrm_pred_mode < 15))
7521                                 {
7522                                     scan_idx = SCAN_VERT;
7523                                 }
7524                             }
7525 
7526                             nbr_flags = ihevce_get_intra_chroma_tu_nbr(
7527                                 ps_best_cu_prms->au4_nbr_flags[ctr],
7528                                 i4_subtu_idx,
7529                                 trans_size,
7530                                 u1_is_422);
7531 
7532                             /* call the chroma reference array substitution */
7533                             ihevc_intra_pred_chroma_ref_substitution_fptr(
7534                                 pu1_top_left,
7535                                 pu1_top,
7536                                 pu1_left,
7537                                 left_strd,
7538                                 trans_size,
7539                                 nbr_flags,
7540                                 (UWORD8 *)ps_ctxt->pv_ref_sub_out,
7541                                 1);
7542 
7543                             /* use the look up to get the function idx */
7544                             chrm_pred_func_idx = g_i4_ip_funcs[chrm_pred_mode];
7545 
7546                             /* call the intra prediction function */
7547                             ps_ctxt->apf_chrm_ip[chrm_pred_func_idx](
7548                                 (UWORD8 *)ps_ctxt->pv_ref_sub_out,
7549                                 1,
7550                                 pu1_cur_pred,
7551                                 pred_strd,
7552                                 trans_size,
7553                                 chrm_pred_mode);
7554                         }
7555 
7556                         if(!ctr && !i4_subtu_idx && (u1_compute_spatial_ssd || calc_recon))
7557                         {
7558                             ps_recon_datastore->au1_is_chromaRecon_available[0] =
7559                                 !ps_best_cu_prms->u1_skip_flag;
7560                         }
7561                         else if(!ctr && !i4_subtu_idx)
7562                         {
7563                             ps_recon_datastore->au1_is_chromaRecon_available[0] = 0;
7564                         }
7565                         /************************************************************/
7566                         /* recon loop is done for all cases including skip cu       */
7567                         /* This is because skipping chroma reisdual based on luma   */
7568                         /* skip decision can lead to chroma artifacts               */
7569                         /************************************************************/
7570                         /************************************************************/
7571                         /*In the high quality and medium speed modes, wherein chroma*/
7572                         /*and luma costs are included in the total cost calculation */
7573                         /*the cost is just a ssd cost, and not that obtained through*/
7574                         /*iq_it path                                                */
7575                         /************************************************************/
7576                         if(ps_best_cu_prms->u1_skip_flag == 0)
7577                         {
7578                             WORD32 tu_bits;
7579 
7580                             cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
7581                                 ps_ctxt,
7582                                 pu1_cur_pred,
7583                                 pred_strd,
7584                                 pu1_cur_src,
7585                                 chrm_src_stride,
7586                                 pi2_cur_deq_data,
7587                                 deq_data_strd,
7588                                 pu1_cur_recon,
7589                                 i4_recon_stride,
7590                                 pu1_ecd_data + total_bytes_offset,
7591                                 ps_ctxt->au1_cu_csbf,
7592                                 ps_ctxt->i4_cu_csbf_strd,
7593                                 trans_size,
7594                                 scan_idx,
7595                                 PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag,
7596                                 &num_bytes,
7597                                 &tu_bits,
7598                                 &zero_cols,
7599                                 &zero_rows,
7600                                 &u1_is_recon_available,
7601                                 i4_perform_sbh,
7602                                 i4_perform_rdoq,
7603                                 &trans_ssd_u,
7604 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
7605                                 i4_alpha_stim_multiplier,
7606                                 u1_is_cu_noisy,
7607 #endif
7608                                 ps_best_cu_prms->u1_skip_flag,
7609                                 u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
7610                                 U_PLANE);
7611 
7612                             if(u1_compute_spatial_ssd && u1_is_recon_available)
7613                             {
7614                                 ps_recon_datastore
7615                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7616                                                                         [i4_subtu_idx] = 0;
7617                             }
7618                             else
7619                             {
7620                                 ps_recon_datastore
7621                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7622                                                                         [i4_subtu_idx] = UCHAR_MAX;
7623                             }
7624 
7625 #if !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
7626                             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
7627                             {
7628 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
7629                                 trans_ssd_u = ihevce_inject_stim_into_distortion(
7630                                     pu1_cur_src,
7631                                     chrm_src_stride,
7632                                     pu1_cur_pred,
7633                                     pred_strd,
7634                                     trans_ssd_u,
7635                                     i4_alpha_stim_multiplier,
7636                                     trans_size,
7637                                     0,
7638                                     ps_ctxt->u1_enable_psyRDOPT,
7639                                     U_PLANE);
7640 #else
7641                                 if(u1_compute_spatial_ssd && u1_is_recon_available)
7642                                 {
7643                                     trans_ssd_u = ihevce_inject_stim_into_distortion(
7644                                         pu1_cur_src,
7645                                         chrm_src_stride,
7646                                         pu1_cur_recon,
7647                                         i4_recon_stride,
7648                                         trans_ssd_u,
7649                                         i4_alpha_stim_multiplier,
7650                                         trans_size,
7651                                         0,
7652                                         ps_ctxt->u1_enable_psyRDOPT,
7653                                         U_PLANE);
7654                                 }
7655                                 else
7656                                 {
7657                                     trans_ssd_u = ihevce_inject_stim_into_distortion(
7658                                         pu1_cur_src,
7659                                         chrm_src_stride,
7660                                         pu1_cur_pred,
7661                                         pred_strd,
7662                                         trans_ssd_u,
7663                                         i4_alpha_stim_multiplier,
7664                                         trans_size,
7665                                         0,
7666                                         ps_ctxt->u1_enable_psyRDOPT,
7667                                         U_PLANE);
7668                                 }
7669 #endif
7670                             }
7671 #endif
7672 
7673                             curr_cb_cod_cost =
7674                                 trans_ssd_u +
7675                                 COMPUTE_RATE_COST_CLIP30(
7676                                     tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
7677 
7678                             chrm_tu_bits += tu_bits;
7679                             i4_bits_cb += tu_bits;
7680 
7681                             /* RDOPT copy States :  New updated after curr TU to TU init */
7682                             if(0 != cbf)
7683                             {
7684                                 COPY_CABAC_STATES(
7685                                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7686                                     &ps_ctxt->s_rdopt_entropy_ctxt
7687                                          .as_cu_entropy_ctxt[rd_opt_curr_idx]
7688                                          .s_cabac_ctxt.au1_ctxt_models[0],
7689                                     IHEVC_CAB_CTXT_END);
7690                             }
7691                             /* RDOPT copy States :  Restoring back the Cb init state to Cr */
7692                             else
7693                             {
7694                                 COPY_CABAC_STATES(
7695                                     &ps_ctxt->s_rdopt_entropy_ctxt
7696                                          .as_cu_entropy_ctxt[rd_opt_curr_idx]
7697                                          .s_cabac_ctxt.au1_ctxt_models[0],
7698                                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7699                                     IHEVC_CAB_CTXT_END);
7700                             }
7701 
7702                             /* If Intra and TU=CU/2, need recon for next TUs */
7703                             if(calc_recon)
7704                             {
7705                                 ihevce_chroma_it_recon_fxn(
7706                                     ps_ctxt,
7707                                     pi2_cur_deq_data,
7708                                     deq_data_strd,
7709                                     pu1_cur_pred,
7710                                     pred_strd,
7711                                     pu1_cur_recon,
7712                                     i4_recon_stride,
7713                                     (pu1_ecd_data + total_bytes_offset),
7714                                     trans_size,
7715                                     cbf,
7716                                     zero_cols,
7717                                     zero_rows,
7718                                     U_PLANE);
7719 
7720                                 ps_recon_datastore
7721                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7722                                                                         [i4_subtu_idx] = 0;
7723                             }
7724                             else
7725                             {
7726                                 ps_recon_datastore
7727                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7728                                                                         [i4_subtu_idx] = UCHAR_MAX;
7729                             }
7730                         }
7731                         else
7732                         {
7733                             /* num bytes is set to 0 */
7734                             num_bytes = 0;
7735 
7736                             /* cbf is returned as 0 */
7737                             cbf = 0;
7738 
7739                             curr_cb_cod_cost = trans_ssd_u =
7740 
7741                                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
7742                                     pu1_cur_pred,
7743                                     pu1_cur_src,
7744                                     pred_strd,
7745                                     chrm_src_stride,
7746                                     trans_size,
7747                                     trans_size);
7748 
7749                             if(u1_compute_spatial_ssd)
7750                             {
7751                                 /* buffer copy fromp pred to recon */
7752 
7753                                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
7754                                     pu1_cur_pred,
7755                                     pred_strd,
7756                                     pu1_cur_recon,
7757                                     i4_recon_stride,
7758                                     trans_size,
7759                                     trans_size,
7760                                     U_PLANE);
7761 
7762                                 ps_recon_datastore
7763                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
7764                                                                         [i4_subtu_idx] = 0;
7765                             }
7766 
7767                             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
7768                             {
7769                                 trans_ssd_u = ihevce_inject_stim_into_distortion(
7770                                     pu1_cur_src,
7771                                     chrm_src_stride,
7772                                     pu1_cur_pred,
7773                                     pred_strd,
7774                                     trans_ssd_u,
7775                                     i4_alpha_stim_multiplier,
7776                                     trans_size,
7777                                     0,
7778                                     ps_ctxt->u1_enable_psyRDOPT,
7779                                     U_PLANE);
7780                             }
7781 
7782 #if ENABLE_INTER_ZCU_COST
7783 #if !WEIGH_CHROMA_COST
7784                             /* cbf = 0, accumulate cu not coded cost */
7785                             ps_ctxt->i8_cu_not_coded_cost += curr_cb_cod_cost;
7786 #else
7787                             /* cbf = 0, accumulate cu not coded cost */
7788 
7789                             ps_ctxt->i8_cu_not_coded_cost += (LWORD64)(
7790                                 (curr_cb_cod_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
7791                                  (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
7792                                 CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
7793 #endif
7794 #endif
7795                         }
7796 
7797 #if !WEIGH_CHROMA_COST
7798                         curr_rdopt_cost += curr_cb_cod_cost;
7799 #else
7800                         curr_rdopt_cost +=
7801                             ((curr_cb_cod_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
7802                               (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
7803                              CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
7804 #endif
7805                         chrm_cod_cost += curr_cb_cod_cost;
7806                         i8_ssd_cb += trans_ssd_u;
7807 
7808                         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
7809                         {
7810                             /* Early exit : If the current running cost exceeds
7811                             the prev. best mode cost, break */
7812                             if(curr_rdopt_cost > prev_best_rdopt_cost)
7813                             {
7814                                 u1_is_early_exit_condition_satisfied = 1;
7815                                 break;
7816                             }
7817                         }
7818 
7819                         /* inter cu is coded if any of the tu is coded in it */
7820                         ps_best_cu_prms->u1_is_cu_coded |= cbf;
7821 
7822                         /* update CB related params */
7823                         ps_tu->ai4_cb_coeff_offset[i4_subtu_idx] =
7824                             total_bytes_offset + init_bytes_offset;
7825 
7826                         if(0 == i4_subtu_idx)
7827                         {
7828                             ps_tu->s_tu.b1_cb_cbf = cbf;
7829                         }
7830                         else
7831                         {
7832                             ps_tu->s_tu.b1_cb_cbf_subtu1 = cbf;
7833                         }
7834 
7835                         total_bytes_offset += num_bytes;
7836 
7837                         ps_tu_temp_prms->au4_cb_zero_col[i4_subtu_idx] = zero_cols;
7838                         ps_tu_temp_prms->au4_cb_zero_row[i4_subtu_idx] = zero_rows;
7839                         ps_tu_temp_prms->ai2_cb_bytes_consumed[i4_subtu_idx] = num_bytes;
7840 
7841                         /* recon loop is done for non skip cases */
7842                         if(ps_best_cu_prms->u1_skip_flag == 0)
7843                         {
7844                             WORD32 tu_bits;
7845 
7846                             cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
7847                                 ps_ctxt,
7848                                 pu1_cur_pred,
7849                                 pred_strd,
7850                                 pu1_cur_src,
7851                                 chrm_src_stride,
7852                                 pi2_cur_deq_data + trans_size,
7853                                 deq_data_strd,
7854                                 pu1_cur_recon,
7855                                 i4_recon_stride,
7856                                 pu1_ecd_data + total_bytes_offset,
7857                                 ps_ctxt->au1_cu_csbf,
7858                                 ps_ctxt->i4_cu_csbf_strd,
7859                                 trans_size,
7860                                 scan_idx,
7861                                 PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag,
7862                                 &num_bytes,
7863                                 &tu_bits,
7864                                 &zero_cols,
7865                                 &zero_rows,
7866                                 &u1_is_recon_available,
7867                                 i4_perform_sbh,
7868                                 i4_perform_rdoq,
7869                                 &trans_ssd_v,
7870 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
7871                                 i4_alpha_stim_multiplier,
7872                                 u1_is_cu_noisy,
7873 #endif
7874                                 ps_best_cu_prms->u1_skip_flag,
7875                                 u1_compute_spatial_ssd ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
7876                                 V_PLANE);
7877 
7878                             if(u1_compute_spatial_ssd && u1_is_recon_available)
7879                             {
7880                                 ps_recon_datastore
7881                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7882                                                                         [i4_subtu_idx] = 0;
7883                             }
7884                             else
7885                             {
7886                                 ps_recon_datastore
7887                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7888                                                                         [i4_subtu_idx] = UCHAR_MAX;
7889                             }
7890 
7891 #if !USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
7892                             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
7893                             {
7894 #if !USE_RECON_TO_EVALUATE_STIM_IN_RDOPT
7895                                 trans_ssd_v = ihevce_inject_stim_into_distortion(
7896                                     pu1_cur_src,
7897                                     chrm_src_stride,
7898                                     pu1_cur_pred,
7899                                     pred_strd,
7900                                     trans_ssd_v,
7901                                     i4_alpha_stim_multiplier,
7902                                     trans_size,
7903                                     0,
7904                                     ps_ctxt->u1_enable_psyRDOPT,
7905                                     V_PLANE);
7906 #else
7907                                 if(u1_compute_spatial_ssd && u1_is_recon_available)
7908                                 {
7909                                     trans_ssd_v = ihevce_inject_stim_into_distortion(
7910                                         pu1_cur_src,
7911                                         chrm_src_stride,
7912                                         pu1_cur_recon,
7913                                         i4_recon_stride,
7914                                         trans_ssd_v,
7915                                         i4_alpha_stim_multiplier,
7916                                         trans_size,
7917                                         0,
7918                                         ps_ctxt->u1_enable_psyRDOPT,
7919                                         V_PLANE);
7920                                 }
7921                                 else
7922                                 {
7923                                     trans_ssd_v = ihevce_inject_stim_into_distortion(
7924                                         pu1_cur_src,
7925                                         chrm_src_stride,
7926                                         pu1_cur_pred,
7927                                         pred_strd,
7928                                         trans_ssd_v,
7929                                         i4_alpha_stim_multiplier,
7930                                         trans_size,
7931                                         0,
7932                                         ps_ctxt->u1_enable_psyRDOPT,
7933                                         V_PLANE);
7934                                 }
7935 #endif
7936                             }
7937 #endif
7938 
7939                             curr_cr_cod_cost =
7940                                 trans_ssd_v +
7941                                 COMPUTE_RATE_COST_CLIP30(
7942                                     tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
7943                             chrm_tu_bits += tu_bits;
7944                             i4_bits_cr += tu_bits;
7945 
7946                             /* RDOPT copy States :  New updated after curr TU to TU init */
7947                             if(0 != cbf)
7948                             {
7949                                 COPY_CABAC_STATES(
7950                                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7951                                     &ps_ctxt->s_rdopt_entropy_ctxt
7952                                          .as_cu_entropy_ctxt[rd_opt_curr_idx]
7953                                          .s_cabac_ctxt.au1_ctxt_models[0],
7954                                     IHEVC_CAB_CTXT_END);
7955                             }
7956                             /* RDOPT copy States :  Restoring back the Cb init state to Cr */
7957                             else
7958                             {
7959                                 COPY_CABAC_STATES(
7960                                     &ps_ctxt->s_rdopt_entropy_ctxt
7961                                          .as_cu_entropy_ctxt[rd_opt_curr_idx]
7962                                          .s_cabac_ctxt.au1_ctxt_models[0],
7963                                     &ps_ctxt->au1_rdopt_init_ctxt_models[0],
7964                                     IHEVC_CAB_CTXT_END);
7965                             }
7966 
7967                             /* If Intra and TU=CU/2, need recon for next TUs */
7968                             if(calc_recon)
7969                             {
7970                                 ihevce_chroma_it_recon_fxn(
7971                                     ps_ctxt,
7972                                     (pi2_cur_deq_data + trans_size),
7973                                     deq_data_strd,
7974                                     pu1_cur_pred,
7975                                     pred_strd,
7976                                     pu1_cur_recon,
7977                                     i4_recon_stride,
7978                                     (pu1_ecd_data + total_bytes_offset),
7979                                     trans_size,
7980                                     cbf,
7981                                     zero_cols,
7982                                     zero_rows,
7983                                     V_PLANE);
7984 
7985                                 ps_recon_datastore
7986                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7987                                                                         [i4_subtu_idx] = 0;
7988                             }
7989                             else
7990                             {
7991                                 ps_recon_datastore
7992                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
7993                                                                         [i4_subtu_idx] = UCHAR_MAX;
7994                             }
7995                         }
7996                         else
7997                         {
7998                             /* num bytes is set to 0 */
7999                             num_bytes = 0;
8000 
8001                             /* cbf is returned as 0 */
8002                             cbf = 0;
8003 
8004                             curr_cr_cod_cost = trans_ssd_v =
8005 
8006                                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
8007                                     pu1_cur_pred + 1,
8008                                     pu1_cur_src + 1,
8009                                     pred_strd,
8010                                     chrm_src_stride,
8011                                     trans_size,
8012                                     trans_size);
8013 
8014                             if(u1_compute_spatial_ssd)
8015                             {
8016                                 /* buffer copy fromp pred to recon */
8017                                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
8018                                     pu1_cur_pred,
8019                                     pred_strd,
8020                                     pu1_cur_recon,
8021                                     i4_recon_stride,
8022                                     trans_size,
8023                                     trans_size,
8024                                     V_PLANE);
8025 
8026                                 ps_recon_datastore
8027                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
8028                                                                         [i4_subtu_idx] = 0;
8029                             }
8030 
8031                             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
8032                             {
8033                                 trans_ssd_v = ihevce_inject_stim_into_distortion(
8034                                     pu1_cur_src,
8035                                     chrm_src_stride,
8036                                     pu1_cur_pred,
8037                                     pred_strd,
8038                                     trans_ssd_v,
8039                                     i4_alpha_stim_multiplier,
8040                                     trans_size,
8041                                     0,
8042                                     ps_ctxt->u1_enable_psyRDOPT,
8043                                     V_PLANE);
8044                             }
8045 
8046 #if ENABLE_INTER_ZCU_COST
8047 #if !WEIGH_CHROMA_COST
8048                             /* cbf = 0, accumulate cu not coded cost */
8049                             ps_ctxt->i8_cu_not_coded_cost += curr_cr_cod_cost;
8050 #else
8051                             /* cbf = 0, accumulate cu not coded cost */
8052 
8053                             ps_ctxt->i8_cu_not_coded_cost += (LWORD64)(
8054                                 (curr_cr_cod_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
8055                                  (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
8056                                 CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
8057 #endif
8058 #endif
8059                         }
8060 
8061 #if !WEIGH_CHROMA_COST
8062                         curr_rdopt_cost += curr_cr_cod_cost;
8063 #else
8064                         curr_rdopt_cost +=
8065                             ((curr_cr_cod_cost * ps_ctxt->u4_chroma_cost_weighing_factor +
8066                               (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
8067                              CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
8068 #endif
8069 
8070                         chrm_cod_cost += curr_cr_cod_cost;
8071                         i8_ssd_cr += trans_ssd_v;
8072 
8073                         if(ps_ctxt->i4_bitrate_instance_num || ps_ctxt->i4_num_bitrates == 1)
8074                         {
8075                             /* Early exit : If the current running cost exceeds
8076                             the prev. best mode cost, break */
8077                             if(curr_rdopt_cost > prev_best_rdopt_cost)
8078                             {
8079                                 u1_is_early_exit_condition_satisfied = 1;
8080                                 break;
8081                             }
8082                         }
8083 
8084                         /* inter cu is coded if any of the tu is coded in it */
8085                         ps_best_cu_prms->u1_is_cu_coded |= cbf;
8086 
8087                         /* update CR related params */
8088                         ps_tu->ai4_cr_coeff_offset[i4_subtu_idx] =
8089                             total_bytes_offset + init_bytes_offset;
8090 
8091                         if(0 == i4_subtu_idx)
8092                         {
8093                             ps_tu->s_tu.b1_cr_cbf = cbf;
8094                         }
8095                         else
8096                         {
8097                             ps_tu->s_tu.b1_cr_cbf_subtu1 = cbf;
8098                         }
8099 
8100                         total_bytes_offset += num_bytes;
8101 
8102                         ps_tu_temp_prms->au4_cr_zero_col[i4_subtu_idx] = zero_cols;
8103                         ps_tu_temp_prms->au4_cr_zero_row[i4_subtu_idx] = zero_rows;
8104                         ps_tu_temp_prms->ai2_cr_bytes_consumed[i4_subtu_idx] = num_bytes;
8105                     }
8106                     else
8107                     {
8108                         ps_recon_datastore
8109                             ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx] =
8110                             UCHAR_MAX;
8111                         ps_recon_datastore
8112                             ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx] =
8113                             UCHAR_MAX;
8114                     }
8115                 }
8116 
8117                 if(u1_is_early_exit_condition_satisfied)
8118                 {
8119                     break;
8120                 }
8121 
8122                 /* loop increments */
8123                 ps_tu++;
8124                 ps_tu_temp_prms++;
8125             }
8126 
8127             /* Signal as luma mode. HIGH_QUALITY may update it */
8128             ps_best_cu_prms->u1_chroma_intra_pred_mode = 4;
8129 
8130             /* modify the cost chrm_cod_cost */
8131             if(ps_ctxt->u1_enable_psyRDOPT)
8132             {
8133                 UWORD8 *pu1_recon_cu;
8134                 WORD32 recon_stride;
8135                 WORD32 curr_pos_x;
8136                 WORD32 curr_pos_y;
8137                 WORD32 start_index;
8138                 WORD32 num_horz_cu_in_ctb;
8139                 WORD32 had_block_size;
8140                 /* tODO: sreenivasa ctb size has to be used appropriately */
8141                 had_block_size = 8;
8142                 num_horz_cu_in_ctb = 2 * 64 / had_block_size;
8143 
8144                 curr_pos_x = cu_pos_x << 3; /* pel units */
8145                 curr_pos_y = cu_pos_y << 3; /* pel units */
8146                 recon_stride = i4_recon_stride;
8147                 pu1_recon_cu = pu1_recon;
8148 
8149                 /* start index to index the source satd of curr cu int he current ctb*/
8150                 start_index = 2 * (curr_pos_x / had_block_size) +
8151                               (curr_pos_y / had_block_size) * num_horz_cu_in_ctb;
8152 
8153                 {
8154                     chrm_cod_cost += ihevce_psy_rd_cost_croma(
8155                         ps_ctxt->ai4_source_chroma_satd,
8156                         pu1_recon,
8157                         recon_stride,
8158                         1,  //
8159                         cu_size,
8160                         0,  // pic type
8161                         0,  //layer id
8162                         ps_ctxt->i4_satd_lamda,  // lambda
8163                         start_index,
8164                         ps_ctxt->u1_is_input_data_hbd,  // 8 bit
8165                         ps_ctxt->u1_chroma_array_type,
8166                         &ps_ctxt->s_cmn_opt_func
8167 
8168                     );  // chroma subsampling 420
8169                 }
8170             }
8171         }
8172         else
8173         {
8174             u1_is_mode_eq_chroma_satd_mode = 1;
8175             chrm_cod_cost = MAX_COST_64;
8176         }
8177 
8178         /* If Intra Block and preset is HIGH QUALITY, then compare with best SATD mode */
8179         if((PRED_MODE_INTRA == ps_best_cu_prms->u1_intra_flag) &&
8180            (1 == ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_satd))
8181         {
8182             if(64 == cu_size)
8183             {
8184                 ASSERT(TU_EQ_CU != func_proc_mode);
8185             }
8186 
8187             if(ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[func_proc_mode]
8188                    .i8_chroma_best_rdopt < chrm_cod_cost)
8189             {
8190                 UWORD8 *pu1_src;
8191                 UWORD8 *pu1_ecd_data_src_cb;
8192                 UWORD8 *pu1_ecd_data_src_cr;
8193 
8194                 chroma_intra_satd_ctxt_t *ps_chr_intra_satd_ctxt =
8195                     &ps_ctxt->s_chroma_rdopt_ctxt.as_chr_intra_satd_ctxt[func_proc_mode];
8196 
8197                 UWORD8 *pu1_dst = &ps_ctxt->au1_rdopt_init_ctxt_models[0];
8198                 WORD32 ai4_ecd_data_cb_offset[2] = { 0, 0 };
8199                 WORD32 ai4_ecd_data_cr_offset[2] = { 0, 0 };
8200 
8201                 pu1_src = &ps_chr_intra_satd_ctxt->au1_chrm_satd_updated_ctxt_models[0];
8202                 chrm_cod_cost = ps_chr_intra_satd_ctxt->i8_chroma_best_rdopt;
8203                 chrm_pred_mode = ps_chr_intra_satd_ctxt->u1_best_cr_mode;
8204                 chrm_tu_bits = ps_chr_intra_satd_ctxt->i4_chrm_tu_bits;
8205 
8206                 if(u1_is_mode_eq_chroma_satd_mode)
8207                 {
8208                     chrm_cod_cost -= ps_chr_intra_satd_ctxt->i8_cost_to_encode_chroma_mode;
8209                 }
8210 
8211                 /*Resetting total_num_bytes_to 0*/
8212                 total_bytes_offset = 0;
8213 
8214                 /* Update the CABAC state corresponding to chroma only */
8215                 /* Chroma Cbf */
8216                 memcpy(pu1_dst + IHEVC_CAB_CBCR_IDX, pu1_src + IHEVC_CAB_CBCR_IDX, 2);
8217                 /* Chroma transform skip */
8218                 memcpy(pu1_dst + IHEVC_CAB_TFM_SKIP12, pu1_src + IHEVC_CAB_TFM_SKIP12, 1);
8219                 /* Chroma last coeff x prefix */
8220                 memcpy(
8221                     pu1_dst + IHEVC_CAB_COEFFX_PREFIX + 15,
8222                     pu1_src + IHEVC_CAB_COEFFX_PREFIX + 15,
8223                     3);
8224                 /* Chroma last coeff y prefix */
8225                 memcpy(
8226                     pu1_dst + IHEVC_CAB_COEFFY_PREFIX + 15,
8227                     pu1_src + IHEVC_CAB_COEFFY_PREFIX + 15,
8228                     3);
8229                 /* Chroma csbf */
8230                 memcpy(
8231                     pu1_dst + IHEVC_CAB_CODED_SUBLK_IDX + 2,
8232                     pu1_src + IHEVC_CAB_CODED_SUBLK_IDX + 2,
8233                     2);
8234                 /* Chroma sig coeff flags */
8235                 memcpy(
8236                     pu1_dst + IHEVC_CAB_COEFF_FLAG + 27, pu1_src + IHEVC_CAB_COEFF_FLAG + 27, 15);
8237                 /* Chroma absgt1 flags */
8238                 memcpy(
8239                     pu1_dst + IHEVC_CAB_COEFABS_GRTR1_FLAG + 16,
8240                     pu1_src + IHEVC_CAB_COEFABS_GRTR1_FLAG + 16,
8241                     8);
8242                 /* Chroma absgt2 flags */
8243                 memcpy(
8244                     pu1_dst + IHEVC_CAB_COEFABS_GRTR2_FLAG + 4,
8245                     pu1_src + IHEVC_CAB_COEFABS_GRTR2_FLAG + 4,
8246                     2);
8247 
8248                 ps_tu = &ps_best_cu_prms->as_tu_enc_loop[0];
8249                 ps_tu_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
8250 
8251                 /* update to luma decision as we update chroma in final mode */
8252                 ps_best_cu_prms->u1_is_cu_coded = u1_is_cu_coded_old;
8253 
8254                 for(ctr = 0; ctr < u1_num_tus; ctr++)
8255                 {
8256                     for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus_in_tu; i4_subtu_idx++)
8257                     {
8258                         WORD32 cbf;
8259                         WORD32 num_bytes;
8260 
8261                         pu1_ecd_data_src_cb =
8262                             &ps_chr_intra_satd_ctxt->au1_scan_coeff_cb[i4_subtu_idx][0];
8263                         pu1_ecd_data_src_cr =
8264                             &ps_chr_intra_satd_ctxt->au1_scan_coeff_cr[i4_subtu_idx][0];
8265 
8266                         /* check if chroma present flag is set */
8267                         if(1 == ps_tu->s_tu.b3_chroma_intra_mode_idx)
8268                         {
8269                             UWORD8 *pu1_cur_pred_dest;
8270                             UWORD8 *pu1_cur_pred_src;
8271                             WORD32 pred_src_strd;
8272                             WORD16 *pi2_cur_deq_data_dest;
8273                             WORD16 *pi2_cur_deq_data_src_cb;
8274                             WORD16 *pi2_cur_deq_data_src_cr;
8275                             WORD32 deq_src_strd;
8276 
8277                             WORD32 curr_pos_x, curr_pos_y;
8278 
8279                             trans_size = ps_tu->s_tu.b3_size;
8280                             trans_size = (1 << (trans_size + 1)); /* in chroma units */
8281 
8282                             /*Deriving stride values*/
8283                             pred_src_strd = ps_chr_intra_satd_ctxt->i4_pred_stride;
8284                             deq_src_strd = ps_chr_intra_satd_ctxt->i4_iq_buff_stride;
8285 
8286                             /* since 2x2 transform is not allowed for chroma*/
8287                             if(2 == trans_size)
8288                             {
8289                                 trans_size = 4;
8290                             }
8291 
8292                             /* get the current tu posx and posy w.r.t to cu */
8293                             curr_pos_x = (ps_tu->s_tu.b4_pos_x << 2) - (cu_pos_x << 3);
8294                             curr_pos_y = (ps_tu->s_tu.b4_pos_y << 2) - (cu_pos_y << 3) +
8295                                          (i4_subtu_idx * trans_size);
8296 
8297                             /* 420sp case only vertical height will be half */
8298                             if(0 == u1_is_422)
8299                             {
8300                                 curr_pos_y >>= 1;
8301                             }
8302 
8303                             /* increment the pointers to start of current TU  */
8304                             pu1_cur_pred_src =
8305                                 ((UWORD8 *)ps_chr_intra_satd_ctxt->pv_pred_data + curr_pos_x);
8306                             pu1_cur_pred_src += (curr_pos_y * pred_src_strd);
8307                             pu1_cur_pred_dest = (pu1_pred + curr_pos_x);
8308                             pu1_cur_pred_dest += (curr_pos_y * pred_strd);
8309 
8310                             pi2_cur_deq_data_src_cb =
8311                                 &ps_chr_intra_satd_ctxt->ai2_iq_data_cb[0] + (curr_pos_x >> 1);
8312                             pi2_cur_deq_data_src_cr =
8313                                 &ps_chr_intra_satd_ctxt->ai2_iq_data_cr[0] + (curr_pos_x >> 1);
8314                             pi2_cur_deq_data_src_cb += (curr_pos_y * deq_src_strd);
8315                             pi2_cur_deq_data_src_cr += (curr_pos_y * deq_src_strd);
8316                             pi2_cur_deq_data_dest = pi2_deq_data + curr_pos_x;
8317                             pi2_cur_deq_data_dest += (curr_pos_y * deq_data_strd);
8318 
8319                             /*Overwriting deq data with that belonging to the winning special mode
8320                             (luma mode !=  chroma mode)
8321                             ihevce_copy_2d takes source and dest arguments as UWORD8 *. We have to
8322                             correspondingly manipulate to copy WORD16 data*/
8323 
8324                             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
8325                                 (UWORD8 *)pi2_cur_deq_data_dest,
8326                                 (deq_data_strd << 1),
8327                                 (UWORD8 *)pi2_cur_deq_data_src_cb,
8328                                 (deq_src_strd << 1),
8329                                 (trans_size << 1),
8330                                 trans_size);
8331 
8332                             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
8333                                 (UWORD8 *)(pi2_cur_deq_data_dest + trans_size),
8334                                 (deq_data_strd << 1),
8335                                 (UWORD8 *)pi2_cur_deq_data_src_cr,
8336                                 (deq_src_strd << 1),
8337                                 (trans_size << 1),
8338                                 trans_size);
8339 
8340                             /*Overwriting pred data with that belonging to the winning special mode
8341                             (luma mode !=  chroma mode)*/
8342 
8343                             ps_ctxt->s_cmn_opt_func.pf_copy_2d(
8344                                 pu1_cur_pred_dest,
8345                                 pred_strd,
8346                                 pu1_cur_pred_src,
8347                                 pred_src_strd,
8348                                 (trans_size << 1),
8349                                 trans_size);
8350 
8351                             num_bytes = ps_chr_intra_satd_ctxt
8352                                             ->ai4_num_bytes_scan_coeff_cb_per_tu[i4_subtu_idx][ctr];
8353                             cbf = ps_chr_intra_satd_ctxt->au1_cbf_cb[i4_subtu_idx][ctr];
8354                             /* inter cu is coded if any of the tu is coded in it */
8355                             ps_best_cu_prms->u1_is_cu_coded |= cbf;
8356 
8357                             /* update CB related params */
8358                             ps_tu->ai4_cb_coeff_offset[i4_subtu_idx] =
8359                                 total_bytes_offset + init_bytes_offset;
8360 
8361                             if(0 == i4_subtu_idx)
8362                             {
8363                                 ps_tu->s_tu.b1_cb_cbf = cbf;
8364                             }
8365                             else
8366                             {
8367                                 ps_tu->s_tu.b1_cb_cbf_subtu1 = cbf;
8368                             }
8369 
8370                             /*Overwriting the cb ecd data corresponding to the special mode*/
8371                             if(0 != num_bytes)
8372                             {
8373                                 memcpy(
8374                                     (pu1_ecd_data + total_bytes_offset),
8375                                     pu1_ecd_data_src_cb + ai4_ecd_data_cb_offset[i4_subtu_idx],
8376                                     num_bytes);
8377                             }
8378 
8379                             total_bytes_offset += num_bytes;
8380                             ai4_ecd_data_cb_offset[i4_subtu_idx] += num_bytes;
8381                             ps_tu_temp_prms->ai2_cb_bytes_consumed[i4_subtu_idx] = num_bytes;
8382 
8383                             num_bytes = ps_chr_intra_satd_ctxt
8384                                             ->ai4_num_bytes_scan_coeff_cr_per_tu[i4_subtu_idx][ctr];
8385                             cbf = ps_chr_intra_satd_ctxt->au1_cbf_cr[i4_subtu_idx][ctr];
8386                             /* inter cu is coded if any of the tu is coded in it */
8387                             ps_best_cu_prms->u1_is_cu_coded |= cbf;
8388 
8389                             /*Overwriting the cr ecd data corresponding to the special mode*/
8390                             if(0 != num_bytes)
8391                             {
8392                                 memcpy(
8393                                     (pu1_ecd_data + total_bytes_offset),
8394                                     pu1_ecd_data_src_cr + ai4_ecd_data_cr_offset[i4_subtu_idx],
8395                                     num_bytes);
8396                             }
8397 
8398                             /* update CR related params */
8399                             ps_tu->ai4_cr_coeff_offset[i4_subtu_idx] =
8400                                 total_bytes_offset + init_bytes_offset;
8401 
8402                             if(0 == i4_subtu_idx)
8403                             {
8404                                 ps_tu->s_tu.b1_cr_cbf = cbf;
8405                             }
8406                             else
8407                             {
8408                                 ps_tu->s_tu.b1_cr_cbf_subtu1 = cbf;
8409                             }
8410 
8411                             total_bytes_offset += num_bytes;
8412                             ai4_ecd_data_cr_offset[i4_subtu_idx] += num_bytes;
8413 
8414                             /*Updating zero rows and zero cols*/
8415                             ps_tu_temp_prms->au4_cb_zero_col[i4_subtu_idx] =
8416                                 ps_chr_intra_satd_ctxt->ai4_zero_col_cb[i4_subtu_idx][ctr];
8417                             ps_tu_temp_prms->au4_cb_zero_row[i4_subtu_idx] =
8418                                 ps_chr_intra_satd_ctxt->ai4_zero_row_cb[i4_subtu_idx][ctr];
8419                             ps_tu_temp_prms->au4_cr_zero_col[i4_subtu_idx] =
8420                                 ps_chr_intra_satd_ctxt->ai4_zero_col_cr[i4_subtu_idx][ctr];
8421                             ps_tu_temp_prms->au4_cr_zero_row[i4_subtu_idx] =
8422                                 ps_chr_intra_satd_ctxt->ai4_zero_row_cr[i4_subtu_idx][ctr];
8423 
8424                             ps_tu_temp_prms->ai2_cr_bytes_consumed[i4_subtu_idx] = num_bytes;
8425 
8426                             if((u1_num_tus > 1) &&
8427                                ps_recon_datastore->au1_is_chromaRecon_available[2])
8428                             {
8429                                 ps_recon_datastore
8430                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
8431                                                                         [i4_subtu_idx] = 2;
8432                                 ps_recon_datastore
8433                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
8434                                                                         [i4_subtu_idx] = 2;
8435                             }
8436                             else if(
8437                                 (1 == u1_num_tus) &&
8438                                 ps_recon_datastore->au1_is_chromaRecon_available[1])
8439                             {
8440                                 ps_recon_datastore
8441                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
8442                                                                         [i4_subtu_idx] = 1;
8443                                 ps_recon_datastore
8444                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
8445                                                                         [i4_subtu_idx] = 1;
8446                             }
8447                             else
8448                             {
8449                                 ps_recon_datastore
8450                                     ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr]
8451                                                                         [i4_subtu_idx] = UCHAR_MAX;
8452                                 ps_recon_datastore
8453                                     ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr]
8454                                                                         [i4_subtu_idx] = UCHAR_MAX;
8455                             }
8456                         }
8457                     }
8458 
8459                     /* loop increments */
8460                     ps_tu++;
8461                     ps_tu_temp_prms++;
8462                 }
8463             }
8464 
8465             if(!u1_is_422)
8466             {
8467                 if(chrm_pred_mode == luma_pred_mode)
8468                 {
8469                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 4;
8470                 }
8471                 else if(chrm_pred_mode == 0)
8472                 {
8473                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 0;
8474                 }
8475                 else if(chrm_pred_mode == 1)
8476                 {
8477                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 3;
8478                 }
8479                 else if(chrm_pred_mode == 10)
8480                 {
8481                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 2;
8482                 }
8483                 else if(chrm_pred_mode == 26)
8484                 {
8485                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 1;
8486                 }
8487                 else
8488                 {
8489                     ASSERT(0); /*Should not come here*/
8490                 }
8491             }
8492             else
8493             {
8494                 if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[luma_pred_mode])
8495                 {
8496                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 4;
8497                 }
8498                 else if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[0])
8499                 {
8500                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 0;
8501                 }
8502                 else if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[1])
8503                 {
8504                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 3;
8505                 }
8506                 else if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[10])
8507                 {
8508                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 2;
8509                 }
8510                 else if(chrm_pred_mode == gau1_chroma422_intra_angle_mapping[26])
8511                 {
8512                     ps_best_cu_prms->u1_chroma_intra_pred_mode = 1;
8513                 }
8514                 else
8515                 {
8516                     ASSERT(0); /*Should not come here*/
8517                 }
8518             }
8519         }
8520 
8521         /* Store the actual chroma mode */
8522         ps_best_cu_prms->u1_chroma_intra_pred_actual_mode = chrm_pred_mode;
8523     }
8524 
8525     /* update the total bytes produced */
8526     ps_best_cu_prms->i4_num_bytes_ecd_data = total_bytes_offset + init_bytes_offset;
8527 
8528     /* store the final chrm bits accumulated */
8529     *pi4_chrm_tu_bits = chrm_tu_bits;
8530 
8531     return (chrm_cod_cost);
8532 }
8533 
8534 /*!
8535 ******************************************************************************
8536 * \if Function name : ihevce_final_rdopt_mode_prcs \endif
8537 *
8538 * \brief
8539 *    Final RDOPT mode process function. Performs Recon computation for the
8540 *    final mode. Re-use or Compute pred, iq-data, coeff based on the flags.
8541 *
8542 * \param[in] pv_ctxt : pointer to enc_loop module
8543 * \param[in] ps_prms : pointer to struct containing requisite parameters
8544 *
8545 * \return
8546 *    None
8547 *
8548 * \author
8549 *  Ittiam
8550 *
8551 *****************************************************************************
8552 */
ihevce_final_rdopt_mode_prcs(ihevce_enc_loop_ctxt_t * ps_ctxt,final_mode_process_prms_t * ps_prms)8553 void ihevce_final_rdopt_mode_prcs(
8554     ihevce_enc_loop_ctxt_t *ps_ctxt, final_mode_process_prms_t *ps_prms)
8555 {
8556     enc_loop_cu_final_prms_t *ps_best_cu_prms;
8557     tu_enc_loop_out_t *ps_tu_enc_loop;
8558     tu_enc_loop_temp_prms_t *ps_tu_enc_loop_temp_prms;
8559     nbr_avail_flags_t s_nbr;
8560     recon_datastore_t *ps_recon_datastore;
8561 
8562     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
8563     ihevc_intra_pred_chroma_ref_substitution_ft *ihevc_intra_pred_chroma_ref_substitution_fptr;
8564     ihevc_intra_pred_ref_filtering_ft *ihevc_intra_pred_ref_filtering_fptr;
8565 
8566     WORD32 num_tu_in_cu;
8567     LWORD64 rd_opt_cost;
8568     WORD32 ctr;
8569     WORD32 i4_subtu_idx;
8570     WORD32 cu_size;
8571     WORD32 cu_pos_x, cu_pos_y;
8572     WORD32 chrm_present_flag = 1;
8573     WORD32 num_bytes, total_bytes = 0;
8574     WORD32 chrm_ctr = 0;
8575     WORD32 u1_is_cu_coded;
8576     UWORD8 *pu1_old_ecd_data;
8577     UWORD8 *pu1_chrm_old_ecd_data;
8578     UWORD8 *pu1_cur_pred;
8579     WORD16 *pi2_deq_data;
8580     WORD16 *pi2_chrm_deq_data;
8581     WORD16 *pi2_cur_deq_data;
8582     WORD16 *pi2_cur_deq_data_chrm;
8583     UWORD8 *pu1_cur_luma_recon;
8584     UWORD8 *pu1_cur_chroma_recon;
8585     UWORD8 *pu1_cur_src;
8586     UWORD8 *pu1_cur_src_chrm;
8587     UWORD8 *pu1_cur_pred_chrm;
8588     UWORD8 *pu1_intra_pred_mode;
8589     UWORD32 *pu4_nbr_flags;
8590     LWORD64 i8_ssd;
8591 
8592     cu_nbr_prms_t *ps_cu_nbr_prms = ps_prms->ps_cu_nbr_prms;
8593     cu_inter_cand_t *ps_best_inter_cand = ps_prms->ps_best_inter_cand;
8594     enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms = ps_prms->ps_chrm_cu_buf_prms;
8595 
8596     WORD32 packed_pred_mode = ps_prms->packed_pred_mode;
8597     WORD32 rd_opt_best_idx = ps_prms->rd_opt_best_idx;
8598     UWORD8 *pu1_src = (UWORD8 *)ps_prms->pv_src;
8599     WORD32 src_strd = ps_prms->src_strd;
8600     UWORD8 *pu1_pred = (UWORD8 *)ps_prms->pv_pred;
8601     WORD32 pred_strd = ps_prms->pred_strd;
8602     UWORD8 *pu1_pred_chrm = (UWORD8 *)ps_prms->pv_pred_chrm;
8603     WORD32 pred_chrm_strd = ps_prms->pred_chrm_strd;
8604     UWORD8 *pu1_final_ecd_data = ps_prms->pu1_final_ecd_data;
8605     UWORD8 *pu1_csbf_buf = ps_prms->pu1_csbf_buf;
8606     WORD32 csbf_strd = ps_prms->csbf_strd;
8607     UWORD8 *pu1_luma_recon = (UWORD8 *)ps_prms->pv_luma_recon;
8608     WORD32 recon_luma_strd = ps_prms->recon_luma_strd;
8609     UWORD8 *pu1_chrm_recon = (UWORD8 *)ps_prms->pv_chrm_recon;
8610     WORD32 recon_chrma_strd = ps_prms->recon_chrma_strd;
8611     UWORD8 u1_cu_pos_x = ps_prms->u1_cu_pos_x;
8612     UWORD8 u1_cu_pos_y = ps_prms->u1_cu_pos_y;
8613     UWORD8 u1_cu_size = ps_prms->u1_cu_size;
8614     WORD8 i1_cu_qp = ps_prms->i1_cu_qp;
8615     UWORD8 u1_is_422 = (ps_ctxt->u1_chroma_array_type == 2);
8616     UWORD8 u1_num_subtus = (u1_is_422 == 1) + 1;
8617     /* Get the Chroma pointer and parameters */
8618     UWORD8 *pu1_src_chrm = ps_chrm_cu_buf_prms->pu1_curr_src;
8619     WORD32 src_chrm_strd = ps_chrm_cu_buf_prms->i4_chrm_src_stride;
8620     UWORD8 u1_compute_spatial_ssd_luma = 0;
8621     UWORD8 u1_compute_spatial_ssd_chroma = 0;
8622     /* Get the pointer for function selector */
8623     ihevc_intra_pred_luma_ref_substitution_fptr =
8624         ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
8625 
8626     ihevc_intra_pred_ref_filtering_fptr =
8627         ps_ctxt->ps_func_selector->ihevc_intra_pred_ref_filtering_fptr;
8628 
8629     ihevc_intra_pred_chroma_ref_substitution_fptr =
8630         ps_ctxt->ps_func_selector->ihevc_intra_pred_chroma_ref_substitution_fptr;
8631 
8632     /* Get the best CU parameters */
8633     ps_best_cu_prms = &ps_ctxt->as_cu_prms[rd_opt_best_idx];
8634     num_tu_in_cu = ps_best_cu_prms->u2_num_tus_in_cu;
8635     cu_size = ps_best_cu_prms->u1_cu_size;
8636     cu_pos_x = u1_cu_pos_x;
8637     cu_pos_y = u1_cu_pos_y;
8638     pu1_intra_pred_mode = &ps_best_cu_prms->au1_intra_pred_mode[0];
8639     pu4_nbr_flags = &ps_best_cu_prms->au4_nbr_flags[0];
8640     ps_recon_datastore = &ps_best_cu_prms->s_recon_datastore;
8641 
8642     /* get the first TU pointer */
8643     ps_tu_enc_loop = &ps_best_cu_prms->as_tu_enc_loop[0];
8644     /* get the first TU only enc_loop prms pointer */
8645     ps_tu_enc_loop_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
8646     /*modify quant related param in ctxt based on current cu qp*/
8647     if((ps_ctxt->i1_cu_qp_delta_enable))
8648     {
8649         /*recompute quant related param at every cu level*/
8650         ihevce_compute_quant_rel_param(ps_ctxt, i1_cu_qp);
8651 
8652         /* get frame level lambda params */
8653         ihevce_get_cl_cu_lambda_prms(
8654             ps_ctxt, MODULATE_LAMDA_WHEN_SPATIAL_MOD_ON ? i1_cu_qp : ps_ctxt->i4_frame_qp);
8655     }
8656 
8657     ps_best_cu_prms->i8_cu_ssd = 0;
8658     ps_best_cu_prms->u4_cu_open_intra_sad = 0;
8659 
8660     /* For skip case : Set TU_size = CU_size and make cbf = 0
8661     so that same TU loop can be used for all modes */
8662     if(PRED_MODE_SKIP == packed_pred_mode)
8663     {
8664         for(ctr = 0; ctr < num_tu_in_cu; ctr++)
8665         {
8666             ps_tu_enc_loop->s_tu.b1_y_cbf = 0;
8667 
8668             ps_tu_enc_loop_temp_prms->i2_luma_bytes_consumed = 0;
8669 
8670             ps_tu_enc_loop++;
8671             ps_tu_enc_loop_temp_prms++;
8672         }
8673 
8674         /* go back to the first TU pointer */
8675         ps_tu_enc_loop = &ps_best_cu_prms->as_tu_enc_loop[0];
8676         ps_tu_enc_loop_temp_prms = &ps_best_cu_prms->as_tu_enc_loop_temp_prms[0];
8677     }
8678     /**   For inter case, pred calculation is outside the loop     **/
8679     if(PRED_MODE_INTRA != packed_pred_mode)
8680     {
8681         /**------------- Compute pred data if required --------------**/
8682         if((1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data))
8683         {
8684             nbr_4x4_t *ps_topleft_nbr_4x4;
8685             nbr_4x4_t *ps_left_nbr_4x4;
8686             nbr_4x4_t *ps_top_nbr_4x4;
8687             WORD32 nbr_4x4_left_strd;
8688 
8689             ps_best_inter_cand->pu1_pred_data = pu1_pred;
8690             ps_best_inter_cand->i4_pred_data_stride = pred_strd;
8691 
8692             /* Get the CU nbr information */
8693             ps_topleft_nbr_4x4 = ps_cu_nbr_prms->ps_topleft_nbr_4x4;
8694             ps_left_nbr_4x4 = ps_cu_nbr_prms->ps_left_nbr_4x4;
8695             ps_top_nbr_4x4 = ps_cu_nbr_prms->ps_top_nbr_4x4;
8696             nbr_4x4_left_strd = ps_cu_nbr_prms->nbr_4x4_left_strd;
8697 
8698             /* MVP ,MVD calc and Motion compensation */
8699             rd_opt_cost = ((pf_inter_rdopt_cu_mc_mvp)ps_ctxt->pv_inter_rdopt_cu_mc_mvp)(
8700                 ps_ctxt,
8701                 ps_best_inter_cand,
8702                 u1_cu_size,
8703                 cu_pos_x,
8704                 cu_pos_y,
8705                 ps_left_nbr_4x4,
8706                 ps_top_nbr_4x4,
8707                 ps_topleft_nbr_4x4,
8708                 nbr_4x4_left_strd,
8709                 rd_opt_best_idx);
8710         }
8711 
8712         /** ------ Motion Compensation for Chroma -------- **/
8713         if(1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data)
8714         {
8715             UWORD8 *pu1_cur_pred;
8716             pu1_cur_pred = pu1_pred_chrm;
8717 
8718             /* run a loop over all the partitons in cu */
8719             for(ctr = 0; ctr < ps_best_cu_prms->u2_num_pus_in_cu; ctr++)
8720             {
8721                 pu_t *ps_pu;
8722                 WORD32 inter_pu_wd, inter_pu_ht;
8723 
8724                 ps_pu = &ps_best_cu_prms->as_pu_chrm_proc[ctr];
8725 
8726                 /* IF AMP then each partitions can have diff wd ht */
8727                 inter_pu_wd = (ps_pu->b4_wd + 1) << 2; /* cb and cr pixel interleaved */
8728                 inter_pu_ht = ((ps_pu->b4_ht + 1) << 2) >> 1;
8729                 inter_pu_ht <<= u1_is_422;
8730                 /* chroma mc func */
8731                 ihevce_chroma_inter_pred_pu(
8732                     &ps_ctxt->s_mc_ctxt, ps_pu, pu1_cur_pred, pred_chrm_strd);
8733                 if(2 == ps_best_cu_prms->u2_num_pus_in_cu)
8734                 {
8735                     /* 2Nx__ partion case */
8736                     if(inter_pu_wd == ps_best_cu_prms->u1_cu_size)
8737                     {
8738                         pu1_cur_pred += (inter_pu_ht * pred_chrm_strd);
8739                     }
8740                     /* __x2N partion case */
8741                     if(inter_pu_ht == (ps_best_cu_prms->u1_cu_size >> (u1_is_422 == 0)))
8742                     {
8743                         pu1_cur_pred += inter_pu_wd;
8744                     }
8745                 }
8746             }
8747         }
8748     }
8749     pi2_deq_data = &ps_best_cu_prms->pi2_cu_deq_coeffs[0];
8750     pi2_chrm_deq_data =
8751         &ps_best_cu_prms->pi2_cu_deq_coeffs[0] + ps_best_cu_prms->i4_chrm_deq_coeff_strt_idx;
8752     pu1_old_ecd_data = &ps_best_cu_prms->pu1_cu_coeffs[0];
8753     pu1_chrm_old_ecd_data =
8754         &ps_best_cu_prms->pu1_cu_coeffs[0] + ps_best_cu_prms->i4_chrm_cu_coeff_strt_idx;
8755 
8756     /* default value for cu coded flag */
8757     u1_is_cu_coded = 0;
8758 
8759     /* If we are re-computing coeff, set sad to 0 and start accumulating */
8760     /* else use the best cand. sad from RDOPT stage                    */
8761     if(1 == ps_tu_enc_loop_temp_prms->b1_eval_luma_iq_and_coeff_data)
8762     {
8763         /*init of ssd of CU accuumulated over all TU*/
8764         ps_best_cu_prms->u4_cu_sad = 0;
8765 
8766         /* reset the luma residual bits */
8767         ps_best_cu_prms->u4_cu_luma_res_bits = 0;
8768     }
8769 
8770     if(1 == ps_tu_enc_loop_temp_prms->b1_eval_chroma_iq_and_coeff_data)
8771     {
8772         /* reset the chroma residual bits */
8773         ps_best_cu_prms->u4_cu_chroma_res_bits = 0;
8774     }
8775 
8776     if((1 == ps_tu_enc_loop_temp_prms->b1_eval_luma_iq_and_coeff_data) ||
8777        (1 == ps_tu_enc_loop_temp_prms->b1_eval_chroma_iq_and_coeff_data))
8778     {
8779         /*Header bits have to be reevaluated if luma and chroma reevaluation is done, as
8780         the quantized coefficients might be changed.
8781         We are copying only those states which correspond to the header from the cabac state
8782         of the previous CU, because the header is going to be recomputed for this condition*/
8783         ps_ctxt->s_cu_final_recon_flags.u1_eval_header_data = 1;
8784         memcpy(
8785             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
8786             &ps_ctxt->s_rdopt_entropy_ctxt.au1_init_cabac_ctxt_states[0],
8787             IHEVC_CAB_COEFFX_PREFIX);
8788 
8789         if((1 == ps_tu_enc_loop_temp_prms->b1_eval_luma_iq_and_coeff_data))
8790         {
8791             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
8792                 (&ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX),
8793                 (&ps_ctxt->s_rdopt_entropy_ctxt.au1_init_cabac_ctxt_states[0] +
8794                  IHEVC_CAB_COEFFX_PREFIX),
8795                 (IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX));
8796         }
8797         else
8798         {
8799             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
8800                 (&ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX),
8801                 (&ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
8802                       .s_cabac_ctxt.au1_ctxt_models[0] +
8803                  IHEVC_CAB_COEFFX_PREFIX),
8804                 (IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX));
8805         }
8806         ps_ctxt->s_rdopt_entropy_ctxt.i4_curr_buf_idx = rd_opt_best_idx;
8807     }
8808     else
8809     {
8810         ps_ctxt->s_cu_final_recon_flags.u1_eval_header_data = 0;
8811     }
8812 
8813     /* Zero cbf tool is disabled for intra CUs */
8814     if(PRED_MODE_INTRA == packed_pred_mode)
8815     {
8816 #if ENABLE_ZERO_CBF_IN_INTRA
8817         ps_ctxt->i4_zcbf_rdo_level = ZCBF_ENABLE;
8818 #else
8819         ps_ctxt->i4_zcbf_rdo_level = NO_ZCBF;
8820 #endif
8821     }
8822     else
8823     {
8824 #if DISABLE_ZERO_ZBF_IN_INTER
8825         ps_ctxt->i4_zcbf_rdo_level = NO_ZCBF;
8826 #else
8827         ps_ctxt->i4_zcbf_rdo_level = ZCBF_ENABLE;
8828 #endif
8829     }
8830 
8831     /** Loop for all tu blocks in current cu and do reconstruction **/
8832     for(ctr = 0; ctr < num_tu_in_cu; ctr++)
8833     {
8834         tu_t *ps_tu;
8835         WORD32 trans_size, num_4x4_in_tu;
8836         WORD32 cbf, zero_rows, zero_cols;
8837         WORD32 cu_pos_x_in_4x4, cu_pos_y_in_4x4;
8838         WORD32 cu_pos_x_in_pix, cu_pos_y_in_pix;
8839         WORD32 luma_pred_mode, chroma_pred_mode = 0;
8840         UWORD8 au1_is_recon_available[2];
8841 
8842         ps_tu = &(ps_tu_enc_loop->s_tu); /* Points to the TU property ctxt */
8843 
8844         u1_compute_spatial_ssd_luma = 0;
8845         u1_compute_spatial_ssd_chroma = 0;
8846 
8847         trans_size = 1 << (ps_tu->b3_size + 2);
8848         num_4x4_in_tu = (trans_size >> 2);
8849         cu_pos_x_in_4x4 = ps_tu->b4_pos_x;
8850         cu_pos_y_in_4x4 = ps_tu->b4_pos_y;
8851 
8852         /* populate the coeffs scan idx */
8853         ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
8854 
8855         /* get the current pos x and pos y in pixels */
8856         cu_pos_x_in_pix = (cu_pos_x_in_4x4 << 2) - (cu_pos_x << 3);
8857         cu_pos_y_in_pix = (cu_pos_y_in_4x4 << 2) - (cu_pos_y << 3);
8858 
8859         /* Update pointers based on the location */
8860         pu1_cur_src = pu1_src + cu_pos_x_in_pix;
8861         pu1_cur_src += (cu_pos_y_in_pix * src_strd);
8862         pu1_cur_pred = pu1_pred + cu_pos_x_in_pix;
8863         pu1_cur_pred += (cu_pos_y_in_pix * pred_strd);
8864 
8865         pu1_cur_luma_recon = pu1_luma_recon + cu_pos_x_in_pix;
8866         pu1_cur_luma_recon += (cu_pos_y_in_pix * recon_luma_strd);
8867 
8868         pi2_cur_deq_data = pi2_deq_data + cu_pos_x_in_pix;
8869         pi2_cur_deq_data += cu_pos_y_in_pix * cu_size;
8870 
8871         pu1_cur_src_chrm = pu1_src_chrm + cu_pos_x_in_pix;
8872         pu1_cur_src_chrm += ((cu_pos_y_in_pix >> 1) * src_chrm_strd) +
8873                             (u1_is_422 * ((cu_pos_y_in_pix >> 1) * src_chrm_strd));
8874 
8875         pu1_cur_pred_chrm = pu1_pred_chrm + cu_pos_x_in_pix;
8876         pu1_cur_pred_chrm += ((cu_pos_y_in_pix >> 1) * pred_chrm_strd) +
8877                              (u1_is_422 * ((cu_pos_y_in_pix >> 1) * pred_chrm_strd));
8878 
8879         pu1_cur_chroma_recon = pu1_chrm_recon + cu_pos_x_in_pix;
8880         pu1_cur_chroma_recon += ((cu_pos_y_in_pix >> 1) * recon_chrma_strd) +
8881                                 (u1_is_422 * ((cu_pos_y_in_pix >> 1) * recon_chrma_strd));
8882 
8883         pi2_cur_deq_data_chrm = pi2_chrm_deq_data + cu_pos_x_in_pix;
8884         pi2_cur_deq_data_chrm +=
8885             ((cu_pos_y_in_pix >> 1) * cu_size) + (u1_is_422 * ((cu_pos_y_in_pix >> 1) * cu_size));
8886 
8887         /* if transfrom size is 4x4 then only first luma 4x4 will have chroma*/
8888         chrm_present_flag = 1; /* by default chroma present is set to 1*/
8889 
8890         if(4 == trans_size)
8891         {
8892             /* if tusize is 4x4 then only first luma 4x4 will have chroma*/
8893             if(0 != chrm_ctr)
8894             {
8895                 chrm_present_flag = INTRA_PRED_CHROMA_IDX_NONE;
8896             }
8897 
8898             /* increment the chrm ctr unconditionally */
8899             chrm_ctr++;
8900             /* after ctr reached 4 reset it */
8901             if(4 == chrm_ctr)
8902             {
8903                 chrm_ctr = 0;
8904             }
8905         }
8906 
8907         /**------------- Compute pred data if required --------------**/
8908         if(PRED_MODE_INTRA == packed_pred_mode) /* Inter pred calc. is done outside loop */
8909         {
8910             /* Get the pred mode for scan idx calculation, even if pred is not required */
8911             luma_pred_mode = *pu1_intra_pred_mode;
8912 
8913             if((ps_ctxt->i4_rc_pass == 1) ||
8914                (1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data))
8915             {
8916                 WORD32 nbr_flags;
8917                 WORD32 luma_pred_func_idx;
8918                 UWORD8 *pu1_left;
8919                 UWORD8 *pu1_top;
8920                 UWORD8 *pu1_top_left;
8921                 WORD32 left_strd;
8922 
8923                 /* left cu boundary */
8924                 if(0 == cu_pos_x_in_pix)
8925                 {
8926                     left_strd = ps_cu_nbr_prms->cu_left_stride;
8927                     pu1_left = ps_cu_nbr_prms->pu1_cu_left + cu_pos_y_in_pix * left_strd;
8928                 }
8929                 else
8930                 {
8931                     pu1_left = pu1_cur_luma_recon - 1;
8932                     left_strd = recon_luma_strd;
8933                 }
8934 
8935                 /* top cu boundary */
8936                 if(0 == cu_pos_y_in_pix)
8937                 {
8938                     pu1_top = ps_cu_nbr_prms->pu1_cu_top + cu_pos_x_in_pix;
8939                 }
8940                 else
8941                 {
8942                     pu1_top = pu1_cur_luma_recon - recon_luma_strd;
8943                 }
8944 
8945                 /* by default top left is set to cu top left */
8946                 pu1_top_left = ps_cu_nbr_prms->pu1_cu_top_left;
8947 
8948                 /* top left based on position */
8949                 if((0 != cu_pos_y_in_pix) && (0 == cu_pos_x_in_pix))
8950                 {
8951                     pu1_top_left = pu1_left - left_strd;
8952                 }
8953                 else if(0 != cu_pos_x_in_pix)
8954                 {
8955                     pu1_top_left = pu1_top - 1;
8956                 }
8957 
8958                 /* get the neighbour availability flags */
8959                 nbr_flags = ihevce_get_nbr_intra(
8960                     &s_nbr,
8961                     ps_ctxt->pu1_ctb_nbr_map,
8962                     ps_ctxt->i4_nbr_map_strd,
8963                     cu_pos_x_in_4x4,
8964                     cu_pos_y_in_4x4,
8965                     num_4x4_in_tu);
8966 
8967                 if(1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data)
8968                 {
8969                     /* copy the nbr flags for chroma reuse */
8970                     if(4 != trans_size)
8971                     {
8972                         *pu4_nbr_flags = nbr_flags;
8973                     }
8974                     else if(1 == chrm_present_flag)
8975                     {
8976                         /* compute the avail flags assuming luma trans is 8x8 */
8977                         /* get the neighbour availability flags */
8978                         *pu4_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
8979                             ps_ctxt->pu1_ctb_nbr_map,
8980                             ps_ctxt->i4_nbr_map_strd,
8981                             cu_pos_x_in_4x4,
8982                             cu_pos_y_in_4x4,
8983                             (num_4x4_in_tu << 1),
8984                             (num_4x4_in_tu << 1));
8985                     }
8986 
8987                     /* call reference array substitution */
8988                     ihevc_intra_pred_luma_ref_substitution_fptr(
8989                         pu1_top_left,
8990                         pu1_top,
8991                         pu1_left,
8992                         left_strd,
8993                         trans_size,
8994                         nbr_flags,
8995                         (UWORD8 *)ps_ctxt->pv_ref_sub_out,
8996                         1);
8997 
8998                     /* call reference filtering */
8999                     ihevc_intra_pred_ref_filtering_fptr(
9000                         (UWORD8 *)ps_ctxt->pv_ref_sub_out,
9001                         trans_size,
9002                         (UWORD8 *)ps_ctxt->pv_ref_filt_out,
9003                         luma_pred_mode,
9004                         ps_ctxt->i1_strong_intra_smoothing_enable_flag);
9005 
9006                     /* use the look up to get the function idx */
9007                     luma_pred_func_idx = g_i4_ip_funcs[luma_pred_mode];
9008 
9009                     /* call the intra prediction function */
9010                     ps_ctxt->apf_lum_ip[luma_pred_func_idx](
9011                         (UWORD8 *)ps_ctxt->pv_ref_filt_out,
9012                         1,
9013                         pu1_cur_pred,
9014                         pred_strd,
9015                         trans_size,
9016                         luma_pred_mode);
9017                 }
9018             }
9019             else if(
9020                 (1 == chrm_present_flag) &&
9021                 (1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data))
9022             {
9023                 WORD32 temp_num_4x4_in_tu = num_4x4_in_tu;
9024 
9025                 if(4 == trans_size) /* compute the avail flags assuming luma trans is 8x8 */
9026                 {
9027                     temp_num_4x4_in_tu = num_4x4_in_tu << 1;
9028                 }
9029 
9030                 *pu4_nbr_flags = ihevce_get_nbr_intra_mxn_tu(
9031                     ps_ctxt->pu1_ctb_nbr_map,
9032                     ps_ctxt->i4_nbr_map_strd,
9033                     cu_pos_x_in_4x4,
9034                     cu_pos_y_in_4x4,
9035                     temp_num_4x4_in_tu,
9036                     temp_num_4x4_in_tu);
9037             }
9038 
9039             /* Get the pred mode for scan idx calculation, even if pred is not required */
9040             chroma_pred_mode = ps_best_cu_prms->u1_chroma_intra_pred_actual_mode;
9041         }
9042 
9043         if(1 == ps_tu_enc_loop_temp_prms->b1_eval_luma_iq_and_coeff_data)
9044         {
9045             WORD32 temp_bits;
9046             LWORD64 temp_cost;
9047             UWORD32 u4_tu_sad;
9048             WORD32 perform_sbh, perform_rdoq;
9049 
9050             if(PRED_MODE_INTRA == packed_pred_mode)
9051             {
9052                 /* for luma 4x4 and 8x8 transforms based on intra pred mode scan is choosen*/
9053                 if(trans_size < 16)
9054                 {
9055                     /* for modes from 22 upto 30 horizontal scan is used */
9056                     if((luma_pred_mode > 21) && (luma_pred_mode < 31))
9057                     {
9058                         ps_ctxt->i4_scan_idx = SCAN_HORZ;
9059                     }
9060                     /* for modes from 6 upto 14 horizontal scan is used */
9061                     else if((luma_pred_mode > 5) && (luma_pred_mode < 15))
9062                     {
9063                         ps_ctxt->i4_scan_idx = SCAN_VERT;
9064                     }
9065                 }
9066             }
9067 
9068             /* RDOPT copy States :  TU init (best until prev TU) to current */
9069             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9070                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9071                         .s_cabac_ctxt.au1_ctxt_models[0] +
9072                     IHEVC_CAB_COEFFX_PREFIX,
9073                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9074                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9075 
9076             if(ps_prms->u1_recompute_sbh_and_rdoq)
9077             {
9078                 perform_sbh = (ps_ctxt->i4_sbh_level != NO_SBH);
9079                 perform_rdoq = (ps_ctxt->i4_rdoq_level != NO_RDOQ);
9080             }
9081             else
9082             {
9083                 /* RDOQ will change the coefficients. If coefficients are changed, we will have to do sbh again*/
9084                 perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_sbh;
9085                 /* To do SBH we need the quant and iquant data. This would mean we need to do quantization again, which would mean
9086                 we would have to do RDOQ again.*/
9087                 perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_rdoq;
9088             }
9089 
9090 #if DISABLE_RDOQ_INTRA
9091             if(PRED_MODE_INTRA == packed_pred_mode)
9092             {
9093                 perform_rdoq = 0;
9094             }
9095 #endif
9096             /*If BEST candidate RDOQ is enabled, Eithe no coef level rdoq or CU level rdoq has to be enabled
9097             so that all candidates and best candidate are quantized with same rounding factor  */
9098             if(1 == perform_rdoq)
9099             {
9100                 ASSERT(ps_ctxt->i4_quant_rounding_level != TU_LEVEL_QUANT_ROUNDING);
9101             }
9102 
9103             cbf = ihevce_t_q_iq_ssd_scan_fxn(
9104                 ps_ctxt,
9105                 pu1_cur_pred,
9106                 pred_strd,
9107                 pu1_cur_src,
9108                 src_strd,
9109                 pi2_cur_deq_data,
9110                 cu_size, /*deq_data stride is cu_size*/
9111                 pu1_cur_luma_recon,
9112                 recon_luma_strd,
9113                 pu1_final_ecd_data,
9114                 pu1_csbf_buf,
9115                 csbf_strd,
9116                 trans_size,
9117                 packed_pred_mode,
9118                 &temp_cost,
9119                 &num_bytes,
9120                 &temp_bits,
9121                 &u4_tu_sad,
9122                 &zero_cols,
9123                 &zero_rows,
9124                 &au1_is_recon_available[0],
9125                 perform_rdoq,  //(BEST_CAND_RDOQ == ps_ctxt->i4_rdoq_level),
9126                 perform_sbh,
9127 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
9128                 !ps_ctxt->u1_is_refPic ? ALPHA_FOR_NOISE_TERM_IN_RDOPT
9129                                        : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
9130                                           (double)ALPHA_FOR_NOISE_TERM_IN_RDOPT) /
9131                                              100.0,
9132                 ps_prms->u1_is_cu_noisy,
9133 #endif
9134                 u1_compute_spatial_ssd_luma ? SPATIAL_DOMAIN_SSD : FREQUENCY_DOMAIN_SSD,
9135                 1 /*early cbf*/
9136             );  //(BEST_CAND_SBH == ps_ctxt->i4_sbh_level));
9137 
9138             /* Accumulate luma residual bits */
9139             ps_best_cu_prms->u4_cu_luma_res_bits += temp_bits;
9140 
9141             /* RDOPT copy States :  New updated after curr TU to TU init */
9142             if(0 != cbf)
9143             {
9144                 /* update to new state only if CBF is non zero */
9145                 COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9146                     &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9147                     &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9148                             .s_cabac_ctxt.au1_ctxt_models[0] +
9149                         IHEVC_CAB_COEFFX_PREFIX,
9150                     IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9151             }
9152 
9153             /* accumulate the TU sad into cu sad */
9154             ps_best_cu_prms->u4_cu_sad += u4_tu_sad;
9155             ps_tu->b1_y_cbf = cbf;
9156             ps_tu_enc_loop_temp_prms->i2_luma_bytes_consumed = num_bytes;
9157 
9158             /* If somebody updates cbf (RDOQ or SBH), update in nbr str. for BS */
9159             if((ps_prms->u1_will_cabac_state_change) && (!ps_prms->u1_is_first_pass))
9160             {
9161                 WORD32 num_4x4_in_cu = u1_cu_size >> 2;
9162                 nbr_4x4_t *ps_cur_nbr_4x4 = &ps_ctxt->as_cu_nbr[rd_opt_best_idx][0];
9163                 ps_cur_nbr_4x4 = (ps_cur_nbr_4x4 + (cu_pos_x_in_pix >> 2));
9164                 ps_cur_nbr_4x4 += ((cu_pos_y_in_pix >> 2) * num_4x4_in_cu);
9165                 /* repiclate the nbr 4x4 structure for all 4x4 blocks current TU */
9166                 ps_cur_nbr_4x4->b1_y_cbf = cbf;
9167                 /*copy the cu qp. This will be overwritten by qp calculated based on skip flag at final stage of cu mode decide*/
9168                 ps_cur_nbr_4x4->b8_qp = ps_ctxt->i4_cu_qp;
9169                 /* Qp and cbf are stored for the all 4x4 in TU */
9170                 {
9171                     WORD32 i, j;
9172                     nbr_4x4_t *ps_tmp_4x4;
9173                     ps_tmp_4x4 = ps_cur_nbr_4x4;
9174 
9175                     for(i = 0; i < num_4x4_in_tu; i++)
9176                     {
9177                         for(j = 0; j < num_4x4_in_tu; j++)
9178                         {
9179                             ps_tmp_4x4[j].b8_qp = ps_ctxt->i4_cu_qp;
9180                             ps_tmp_4x4[j].b1_y_cbf = cbf;
9181                         }
9182                         /* row level update*/
9183                         ps_tmp_4x4 += num_4x4_in_cu;
9184                     }
9185                 }
9186             }
9187         }
9188         else
9189         {
9190             zero_cols = ps_tu_enc_loop_temp_prms->u4_luma_zero_col;
9191             zero_rows = ps_tu_enc_loop_temp_prms->u4_luma_zero_row;
9192 
9193             if(ps_prms->u1_will_cabac_state_change)
9194             {
9195                 num_bytes = ps_tu_enc_loop_temp_prms->i2_luma_bytes_consumed;
9196             }
9197             else
9198             {
9199                 num_bytes = 0;
9200             }
9201 
9202             /* copy luma ecd data to final buffer */
9203             memcpy(pu1_final_ecd_data, pu1_old_ecd_data, num_bytes);
9204 
9205             pu1_old_ecd_data += num_bytes;
9206 
9207             au1_is_recon_available[0] = 0;
9208         }
9209 
9210         /**-------- Compute Recon data (Do IT & Recon) : Luma  -----------**/
9211         if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9212            (!u1_compute_spatial_ssd_luma ||
9213             (!au1_is_recon_available[0] && u1_compute_spatial_ssd_luma)))
9214         {
9215             if(!ps_recon_datastore->u1_is_lumaRecon_available ||
9216                (ps_recon_datastore->u1_is_lumaRecon_available &&
9217                 (UCHAR_MAX == ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr])))
9218             {
9219                 ihevce_it_recon_fxn(
9220                     ps_ctxt,
9221                     pi2_cur_deq_data,
9222                     cu_size,
9223                     pu1_cur_pred,
9224                     pred_strd,
9225                     pu1_cur_luma_recon,
9226                     recon_luma_strd,
9227                     pu1_final_ecd_data,
9228                     trans_size,
9229                     packed_pred_mode,
9230                     ps_tu->b1_y_cbf,
9231                     zero_cols,
9232                     zero_rows);
9233             }
9234             else if(
9235                 ps_recon_datastore->u1_is_lumaRecon_available &&
9236                 (UCHAR_MAX != ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr]))
9237             {
9238                 UWORD8 *pu1_recon_src =
9239                     ((UWORD8 *)ps_recon_datastore->apv_luma_recon_bufs
9240                          [ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr]]) +
9241                     cu_pos_x_in_pix + cu_pos_y_in_pix * ps_recon_datastore->i4_lumaRecon_stride;
9242 
9243                 ps_ctxt->s_cmn_opt_func.pf_copy_2d(
9244                     pu1_cur_luma_recon,
9245                     recon_luma_strd,
9246                     pu1_recon_src,
9247                     ps_recon_datastore->i4_lumaRecon_stride,
9248                     trans_size,
9249                     trans_size);
9250             }
9251         }
9252 
9253         if(ps_prms->u1_will_cabac_state_change)
9254         {
9255             ps_tu_enc_loop->i4_luma_coeff_offset = total_bytes;
9256         }
9257 
9258         pu1_final_ecd_data += num_bytes;
9259         /* update total bytes consumed */
9260         total_bytes += num_bytes;
9261 
9262         u1_is_cu_coded |= ps_tu->b1_y_cbf;
9263 
9264         /***************** Compute T,Q,IQ,IT & Recon for Chroma ********************/
9265         if(1 == chrm_present_flag)
9266         {
9267             pu1_cur_src_chrm = pu1_src_chrm + cu_pos_x_in_pix;
9268             pu1_cur_src_chrm += ((cu_pos_y_in_pix >> 1) * src_chrm_strd) +
9269                                 (u1_is_422 * ((cu_pos_y_in_pix >> 1) * src_chrm_strd));
9270 
9271             pu1_cur_pred_chrm = pu1_pred_chrm + cu_pos_x_in_pix;
9272             pu1_cur_pred_chrm += ((cu_pos_y_in_pix >> 1) * pred_chrm_strd) +
9273                                  (u1_is_422 * ((cu_pos_y_in_pix >> 1) * pred_chrm_strd));
9274 
9275             pu1_cur_chroma_recon = pu1_chrm_recon + cu_pos_x_in_pix;
9276             pu1_cur_chroma_recon += ((cu_pos_y_in_pix >> 1) * recon_chrma_strd) +
9277                                     (u1_is_422 * ((cu_pos_y_in_pix >> 1) * recon_chrma_strd));
9278 
9279             pi2_cur_deq_data_chrm = pi2_chrm_deq_data + cu_pos_x_in_pix;
9280             pi2_cur_deq_data_chrm += ((cu_pos_y_in_pix >> 1) * cu_size) +
9281                                      (u1_is_422 * ((cu_pos_y_in_pix >> 1) * cu_size));
9282 
9283             if(INCLUDE_CHROMA_DURING_TU_RECURSION &&
9284                (ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P0) &&
9285                (PRED_MODE_INTRA != packed_pred_mode))
9286             {
9287                 WORD32 i4_num_bytes;
9288                 UWORD8 *pu1_chroma_pred;
9289                 UWORD8 *pu1_chroma_recon;
9290                 WORD16 *pi2_chroma_deq;
9291                 UWORD32 u4_zero_col;
9292                 UWORD32 u4_zero_row;
9293 
9294                 for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus; i4_subtu_idx++)
9295                 {
9296                     WORD32 chroma_trans_size = MAX(4, trans_size >> 1);
9297                     WORD32 i4_subtu_pos_x = cu_pos_x_in_pix;
9298                     WORD32 i4_subtu_pos_y = cu_pos_y_in_pix + (i4_subtu_idx * chroma_trans_size);
9299 
9300                     if(0 == u1_is_422)
9301                     {
9302                         i4_subtu_pos_y >>= 1;
9303                     }
9304 
9305                     pu1_chroma_pred =
9306                         pu1_cur_pred_chrm + (i4_subtu_idx * chroma_trans_size * pred_chrm_strd);
9307                     pu1_chroma_recon = pu1_cur_chroma_recon +
9308                                        (i4_subtu_idx * chroma_trans_size * recon_chrma_strd);
9309                     pi2_chroma_deq =
9310                         pi2_cur_deq_data_chrm + (i4_subtu_idx * chroma_trans_size * cu_size);
9311 
9312                     u4_zero_col = ps_tu_enc_loop_temp_prms->au4_cb_zero_col[i4_subtu_idx];
9313                     u4_zero_row = ps_tu_enc_loop_temp_prms->au4_cb_zero_row[i4_subtu_idx];
9314 
9315                     if(ps_prms->u1_will_cabac_state_change)
9316                     {
9317                         i4_num_bytes =
9318                             ps_tu_enc_loop_temp_prms->ai2_cb_bytes_consumed[i4_subtu_idx];
9319                     }
9320                     else
9321                     {
9322                         i4_num_bytes = 0;
9323                     }
9324 
9325                     memcpy(pu1_final_ecd_data, pu1_old_ecd_data, i4_num_bytes);
9326 
9327                     pu1_old_ecd_data += i4_num_bytes;
9328 
9329                     au1_is_recon_available[U_PLANE] = 0;
9330 
9331                     if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9332                        (!u1_compute_spatial_ssd_chroma ||
9333                         (!au1_is_recon_available[U_PLANE] && u1_compute_spatial_ssd_chroma)))
9334                     {
9335                         if(!ps_recon_datastore->au1_is_chromaRecon_available[0] ||
9336                            (ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9337                             (UCHAR_MAX ==
9338                              ps_recon_datastore
9339                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx])))
9340                         {
9341                             ihevce_chroma_it_recon_fxn(
9342                                 ps_ctxt,
9343                                 pi2_chroma_deq,
9344                                 cu_size,
9345                                 pu1_chroma_pred,
9346                                 pred_chrm_strd,
9347                                 pu1_chroma_recon,
9348                                 recon_chrma_strd,
9349                                 pu1_final_ecd_data,
9350                                 chroma_trans_size,
9351                                 (i4_subtu_idx == 0) ? ps_tu->b1_cb_cbf : ps_tu->b1_cb_cbf_subtu1,
9352                                 u4_zero_col,
9353                                 u4_zero_row,
9354                                 U_PLANE);
9355                         }
9356                         else if(
9357                             ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9358                             (UCHAR_MAX !=
9359                              ps_recon_datastore
9360                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx]))
9361                         {
9362                             UWORD8 *pu1_recon_src =
9363                                 ((UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs
9364                                      [ps_recon_datastore->au1_bufId_with_winning_ChromaRecon
9365                                           [U_PLANE][ctr][i4_subtu_idx]]) +
9366                                 i4_subtu_pos_x +
9367                                 i4_subtu_pos_y * ps_recon_datastore->i4_chromaRecon_stride;
9368 
9369                             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
9370                                 pu1_recon_src,
9371                                 ps_recon_datastore->i4_lumaRecon_stride,
9372                                 pu1_chroma_recon,
9373                                 recon_chrma_strd,
9374                                 chroma_trans_size,
9375                                 chroma_trans_size,
9376                                 U_PLANE);
9377                         }
9378                     }
9379 
9380                     u1_is_cu_coded |=
9381                         ((1 == i4_subtu_idx) ? ps_tu->b1_cb_cbf_subtu1 : ps_tu->b1_cb_cbf);
9382 
9383                     pu1_final_ecd_data += i4_num_bytes;
9384                     total_bytes += i4_num_bytes;
9385                 }
9386 
9387                 for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus; i4_subtu_idx++)
9388                 {
9389                     WORD32 chroma_trans_size = MAX(4, trans_size >> 1);
9390                     WORD32 i4_subtu_pos_x = cu_pos_x_in_pix;
9391                     WORD32 i4_subtu_pos_y = cu_pos_y_in_pix + (i4_subtu_idx * chroma_trans_size);
9392 
9393                     if(0 == u1_is_422)
9394                     {
9395                         i4_subtu_pos_y >>= 1;
9396                     }
9397 
9398                     pu1_chroma_pred =
9399                         pu1_cur_pred_chrm + (i4_subtu_idx * chroma_trans_size * pred_chrm_strd);
9400                     pu1_chroma_recon = pu1_cur_chroma_recon +
9401                                        (i4_subtu_idx * chroma_trans_size * recon_chrma_strd);
9402                     pi2_chroma_deq = pi2_cur_deq_data_chrm +
9403                                      (i4_subtu_idx * chroma_trans_size * cu_size) +
9404                                      chroma_trans_size;
9405 
9406                     u4_zero_col = ps_tu_enc_loop_temp_prms->au4_cr_zero_col[i4_subtu_idx];
9407                     u4_zero_row = ps_tu_enc_loop_temp_prms->au4_cr_zero_row[i4_subtu_idx];
9408 
9409                     if(ps_prms->u1_will_cabac_state_change)
9410                     {
9411                         i4_num_bytes =
9412                             ps_tu_enc_loop_temp_prms->ai2_cr_bytes_consumed[i4_subtu_idx];
9413                     }
9414                     else
9415                     {
9416                         i4_num_bytes = 0;
9417                     }
9418 
9419                     memcpy(pu1_final_ecd_data, pu1_old_ecd_data, i4_num_bytes);
9420 
9421                     pu1_old_ecd_data += i4_num_bytes;
9422 
9423                     au1_is_recon_available[V_PLANE] = 0;
9424 
9425                     if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9426                        (!u1_compute_spatial_ssd_chroma ||
9427                         (!au1_is_recon_available[V_PLANE] && u1_compute_spatial_ssd_chroma)))
9428                     {
9429                         if(!ps_recon_datastore->au1_is_chromaRecon_available[0] ||
9430                            (ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9431                             (UCHAR_MAX ==
9432                              ps_recon_datastore
9433                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx])))
9434                         {
9435                             ihevce_chroma_it_recon_fxn(
9436                                 ps_ctxt,
9437                                 pi2_chroma_deq,
9438                                 cu_size,
9439                                 pu1_chroma_pred,
9440                                 pred_chrm_strd,
9441                                 pu1_chroma_recon,
9442                                 recon_chrma_strd,
9443                                 pu1_final_ecd_data,
9444                                 chroma_trans_size,
9445                                 (i4_subtu_idx == 0) ? ps_tu->b1_cr_cbf : ps_tu->b1_cr_cbf_subtu1,
9446                                 u4_zero_col,
9447                                 u4_zero_row,
9448                                 V_PLANE);
9449                         }
9450                         else if(
9451                             ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9452                             (UCHAR_MAX !=
9453                              ps_recon_datastore
9454                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx]))
9455                         {
9456                             UWORD8 *pu1_recon_src =
9457                                 ((UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs
9458                                      [ps_recon_datastore->au1_bufId_with_winning_ChromaRecon
9459                                           [V_PLANE][ctr][i4_subtu_idx]]) +
9460                                 i4_subtu_pos_x +
9461                                 i4_subtu_pos_y * ps_recon_datastore->i4_chromaRecon_stride;
9462 
9463                             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
9464                                 pu1_recon_src,
9465                                 ps_recon_datastore->i4_lumaRecon_stride,
9466                                 pu1_chroma_recon,
9467                                 recon_chrma_strd,
9468                                 chroma_trans_size,
9469                                 chroma_trans_size,
9470                                 V_PLANE);
9471                         }
9472                     }
9473 
9474                     u1_is_cu_coded |=
9475                         ((1 == i4_subtu_idx) ? ps_tu->b1_cr_cbf_subtu1 : ps_tu->b1_cr_cbf);
9476 
9477                     pu1_final_ecd_data += i4_num_bytes;
9478                     total_bytes += i4_num_bytes;
9479                 }
9480             }
9481             else
9482             {
9483                 WORD32 cb_zero_col, cb_zero_row, cr_zero_col, cr_zero_row;
9484 
9485                 for(i4_subtu_idx = 0; i4_subtu_idx < u1_num_subtus; i4_subtu_idx++)
9486                 {
9487                     WORD32 cb_cbf, cr_cbf;
9488                     WORD32 cb_num_bytes, cr_num_bytes;
9489 
9490                     WORD32 chroma_trans_size = MAX(4, trans_size >> 1);
9491 
9492                     WORD32 i4_subtu_pos_x = cu_pos_x_in_pix;
9493                     WORD32 i4_subtu_pos_y = cu_pos_y_in_pix + (i4_subtu_idx * chroma_trans_size);
9494 
9495                     if(0 == u1_is_422)
9496                     {
9497                         i4_subtu_pos_y >>= 1;
9498                     }
9499 
9500                     pu1_cur_src_chrm += (i4_subtu_idx * chroma_trans_size * src_chrm_strd);
9501                     pu1_cur_pred_chrm += (i4_subtu_idx * chroma_trans_size * pred_chrm_strd);
9502                     pu1_cur_chroma_recon += (i4_subtu_idx * chroma_trans_size * recon_chrma_strd);
9503                     pi2_cur_deq_data_chrm += (i4_subtu_idx * chroma_trans_size * cu_size);
9504 
9505                     if((PRED_MODE_INTRA == packed_pred_mode) &&
9506                        (1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data))
9507                     {
9508                         WORD32 nbr_flags, left_strd_chrm, chrm_pred_func_idx;
9509                         UWORD8 *pu1_left_chrm;
9510                         UWORD8 *pu1_top_chrm;
9511                         UWORD8 *pu1_top_left_chrm;
9512 
9513                         nbr_flags = ihevce_get_intra_chroma_tu_nbr(
9514                             *pu4_nbr_flags, i4_subtu_idx, chroma_trans_size, u1_is_422);
9515 
9516                         /* left cu boundary */
9517                         if(0 == i4_subtu_pos_x)
9518                         {
9519                             left_strd_chrm = ps_chrm_cu_buf_prms->i4_cu_left_stride;
9520                             pu1_left_chrm =
9521                                 ps_chrm_cu_buf_prms->pu1_cu_left + i4_subtu_pos_y * left_strd_chrm;
9522                         }
9523                         else
9524                         {
9525                             pu1_left_chrm = pu1_cur_chroma_recon - 2;
9526                             left_strd_chrm = recon_chrma_strd;
9527                         }
9528 
9529                         /* top cu boundary */
9530                         if(0 == i4_subtu_pos_y)
9531                         {
9532                             pu1_top_chrm = ps_chrm_cu_buf_prms->pu1_cu_top + i4_subtu_pos_x;
9533                         }
9534                         else
9535                         {
9536                             pu1_top_chrm = pu1_cur_chroma_recon - recon_chrma_strd;
9537                         }
9538 
9539                         /* by default top left is set to cu top left */
9540                         pu1_top_left_chrm = ps_chrm_cu_buf_prms->pu1_cu_top_left;
9541 
9542                         /* top left based on position */
9543                         if((0 != i4_subtu_pos_y) && (0 == i4_subtu_pos_x))
9544                         {
9545                             pu1_top_left_chrm = pu1_left_chrm - left_strd_chrm;
9546                         }
9547                         else if(0 != i4_subtu_pos_x)
9548                         {
9549                             pu1_top_left_chrm = pu1_top_chrm - 2;
9550                         }
9551 
9552                         /* call the chroma reference array substitution */
9553                         ihevc_intra_pred_chroma_ref_substitution_fptr(
9554                             pu1_top_left_chrm,
9555                             pu1_top_chrm,
9556                             pu1_left_chrm,
9557                             left_strd_chrm,
9558                             chroma_trans_size,
9559                             nbr_flags,
9560                             (UWORD8 *)ps_ctxt->pv_ref_sub_out,
9561                             1);
9562 
9563                         /* use the look up to get the function idx */
9564                         chrm_pred_func_idx = g_i4_ip_funcs[chroma_pred_mode];
9565 
9566                         /* call the intra prediction function */
9567                         ps_ctxt->apf_chrm_ip[chrm_pred_func_idx](
9568                             (UWORD8 *)ps_ctxt->pv_ref_sub_out,
9569                             1,
9570                             pu1_cur_pred_chrm,
9571                             pred_chrm_strd,
9572                             chroma_trans_size,
9573                             chroma_pred_mode);
9574                     }
9575 
9576                     /**---------- Compute iq&coeff data if required : Chroma ------------**/
9577                     if(1 == ps_tu_enc_loop_temp_prms->b1_eval_chroma_iq_and_coeff_data)
9578                     {
9579                         WORD32 perform_sbh, perform_rdoq, temp_bits;
9580 
9581                         if(ps_prms->u1_recompute_sbh_and_rdoq)
9582                         {
9583                             perform_sbh = (ps_ctxt->i4_sbh_level != NO_SBH);
9584                             perform_rdoq = (ps_ctxt->i4_rdoq_level != NO_RDOQ);
9585                         }
9586                         else
9587                         {
9588                             /* RDOQ will change the coefficients. If coefficients are changed, we will have to do sbh again*/
9589                             perform_sbh = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_sbh;
9590                             /* To do SBH we need the quant and iquant data. This would mean we need to do quantization again, which would mean
9591                         we would have to do RDOQ again.*/
9592                             perform_rdoq = ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_rdoq;
9593                         }
9594 
9595                         /* populate the coeffs scan idx */
9596                         ps_ctxt->i4_scan_idx = SCAN_DIAG_UPRIGHT;
9597 
9598                         if(PRED_MODE_INTRA == packed_pred_mode)
9599                         {
9600                             /* for 4x4 transforms based on intra pred mode scan is choosen*/
9601                             if(4 == chroma_trans_size)
9602                             {
9603                                 /* for modes from 22 upto 30 horizontal scan is used */
9604                                 if((chroma_pred_mode > 21) && (chroma_pred_mode < 31))
9605                                 {
9606                                     ps_ctxt->i4_scan_idx = SCAN_HORZ;
9607                                 }
9608                                 /* for modes from 6 upto 14 horizontal scan is used */
9609                                 else if((chroma_pred_mode > 5) && (chroma_pred_mode < 15))
9610                                 {
9611                                     ps_ctxt->i4_scan_idx = SCAN_VERT;
9612                                 }
9613                             }
9614                         }
9615 
9616 #if DISABLE_RDOQ_INTRA
9617                         if(PRED_MODE_INTRA == packed_pred_mode)
9618                         {
9619                             perform_rdoq = 0;
9620                         }
9621 #endif
9622 
9623                         /* RDOPT copy States :  TU init (best until prev TU) to current */
9624                         COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9625                             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9626                                     .s_cabac_ctxt.au1_ctxt_models[0] +
9627                                 IHEVC_CAB_COEFFX_PREFIX,
9628                             &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9629                             IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9630 
9631                         ASSERT(rd_opt_best_idx == ps_ctxt->s_rdopt_entropy_ctxt.i4_curr_buf_idx);
9632                         /*If BEST candidate RDOQ is enabled, Eithe no coef level rdoq or CU level rdoq has to be enabled
9633                     so that all candidates and best candidate are quantized with same rounding factor  */
9634                         if(1 == perform_rdoq)
9635                         {
9636                             ASSERT(ps_ctxt->i4_quant_rounding_level != TU_LEVEL_QUANT_ROUNDING);
9637                         }
9638 
9639                         if(!ps_best_cu_prms->u1_skip_flag ||
9640                            !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt)
9641                         {
9642                             /* Cb */
9643                             cb_cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
9644                                 ps_ctxt,
9645                                 pu1_cur_pred_chrm,
9646                                 pred_chrm_strd,
9647                                 pu1_cur_src_chrm,
9648                                 src_chrm_strd,
9649                                 pi2_cur_deq_data_chrm,
9650                                 cu_size,
9651                                 pu1_chrm_recon,
9652                                 recon_chrma_strd,
9653                                 pu1_final_ecd_data,
9654                                 pu1_csbf_buf,
9655                                 csbf_strd,
9656                                 chroma_trans_size,
9657                                 ps_ctxt->i4_scan_idx,
9658                                 (PRED_MODE_INTRA == packed_pred_mode),
9659                                 &cb_num_bytes,
9660                                 &temp_bits,
9661                                 &cb_zero_col,
9662                                 &cb_zero_row,
9663                                 &au1_is_recon_available[U_PLANE],
9664                                 perform_sbh,
9665                                 perform_rdoq,
9666                                 &i8_ssd,
9667 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
9668                                 !ps_ctxt->u1_is_refPic
9669                                     ? ALPHA_FOR_NOISE_TERM_IN_RDOPT
9670                                     : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
9671                                        (double)ALPHA_FOR_NOISE_TERM_IN_RDOPT) /
9672                                           100.0,
9673                                 ps_prms->u1_is_cu_noisy,
9674 #endif
9675                                 ps_best_cu_prms->u1_skip_flag &&
9676                                     ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt,
9677                                 u1_compute_spatial_ssd_chroma ? SPATIAL_DOMAIN_SSD
9678                                                               : FREQUENCY_DOMAIN_SSD,
9679                                 U_PLANE);
9680                         }
9681                         else
9682                         {
9683                             cb_cbf = 0;
9684                             temp_bits = 0;
9685                             cb_num_bytes = 0;
9686                             au1_is_recon_available[U_PLANE] = 0;
9687                             cb_zero_col = 0;
9688                             cb_zero_row = 0;
9689                         }
9690 
9691                         /* Accumulate chroma residual bits */
9692                         ps_best_cu_prms->u4_cu_chroma_res_bits += temp_bits;
9693 
9694                         /* RDOPT copy States :  New updated after curr TU to TU init */
9695                         if(0 != cb_cbf)
9696                         {
9697                             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9698                                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9699                                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9700                                         .s_cabac_ctxt.au1_ctxt_models[0] +
9701                                     IHEVC_CAB_COEFFX_PREFIX,
9702                                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9703                         }
9704                         /* RDOPT copy States :  Restoring back the Cb init state to Cr */
9705                         else
9706                         {
9707                             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9708                                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9709                                         .s_cabac_ctxt.au1_ctxt_models[0] +
9710                                     IHEVC_CAB_COEFFX_PREFIX,
9711                                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9712                                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9713                         }
9714 
9715                         if(!ps_best_cu_prms->u1_skip_flag ||
9716                            !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt)
9717                         {
9718                             /* Cr */
9719                             cr_cbf = ihevce_chroma_t_q_iq_ssd_scan_fxn(
9720                                 ps_ctxt,
9721                                 pu1_cur_pred_chrm,
9722                                 pred_chrm_strd,
9723                                 pu1_cur_src_chrm,
9724                                 src_chrm_strd,
9725                                 pi2_cur_deq_data_chrm + chroma_trans_size,
9726                                 cu_size,
9727                                 pu1_chrm_recon,
9728                                 recon_chrma_strd,
9729                                 pu1_final_ecd_data + cb_num_bytes,
9730                                 pu1_csbf_buf,
9731                                 csbf_strd,
9732                                 chroma_trans_size,
9733                                 ps_ctxt->i4_scan_idx,
9734                                 (PRED_MODE_INTRA == packed_pred_mode),
9735                                 &cr_num_bytes,
9736                                 &temp_bits,
9737                                 &cr_zero_col,
9738                                 &cr_zero_row,
9739                                 &au1_is_recon_available[V_PLANE],
9740                                 perform_sbh,
9741                                 perform_rdoq,
9742                                 &i8_ssd,
9743 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
9744                                 !ps_ctxt->u1_is_refPic
9745                                     ? ALPHA_FOR_NOISE_TERM_IN_RDOPT
9746                                     : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
9747                                        (double)ALPHA_FOR_NOISE_TERM_IN_RDOPT) /
9748                                           100.0,
9749                                 ps_prms->u1_is_cu_noisy,
9750 #endif
9751                                 ps_best_cu_prms->u1_skip_flag &&
9752                                     ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt,
9753                                 u1_compute_spatial_ssd_chroma ? SPATIAL_DOMAIN_SSD
9754                                                               : FREQUENCY_DOMAIN_SSD,
9755                                 V_PLANE);
9756                         }
9757                         else
9758                         {
9759                             cr_cbf = 0;
9760                             temp_bits = 0;
9761                             cr_num_bytes = 0;
9762                             au1_is_recon_available[V_PLANE] = 0;
9763                             cr_zero_col = 0;
9764                             cr_zero_row = 0;
9765                         }
9766 
9767                         /* Accumulate chroma residual bits */
9768                         ps_best_cu_prms->u4_cu_chroma_res_bits += temp_bits;
9769 
9770                         /* RDOPT copy States :  New updated after curr TU to TU init */
9771                         if(0 != cr_cbf)
9772                         {
9773                             COPY_CABAC_STATES_FRM_CAB_COEFFX_PREFIX(
9774                                 &ps_ctxt->au1_rdopt_init_ctxt_models[0] + IHEVC_CAB_COEFFX_PREFIX,
9775                                 &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
9776                                         .s_cabac_ctxt.au1_ctxt_models[0] +
9777                                     IHEVC_CAB_COEFFX_PREFIX,
9778                                 IHEVC_CAB_CTXT_END - IHEVC_CAB_COEFFX_PREFIX);
9779                         }
9780 
9781                         if(0 == i4_subtu_idx)
9782                         {
9783                             ps_tu->b1_cb_cbf = cb_cbf;
9784                             ps_tu->b1_cr_cbf = cr_cbf;
9785                         }
9786                         else
9787                         {
9788                             ps_tu->b1_cb_cbf_subtu1 = cb_cbf;
9789                             ps_tu->b1_cr_cbf_subtu1 = cr_cbf;
9790                         }
9791                     }
9792                     else
9793                     {
9794                         cb_zero_col = ps_tu_enc_loop_temp_prms->au4_cb_zero_col[i4_subtu_idx];
9795                         cb_zero_row = ps_tu_enc_loop_temp_prms->au4_cb_zero_row[i4_subtu_idx];
9796                         cr_zero_col = ps_tu_enc_loop_temp_prms->au4_cr_zero_col[i4_subtu_idx];
9797                         cr_zero_row = ps_tu_enc_loop_temp_prms->au4_cr_zero_row[i4_subtu_idx];
9798 
9799                         if(ps_prms->u1_will_cabac_state_change)
9800                         {
9801                             cb_num_bytes =
9802                                 ps_tu_enc_loop_temp_prms->ai2_cb_bytes_consumed[i4_subtu_idx];
9803                         }
9804                         else
9805                         {
9806                             cb_num_bytes = 0;
9807                         }
9808 
9809                         if(ps_prms->u1_will_cabac_state_change)
9810                         {
9811                             cr_num_bytes =
9812                                 ps_tu_enc_loop_temp_prms->ai2_cr_bytes_consumed[i4_subtu_idx];
9813                         }
9814                         else
9815                         {
9816                             cr_num_bytes = 0;
9817                         }
9818 
9819                         /* copy cb ecd data to final buffer */
9820                         memcpy(pu1_final_ecd_data, pu1_chrm_old_ecd_data, cb_num_bytes);
9821 
9822                         pu1_chrm_old_ecd_data += cb_num_bytes;
9823 
9824                         /* copy cb ecd data to final buffer */
9825                         memcpy(
9826                             (pu1_final_ecd_data + cb_num_bytes),
9827                             pu1_chrm_old_ecd_data,
9828                             cr_num_bytes);
9829 
9830                         pu1_chrm_old_ecd_data += cr_num_bytes;
9831 
9832                         au1_is_recon_available[U_PLANE] = 0;
9833                         au1_is_recon_available[V_PLANE] = 0;
9834                     }
9835 
9836                     /**-------- Compute Recon data (Do IT & Recon) : Chroma  -----------**/
9837                     if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9838                        (!u1_compute_spatial_ssd_chroma ||
9839                         (!au1_is_recon_available[U_PLANE] && u1_compute_spatial_ssd_chroma)))
9840                     {
9841                         if(!ps_recon_datastore->au1_is_chromaRecon_available[0] ||
9842                            (ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9843                             (UCHAR_MAX ==
9844                              ps_recon_datastore
9845                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx])))
9846                         {
9847                             ihevce_chroma_it_recon_fxn(
9848                                 ps_ctxt,
9849                                 pi2_cur_deq_data_chrm,
9850                                 cu_size,
9851                                 pu1_cur_pred_chrm,
9852                                 pred_chrm_strd,
9853                                 pu1_cur_chroma_recon,
9854                                 recon_chrma_strd,
9855                                 pu1_final_ecd_data,
9856                                 chroma_trans_size,
9857                                 (i4_subtu_idx == 0) ? ps_tu->b1_cb_cbf : ps_tu->b1_cb_cbf_subtu1,
9858                                 cb_zero_col,
9859                                 cb_zero_row,
9860                                 U_PLANE);
9861                         }
9862                         else if(
9863                             ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9864                             (UCHAR_MAX !=
9865                              ps_recon_datastore
9866                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][i4_subtu_idx]))
9867                         {
9868                             UWORD8 *pu1_recon_src =
9869                                 ((UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs
9870                                      [ps_recon_datastore->au1_bufId_with_winning_ChromaRecon
9871                                           [U_PLANE][ctr][i4_subtu_idx]]) +
9872                                 i4_subtu_pos_x +
9873                                 i4_subtu_pos_y * ps_recon_datastore->i4_chromaRecon_stride;
9874 
9875                             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
9876                                 pu1_recon_src,
9877                                 ps_recon_datastore->i4_lumaRecon_stride,
9878                                 pu1_cur_chroma_recon,
9879                                 recon_chrma_strd,
9880                                 chroma_trans_size,
9881                                 chroma_trans_size,
9882                                 U_PLANE);
9883                         }
9884                     }
9885 
9886                     u1_is_cu_coded |=
9887                         ((1 == i4_subtu_idx) ? ps_tu->b1_cb_cbf_subtu1 : ps_tu->b1_cb_cbf);
9888 
9889                     if(ps_prms->u1_will_cabac_state_change)
9890                     {
9891                         ps_tu_enc_loop->ai4_cb_coeff_offset[i4_subtu_idx] = total_bytes;
9892                     }
9893 
9894                     pu1_final_ecd_data += cb_num_bytes;
9895                     /* update total bytes consumed */
9896                     total_bytes += cb_num_bytes;
9897 
9898                     if(ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data &&
9899                        (!u1_compute_spatial_ssd_chroma ||
9900                         (!au1_is_recon_available[V_PLANE] && u1_compute_spatial_ssd_chroma)))
9901                     {
9902                         if(!ps_recon_datastore->au1_is_chromaRecon_available[0] ||
9903                            (ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9904                             (UCHAR_MAX ==
9905                              ps_recon_datastore
9906                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx])))
9907                         {
9908                             ihevce_chroma_it_recon_fxn(
9909                                 ps_ctxt,
9910                                 pi2_cur_deq_data_chrm + chroma_trans_size,
9911                                 cu_size,
9912                                 pu1_cur_pred_chrm,
9913                                 pred_chrm_strd,
9914                                 pu1_cur_chroma_recon,
9915                                 recon_chrma_strd,
9916                                 pu1_final_ecd_data,
9917                                 chroma_trans_size,
9918                                 (i4_subtu_idx == 0) ? ps_tu->b1_cr_cbf : ps_tu->b1_cr_cbf_subtu1,
9919                                 cr_zero_col,
9920                                 cr_zero_row,
9921                                 V_PLANE);
9922                         }
9923                         else if(
9924                             ps_recon_datastore->au1_is_chromaRecon_available[0] &&
9925                             (UCHAR_MAX !=
9926                              ps_recon_datastore
9927                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][i4_subtu_idx]))
9928                         {
9929                             UWORD8 *pu1_recon_src =
9930                                 ((UWORD8 *)ps_recon_datastore->apv_chroma_recon_bufs
9931                                      [ps_recon_datastore->au1_bufId_with_winning_ChromaRecon
9932                                           [V_PLANE][ctr][i4_subtu_idx]]) +
9933                                 i4_subtu_pos_x +
9934                                 i4_subtu_pos_y * ps_recon_datastore->i4_chromaRecon_stride;
9935 
9936                             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
9937                                 pu1_recon_src,
9938                                 ps_recon_datastore->i4_lumaRecon_stride,
9939                                 pu1_cur_chroma_recon,
9940                                 recon_chrma_strd,
9941                                 chroma_trans_size,
9942                                 chroma_trans_size,
9943                                 V_PLANE);
9944                         }
9945                     }
9946 
9947                     u1_is_cu_coded |=
9948                         ((1 == i4_subtu_idx) ? ps_tu->b1_cr_cbf_subtu1 : ps_tu->b1_cr_cbf);
9949 
9950                     if(ps_prms->u1_will_cabac_state_change)
9951                     {
9952                         ps_tu_enc_loop->ai4_cr_coeff_offset[i4_subtu_idx] = total_bytes;
9953                     }
9954 
9955                     pu1_final_ecd_data += cr_num_bytes;
9956                     /* update total bytes consumed */
9957                     total_bytes += cr_num_bytes;
9958                 }
9959             }
9960         }
9961         else
9962         {
9963             ps_tu_enc_loop->ai4_cb_coeff_offset[0] = total_bytes;
9964             ps_tu_enc_loop->ai4_cr_coeff_offset[0] = total_bytes;
9965             ps_tu_enc_loop->ai4_cb_coeff_offset[1] = total_bytes;
9966             ps_tu_enc_loop->ai4_cr_coeff_offset[1] = total_bytes;
9967             ps_tu->b1_cb_cbf = 0;
9968             ps_tu->b1_cr_cbf = 0;
9969             ps_tu->b1_cb_cbf_subtu1 = 0;
9970             ps_tu->b1_cr_cbf_subtu1 = 0;
9971         }
9972 
9973         /* Update to next TU */
9974         ps_tu_enc_loop++;
9975         ps_tu_enc_loop_temp_prms++;
9976 
9977         pu4_nbr_flags++;
9978         pu1_intra_pred_mode++;
9979 
9980         /*Do not set the nbr map for last pu in cu */
9981         if((num_tu_in_cu - 1) != ctr)
9982         {
9983             /* set the neighbour map to 1 */
9984             ihevce_set_nbr_map(
9985                 ps_ctxt->pu1_ctb_nbr_map,
9986                 ps_ctxt->i4_nbr_map_strd,
9987                 cu_pos_x_in_4x4,
9988                 cu_pos_y_in_4x4,
9989                 (trans_size >> 2),
9990                 1);
9991         }
9992     }
9993 
9994     if(ps_prms->u1_will_cabac_state_change)
9995     {
9996         ps_best_cu_prms->u1_is_cu_coded = u1_is_cu_coded;
9997 
9998         /* Modify skip flag, if luma is skipped & Chroma is coded */
9999         if((1 == u1_is_cu_coded) && (PRED_MODE_SKIP == packed_pred_mode))
10000         {
10001             ps_best_cu_prms->u1_skip_flag = 0;
10002         }
10003     }
10004 
10005     /* during chroma evaluation if skip decision was over written     */
10006     /* then the current skip candidate is set to a non skip candidate */
10007     if(PRED_MODE_INTRA != packed_pred_mode)
10008     {
10009         ps_best_inter_cand->b1_skip_flag = ps_best_cu_prms->u1_skip_flag;
10010     }
10011 
10012     /**------------- Compute header data if required --------------**/
10013     if(1 == ps_ctxt->s_cu_final_recon_flags.u1_eval_header_data)
10014     {
10015         WORD32 cbf_bits;
10016         WORD32 cu_bits;
10017         WORD32 unit_4x4_size = cu_size >> 2;
10018 
10019         /*Restoring the running reference into the best rdopt_ctxt cabac states which will then
10020         be copied as the base reference for the next cu
10021         Assumption : We are ensuring that the u1_eval_header_data flag is set to 1 only if either
10022         luma and chroma are being reevaluated*/
10023         COPY_CABAC_STATES(
10024             &ps_ctxt->s_rdopt_entropy_ctxt.as_cu_entropy_ctxt[rd_opt_best_idx]
10025                  .s_cabac_ctxt.au1_ctxt_models[0],
10026             &ps_ctxt->au1_rdopt_init_ctxt_models[0],
10027             IHEVC_CAB_CTXT_END);
10028 
10029         /* get the neighbour availability flags for current cu  */
10030         ihevce_get_only_nbr_flag(
10031             &s_nbr,
10032             ps_ctxt->pu1_ctb_nbr_map,
10033             ps_ctxt->i4_nbr_map_strd,
10034             (cu_pos_x << 1),
10035             (cu_pos_y << 1),
10036             unit_4x4_size,
10037             unit_4x4_size);
10038 
10039         cu_bits = ihevce_entropy_rdo_encode_cu(
10040             &ps_ctxt->s_rdopt_entropy_ctxt,
10041             ps_best_cu_prms,
10042             cu_pos_x,
10043             cu_pos_y,
10044             cu_size,
10045             ps_ctxt->u1_disable_intra_eval ? !DISABLE_TOP_SYNC && s_nbr.u1_top_avail
10046                                            : s_nbr.u1_top_avail,
10047             s_nbr.u1_left_avail,
10048             (pu1_final_ecd_data - total_bytes),
10049             &cbf_bits);
10050 
10051         /* cbf bits are excluded from header bits, instead considered as texture bits */
10052         ps_best_cu_prms->u4_cu_hdr_bits = cu_bits - cbf_bits;
10053         ps_best_cu_prms->u4_cu_cbf_bits = cbf_bits;
10054     }
10055 
10056     if(ps_prms->u1_will_cabac_state_change)
10057     {
10058         ps_best_cu_prms->i4_num_bytes_ecd_data = total_bytes;
10059     }
10060 }
10061 
10062 /*!
10063 ******************************************************************************
10064 * \if Function name : ihevce_set_eval_flags \endif
10065 *
10066 * \brief
10067 *    Function which decides which eval flags have to be set based on present
10068 *    and RDOQ conditions
10069 *
10070 * \param[in] ps_ctxt : encoder ctxt pointer
10071 * \param[in] enc_loop_cu_final_prms_t : pointer to final cu params
10072 *
10073 * \return
10074 *    None
10075 *
10076 * \author
10077 *  Ittiam
10078 *
10079 *****************************************************************************
10080 */
ihevce_set_eval_flags(ihevce_enc_loop_ctxt_t * ps_ctxt,enc_loop_cu_final_prms_t * ps_enc_loop_bestprms)10081 void ihevce_set_eval_flags(
10082     ihevce_enc_loop_ctxt_t *ps_ctxt, enc_loop_cu_final_prms_t *ps_enc_loop_bestprms)
10083 {
10084     WORD32 count = 0;
10085 
10086     ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data = 0;
10087 
10088     ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data =
10089         !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10090 
10091     if(ps_ctxt->u1_disable_intra_eval && (!(ps_ctxt->i4_deblk_pad_hpel_cur_pic & 0x1)))
10092     {
10093         ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data = 0;
10094     }
10095     else
10096     {
10097         ps_ctxt->s_cu_final_recon_flags.u1_eval_recon_data = 1;
10098     }
10099 
10100     if((1 == ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_rdoq) ||
10101        (1 == ps_ctxt->s_rdoq_sbh_ctxt.i4_perform_best_cand_sbh))
10102     {
10103         /* When rdoq is enabled only for the best candidate, in case of in Intra nTU
10104         RDOQ might have altered the coeffs of the neighbour CU. As a result, the pred
10105         for the current CU will change. Therefore, we need to reevaluate the pred data*/
10106         if((ps_enc_loop_bestprms->u2_num_tus_in_cu > 1) &&
10107            (ps_enc_loop_bestprms->u1_intra_flag == 1))
10108         {
10109             ps_ctxt->s_cu_final_recon_flags.u1_eval_luma_pred_data = 1;
10110             ps_ctxt->s_cu_final_recon_flags.u1_eval_chroma_pred_data = 1;
10111         }
10112         if(ps_enc_loop_bestprms->u1_skip_flag == 1)
10113         {
10114             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
10115             {
10116                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10117                     .b1_eval_luma_iq_and_coeff_data = 0;
10118                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10119                     .b1_eval_chroma_iq_and_coeff_data = 0;
10120             }
10121         }
10122         else
10123         {
10124             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
10125             {
10126                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10127                     .b1_eval_luma_iq_and_coeff_data = 1;
10128                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10129                     .b1_eval_chroma_iq_and_coeff_data = 1;
10130             }
10131         }
10132     }
10133     else
10134     {
10135         switch(ps_ctxt->i4_quality_preset)
10136         {
10137         case IHEVCE_QUALITY_P0:
10138         case IHEVCE_QUALITY_P2:
10139         case IHEVCE_QUALITY_P3:
10140         {
10141             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
10142             {
10143                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10144                     .b1_eval_luma_iq_and_coeff_data = 0;
10145                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10146                     .b1_eval_chroma_iq_and_coeff_data =
10147                     !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10148             }
10149 
10150             break;
10151         }
10152         case IHEVCE_QUALITY_P4:
10153         case IHEVCE_QUALITY_P5:
10154         {
10155             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
10156             {
10157                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10158                     .b1_eval_luma_iq_and_coeff_data = 0;
10159                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10160                     .b1_eval_chroma_iq_and_coeff_data =
10161                     !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10162             }
10163 
10164             break;
10165         }
10166         case IHEVCE_QUALITY_P6:
10167         {
10168             for(count = 0; count < ps_enc_loop_bestprms->u2_num_tus_in_cu; count++)
10169             {
10170                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10171                     .b1_eval_luma_iq_and_coeff_data = 0;
10172 #if !ENABLE_CHROMA_TRACKING_OF_LUMA_CBF_IN_XS25
10173                 ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10174                     .b1_eval_chroma_iq_and_coeff_data =
10175                     !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10176 #else
10177                 if((ps_ctxt->i1_slice_type == BSLICE) && (ps_ctxt->i4_temporal_layer_id > 1) &&
10178                    (ps_enc_loop_bestprms->as_tu_enc_loop[count].s_tu.b3_size >= 2))
10179                 {
10180                     ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10181                         .b1_eval_chroma_iq_and_coeff_data =
10182                         ps_enc_loop_bestprms->as_tu_enc_loop[count].s_tu.b1_y_cbf;
10183                 }
10184                 else
10185                 {
10186                     ps_enc_loop_bestprms->as_tu_enc_loop_temp_prms[count]
10187                         .b1_eval_chroma_iq_and_coeff_data =
10188                         !ps_ctxt->s_chroma_rdopt_ctxt.u1_eval_chrm_rdopt;
10189                 }
10190 #endif
10191             }
10192 
10193             break;
10194         }
10195         default:
10196         {
10197             break;
10198         }
10199         }
10200     }
10201 
10202     /* Not recomputing Luma pred-data and header data for any preset now */
10203     ps_ctxt->s_cu_final_recon_flags.u1_eval_header_data = 1;
10204 }
10205 
10206 /**
10207 ******************************************************************************
10208 *
10209 *  @brief Shrink's TU tree of inter CUs by merging redundnant child nodes
10210 *         (not coded children) into a parent node(not coded).
10211 *
10212 *  @par   Description
10213 *         This is required post RDO evaluation as TU decisions are
10214 *         pre-determined(pre RDO) based on recursive SATD,
10215 *         while the quad children TU's can be skipped during RDO
10216 *
10217 *         The shrink process is applied iteratively till there are no
10218 *         more modes to shrink
10219 *
10220 *  @param[inout]   ps_tu_enc_loop
10221 *       pointer to tu enc loop params of inter cu
10222 *
10223 *  @param[inout]   ps_tu_enc_loop_temp_prms
10224 *       pointer to temp tu enc loop params of inter cu
10225 *
10226 *  @param[in]   num_tu_in_cu
10227 *       number of tus in cu
10228 *
10229 *  @return      modified number of tus in cu
10230 *
10231 ******************************************************************************
10232 */
ihevce_shrink_inter_tu_tree(tu_enc_loop_out_t * ps_tu_enc_loop,tu_enc_loop_temp_prms_t * ps_tu_enc_loop_temp_prms,recon_datastore_t * ps_recon_datastore,WORD32 num_tu_in_cu,UWORD8 u1_is_422)10233 WORD32 ihevce_shrink_inter_tu_tree(
10234     tu_enc_loop_out_t *ps_tu_enc_loop,
10235     tu_enc_loop_temp_prms_t *ps_tu_enc_loop_temp_prms,
10236     recon_datastore_t *ps_recon_datastore,
10237     WORD32 num_tu_in_cu,
10238     UWORD8 u1_is_422)
10239 {
10240     WORD32 recurse = 1;
10241     WORD32 ctr;
10242 
10243     /* ------------- Quadtree TU Split Transform flag optimization ------------  */
10244     /* Post RDO, if all 4 child nodes are not coded the overheads of split TU    */
10245     /* flags and cbf flags are saved by merging to parent node and marking       */
10246     /* parent TU as not coded                                                    */
10247     /*                                                                           */
10248     /*                               ParentTUSplit=1                             */
10249     /*                                      |                                    */
10250     /*       ---------------------------------------------------------           */
10251     /*       |C0(Not coded) | C1(Not coded) | C2(Not coded) | C3(Not coded)      */
10252     /*                                     ||                                    */
10253     /*                                     \/                                    */
10254     /*                                                                           */
10255     /*                              ParentTUSplit=0 (Not Coded)                  */
10256     /*                                                                           */
10257     /* ------------- Quadtree TU Split Transform flag optimization ------------  */
10258     while((num_tu_in_cu > 4) && recurse)
10259     {
10260         recurse = 0;
10261 
10262         /* Validate inter CU */
10263         //ASSERT(ps_tu_enc_loop[0].s_tu.s_tu.b1_intra_flag == 0); /*b1_intra_flag no longer a member of tu structure */
10264 
10265         /* loop for all tu blocks in current cu */
10266         for(ctr = 0; ctr < num_tu_in_cu;)
10267         {
10268             /* Get current tu posx, posy and size */
10269             WORD32 curr_pos_x = ps_tu_enc_loop[ctr].s_tu.b4_pos_x << 2;
10270             WORD32 curr_pos_y = ps_tu_enc_loop[ctr].s_tu.b4_pos_y << 2;
10271             /* +1 is for parents size */
10272             WORD32 parent_tu_size = 1 << (ps_tu_enc_loop[ctr].s_tu.b3_size + 2 + 1);
10273 
10274             /* eval merge if leaf nodes reached i.e all child tus are of same size and first tu pos is same as parent pos */
10275             WORD32 eval_merge = ((curr_pos_x & (parent_tu_size - 1)) == 0);
10276             eval_merge &= ((curr_pos_y & (parent_tu_size - 1)) == 0);
10277 
10278             /* As TUs are published in encode order (Z SCAN),                      */
10279             /* Four consecutive TUS of same size implies we have hit leaf nodes.   */
10280             if(((ps_tu_enc_loop[ctr].s_tu.b3_size) == (ps_tu_enc_loop[ctr + 1].s_tu.b3_size)) &&
10281                ((ps_tu_enc_loop[ctr].s_tu.b3_size) == (ps_tu_enc_loop[ctr + 2].s_tu.b3_size)) &&
10282                ((ps_tu_enc_loop[ctr].s_tu.b3_size) == (ps_tu_enc_loop[ctr + 3].s_tu.b3_size)) &&
10283                eval_merge)
10284             {
10285                 WORD32 merge_parent = 1;
10286 
10287                 /* If any leaf noded is coded, it cannot be merged to parent */
10288                 if((ps_tu_enc_loop[ctr].s_tu.b1_y_cbf) || (ps_tu_enc_loop[ctr].s_tu.b1_cb_cbf) ||
10289                    (ps_tu_enc_loop[ctr].s_tu.b1_cr_cbf) ||
10290 
10291                    (ps_tu_enc_loop[ctr + 1].s_tu.b1_y_cbf) ||
10292                    (ps_tu_enc_loop[ctr + 1].s_tu.b1_cb_cbf) ||
10293                    (ps_tu_enc_loop[ctr + 1].s_tu.b1_cr_cbf) ||
10294 
10295                    (ps_tu_enc_loop[ctr + 2].s_tu.b1_y_cbf) ||
10296                    (ps_tu_enc_loop[ctr + 2].s_tu.b1_cb_cbf) ||
10297                    (ps_tu_enc_loop[ctr + 2].s_tu.b1_cr_cbf) ||
10298 
10299                    (ps_tu_enc_loop[ctr + 3].s_tu.b1_y_cbf) ||
10300                    (ps_tu_enc_loop[ctr + 3].s_tu.b1_cb_cbf) ||
10301                    (ps_tu_enc_loop[ctr + 3].s_tu.b1_cr_cbf))
10302                 {
10303                     merge_parent = 0;
10304                 }
10305 
10306                 if(u1_is_422)
10307                 {
10308                     if((ps_tu_enc_loop[ctr].s_tu.b1_cb_cbf_subtu1) ||
10309                        (ps_tu_enc_loop[ctr].s_tu.b1_cr_cbf_subtu1) ||
10310 
10311                        (ps_tu_enc_loop[ctr + 1].s_tu.b1_cb_cbf_subtu1) ||
10312                        (ps_tu_enc_loop[ctr + 1].s_tu.b1_cr_cbf_subtu1) ||
10313 
10314                        (ps_tu_enc_loop[ctr + 2].s_tu.b1_cb_cbf_subtu1) ||
10315                        (ps_tu_enc_loop[ctr + 2].s_tu.b1_cr_cbf_subtu1) ||
10316 
10317                        (ps_tu_enc_loop[ctr + 3].s_tu.b1_cb_cbf_subtu1) ||
10318                        (ps_tu_enc_loop[ctr + 3].s_tu.b1_cr_cbf_subtu1))
10319                     {
10320                         merge_parent = 0;
10321                     }
10322                 }
10323 
10324                 if(merge_parent)
10325                 {
10326                     /* Merge all the children (ctr,ctr+1,ctr+2,ctr+3) to parent (ctr) */
10327 
10328                     if(ps_recon_datastore->u1_is_lumaRecon_available)
10329                     {
10330                         ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr] = UCHAR_MAX;
10331 
10332                         memmove(
10333                             &ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr + 1],
10334                             &ps_recon_datastore->au1_bufId_with_winning_LumaRecon[ctr + 4],
10335                             (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10336                     }
10337 
10338                     if(ps_recon_datastore->au1_is_chromaRecon_available[0])
10339                     {
10340                         ps_recon_datastore->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][0] =
10341                             UCHAR_MAX;
10342                         ps_recon_datastore->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][0] =
10343                             UCHAR_MAX;
10344 
10345                         memmove(
10346                             &ps_recon_datastore
10347                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr + 1][0],
10348                             &ps_recon_datastore
10349                                  ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr + 4][0],
10350                             (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10351 
10352                         memmove(
10353                             &ps_recon_datastore
10354                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr + 1][0],
10355                             &ps_recon_datastore
10356                                  ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr + 4][0],
10357                             (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10358 
10359                         if(u1_is_422)
10360                         {
10361                             ps_recon_datastore->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr][1] =
10362                                 UCHAR_MAX;
10363                             ps_recon_datastore->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr][1] =
10364                                 UCHAR_MAX;
10365 
10366                             memmove(
10367                                 &ps_recon_datastore
10368                                      ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr + 1][1],
10369                                 &ps_recon_datastore
10370                                      ->au1_bufId_with_winning_ChromaRecon[U_PLANE][ctr + 4][1],
10371                                 (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10372 
10373                             memmove(
10374                                 &ps_recon_datastore
10375                                      ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr + 1][1],
10376                                 &ps_recon_datastore
10377                                      ->au1_bufId_with_winning_ChromaRecon[V_PLANE][ctr + 4][1],
10378                                 (num_tu_in_cu - ctr - 4) * sizeof(UWORD8));
10379                         }
10380                     }
10381 
10382                     /* Parent node size is one more than that of child */
10383                     ps_tu_enc_loop[ctr].s_tu.b3_size++;
10384 
10385                     ctr++;
10386 
10387                     /* move the subsequent TUs to next element */
10388                     ASSERT(num_tu_in_cu >= (ctr + 3));
10389                     memmove(
10390                         (void *)(ps_tu_enc_loop + ctr),
10391                         (void *)(ps_tu_enc_loop + ctr + 3),
10392                         (num_tu_in_cu - ctr - 3) * sizeof(tu_enc_loop_out_t));
10393 
10394                     /* Also memmove the temp TU params */
10395                     memmove(
10396                         (void *)(ps_tu_enc_loop_temp_prms + ctr),
10397                         (void *)(ps_tu_enc_loop_temp_prms + ctr + 3),
10398                         (num_tu_in_cu - ctr - 3) * sizeof(tu_enc_loop_temp_prms_t));
10399 
10400                     /* Number of TUs in CU are now less by 3 */
10401                     num_tu_in_cu -= 3;
10402 
10403                     /* Recurse again as new parent also be can be merged later */
10404                     recurse = 1;
10405                 }
10406                 else
10407                 {
10408                     /* Go to next set of leaf nodes */
10409                     ctr += 4;
10410                 }
10411             }
10412             else
10413             {
10414                 ctr++;
10415             }
10416         }
10417     }
10418 
10419     /* return the modified num TUs*/
10420     ASSERT(num_tu_in_cu > 0);
10421     return (num_tu_in_cu);
10422 }
10423 
ihevce_intra_mode_nxn_hash_updater(UWORD8 * pu1_mode_array,UWORD8 * pu1_hash_table,UWORD8 u1_num_ipe_modes)10424 UWORD8 ihevce_intra_mode_nxn_hash_updater(
10425     UWORD8 *pu1_mode_array, UWORD8 *pu1_hash_table, UWORD8 u1_num_ipe_modes)
10426 {
10427     WORD32 i;
10428     WORD32 i4_mode;
10429 
10430     for(i = 0; i < MAX_INTRA_CU_CANDIDATES; i++)
10431     {
10432         if(pu1_mode_array[i] < 35)
10433         {
10434             if(pu1_mode_array[i] != 0)
10435             {
10436                 i4_mode = pu1_mode_array[i] - 1;
10437 
10438                 if(!pu1_hash_table[i4_mode])
10439                 {
10440                     pu1_hash_table[i4_mode] = 1;
10441                     pu1_mode_array[u1_num_ipe_modes] = i4_mode;
10442                     u1_num_ipe_modes++;
10443                 }
10444             }
10445 
10446             if(pu1_mode_array[i] != 34)
10447             {
10448                 i4_mode = pu1_mode_array[i] + 1;
10449 
10450                 if((!pu1_hash_table[i4_mode]))
10451                 {
10452                     pu1_hash_table[i4_mode] = 1;
10453                     pu1_mode_array[u1_num_ipe_modes] = i4_mode;
10454                     u1_num_ipe_modes++;
10455                 }
10456             }
10457         }
10458     }
10459 
10460     if(!pu1_hash_table[INTRA_PLANAR])
10461     {
10462         pu1_hash_table[INTRA_PLANAR] = 1;
10463         pu1_mode_array[u1_num_ipe_modes] = INTRA_PLANAR;
10464         u1_num_ipe_modes++;
10465     }
10466 
10467     if(!pu1_hash_table[INTRA_DC])
10468     {
10469         pu1_hash_table[INTRA_DC] = 1;
10470         pu1_mode_array[u1_num_ipe_modes] = INTRA_DC;
10471         u1_num_ipe_modes++;
10472     }
10473 
10474     return u1_num_ipe_modes;
10475 }
10476 
10477 #if ENABLE_TU_TREE_DETERMINATION_IN_RDOPT
ihevce_determine_tu_tree_distribution(cu_inter_cand_t * ps_cu_data,me_func_selector_t * ps_func_selector,WORD16 * pi2_scratch_mem,UWORD8 * pu1_inp,WORD32 i4_inp_stride,WORD32 i4_lambda,UWORD8 u1_lambda_q_shift,UWORD8 u1_cu_size,UWORD8 u1_max_tr_depth)10478 WORD32 ihevce_determine_tu_tree_distribution(
10479     cu_inter_cand_t *ps_cu_data,
10480     me_func_selector_t *ps_func_selector,
10481     WORD16 *pi2_scratch_mem,
10482     UWORD8 *pu1_inp,
10483     WORD32 i4_inp_stride,
10484     WORD32 i4_lambda,
10485     UWORD8 u1_lambda_q_shift,
10486     UWORD8 u1_cu_size,
10487     UWORD8 u1_max_tr_depth)
10488 {
10489     err_prms_t s_err_prms;
10490 
10491     PF_SAD_FXN_TU_REC pf_err_compute[4];
10492 
10493     WORD32 i4_satd;
10494 
10495     s_err_prms.pi4_sad_grid = &i4_satd;
10496     s_err_prms.pi4_tu_split_flags = ps_cu_data->ai4_tu_split_flag;
10497     s_err_prms.pu1_inp = pu1_inp;
10498     s_err_prms.pu1_ref = ps_cu_data->pu1_pred_data;
10499     s_err_prms.i4_inp_stride = i4_inp_stride;
10500     s_err_prms.i4_ref_stride = ps_cu_data->i4_pred_data_stride;
10501     s_err_prms.pu1_wkg_mem = (UWORD8 *)pi2_scratch_mem;
10502 
10503     if(u1_cu_size == 64)
10504     {
10505         s_err_prms.u1_max_tr_depth = MIN(1, u1_max_tr_depth);
10506     }
10507     else
10508     {
10509         s_err_prms.u1_max_tr_depth = u1_max_tr_depth;
10510     }
10511 
10512     pf_err_compute[CU_64x64] = hme_evalsatd_pt_pu_64x64_tu_rec;
10513     pf_err_compute[CU_32x32] = hme_evalsatd_pt_pu_32x32_tu_rec;
10514     pf_err_compute[CU_16x16] = hme_evalsatd_pt_pu_16x16_tu_rec;
10515     pf_err_compute[CU_8x8] = hme_evalsatd_pt_pu_8x8_tu_rec;
10516 
10517     i4_satd = pf_err_compute[hme_get_range(u1_cu_size) - 4](
10518         &s_err_prms, i4_lambda, u1_lambda_q_shift, 0, ps_func_selector);
10519 
10520     if((0 == u1_max_tr_depth) && (ps_cu_data->b3_part_size != 0) && (u1_cu_size != 64))
10521     {
10522         ps_cu_data->ai4_tu_split_flag[0] = 1;
10523     }
10524 
10525     return i4_satd;
10526 }
10527 #endif
10528 
ihevce_populate_nbr_4x4_with_pu_data(nbr_4x4_t * ps_nbr_4x4,pu_t * ps_pu,WORD32 i4_nbr_buf_stride)10529 void ihevce_populate_nbr_4x4_with_pu_data(
10530     nbr_4x4_t *ps_nbr_4x4, pu_t *ps_pu, WORD32 i4_nbr_buf_stride)
10531 {
10532     WORD32 i, j;
10533 
10534     nbr_4x4_t *ps_tmp_4x4 = ps_nbr_4x4;
10535 
10536     WORD32 ht = (ps_pu->b4_ht + 1);
10537     WORD32 wd = (ps_pu->b4_wd + 1);
10538 
10539     ps_nbr_4x4->b1_intra_flag = 0;
10540     ps_nbr_4x4->b1_pred_l0_flag = !(ps_pu->b2_pred_mode & 1);
10541     ps_nbr_4x4->b1_pred_l1_flag = (ps_pu->b2_pred_mode > PRED_L0);
10542     ps_nbr_4x4->mv = ps_pu->mv;
10543 
10544     for(i = 0; i < ht; i++)
10545     {
10546         for(j = 0; j < wd; j++)
10547         {
10548             ps_tmp_4x4[j] = *ps_nbr_4x4;
10549         }
10550 
10551         ps_tmp_4x4 += i4_nbr_buf_stride;
10552     }
10553 }
10554 
ihevce_call_luma_inter_pred_rdopt_pass1(ihevce_enc_loop_ctxt_t * ps_ctxt,cu_inter_cand_t * ps_inter_cand,WORD32 cu_size)10555 void ihevce_call_luma_inter_pred_rdopt_pass1(
10556     ihevce_enc_loop_ctxt_t *ps_ctxt, cu_inter_cand_t *ps_inter_cand, WORD32 cu_size)
10557 {
10558     pu_t *ps_pu;
10559     UWORD8 *pu1_pred;
10560     WORD32 pred_stride, ctr, num_cu_part, skip_or_merge_flag = 0;
10561     WORD32 inter_pu_wd, inter_pu_ht;
10562 
10563     pu1_pred = ps_inter_cand->pu1_pred_data_scr;
10564     pred_stride = ps_inter_cand->i4_pred_data_stride;
10565     num_cu_part = (SIZE_2Nx2N != ps_inter_cand->b3_part_size) + 1;
10566 
10567     for(ctr = 0; ctr < num_cu_part; ctr++)
10568     {
10569         ps_pu = &ps_inter_cand->as_inter_pu[ctr];
10570 
10571         /* IF AMP then each partitions can have diff wd ht */
10572         inter_pu_wd = (ps_pu->b4_wd + 1) << 2;
10573         inter_pu_ht = (ps_pu->b4_ht + 1) << 2;
10574 
10575         skip_or_merge_flag = ps_inter_cand->b1_skip_flag | ps_pu->b1_merge_flag;
10576         //if(0 == skip_or_merge_flag)
10577         {
10578             ihevce_luma_inter_pred_pu(&ps_ctxt->s_mc_ctxt, ps_pu, pu1_pred, pred_stride, 1);
10579         }
10580         if((2 == num_cu_part) && (0 == ctr))
10581         {
10582             /* 2Nx__ partion case */
10583             if(inter_pu_wd == cu_size)
10584             {
10585                 pu1_pred += (inter_pu_ht * pred_stride);
10586             }
10587 
10588             /* __x2N partion case */
10589             if(inter_pu_ht == cu_size)
10590             {
10591                 pu1_pred += inter_pu_wd;
10592             }
10593         }
10594     }
10595 }
10596 
ihevce_it_recon_ssd(ihevce_enc_loop_ctxt_t * ps_ctxt,UWORD8 * pu1_src,WORD32 i4_src_strd,UWORD8 * pu1_pred,WORD32 i4_pred_strd,WORD16 * pi2_deq_data,WORD32 i4_deq_data_strd,UWORD8 * pu1_recon,WORD32 i4_recon_stride,UWORD8 * pu1_ecd_data,UWORD8 u1_trans_size,UWORD8 u1_pred_mode,WORD32 i4_cbf,WORD32 i4_zero_col,WORD32 i4_zero_row,CHROMA_PLANE_ID_T e_chroma_plane)10597 LWORD64 ihevce_it_recon_ssd(
10598     ihevce_enc_loop_ctxt_t *ps_ctxt,
10599     UWORD8 *pu1_src,
10600     WORD32 i4_src_strd,
10601     UWORD8 *pu1_pred,
10602     WORD32 i4_pred_strd,
10603     WORD16 *pi2_deq_data,
10604     WORD32 i4_deq_data_strd,
10605     UWORD8 *pu1_recon,
10606     WORD32 i4_recon_stride,
10607     UWORD8 *pu1_ecd_data,
10608     UWORD8 u1_trans_size,
10609     UWORD8 u1_pred_mode,
10610     WORD32 i4_cbf,
10611     WORD32 i4_zero_col,
10612     WORD32 i4_zero_row,
10613     CHROMA_PLANE_ID_T e_chroma_plane)
10614 {
10615     if(NULL_PLANE == e_chroma_plane)
10616     {
10617         ihevce_it_recon_fxn(
10618             ps_ctxt,
10619             pi2_deq_data,
10620             i4_deq_data_strd,
10621             pu1_pred,
10622             i4_pred_strd,
10623             pu1_recon,
10624             i4_recon_stride,
10625             pu1_ecd_data,
10626             u1_trans_size,
10627             u1_pred_mode,
10628             i4_cbf,
10629             i4_zero_col,
10630             i4_zero_row);
10631 
10632         return ps_ctxt->s_cmn_opt_func.pf_ssd_calculator(
10633             pu1_recon, pu1_src, i4_recon_stride, i4_src_strd, u1_trans_size, u1_trans_size);
10634     }
10635     else
10636     {
10637         ihevce_chroma_it_recon_fxn(
10638             ps_ctxt,
10639             pi2_deq_data,
10640             i4_deq_data_strd,
10641             pu1_pred,
10642             i4_pred_strd,
10643             pu1_recon,
10644             i4_recon_stride,
10645             pu1_ecd_data,
10646             u1_trans_size,
10647             i4_cbf,
10648             i4_zero_col,
10649             i4_zero_row,
10650             e_chroma_plane);
10651 
10652         return ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
10653             pu1_recon + (e_chroma_plane == V_PLANE),
10654             pu1_src + (e_chroma_plane == V_PLANE),
10655             i4_recon_stride,
10656             i4_src_strd,
10657             u1_trans_size,
10658             u1_trans_size);
10659     }
10660 }
10661 
10662 /*!
10663 ******************************************************************************
10664 * \if Function name : ihevce_t_q_iq_ssd_scan_fxn \endif
10665 *
10666 * \brief
10667 *    Transform unit level (Chroma) enc_loop function
10668 *
10669 * \param[in] ps_ctxt    enc_loop module ctxt pointer
10670 * \param[in] pu1_pred       pointer to predicted data buffer
10671 * \param[in] pred_strd      predicted buffer stride
10672 * \param[in] pu1_src    pointer to source data buffer
10673 * \param[in] src_strd   source buffer stride
10674 * \param[in] pi2_deq_data   pointer to store iq data
10675 * \param[in] deq_data_strd  iq data buffer stride
10676 * \param[out] pu1_ecd_data  pointer coeff output buffer (input to ent cod)
10677 * \param[out] pu1_csbf_buf  pointer to store the csbf for all 4x4 in a current
10678 *                           block
10679 * \param[out] csbf_strd     csbf buffer stride
10680 * \param[in] trans_size     transform size (4, 8, 16)
10681 * \param[in] intra_flag     0:Inter/Skip 1:Intra
10682 * \param[out] pi4_coeff_off pointer to store the number of bytes produced in
10683 *                           coeff buffer
10684 the current TU in RDopt Mode
10685 * \param[out] pi4_zero_col  pointer to store the zero_col info for the TU
10686 * \param[out] pi4_zero_row  pointer to store the zero_row info for the TU
10687 *
10688 * \return
10689 *    CBF of the current block
10690 *
10691 * \author
10692 *  Ittiam
10693 *
10694 *****************************************************************************
10695 */
ihevce_chroma_t_q_iq_ssd_scan_fxn(ihevce_enc_loop_ctxt_t * ps_ctxt,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD16 * pi2_deq_data,WORD32 deq_data_strd,UWORD8 * pu1_recon,WORD32 i4_recon_stride,UWORD8 * pu1_ecd_data,UWORD8 * pu1_csbf_buf,WORD32 csbf_strd,WORD32 trans_size,WORD32 i4_scan_idx,WORD32 intra_flag,WORD32 * pi4_coeff_off,WORD32 * pi4_tu_bits,WORD32 * pi4_zero_col,WORD32 * pi4_zero_row,UWORD8 * pu1_is_recon_available,WORD32 i4_perform_sbh,WORD32 i4_perform_rdoq,LWORD64 * pi8_cost,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_is_cu_noisy,UWORD8 u1_is_skip,SSD_TYPE_T e_ssd_type,CHROMA_PLANE_ID_T e_chroma_plane)10696 WORD32 ihevce_chroma_t_q_iq_ssd_scan_fxn(
10697     ihevce_enc_loop_ctxt_t *ps_ctxt,
10698     UWORD8 *pu1_pred,
10699     WORD32 pred_strd,
10700     UWORD8 *pu1_src,
10701     WORD32 src_strd,
10702     WORD16 *pi2_deq_data,
10703     WORD32 deq_data_strd,
10704     UWORD8 *pu1_recon,
10705     WORD32 i4_recon_stride,
10706     UWORD8 *pu1_ecd_data,
10707     UWORD8 *pu1_csbf_buf,
10708     WORD32 csbf_strd,
10709     WORD32 trans_size,
10710     WORD32 i4_scan_idx,
10711     WORD32 intra_flag,
10712     WORD32 *pi4_coeff_off,
10713     WORD32 *pi4_tu_bits,
10714     WORD32 *pi4_zero_col,
10715     WORD32 *pi4_zero_row,
10716     UWORD8 *pu1_is_recon_available,
10717     WORD32 i4_perform_sbh,
10718     WORD32 i4_perform_rdoq,
10719     LWORD64 *pi8_cost,
10720 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
10721     WORD32 i4_alpha_stim_multiplier,
10722     UWORD8 u1_is_cu_noisy,
10723 #endif
10724     UWORD8 u1_is_skip,
10725     SSD_TYPE_T e_ssd_type,
10726     CHROMA_PLANE_ID_T e_chroma_plane)
10727 {
10728     WORD32 trans_idx, cbf, u4_blk_sad;
10729     WORD16 *pi2_quant_coeffs;
10730     WORD16 *pi2_trans_values;
10731     WORD32 quant_scale_mat_offset;
10732     WORD32 *pi4_trans_scratch;
10733     WORD32 *pi4_subBlock2csbfId_map = NULL;
10734 
10735 #if PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
10736     WORD32 ai4_quant_rounding_factors[3][MAX_TU_SIZE * MAX_TU_SIZE], i;
10737 #endif
10738 
10739     rdoq_sbh_ctxt_t *ps_rdoq_sbh_ctxt = &ps_ctxt->s_rdoq_sbh_ctxt;
10740 
10741     WORD32 i4_perform_zcbf = (ps_ctxt->i4_zcbf_rdo_level == ZCBF_ENABLE) ||
10742                              (!intra_flag && ENABLE_INTER_ZCU_COST);
10743     WORD32 i4_perform_coeff_level_rdoq =
10744         (ps_ctxt->i4_quant_rounding_level != FIXED_QUANT_ROUNDING) &&
10745         (ps_ctxt->i4_chroma_quant_rounding_level == CHROMA_QUANT_ROUNDING);
10746 
10747     ASSERT((e_chroma_plane == U_PLANE) || (e_chroma_plane == V_PLANE));
10748     ASSERT(csbf_strd == MAX_TU_IN_CTB_ROW);
10749 
10750     *pi4_coeff_off = 0;
10751     *pi4_tu_bits = 0;
10752     pu1_is_recon_available[0] = 0;
10753 
10754     pi4_trans_scratch = (WORD32 *)&ps_ctxt->ai2_scratch[0];
10755     pi2_quant_coeffs = &ps_ctxt->ai2_scratch[0];
10756     pi2_trans_values = &ps_ctxt->ai2_scratch[0] + (MAX_TRANS_SIZE * 2);
10757 
10758     if(2 == trans_size)
10759     {
10760         trans_size = 4;
10761     }
10762 
10763     /* translate the transform size to index */
10764     trans_idx = trans_size >> 2;
10765 
10766     if(16 == trans_size)
10767     {
10768         trans_idx = 3;
10769     }
10770 
10771     if(u1_is_skip)
10772     {
10773         pi8_cost[0] = ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
10774             pu1_pred + e_chroma_plane,
10775             pu1_src + e_chroma_plane,
10776             pred_strd,
10777             src_strd,
10778             trans_size,
10779             trans_size);
10780 
10781         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
10782         {
10783             /* buffer copy fromp pred to recon */
10784             ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
10785                 pu1_pred,
10786                 pred_strd,
10787                 pu1_recon,
10788                 i4_recon_stride,
10789                 trans_size,
10790                 trans_size,
10791                 e_chroma_plane);
10792 
10793             pu1_is_recon_available[0] = 1;
10794         }
10795 
10796 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
10797         if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
10798         {
10799             pi8_cost[0] = ihevce_inject_stim_into_distortion(
10800                 pu1_src,
10801                 src_strd,
10802                 pu1_pred,
10803                 pred_strd,
10804                 pi8_cost[0],
10805                 i4_alpha_stim_multiplier,
10806                 trans_size,
10807                 0,
10808                 ps_ctxt->u1_enable_psyRDOPT,
10809                 e_chroma_plane);
10810         }
10811 #endif
10812 
10813 #if ENABLE_INTER_ZCU_COST
10814 #if !WEIGH_CHROMA_COST
10815         /* cbf = 0, accumulate cu not coded cost */
10816         ps_ctxt->i8_cu_not_coded_cost += pi8_cost[0];
10817 #else
10818         ps_ctxt->i8_cu_not_coded_cost += (pi8_cost[0] * ps_ctxt->u4_chroma_cost_weighing_factor +
10819                                           (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
10820                                          CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT;
10821 #endif
10822 #endif
10823 
10824         return 0;
10825     }
10826 
10827     if(intra_flag == 1)
10828     {
10829         quant_scale_mat_offset = 0;
10830 
10831 #if PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
10832         ai4_quant_rounding_factors[0][0] =
10833             MAX(ps_ctxt->i4_quant_rnd_factor[intra_flag], (1 << QUANT_ROUND_FACTOR_Q) / 3);
10834 
10835         for(i = 0; i < trans_size * trans_size; i++)
10836         {
10837             ai4_quant_rounding_factors[1][i] =
10838                 MAX(ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_0_1[trans_size >> 3][i],
10839                     (1 << QUANT_ROUND_FACTOR_Q) / 3);
10840             ai4_quant_rounding_factors[2][i] =
10841                 MAX(ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_1_2[trans_size >> 3][i],
10842                     (1 << QUANT_ROUND_FACTOR_Q) / 3);
10843         }
10844 #endif
10845     }
10846     else
10847     {
10848         quant_scale_mat_offset = NUM_TRANS_TYPES;
10849     }
10850 
10851     switch(trans_size)
10852     {
10853     case 4:
10854     {
10855         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map4x4TU;
10856 
10857         break;
10858     }
10859     case 8:
10860     {
10861         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map8x8TU;
10862 
10863         break;
10864     }
10865     case 16:
10866     {
10867         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map16x16TU;
10868 
10869         break;
10870     }
10871     case 32:
10872     {
10873         pi4_subBlock2csbfId_map = gai4_subBlock2csbfId_map32x32TU;
10874 
10875         break;
10876     }
10877     }
10878 
10879     /* ---------- call residue and transform block ------- */
10880     u4_blk_sad = ps_ctxt->apf_chrm_resd_trns[trans_idx - 1](
10881         pu1_src + (e_chroma_plane == V_PLANE),
10882         pu1_pred + (e_chroma_plane == V_PLANE),
10883         pi4_trans_scratch,
10884         pi2_trans_values,
10885         src_strd,
10886         pred_strd,
10887         ((trans_size << 16) + 1)); /* dst strd and chroma flag are packed together */
10888     (void)u4_blk_sad;
10889     /* -------- calculate SSD calculation in Transform Domain ------ */
10890 
10891     cbf = ps_ctxt->apf_quant_iquant_ssd
10892               [i4_perform_coeff_level_rdoq + (e_ssd_type != FREQUENCY_DOMAIN_SSD) * 2]
10893 
10894           (pi2_trans_values,
10895            ps_ctxt->api2_rescal_mat[trans_idx + quant_scale_mat_offset],
10896            pi2_quant_coeffs,
10897            pi2_deq_data,
10898            trans_size,
10899            ps_ctxt->i4_chrm_cu_qp_div6,
10900            ps_ctxt->i4_chrm_cu_qp_mod6,
10901 #if !PROHIBIT_INTRA_QUANT_ROUNDING_FACTOR_TO_DROP_BELOW_1BY3
10902            ps_ctxt->i4_quant_rnd_factor[intra_flag],
10903            ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_0_1[trans_size >> 3],
10904            ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_1_2[trans_size >> 3],
10905 #else
10906            intra_flag ? ai4_quant_rounding_factors[0][0] : ps_ctxt->i4_quant_rnd_factor[intra_flag],
10907            intra_flag ? ai4_quant_rounding_factors[1]
10908                       : ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_0_1[trans_size >> 3],
10909            intra_flag ? ai4_quant_rounding_factors[2]
10910                       : ps_ctxt->pi4_quant_round_factor_cr_cu_ctb_1_2[trans_size >> 3],
10911 #endif
10912            trans_size,
10913            trans_size,
10914            deq_data_strd,
10915            pu1_csbf_buf,
10916            csbf_strd,
10917            pi4_zero_col,
10918            pi4_zero_row,
10919            ps_ctxt->api2_scal_mat[trans_idx + quant_scale_mat_offset],
10920            pi8_cost);
10921 
10922     if(e_ssd_type != FREQUENCY_DOMAIN_SSD)
10923     {
10924         pi8_cost[0] = UINT_MAX;
10925     }
10926 
10927     if(0 != cbf)
10928     {
10929         if(i4_perform_sbh || i4_perform_rdoq)
10930         {
10931             ps_rdoq_sbh_ctxt->i4_iq_data_strd = deq_data_strd;
10932             ps_rdoq_sbh_ctxt->i4_q_data_strd = trans_size;
10933 
10934             ps_rdoq_sbh_ctxt->i4_qp_div = ps_ctxt->i4_chrm_cu_qp_div6;
10935             ps_rdoq_sbh_ctxt->i2_qp_rem = ps_ctxt->i4_chrm_cu_qp_mod6;
10936             ps_rdoq_sbh_ctxt->i4_scan_idx = i4_scan_idx;
10937             ps_rdoq_sbh_ctxt->i8_ssd_cost = *pi8_cost;
10938             ps_rdoq_sbh_ctxt->i4_trans_size = trans_size;
10939 
10940             ps_rdoq_sbh_ctxt->pi2_dequant_coeff =
10941                 ps_ctxt->api2_scal_mat[trans_idx + quant_scale_mat_offset];
10942             ps_rdoq_sbh_ctxt->pi2_iquant_coeffs = pi2_deq_data;
10943             ps_rdoq_sbh_ctxt->pi2_quant_coeffs = pi2_quant_coeffs;
10944             ps_rdoq_sbh_ctxt->pi2_trans_values = pi2_trans_values;
10945             ps_rdoq_sbh_ctxt->pu1_csbf_buf = pu1_csbf_buf;
10946             ps_rdoq_sbh_ctxt->pi4_subBlock2csbfId_map = pi4_subBlock2csbfId_map;
10947 
10948             if((!i4_perform_rdoq))
10949             {
10950                 ihevce_sign_data_hiding(ps_rdoq_sbh_ctxt);
10951 
10952                 pi8_cost[0] = ps_rdoq_sbh_ctxt->i8_ssd_cost;
10953             }
10954         }
10955 
10956         /* ------- call coeffs scan function ------- */
10957         *pi4_coeff_off = ps_ctxt->s_cmn_opt_func.pf_scan_coeffs(
10958             pi2_quant_coeffs,
10959             pi4_subBlock2csbfId_map,
10960             i4_scan_idx,
10961             trans_size,
10962             pu1_ecd_data,
10963             pu1_csbf_buf,
10964             csbf_strd);
10965     }
10966 
10967     /*  Normalize Cost. Note : trans_idx, not (trans_idx-1) */
10968     pi8_cost[0] >>= ga_trans_shift[trans_idx];
10969 
10970 #if RDOPT_ZERO_CBF_ENABLE
10971     if((0 != cbf))
10972     {
10973         WORD32 tu_bits;
10974         LWORD64 zero_cbf_cost_u, curr_cb_cod_cost;
10975 
10976         zero_cbf_cost_u = 0;
10977 
10978         /*Populating the feilds of rdoq_ctxt structure*/
10979         if(i4_perform_rdoq)
10980         {
10981             //memset(ps_rdoq_sbh_ctxt,0,sizeof(rdoq_sbh_ctxt_t));
10982             /* transform size to log2transform size */
10983             GETRANGE(ps_rdoq_sbh_ctxt->i4_log2_trans_size, trans_size);
10984             ps_rdoq_sbh_ctxt->i4_log2_trans_size -= 1;
10985 
10986             ps_rdoq_sbh_ctxt->i8_cl_ssd_lambda_qf = ps_ctxt->i8_cl_ssd_lambda_chroma_qf;
10987             ps_rdoq_sbh_ctxt->i4_is_luma = 0;
10988             ps_rdoq_sbh_ctxt->i4_shift_val_ssd_in_td = ga_trans_shift[trans_idx];
10989             ps_rdoq_sbh_ctxt->i4_round_val_ssd_in_td =
10990                 (1 << (ps_rdoq_sbh_ctxt->i4_shift_val_ssd_in_td - 1));
10991             ps_rdoq_sbh_ctxt->i1_tu_is_coded = 0;
10992             ps_rdoq_sbh_ctxt->pi4_zero_col = pi4_zero_col;
10993             ps_rdoq_sbh_ctxt->pi4_zero_row = pi4_zero_row;
10994         }
10995         else if(i4_perform_zcbf)
10996         {
10997             /* cost of zero cbf encoding */
10998             zero_cbf_cost_u =
10999 
11000                 ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_ssd_calculator(
11001                     pu1_pred + (e_chroma_plane == V_PLANE),
11002                     pu1_src + (e_chroma_plane == V_PLANE),
11003                     pred_strd,
11004                     src_strd,
11005                     trans_size,
11006                     trans_size);
11007         }
11008 
11009         /************************************************************************/
11010         /* call the entropy rdo encode to get the bit estimate for current tu   */
11011         /* note that tu includes only residual coding bits and does not include */
11012         /* tu split, cbf and qp delta encoding bits for a TU                    */
11013         /************************************************************************/
11014         if(i4_perform_rdoq)
11015         {
11016             tu_bits = ihevce_entropy_rdo_encode_tu_rdoq(
11017                 &ps_ctxt->s_rdopt_entropy_ctxt,
11018                 pu1_ecd_data,
11019                 trans_size,
11020                 0,
11021                 ps_rdoq_sbh_ctxt,
11022                 pi8_cost,
11023                 &zero_cbf_cost_u,
11024                 0);
11025             //Currently, we are not accounting for sign bit in RDOPT bits calculation when RDOQ is turned on
11026 
11027             if(ps_rdoq_sbh_ctxt->i1_tu_is_coded == 0)
11028             {
11029                 cbf = 0;
11030 
11031                 /* num bytes is set to 0 */
11032                 *pi4_coeff_off = 0;
11033             }
11034 
11035             (*pi4_tu_bits) += tu_bits;
11036 
11037             if((i4_perform_sbh) && (0 != cbf))
11038             {
11039                 ps_rdoq_sbh_ctxt->i8_ssd_cost = pi8_cost[0];
11040 
11041                 ihevce_sign_data_hiding(ps_rdoq_sbh_ctxt);
11042 
11043                 pi8_cost[0] = ps_rdoq_sbh_ctxt->i8_ssd_cost;
11044             }
11045 
11046             /*Add round value before normalizing*/
11047             pi8_cost[0] += ps_rdoq_sbh_ctxt->i4_round_val_ssd_in_td;
11048             pi8_cost[0] >>= ga_trans_shift[trans_idx];
11049 
11050             if(ps_rdoq_sbh_ctxt->i1_tu_is_coded == 1)
11051             {
11052                 *pi4_coeff_off = ps_ctxt->s_cmn_opt_func.pf_scan_coeffs(
11053                     pi2_quant_coeffs,
11054                     pi4_subBlock2csbfId_map,
11055                     i4_scan_idx,
11056                     trans_size,
11057                     pu1_ecd_data,
11058                     ps_rdoq_sbh_ctxt->pu1_csbf_buf,
11059                     csbf_strd);
11060             }
11061         }
11062         else
11063         {
11064             /************************************************************************/
11065             /* call the entropy rdo encode to get the bit estimate for current tu   */
11066             /* note that tu includes only residual coding bits and does not include */
11067             /* tu split, cbf and qp delta encoding bits for a TU                    */
11068             /************************************************************************/
11069             tu_bits = ihevce_entropy_rdo_encode_tu(
11070                 &ps_ctxt->s_rdopt_entropy_ctxt, pu1_ecd_data, trans_size, 0, i4_perform_sbh);
11071 
11072             (*pi4_tu_bits) += tu_bits;
11073         }
11074 
11075         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
11076         {
11077             pi8_cost[0] = ihevce_it_recon_ssd(
11078                 ps_ctxt,
11079                 pu1_src,
11080                 src_strd,
11081                 pu1_pred,
11082                 pred_strd,
11083                 pi2_deq_data,
11084                 deq_data_strd,
11085                 pu1_recon,
11086                 i4_recon_stride,
11087                 pu1_ecd_data,
11088                 trans_size,
11089                 PRED_MODE_INTRA,
11090                 cbf,
11091                 pi4_zero_col[0],
11092                 pi4_zero_row[0],
11093                 e_chroma_plane);
11094 
11095             pu1_is_recon_available[0] = 1;
11096         }
11097 
11098 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
11099         if(u1_is_cu_noisy && (e_ssd_type == SPATIAL_DOMAIN_SSD) && i4_alpha_stim_multiplier)
11100         {
11101             pi8_cost[0] = ihevce_inject_stim_into_distortion(
11102                 pu1_src,
11103                 src_strd,
11104                 pu1_recon,
11105                 i4_recon_stride,
11106                 pi8_cost[0],
11107                 i4_alpha_stim_multiplier,
11108                 trans_size,
11109                 0,
11110                 ps_ctxt->u1_enable_psyRDOPT,
11111                 e_chroma_plane);
11112         }
11113         else if(u1_is_cu_noisy && (e_ssd_type == FREQUENCY_DOMAIN_SSD) && i4_alpha_stim_multiplier)
11114         {
11115             pi8_cost[0] = ihevce_inject_stim_into_distortion(
11116                 pu1_src,
11117                 src_strd,
11118                 pu1_pred,
11119                 pred_strd,
11120                 pi8_cost[0],
11121                 i4_alpha_stim_multiplier,
11122                 trans_size,
11123                 0,
11124                 ps_ctxt->u1_enable_psyRDOPT,
11125                 e_chroma_plane);
11126         }
11127 #endif
11128 
11129         curr_cb_cod_cost = pi8_cost[0];
11130 
11131         /* add the SSD cost to bits estimate given by ECD */
11132         curr_cb_cod_cost +=
11133             COMPUTE_RATE_COST_CLIP30(tu_bits, ps_ctxt->i8_cl_ssd_lambda_chroma_qf, LAMBDA_Q_SHIFT);
11134 
11135         if(i4_perform_zcbf)
11136         {
11137 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
11138             if(u1_is_cu_noisy && i4_alpha_stim_multiplier)
11139             {
11140                 zero_cbf_cost_u = ihevce_inject_stim_into_distortion(
11141                     pu1_src,
11142                     src_strd,
11143                     pu1_pred,
11144                     pred_strd,
11145                     zero_cbf_cost_u,
11146                     !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
11147                                            : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
11148                                               (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
11149                                                  100.0,
11150                     trans_size,
11151                     0,
11152                     ps_ctxt->u1_enable_psyRDOPT,
11153                     e_chroma_plane);
11154             }
11155 #endif
11156             /* force the tu as zero cbf if zero_cbf_cost is lower */
11157             if(zero_cbf_cost_u < curr_cb_cod_cost)
11158             {
11159                 *pi4_coeff_off = 0;
11160                 cbf = 0;
11161                 (*pi4_tu_bits) = 0;
11162                 pi8_cost[0] = zero_cbf_cost_u;
11163 
11164                 pu1_is_recon_available[0] = 0;
11165 
11166                 if(e_ssd_type == SPATIAL_DOMAIN_SSD)
11167                 {
11168                     ps_ctxt->s_cmn_opt_func.pf_chroma_interleave_2d_copy(
11169                         pu1_pred,
11170                         pred_strd,
11171                         pu1_recon,
11172                         i4_recon_stride,
11173                         trans_size,
11174                         trans_size,
11175                         e_chroma_plane);
11176 
11177                     pu1_is_recon_available[0] = 1;
11178                 }
11179             }
11180 
11181 #if ENABLE_INTER_ZCU_COST
11182             if(!intra_flag)
11183             {
11184 #if !WEIGH_CHROMA_COST
11185                 ps_ctxt->i8_cu_not_coded_cost += zero_cbf_cost_u;
11186 #else
11187                 ps_ctxt->i8_cu_not_coded_cost += (LWORD64)(
11188                     (zero_cbf_cost_u * ps_ctxt->u4_chroma_cost_weighing_factor +
11189                      (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
11190                     CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
11191 #endif
11192             }
11193 #endif
11194         }
11195     }
11196     else
11197     {
11198         if(e_ssd_type == SPATIAL_DOMAIN_SSD)
11199         {
11200             pi8_cost[0] = ihevce_it_recon_ssd(
11201                 ps_ctxt,
11202                 pu1_src,
11203                 src_strd,
11204                 pu1_pred,
11205                 pred_strd,
11206                 pi2_deq_data,
11207                 deq_data_strd,
11208                 pu1_recon,
11209                 i4_recon_stride,
11210                 pu1_ecd_data,
11211                 trans_size,
11212                 PRED_MODE_INTRA,
11213                 cbf,
11214                 pi4_zero_col[0],
11215                 pi4_zero_row[0],
11216                 e_chroma_plane);
11217 
11218             pu1_is_recon_available[0] = 1;
11219         }
11220 
11221 #if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
11222         if(u1_is_cu_noisy && (e_ssd_type == SPATIAL_DOMAIN_SSD) && i4_alpha_stim_multiplier)
11223         {
11224             pi8_cost[0] = ihevce_inject_stim_into_distortion(
11225                 pu1_src,
11226                 src_strd,
11227                 pu1_recon,
11228                 i4_recon_stride,
11229                 pi8_cost[0],
11230                 !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
11231                                        : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
11232                                           (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
11233                                              100.0,
11234                 trans_size,
11235                 0,
11236                 ps_ctxt->u1_enable_psyRDOPT,
11237                 e_chroma_plane);
11238         }
11239         else if(u1_is_cu_noisy && (e_ssd_type == FREQUENCY_DOMAIN_SSD) && i4_alpha_stim_multiplier)
11240         {
11241             pi8_cost[0] = ihevce_inject_stim_into_distortion(
11242                 pu1_src,
11243                 src_strd,
11244                 pu1_pred,
11245                 pred_strd,
11246                 pi8_cost[0],
11247                 !ps_ctxt->u1_is_refPic ? ALPHA_FOR_ZERO_CODING_DECISIONS
11248                                        : ((100 - ALPHA_DISCOUNT_IN_REF_PICS_IN_RDOPT) *
11249                                           (double)ALPHA_FOR_ZERO_CODING_DECISIONS) /
11250                                              100.0,
11251                 trans_size,
11252                 0,
11253                 ps_ctxt->u1_enable_psyRDOPT,
11254                 e_chroma_plane);
11255         }
11256 #endif
11257 
11258 #if ENABLE_INTER_ZCU_COST
11259         if(!intra_flag)
11260         {
11261 #if !WEIGH_CHROMA_COST
11262             /* cbf = 0, accumulate cu not coded cost */
11263             ps_ctxt->i8_cu_not_coded_cost += pi8_cost[0];
11264 #else
11265             /* cbf = 0, accumulate cu not coded cost */
11266 
11267             ps_ctxt->i8_cu_not_coded_cost += (LWORD64)(
11268                 (pi8_cost[0] * ps_ctxt->u4_chroma_cost_weighing_factor +
11269                  (1 << (CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT - 1))) >>
11270                 CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT);
11271 #endif
11272         }
11273 #endif
11274     }
11275 #endif /* RDOPT_ZERO_CBF_ENABLE */
11276 
11277     return (cbf);
11278 }
11279