1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /*!
22 ******************************************************************************
23 * \file ihevce_recur_bracketing.c
24 *
25 * \brief
26 *    This file contains interface functions of recursive bracketing
27 *    module
28 * \date
29 *    12/02/2012
30 *
31 * \author
32 *    Ittiam
33 *
34 * List of Functions
35 *
36 *
37 ******************************************************************************
38 */
39 
40 /*****************************************************************************/
41 /* File Includes                                                             */
42 /*****************************************************************************/
43 /* System include files */
44 #include <stdio.h>
45 #include <string.h>
46 #include <stdlib.h>
47 #include <assert.h>
48 #include <stdarg.h>
49 #include <math.h>
50 
51 /* User include files */
52 #include "ihevc_typedefs.h"
53 #include "itt_video_api.h"
54 #include "ihevce_api.h"
55 
56 #include "rc_cntrl_param.h"
57 #include "rc_frame_info_collector.h"
58 #include "rc_look_ahead_params.h"
59 
60 #include "ihevc_defs.h"
61 #include "ihevc_structs.h"
62 #include "ihevc_platform_macros.h"
63 #include "ihevc_deblk.h"
64 #include "ihevc_itrans_recon.h"
65 #include "ihevc_chroma_itrans_recon.h"
66 #include "ihevc_chroma_intra_pred.h"
67 #include "ihevc_intra_pred.h"
68 #include "ihevc_inter_pred.h"
69 #include "ihevc_mem_fns.h"
70 #include "ihevc_padding.h"
71 #include "ihevc_weighted_pred.h"
72 #include "ihevc_sao.h"
73 #include "ihevc_resi_trans.h"
74 #include "ihevc_quant_iquant_ssd.h"
75 #include "ihevc_cabac_tables.h"
76 
77 #include "ihevce_defs.h"
78 #include "ihevce_lap_enc_structs.h"
79 #include "ihevce_multi_thrd_structs.h"
80 #include "ihevce_me_common_defs.h"
81 #include "ihevce_had_satd.h"
82 #include "ihevce_error_codes.h"
83 #include "ihevce_bitstream.h"
84 #include "ihevce_cabac.h"
85 #include "ihevce_rdoq_macros.h"
86 #include "ihevce_function_selector.h"
87 #include "ihevce_enc_structs.h"
88 #include "ihevce_entropy_structs.h"
89 #include "ihevce_cmn_utils_instr_set_router.h"
90 #include "ihevce_enc_loop_structs.h"
91 #include "ihevce_ipe_instr_set_router.h"
92 #include "ihevce_ipe_structs.h"
93 #include "ihevce_ipe_pass.h"
94 #include "ihevce_recur_bracketing.h"
95 #include "ihevce_nbr_avail.h"
96 #include "ihevc_common_tables.h"
97 #include "ihevce_decomp_pre_intra_structs.h"
98 #include "ihevce_decomp_pre_intra_pass.h"
99 
100 #include "cast_types.h"
101 #include "osal.h"
102 #include "osal_defaults.h"
103 
104 /*****************************************************************************/
105 /* Constant Macros                                                           */
106 /*****************************************************************************/
107 #define IP_DBG_L1_l2 0
108 #define CHILD_BIAS 12
109 
110 /*****************************************************************************/
111 /* Globals                                                                   */
112 /*****************************************************************************/
113 extern pf_intra_pred g_apf_lum_ip[10];
114 
115 extern WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES];
116 
117 UWORD8 gau1_cu_pos_x[64] = { 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7,
118                              6, 7, 4, 5, 4, 5, 6, 7, 6, 7, 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1,
119                              2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7 };
120 
121 UWORD8 gau1_cu_pos_y[64] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 0, 0, 1, 1, 0, 0,
122                              1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7,
123                              6, 6, 7, 7, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7, 6, 6, 7, 7 };
124 
125 #define RESET_BIT(x, bit) (x = x & ~((WORD32)1 << bit))
126 
127 /*****************************************************************************/
128 /* Function Definitions                                                      */
129 /*****************************************************************************/
130 
131 /*!
132 ******************************************************************************
133 * \if Function name : ihevce_update_cand_list \endif
134 *
135 * \brief
136 *    Final Candidate list population, nbr flag andd nbr mode update function
137 *
138 * \param[in] ps_row_cu : pointer to cu analyse struct
139 * \param[in] ps_cu_node : pointer to cu node info buffer
140 * \param[in] ps_ed_blk_l1 : pointer to level 1 and 2 decision buffer
141 * \param[in] pu1_cand_mode_list  : pointer to candidate list buffer
142 *
143 * \return
144 *    None
145 *
146 * \author
147 *  Ittiam
148 *
149 *****************************************************************************
150 */
ihevce_update_cand_list(ihevce_ipe_cu_tree_t * ps_cu_node,ihevce_ed_blk_t * ps_ed_blk_l1,ihevce_ipe_ctxt_t * ps_ctxt)151 void ihevce_update_cand_list(
152     ihevce_ipe_cu_tree_t *ps_cu_node, ihevce_ed_blk_t *ps_ed_blk_l1, ihevce_ipe_ctxt_t *ps_ctxt)
153 {
154     WORD32 row, col, x, y, size;
155 
156     /* Candidate mode Update */
157     (void)ps_ed_blk_l1;
158     /* Update CTB mode map for the finalised CU */
159     x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
160     y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
161     size = ps_cu_node->u1_cu_size >> 2;
162     for(row = y; row < (y + size); row++)
163     {
164         for(col = x; col < (x + size); col++)
165         {
166             ps_ctxt->au1_ctb_mode_map[row][col] = ps_cu_node->best_mode;
167         }
168     }
169     return;
170 }
171 
172 /*!
173 ******************************************************************************
174 * \if Function name : ihevce_intra_populate_mode_bits_cost_bracketing \endif
175 *
176 * \brief
177 *    Mpm indx calc function based on left and top available modes
178 *
179 * \param[in] top_intra_mode : Top available intra mode
180 * \param[in] left_intra_mode : Left available intra mode
181 * \param[in] available_top : Top availability flag
182 * \param[in] available_left : Left availability flag
183 * \param[in] cu_pos_y : cu position wrt to CTB
184 * \param[in] mode_bits_cost : pointer to mode bits buffer
185 * \param[in] lambda : Lambda value (SAD/SATD)
186 * \param[in] cand_mode_list  : pointer to candidate list buffer
187 *
188 * \return
189 *    None
190 *
191 * \author
192 *  Ittiam
193 *
194 *****************************************************************************
195 */
ihevce_intra_populate_mode_bits_cost_bracketing(WORD32 top_intra_mode,WORD32 left_intra_mode,WORD32 available_top,WORD32 available_left,WORD32 cu_pos_y,UWORD16 * mode_bits_cost,UWORD16 * mode_bits,WORD32 lambda,WORD32 * cand_mode_list)196 void ihevce_intra_populate_mode_bits_cost_bracketing(
197     WORD32 top_intra_mode,
198     WORD32 left_intra_mode,
199     WORD32 available_top,
200     WORD32 available_left,
201     WORD32 cu_pos_y,
202     UWORD16 *mode_bits_cost,
203     UWORD16 *mode_bits,
204     WORD32 lambda,
205     WORD32 *cand_mode_list)
206 {
207     /* local variables */
208     WORD32 i;
209     WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
210 
211     UWORD16 one_bits_cost =
212         COMPUTE_RATE_COST_CLIP30(4, lambda, (LAMBDA_Q_SHIFT + 1));  //1.5 * lambda
213     UWORD16 two_bits_cost =
214         COMPUTE_RATE_COST_CLIP30(6, lambda, (LAMBDA_Q_SHIFT + 1));  //2.5 * lambda
215     UWORD16 five_bits_cost =
216         COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1));  //5.5 * lambda
217 
218     for(i = 0; i < 35; i++)
219     {
220         mode_bits_cost[i] = five_bits_cost;
221         mode_bits[i] = 5;
222     }
223 
224     /* EIID: set availability flag to zero if modes are invalid.
225        Required since some CU's might be skipped (though available)
226        and their modes will be set to 255 (-1)*/
227     if(35 < top_intra_mode || 0 > top_intra_mode)
228         available_top = 0;
229     if(35 < left_intra_mode || 0 > left_intra_mode)
230         available_left = 0;
231 
232     /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
233     /* N = top */
234     if(0 == available_top)
235     {
236         cand_intra_pred_mode_top = INTRA_DC;
237     }
238     /* for neighbour != INTRA, setting DC is done outside */
239     else if(0 == cu_pos_y) /* It's on the CTB boundary */
240     {
241         cand_intra_pred_mode_top = INTRA_DC;
242     }
243     else
244     {
245         cand_intra_pred_mode_top = top_intra_mode;
246     }
247 
248     /* N = left */
249     if(0 == available_left)
250     {
251         cand_intra_pred_mode_left = INTRA_DC;
252         //cand_intra_pred_mode_left = cand_intra_pred_mode_top;
253     }
254     /* for neighbour != INTRA, setting DC is done outside */
255     else
256     {
257         cand_intra_pred_mode_left = left_intra_mode;
258     }
259 
260     /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
261     if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
262     {
263         if(cand_intra_pred_mode_left < 2)
264         {
265             cand_mode_list[0] = INTRA_PLANAR;
266             cand_mode_list[1] = INTRA_DC;
267             cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
268         }
269         else
270         {
271             cand_mode_list[0] = cand_intra_pred_mode_left;
272             cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
273             cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
274         }
275     }
276     else
277     {
278         if(0 == available_left)
279         {
280             cand_mode_list[0] = cand_intra_pred_mode_top;
281             cand_mode_list[1] = cand_intra_pred_mode_left;
282         }
283         else
284         {
285             cand_mode_list[0] = cand_intra_pred_mode_left;
286             cand_mode_list[1] = cand_intra_pred_mode_top;
287         }
288         if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
289            (cand_intra_pred_mode_top != INTRA_PLANAR))
290         {
291             cand_mode_list[2] = INTRA_PLANAR;
292         }
293         else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
294         {
295             cand_mode_list[2] = INTRA_DC;
296         }
297         else
298         {
299             cand_mode_list[2] = INTRA_ANGULAR(26);
300         }
301     }
302     mode_bits_cost[cand_mode_list[0]] = one_bits_cost;
303     mode_bits_cost[cand_mode_list[1]] = two_bits_cost;
304     mode_bits_cost[cand_mode_list[2]] = two_bits_cost;
305 
306     mode_bits[cand_mode_list[0]] = 2;
307     mode_bits[cand_mode_list[1]] = 3;
308     mode_bits[cand_mode_list[2]] = 3;
309 }
310 
311 /*!
312 ******************************************************************************
313 * \if Function name : ihevce_pu_calc_4x4_blk \endif
314 *
315 * \brief
316 *    4x4 pu (8x8 CU) mode decision using step 8421 method
317 *
318 * \param[in] ps_cu_node : pointer to cu node info buffer
319 * \param[in] pu1_src : pointer to src pixels
320 * \param[in] src_stride : frm source stride
321 * \param[in] ref : pointer to reference pixels for prediction
322 * \param[in] cand_mode_list  : pointer to candidate list buffer
323 * \param[in] best_costs_4x4  : pointer to 3 best cost buffer
324 * \param[in] best_modes_4x4  : pointer to 3 best mode buffer
325 *
326 * \return
327 *    None
328 *
329 * \author
330 *  Ittiam
331 *
332 *****************************************************************************
333 */
ihevce_pu_calc_4x4_blk(ihevce_ipe_ctxt_t * ps_ctxt,ihevce_ipe_cu_tree_t * ps_cu_node,UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * ref,UWORD16 * mode_bits_cost,WORD32 * best_costs_4x4,UWORD8 * best_modes_4x4,func_selector_t * ps_func_selector)334 void ihevce_pu_calc_4x4_blk(
335     ihevce_ipe_ctxt_t *ps_ctxt,
336     ihevce_ipe_cu_tree_t *ps_cu_node,
337     UWORD8 *pu1_src,
338     WORD32 src_stride,
339     UWORD8 *ref,
340     UWORD16 *mode_bits_cost,
341     WORD32 *best_costs_4x4,
342     UWORD8 *best_modes_4x4,
343     func_selector_t *ps_func_selector)
344 {
345     WORD16 *pi2_trans_tmp = ps_ctxt->pi2_trans_tmp;
346     WORD16 *pi2_trans_out = ps_ctxt->pi2_trans_out;
347     UWORD8 u1_use_satd = ps_ctxt->u1_use_satd;
348     UWORD8 u1_level_1_refine_on = ps_ctxt->u1_level_1_refine_on;
349 
350     WORD32 i, j = 0, i_end;
351     UWORD8 mode, best_amode = 255;
352     UWORD8 pred[16];
353 
354     UWORD16 sad;
355     WORD32 sad_cost = 0;
356     WORD32 best_asad_cost = 0xFFFFF;
357     WORD32 temp;
358     UWORD8 modes_to_eval[5];
359     WORD32 costs_4x4[5];
360     UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
361 
362     /* LO resolution hence low resolution disable */
363     WORD32 u1_low_resol = 0;
364     UWORD8 au1_best_modes[1] = { 0 };
365     WORD32 ai4_best_sad_costs[1] = { 0 };
366 
367     WORD16 *pi2_tmp = &pi2_trans_tmp[0];
368 
369     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list =
370         &ps_ctxt->s_ipe_optimised_function_list;
371 
372     //apf_resd_trns[0] = &ihevc_resi_trans_4x4_ttype1;
373     //apf_resd_trns[0] = &ihevc_HAD_4x4_8bit;
374 
375     for(i = 0; i < 5; i++)
376     {
377         costs_4x4[i] = MAX_INTRA_COST_IPE;
378     }
379 
380     ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
381         pu1_src,
382         src_stride,
383         ref,
384         mode_bits_cost,
385         au1_best_modes,
386         ai4_best_sad_costs,
387         u1_low_resol,
388         ps_ipe_optimised_function_list->pf_4x4_sad_computer);
389 
390     best_amode = au1_best_modes[0];
391     best_asad_cost = ai4_best_sad_costs[0];
392 
393     ASSERT(best_amode != 255);
394     /* Around best level 4 angular mode, search for best level 2 mode */
395     modes_to_eval[0] = best_amode - 2;
396     modes_to_eval[1] = best_amode + 2;
397     i = 0;
398     i_end = 2;
399     if(best_amode == 2)
400         i = 1;
401     else if(best_amode == 34)
402         i_end = 1;
403     for(; i < i_end; i++)
404     {
405         mode = modes_to_eval[i];
406 
407         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
408 
409         sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
410 
411         sad_cost = sad;
412         sad_cost += mode_bits_cost[mode];
413 
414         if(sad_cost < best_asad_cost)
415         {
416             best_amode = mode;
417             best_asad_cost = sad_cost;
418         }
419     }
420 
421     /* Around best level 2 angular mode, search for best level 1 mode */
422     /* Also evaluate for non-angular mode */
423 
424     i = 0;
425     /*Level 1 refinement is disabled for ES preset */
426     if(1 == u1_level_1_refine_on)
427     {
428         if(best_amode != 2)
429             modes_to_eval[i++] = best_amode - 1;
430         modes_to_eval[i++] = best_amode;
431     }
432 
433     modes_to_eval[i++] = 0;
434     modes_to_eval[i++] = 1;
435 
436     if(1 == u1_level_1_refine_on)
437     {
438         if(best_amode != 34)
439             modes_to_eval[i++] = best_amode + 1;
440     }
441     i_end = i;
442     i = 0;
443 
444     for(; i < i_end; i++)
445     {
446         mode = modes_to_eval[i];
447 
448         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
449 
450         /* Hard coding to use SATD */
451         if(u1_use_satd)
452         {
453             ps_func_selector->ihevc_resi_trans_4x4_ttype1_fptr(
454                 pu1_src, &pred[0], (WORD32 *)pi2_tmp, pi2_trans_out, src_stride, 4, 4, NULL_PLANE);
455 
456             sad = ihevce_ipe_pass_satd(pi2_trans_out, 4, 4);
457         }
458         else
459         {
460             sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(
461                 pu1_src, &pred[0], src_stride, 4);
462         }
463         sad_cost = sad;
464         sad_cost += mode_bits_cost[mode];
465 
466         costs_4x4[i] = sad_cost;
467     }
468 
469     /* Arrange the reference array in ascending order */
470     for(i = 0; i < (i_end - 1); i++)
471     {
472         for(j = i + 1; j < i_end; j++)
473         {
474             if(costs_4x4[i] > costs_4x4[j])
475             {
476                 temp = costs_4x4[i];
477                 costs_4x4[i] = costs_4x4[j];
478                 costs_4x4[j] = temp;
479 
480                 temp = modes_4x4[i];
481                 modes_4x4[i] = modes_4x4[j];
482                 modes_4x4[j] = temp;
483             }
484         }
485     }
486     for(i = 0; i < 3; i++)
487     {
488         best_costs_4x4[i] = costs_4x4[i];
489         best_modes_4x4[i] = modes_to_eval[modes_4x4[i]];
490     }
491 
492     {
493         ps_cu_node->best_mode = best_modes_4x4[0];
494         ps_cu_node->best_cost = best_costs_4x4[0];
495         ps_cu_node->best_satd = best_costs_4x4[0] - mode_bits_cost[ps_cu_node->best_mode];
496     }
497 }
498 
499 /*!
500 ******************************************************************************
501 * \if Function name : ihevce_pu_calc_8x8_blk \endif
502 *
503 * \brief
504 *    4x4 pu (8x8 CU) mode decision loop using step 8421 method
505 *
506 * \param[in] ps_curr_src : pointer to src pixels struct
507 * \param[in] ps_ctxt : pointer to IPE context struct
508 * \param[in] ps_cu_node : pointer to cu node info buffer
509 *
510 * \return
511 *    None
512 *
513 * \author
514 *  Ittiam
515 *
516 *****************************************************************************
517 */
ihevce_pu_calc_8x8_blk(iv_enc_yuv_buf_t * ps_curr_src,ihevce_ipe_ctxt_t * ps_ctxt,ihevce_ipe_cu_tree_t * ps_cu_node,func_selector_t * ps_func_selector)518 void ihevce_pu_calc_8x8_blk(
519     iv_enc_yuv_buf_t *ps_curr_src,
520     ihevce_ipe_ctxt_t *ps_ctxt,
521     ihevce_ipe_cu_tree_t *ps_cu_node,
522     func_selector_t *ps_func_selector)
523 {
524     WORD32 i, j;
525     WORD32 nbr_flags;
526     nbr_avail_flags_t s_nbr;
527     WORD32 trans_size = ps_cu_node->ps_parent->u1_cu_size >> 1;
528 
529     UWORD8 *pu1_src_4x4;
530     WORD32 xA, xB, yA, yB;
531     //WORD32 x, y, size;
532     WORD32 top_intra_mode;
533     WORD32 left_intra_mode;
534     //    WORD8 *top_intra_mode_ptr;
535     //  WORD8 *left_intra_mode_ptr;
536     UWORD8 *pu1_orig;
537     WORD32 src_strd = ps_curr_src->i4_y_strd;
538 
539     WORD32 cu_pos_x = ps_cu_node->ps_parent->u2_x0 << 1;
540     WORD32 cu_pos_y = ps_cu_node->ps_parent->u2_y0 << 1;
541     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
542 
543     ihevc_intra_pred_luma_ref_substitution_fptr =
544         ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
545 
546     pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) +
547                ((ps_cu_node->ps_parent->u2_y0 << 3) * src_strd) +
548                (ps_cu_node->ps_parent->u2_x0 << 3);
549     for(i = 0; i < 2; i++)
550     {
551         for(j = 0; j < 2; j++)
552         {
553             WORD32 cand_mode_list[3];
554             pu1_src_4x4 = pu1_orig + (i * trans_size * src_strd) + (j * trans_size);
555             /* get the neighbour availability flags */
556             nbr_flags = ihevce_get_nbr_intra(
557                 &s_nbr,
558                 ps_ctxt->pu1_ctb_nbr_map,
559                 ps_ctxt->i4_nbr_map_strd,
560                 cu_pos_x + ((j) * (trans_size >> 2)),
561                 cu_pos_y + ((i) * (trans_size >> 2)),
562                 trans_size >> 2);
563 
564             /* call the function which populates sad cost for all the modes */
565             xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + j;
566             yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
567             xB = xA + 1;
568             yB = yA - 1;
569             left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
570             top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
571 
572             ihevce_intra_populate_mode_bits_cost_bracketing(
573                 top_intra_mode,
574                 left_intra_mode,
575                 s_nbr.u1_top_avail,
576                 s_nbr.u1_left_avail,
577                 ps_cu_node->ps_parent->u2_y0,
578                 &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
579                 &ps_ctxt->au2_mode_bits_8x8_pu[0],
580                 ps_ctxt->i4_ol_sad_lambda,
581                 cand_mode_list);
582 
583             /* call the function which populates ref data for intra predicion */
584             ihevc_intra_pred_luma_ref_substitution_fptr(
585                 pu1_src_4x4 - src_strd - 1,
586                 pu1_src_4x4 - src_strd,
587                 pu1_src_4x4 - 1,
588                 src_strd,
589                 4,
590                 nbr_flags,
591                 &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
592                 0);
593 
594             ihevce_pu_calc_4x4_blk(
595                 ps_ctxt,
596                 ps_cu_node->ps_sub_cu[(i * 2) + j],
597                 pu1_src_4x4,
598                 src_strd,
599                 &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
600                 &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
601                 &ps_cu_node->ps_sub_cu[(i * 2) + j]->au4_best_cost_1tu[0],
602                 &ps_cu_node->ps_sub_cu[(i * 2) + j]->au1_best_mode_1tu[0],
603                 ps_func_selector);
604 
605             /*&au4_cost_4x4[i*2 + j][0],
606                 &au1_modes_4x4[i*2 + j][0]);*/ //TTODO : mode will change for the four partition
607 
608             ihevce_set_nbr_map(
609                 ps_ctxt->pu1_ctb_nbr_map,
610                 ps_ctxt->i4_nbr_map_strd,
611                 cu_pos_x + ((j) * (trans_size >> 2)),
612                 cu_pos_y + ((i) * (trans_size >> 2)),
613                 (trans_size >> 2),
614                 1);
615 
616             xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + 1 + j;
617             yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
618             ps_ctxt->au1_ctb_mode_map[yA][xA] = ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode;
619             ps_cu_node->ps_sub_cu[i * 2 + j]->u2_mode_bits_cost =
620                 ps_ctxt->au2_mode_bits_8x8_pu[ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode];
621         }
622     }
623 }
624 
625 /*!
626 ******************************************************************************
627 * \if Function name : ihevce_bracketing_analysis \endif
628 *
629 * \brief
630 *    Interface function that evaluates MAX cu and MAX - 1 cu, with MAX cu size
631 *    info decided coarse resolution mode decision. Compares the SATD/SAD cost btwn
632 *    2 CUS and determines the actual CU size and best 3 modes to be given to rdopt
633 *
634 * \param[in] ps_ctxt : pointer to IPE context struct
635 * \param[in] ps_cu_node : pointer to cu node info buffer
636 * \param[in] ps_curr_src : pointer to src pixels struct
637 * \param[in] ps_ctb_out : pointer to ip ctb out struct
638 * \param[in] ps_row_cu : pointer to cu analyse struct
639 * \param[in] ps_ed_l1_ctb : pointer to level 1 early deci struct
640 * \param[in] ps_ed_l2_ctb : pointer to level 2 early deci struct
641 * \param[in] ps_l0_ipe_out_ctb : pointer to ipe_l0_ctb_analyse_for_me_t struct
642 *
643 * \return
644 *    None
645 *
646 * \author
647 *  Ittiam
648 *
649 *****************************************************************************
650 */
ihevce_bracketing_analysis(ihevce_ipe_ctxt_t * ps_ctxt,ihevce_ipe_cu_tree_t * ps_cu_node,iv_enc_yuv_buf_t * ps_curr_src,ctb_analyse_t * ps_ctb_out,ihevce_ed_blk_t * ps_ed_l1_ctb,ihevce_ed_blk_t * ps_ed_l2_ctb,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,ipe_l0_ctb_analyse_for_me_t * ps_l0_ipe_out_ctb)651 void ihevce_bracketing_analysis(
652     ihevce_ipe_ctxt_t *ps_ctxt,
653     ihevce_ipe_cu_tree_t *ps_cu_node,
654     iv_enc_yuv_buf_t *ps_curr_src,
655     ctb_analyse_t *ps_ctb_out,
656     //cu_analyse_t         *ps_row_cu,
657     ihevce_ed_blk_t *ps_ed_l1_ctb,
658     ihevce_ed_blk_t *ps_ed_l2_ctb,
659     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
660     ipe_l0_ctb_analyse_for_me_t *ps_l0_ipe_out_ctb)
661 {
662     WORD32 cu_pos_x = 0;
663     WORD32 cu_pos_y = 0;
664 
665     UWORD8 u1_curr_ctb_wdt = ps_cu_node->u1_width;
666     UWORD8 u1_curr_ctb_hgt = ps_cu_node->u1_height;
667     WORD32 num_8x8_blks_x = (u1_curr_ctb_wdt >> 3);
668     WORD32 num_8x8_blks_y = (u1_curr_ctb_hgt >> 3);
669 
670     ihevce_ed_blk_t *ps_ed_blk_l1 = ps_ed_l1_ctb;
671     ihevce_ed_blk_t *ps_ed_blk_l2 = ps_ed_l2_ctb;
672 
673     WORD32 i;
674     WORD32 cand_mode_list[3];
675     //cu_analyse_t *ps_curr_cu = ps_row_cu;
676     WORD32 blk_cnt = 0;
677     WORD32 j = 0;
678     WORD32 merge_32x32_l1, merge_32x32_l2;
679 
680     WORD32 i4_skip_intra_eval_32x32_l1;
681     //EIID: flag indicating number of 16x16 blocks to be skipped for intra evaluation within 32x32 block
682 
683     WORD32 parent_cost = 0;
684     WORD32 child_cost[4] = { 0 };
685     WORD32 child_cost_least = 0;
686     WORD32 child_satd[4] = { 0 };
687     WORD32 x, y, size;
688     WORD32 merge_64x64 = 1;
689     UWORD8 au1_best_32x32_modes[4];
690     WORD32 au4_best_32x32_cost[4];
691     WORD32 parent_best_mode;
692     UWORD8 best_mode;
693 
694     WORD32 i4_quality_preset = ps_ctxt->i4_quality_preset;
695     /* flag to control 1CU-4TU modes based on quality preset                */
696     /* if set 1CU-4TU are explicity evaluated else 1CU-1TU modes are copied */
697     WORD32 i4_enable_1cu_4tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
698                                (i4_quality_preset == IHEVCE_QUALITY_P0);
699 
700     /* flag to control 4CU-16TU mode based on quality preset                */
701     /* if set 4CU-16TU are explicity evaluated else 4CU-4TU modes are copied*/
702     WORD32 i4_enable_4cu_16tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
703                                 (i4_quality_preset == IHEVCE_QUALITY_P0);
704 
705     WORD32 i4_mod_factor_num, i4_mod_factor_den = QP_MOD_FACTOR_DEN;  //2;
706     float f_strength;
707     /* Accumalte satd */
708     LWORD64 i8_frame_acc_satd_cost = 0, i8_frame_acc_satd_by_modqp_q10 = 0;
709     WORD32 i4_ctb_acc_satd = 0;
710 
711     /* Accumalate Mode bits cost */
712     LWORD64 i8_frame_acc_mode_bits_cost = 0;
713 
714     /* Step2 is bypassed for parent, uses children modes*/
715     WORD32 step2_bypass = 1;
716 
717     if(1 == ps_ctxt->u1_disable_child_cu_decide)
718         step2_bypass = 0;
719 
720     ps_cu_node->ps_parent = ps_ctxt->ps_ipe_cu_tree;
721     for(i = 0; i < 4; i++)
722     {
723         ps_cu_node->ps_sub_cu[i] = ps_ctxt->ps_ipe_cu_tree + 1 + i;
724     }
725 
726     /* Loop for all 8x8 block in a CTB */
727     ps_ctb_out->u4_cu_split_flags = 0x1;
728 
729     /* Initialize intra 64x64, 32x32 and 16x16 costs to max value */
730     for(i = 0; i < (MAX_CU_IN_CTB >> 4); i++)
731     {
732         ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i] = MAX_INTRA_COST_IPE;
733     }
734 
735     for(i = 0; i < (MAX_CU_IN_CTB >> 2); i++)
736     {
737         ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[i] = MAX_INTRA_COST_IPE;
738     }
739 
740     for(i = 0; i < (MAX_CU_IN_CTB); i++)
741     {
742         ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[i] = MAX_INTRA_COST_IPE;
743     }
744 
745     ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = MAX_INTRA_COST_IPE;
746 
747     /* by default 64x64 modes are set to default values DC and Planar */
748     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = 0;
749     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = 1;
750     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = 255;
751 
752     /* by default 64x4 split is set to 1 */
753     ps_l0_ipe_out_ctb->u1_split_flag = 1;
754 
755     /* Modulation factor calculated based on spatial variance instead of hardcoded val*/
756     i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[1];  //16;
757 
758     f_strength = ps_ctxt->f_strength;
759 
760     /* ------------------------------------------------ */
761     /* populate the early decisions done by L1 analysis */
762     /* ------------------------------------------------ */
763     for(i = 0; i < (MAX_CU_IN_CTB >> 2); i++)
764     {
765         ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_ipe[i] = ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i];
766         ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[i] = ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i];
767         ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_me[i] = ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i];
768         ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_me[i] = ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i];
769     }
770 
771     /* Init CTB level accumalated SATD and MPM bits */
772     ps_l0_ipe_out_ctb->i4_ctb_acc_satd = 0;
773     ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = 0;
774 
775     /* ------------------------------------------------ */
776     /* Loop over all the blocks in current CTB          */
777     /* ------------------------------------------------ */
778     {
779         /* 64 8x8 blocks should be encountered for the do,while loop to exit */
780         do
781         {
782             intra32_analyse_t *ps_intra32_analyse;
783             intra16_analyse_t *ps_intra16_analyse;
784             WORD32 *pi4_intra_32_cost;
785             WORD32 *pi4_intra_16_cost;
786             WORD32 *pi4_intra_8_cost;
787             WORD32 merge_16x16_l1;
788 
789             /* Given the blk_cnt, get the CU's top-left 8x8 block's x and y positions within the CTB */
790             cu_pos_x = gau1_cu_pos_x[blk_cnt];
791             cu_pos_y = gau1_cu_pos_y[blk_cnt];
792 
793             /* default value for 32x32 best mode - blk_cnt increases by 16 for each 32x32 */
794             au1_best_32x32_modes[blk_cnt >> 4] = 255;
795 
796             /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
797             /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
798             ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[blk_cnt >> 4];
799 
800             /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
801             /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
802             ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[(blk_cnt & 0xF) >> 2];
803 
804             /* Line below assumes min_cu_size of 8 - checks whether CU starts are within picture */
805             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
806             {
807                 /* Reset to zero for every cu decision */
808                 merge_32x32_l1 = 0;
809 
810                 child_cost_least = 0;
811 
812                 /* At L2, each 4x4 corresponds to 16x16 at L0. Every 4 16x16 stores a merge_success flag */
813                 ps_ed_blk_l2 = ps_ed_l2_ctb + (blk_cnt >> 2);
814 
815                 pi4_intra_32_cost = &ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[blk_cnt >> 4];
816 
817                 /* by default 32x32 modes are set to default values DC and Planar */
818                 ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 0;
819                 ps_intra32_analyse->au1_best_modes_32x32_tu[1] = 1;
820                 ps_intra32_analyse->au1_best_modes_32x32_tu[2] = 255;
821 
822                 /* By default 32x32 split is set to 1 */
823                 ps_intra32_analyse->b1_split_flag = 1;
824 
825                 ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 0;
826                 ps_intra32_analyse->au1_best_modes_16x16_tu[1] = 1;
827                 ps_intra32_analyse->au1_best_modes_16x16_tu[2] = 255;
828 
829                 /* 16x16 cost & 8x8 cost are stored in Raster scan order */
830                 /* stride of 16x16 buffer is MAX_CU_IN_CTB_ROW >> 1      */
831                 /* stride of 8x8 buffer is MAX_CU_IN_CTB_ROW             */
832                 {
833                     WORD32 pos_x_8x8, pos_y_8x8;
834 
835                     pos_x_8x8 = gau1_cu_pos_x[blk_cnt];
836                     pos_y_8x8 = gau1_cu_pos_y[blk_cnt];
837 
838                     pi4_intra_16_cost = &ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[0];
839 
840                     pi4_intra_16_cost +=
841                         ((pos_x_8x8 >> 1) + ((pos_y_8x8 >> 1) * (MAX_CU_IN_CTB_ROW >> 1)));
842 
843                     pi4_intra_8_cost = &ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[0];
844 
845                     pi4_intra_8_cost += (pos_x_8x8 + (pos_y_8x8 * MAX_CU_IN_CTB_ROW));
846                 }
847 
848                 merge_32x32_l1 = 0;
849                 merge_32x32_l2 = 0;
850                 i4_skip_intra_eval_32x32_l1 = 0;
851 
852                 /* Enable 16x16 merge iff sufficient 8x8 blocks remain in the current CTB */
853                 merge_16x16_l1 = 0;
854                 if(((num_8x8_blks_x - cu_pos_x) >= 2) && ((num_8x8_blks_y - cu_pos_y) >= 2))
855                 {
856 #if !ENABLE_UNIFORM_CU_SIZE_8x8
857                     merge_16x16_l1 = ps_ed_blk_l1->merge_success;
858 #else
859                     merge_16x16_l1 = 0;
860 #endif
861                 }
862 
863                 /* Enable 32x32 merge iff sufficient 8x8 blocks remain in the current CTB */
864                 if(((num_8x8_blks_x - cu_pos_x) >= 4) && ((num_8x8_blks_y - cu_pos_y) >= 4))
865                 {
866                     /* Check 4 flags of L1(8x8) say merge */
867                     for(i = 0; i < 4; i++)
868                     {
869                         merge_32x32_l1 += (ps_ed_blk_l1 + (i * 4))->merge_success;
870 
871                         //EIDD: num 16x16 blocks for which inter_intra flag says eval only inter, i.e. skip intra eval
872                         i4_skip_intra_eval_32x32_l1 +=
873                             ((ps_ed_blk_l1 + (i * 4))->intra_or_inter == 2) ? 1 : 0;
874                     }
875 
876 #if !ENABLE_UNIFORM_CU_SIZE_8x8
877                     /* Check 1 flag from L2(16x16) say merge */
878                     merge_32x32_l2 = ps_ed_blk_l2->merge_success;
879 #else
880                     merge_32x32_l1 = 0;
881                     merge_32x32_l2 = 0;
882 #endif
883                 }
884 
885 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
886                 if((i4_quality_preset == IHEVCE_QUALITY_P6) && (ps_ctxt->i4_slice_type != ISLICE))
887                 {
888                     merge_32x32_l2 = 0;
889                     ps_ed_blk_l2->merge_success = 0;
890                 }
891 #endif
892 
893                 ps_intra32_analyse->b1_valid_cu = 1;
894 
895                 /* If Merge success from all 4 L1 and L2, max CU size 32x32 is chosen */
896                 /* EIID: if all blocks to be skipped then skip entire 32x32 for intra eval,
897                 if no blocks to be skipped then eval entire 32x32,
898                 else break the merge and go to 16x16 level eval */
899                 if((merge_32x32_l1 == 4) && merge_32x32_l2 &&
900                    ((i4_skip_intra_eval_32x32_l1 == 0) ||
901                     (i4_skip_intra_eval_32x32_l1 == 4))  //comment this line to disable break-merge
902                 )
903                 {
904 #if IP_DBG_L1_l2
905                     /* Populate params for 32x32 block analysis */
906                     ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
907 
908                     ps_cu_node->ps_parent->u1_cu_size = 32;
909                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
910                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
911                     ps_cu_node->ps_parent->best_mode = ps_ed_blk_l2->best_merge_mode;
912                     /* CU size 32x32 and fill the final cu params */
913 
914                     ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
915 
916                     /* Increment pointers */
917                     ps_ed_blk_l1 += 16;
918                     blk_cnt += 16;
919                     ps_row_cu++;
920                     merge_64x64 &= 1;
921 #else
922 
923                     /* EIID: dont evaluate if all 4 blocks at L1 said inter is winning*/
924                     if(4 == i4_skip_intra_eval_32x32_l1 && (ps_ctxt->i4_slice_type != ISLICE))
925                     {
926                         WORD32 i4_local_ctr1, i4_local_ctr2;
927 
928                         ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
929 
930                         ps_cu_node->ps_parent->u1_cu_size = 32;
931                         ps_cu_node->ps_parent->u2_x0 =
932                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
933                         ps_cu_node->ps_parent->u2_y0 =
934                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
935                         ps_cu_node->ps_parent->best_mode =
936                             INTRA_DC;  //ps_ed_blk_l2->best_merge_mode;
937                         /* CU size 32x32 and fill the final cu params */
938 
939                         /* fill in the first modes as invalid */
940                         ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
941                         ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
942                             INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
943                         ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
944 
945                         ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
946                         ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
947                         ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
948 
949                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
950 
951                         //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
952                         //ps_row_cu->u1_num_intra_rdopt_cands = 0;
953 
954                         ps_intra32_analyse->b1_valid_cu = 0;
955                         ps_intra32_analyse->b1_split_flag = 0;
956                         ps_intra32_analyse->b1_merge_flag = 0;
957                         /*memset (&ps_intra32_analyse->au1_best_modes_32x32_tu,
958                         255,
959                         NUM_BEST_MODES);
960                         memset (&ps_intra32_analyse->au1_best_modes_16x16_tu,
961                         255,
962                         NUM_BEST_MODES);*/
963                         //set only first mode since if it's 255. it wont go ahead
964                         ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 255;
965                         ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 255;
966 
967                         *pi4_intra_32_cost = MAX_INTRA_COST_IPE;
968 
969                         /*since ME will start evaluating from bottom up, set the lower
970                         cu size data invalid */
971                         for(i4_local_ctr1 = 0; i4_local_ctr1 < 4; i4_local_ctr1++)
972                         {
973                             WORD32 *pi4_intra_8_cost_curr16;
974 
975                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
976                                 .au1_best_modes_16x16_tu[0] = 255;
977                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
978                                 .au1_best_modes_8x8_tu[0] = 255;
979                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_merge_flag = 0;
980                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_valid_cu = 0;
981                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_split_flag = 0;
982 
983                             pi4_intra_16_cost
984                                 [(i4_local_ctr1 & 1) + ((MAX_CU_IN_CTB_ROW >> 1) *
985                                                         (i4_local_ctr1 >> 1))] = MAX_INTRA_COST_IPE;
986 
987                             pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((i4_local_ctr1 & 1) << 1);
988                             pi4_intra_8_cost_curr16 +=
989                                 ((i4_local_ctr1 >> 1) << 1) * MAX_CU_IN_CTB_ROW;
990 
991                             for(i4_local_ctr2 = 0; i4_local_ctr2 < 4; i4_local_ctr2++)
992                             {
993                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
994                                     .as_intra8_analyse[i4_local_ctr2]
995                                     .au1_4x4_best_modes[0][0] = 255;
996                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
997                                     .as_intra8_analyse[i4_local_ctr2]
998                                     .au1_4x4_best_modes[1][0] = 255;
999                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1000                                     .as_intra8_analyse[i4_local_ctr2]
1001                                     .au1_4x4_best_modes[2][0] = 255;
1002                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1003                                     .as_intra8_analyse[i4_local_ctr2]
1004                                     .au1_4x4_best_modes[3][0] = 255;
1005                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1006                                     .as_intra8_analyse[i4_local_ctr2]
1007                                     .au1_best_modes_8x8_tu[0] = 255;
1008                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1009                                     .as_intra8_analyse[i4_local_ctr2]
1010                                     .au1_best_modes_4x4_tu[0] = 255;
1011                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1012                                     .as_intra8_analyse[i4_local_ctr2]
1013                                     .b1_valid_cu = 0;
1014 
1015                                 pi4_intra_8_cost_curr16
1016                                     [(i4_local_ctr2 & 1) +
1017                                      (MAX_CU_IN_CTB_ROW * (i4_local_ctr2 >> 1))] =
1018                                         MAX_INTRA_COST_IPE;
1019                             }
1020                         }
1021 
1022                         /* set neighbours even if intra is not evaluated, since source is always available. */
1023                         ihevce_set_nbr_map(
1024                             ps_ctxt->pu1_ctb_nbr_map,
1025                             ps_ctxt->i4_nbr_map_strd,
1026                             ps_cu_node->ps_parent->u2_x0 << 1,
1027                             ps_cu_node->ps_parent->u2_y0 << 1,
1028                             (ps_cu_node->ps_parent->u1_cu_size >> 2),
1029                             1);
1030 
1031                         /* cost accumalation of best cu size candiate */
1032                         /*i8_frame_acc_satd_cost += parent_cost;*/
1033 
1034                         /* Mode bits cost accumalation for best cu size and cu mode */
1035                         /*i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;*/
1036 
1037                         /*satd/mod_qp accumulation of best cu */
1038                         /*i8_frame_acc_satd_by_modqp_q10 += ((LWORD64)ps_cu_node->ps_parent->best_satd << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3))/i4_q_scale_q3_mod;*/
1039 
1040                         /* Increment pointers */
1041                         ps_ed_blk_l1 += 16;
1042                         blk_cnt += 16;
1043                         //ps_row_cu++;
1044                         merge_64x64 = 0;
1045 
1046                         /* increment for stat purpose only. Increment is valid only on single thread */
1047                         ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 4;
1048                     }
1049                     else
1050                     {
1051                         /* Revaluation of 4 16x16 blocks at 8x8 prediction level */
1052                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1053 
1054                         if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1055                            (ps_ctxt->i4_slice_type == PSLICE))
1056                         {
1057                             ps_ctxt->u1_disable_child_cu_decide = 1;
1058                             step2_bypass = 0;
1059                         }
1060 
1061                         /* Based on the flag, Child modes decision can be disabled*/
1062                         if(0 == ps_ctxt->u1_disable_child_cu_decide)
1063                         {
1064                             for(j = 0; j < 4; j++)
1065                             {
1066                                 ps_cu_node->ps_sub_cu[j]->u2_x0 =
1067                                     gau1_cu_pos_x[blk_cnt + (j * 4)]; /* Populate properly */
1068                                 ps_cu_node->ps_sub_cu[j]->u2_y0 =
1069                                     gau1_cu_pos_y[blk_cnt + (j * 4)]; /* Populate properly */
1070                                 ps_cu_node->ps_sub_cu[j]->u1_cu_size = 16;
1071 
1072                                 {
1073                                     WORD32 best_ang_mode =
1074                                         (ps_ed_blk_l1 + (j * 4))->best_merge_mode;
1075 
1076                                     if(best_ang_mode < 2)
1077                                         best_ang_mode = 26;
1078 
1079                                     ihevce_mode_eval_filtering(
1080                                         ps_cu_node->ps_sub_cu[j],
1081                                         ps_cu_node,
1082                                         ps_ctxt,
1083                                         ps_curr_src,
1084                                         best_ang_mode,
1085                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1086                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1087                                         !step2_bypass,
1088                                         1);
1089 
1090                                     if(i4_enable_4cu_16tu)
1091                                     {
1092                                         ihevce_mode_eval_filtering(
1093                                             ps_cu_node->ps_sub_cu[j],
1094                                             ps_cu_node,
1095                                             ps_ctxt,
1096                                             ps_curr_src,
1097                                             best_ang_mode,
1098                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1099                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1100                                             !step2_bypass,
1101                                             0);
1102                                     }
1103                                     else
1104                                     {
1105                                         /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1106                                         memcpy(
1107                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1108                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1109                                             NUM_BEST_MODES);
1110 
1111                                         /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1112                                         memcpy(
1113                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1114                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1115                                             NUM_BEST_MODES * sizeof(WORD32));
1116                                     }
1117 
1118                                     child_cost[j] =
1119                                         MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1120                                             ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
1121 
1122                                     /* Child cost is sum of costs at 16x16 level  */
1123                                     child_cost_least += child_cost[j];
1124 
1125                                     /* Select the best mode to be populated as top and left nbr depending on the
1126                                     4tu and 1tu cost */
1127                                     if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
1128                                        ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
1129                                     {
1130                                         ps_cu_node->ps_sub_cu[j]->best_mode =
1131                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
1132                                     }
1133                                     else
1134                                     {
1135                                         ps_cu_node->ps_sub_cu[j]->best_mode =
1136                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
1137                                     }
1138 
1139                                     { /* Update the CTB nodes only for MAX - 1 CU nodes */
1140                                         WORD32 xA, yA, row, col;
1141                                         xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
1142                                         yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
1143                                         size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
1144                                         for(row = yA; row < (yA + size); row++)
1145                                         {
1146                                             for(col = xA; col < (xA + size); col++)
1147                                             {
1148                                                 ps_ctxt->au1_ctb_mode_map[row][col] =
1149                                                     ps_cu_node->ps_sub_cu[j]->best_mode;
1150                                             }
1151                                         }
1152                                     }
1153                                 }
1154 
1155                                 /*Child SATD cost*/
1156                                 child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
1157 
1158                                 /* store the child 16x16 costs */
1159                                 pi4_intra_16_cost[(j & 1) + ((MAX_CU_IN_CTB_ROW >> 1) * (j >> 1))] =
1160                                     child_cost[j];
1161 
1162                                 /* set the CU valid flag */
1163                                 ps_intra16_analyse[j].b1_valid_cu = 1;
1164 
1165                                 /* All 16x16 merge is valid, if Cu 32x32 is chosen */
1166                                 /* To be reset, if CU 64x64 is chosen */
1167                                 ps_intra16_analyse[j].b1_merge_flag = 1;
1168 
1169                                 /* storing the modes to intra 16 analyse */
1170                                 /* store the best 16x16 modes 8x8 tu */
1171                                 memcpy(
1172                                     &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1173                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1174                                     sizeof(UWORD8) * (NUM_BEST_MODES));
1175                                 ps_intra16_analyse[j].au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1176 
1177                                 /* store the best 16x16 modes 16x16 tu */
1178                                 memcpy(
1179                                     &ps_intra16_analyse[j].au1_best_modes_16x16_tu[0],
1180                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1181                                     sizeof(UWORD8) * (NUM_BEST_MODES));
1182                                 ps_intra16_analyse[j].au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1183 
1184                                 /* divide the 16x16 costs (pro rating) to 4 8x8 costs */
1185                                 /* store the same 16x16 modes as 4 8x8 child modes    */
1186                                 {
1187                                     WORD32 idx_8x8;
1188                                     WORD32 *pi4_intra_8_cost_curr16;
1189                                     intra8_analyse_t *ps_intra8_analyse;
1190 
1191                                     pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((j & 1) << 1);
1192                                     pi4_intra_8_cost_curr16 += ((j >> 1) << 1) * MAX_CU_IN_CTB_ROW;
1193 
1194                                     for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
1195                                     {
1196                                         pi4_intra_8_cost_curr16
1197                                             [(idx_8x8 & 1) + (MAX_CU_IN_CTB_ROW * (idx_8x8 >> 1))] =
1198                                                 (child_cost[j] + 3) >> 2;
1199 
1200                                         ps_intra8_analyse =
1201                                             &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
1202 
1203                                         ps_intra8_analyse->b1_enable_nxn = 0;
1204                                         ps_intra8_analyse->b1_valid_cu = 1;
1205 
1206                                         /* store the best 8x8 modes 8x8 tu */
1207                                         memcpy(
1208                                             &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
1209                                             &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1210                                             sizeof(UWORD8) * (NUM_BEST_MODES + 1));
1211 
1212                                         /* store the best 8x8 modes 4x4 tu */
1213                                         memcpy(
1214                                             &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
1215                                             &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1216                                             sizeof(UWORD8) * (NUM_BEST_MODES + 1));
1217 
1218                                         /* NXN modes not evaluated hence set to 0 */
1219                                         memset(
1220                                             &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1221                                             255,
1222                                             sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1223                                     }
1224                                 }
1225                             }
1226 
1227                             ihevce_set_nbr_map(
1228                                 ps_ctxt->pu1_ctb_nbr_map,
1229                                 ps_ctxt->i4_nbr_map_strd,
1230                                 ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
1231                                 ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
1232                                 (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
1233                                 0);
1234                         }
1235 #if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
1236                         else
1237                         {
1238                             for(j = 0; j < 4; j++)
1239                             {
1240                                 WORD32 idx_8x8;
1241                                 intra8_analyse_t *ps_intra8_analyse;
1242                                 ps_intra16_analyse[j].au1_best_modes_8x8_tu[0] = 255;
1243                                 ps_intra16_analyse[j].au1_best_modes_16x16_tu[0] = 255;
1244 
1245                                 ps_intra16_analyse[j].b1_valid_cu = 0;
1246 
1247                                 for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
1248                                 {
1249                                     ps_intra8_analyse =
1250                                         &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
1251 
1252                                     ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
1253                                     ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
1254 
1255                                     ps_intra8_analyse->b1_enable_nxn = 0;
1256                                     ps_intra8_analyse->b1_valid_cu = 0;
1257 
1258                                     /* NXN modes not evaluated hence set to 0 */
1259                                     memset(
1260                                         &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1261                                         255,
1262                                         sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1263                                 }
1264                             }
1265 
1266                             child_cost_least = MAX_INTRA_COST_IPE;
1267                         }
1268 #endif
1269 
1270                         /* Populate params for 32x32 block analysis */
1271 
1272                         ps_cu_node->ps_parent->u1_cu_size = 32;
1273                         ps_cu_node->ps_parent->u2_x0 =
1274                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1275                         ps_cu_node->ps_parent->u2_y0 =
1276                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1277 
1278                         /* Revaluation for 32x32 parent block at 16x16 prediction level */
1279                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1280 
1281                         {
1282                             /* Eval for TUSize = CuSize */
1283                             ihevce_mode_eval_filtering(
1284                                 ps_cu_node->ps_parent,
1285                                 ps_cu_node,
1286                                 ps_ctxt,
1287                                 ps_curr_src,
1288                                 26,
1289                                 &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1290                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1291                                 step2_bypass,
1292                                 1);
1293 
1294                             if(i4_enable_1cu_4tu)
1295                             {
1296                                 /* Eval for TUSize = CuSize/2 */
1297                                 ihevce_mode_eval_filtering(
1298                                     ps_cu_node->ps_parent,
1299                                     ps_cu_node,
1300                                     ps_ctxt,
1301                                     ps_curr_src,
1302                                     26,
1303                                     &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1304                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1305                                     step2_bypass,
1306                                     0);
1307                             }
1308                             else
1309                             {
1310                                 /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1311                                 memcpy(
1312                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1313                                     &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1314                                     NUM_BEST_MODES);
1315 
1316                                 /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1317                                 memcpy(
1318                                     &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1319                                     &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1320                                     NUM_BEST_MODES * sizeof(WORD32));
1321                             }
1322                         }
1323 
1324                         ps_ctxt->u1_disable_child_cu_decide = 0;
1325                         step2_bypass = 1;
1326 
1327                         /* Update parent cost */
1328                         parent_cost =
1329                             MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1330                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
1331 
1332                         /* Select the best mode to be populated as top and left nbr depending on the
1333                         4tu and 1tu cost */
1334                         if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
1335                            ps_cu_node->ps_parent->au4_best_cost_1tu[0])
1336                         {
1337                             ps_cu_node->ps_parent->best_mode =
1338                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1339                         }
1340                         else
1341                         {
1342                             ps_cu_node->ps_parent->best_mode =
1343                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0];
1344                         }
1345 
1346                         /* store the 32x32 cost */
1347                         *pi4_intra_32_cost = parent_cost;
1348 
1349                         /* set the CU valid flag */
1350                         ps_intra32_analyse->b1_valid_cu = 1;
1351 
1352                         ps_intra32_analyse->b1_merge_flag = 1;
1353 
1354                         /* storing the modes to intra 32 analyse */
1355                         {
1356                             /* store the best 32x32 modes 16x16 tu */
1357                             memcpy(
1358                                 &ps_intra32_analyse->au1_best_modes_16x16_tu[0],
1359                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1360                                 sizeof(UWORD8) * (NUM_BEST_MODES));
1361                             ps_intra32_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1362 
1363                             /* store the best 32x32 modes 32x32 tu */
1364                             memcpy(
1365                                 &ps_intra32_analyse->au1_best_modes_32x32_tu[0],
1366                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1367                                 sizeof(UWORD8) * (NUM_BEST_MODES));
1368                             ps_intra32_analyse->au1_best_modes_32x32_tu[NUM_BEST_MODES] = 255;
1369                         }
1370                         parent_best_mode = ps_cu_node->ps_parent->best_mode;
1371                         if((parent_cost <=
1372                             child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
1373                                                 LAMBDA_Q_SHIFT)))  //|| identical_modes)
1374                         {
1375                             WORD32 i4_q_scale_q3_mod;
1376                             UWORD8 u1_cu_possible_qp;
1377                             WORD32 i4_act_factor;
1378 
1379                             /* CU size 32x32 and fill the final cu params */
1380 
1381                             ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1382 
1383                             if((IHEVCE_QUALITY_P3 > i4_quality_preset))
1384                             {
1385                                 for(i = 0; i < 4; i++)
1386                                 {
1387                                     intra8_analyse_t *ps_intra8_analyse;
1388                                     ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
1389                                     for(j = 0; j < 4; j++)
1390                                     {
1391                                         /* Populate best 3 nxn modes */
1392                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] =
1393                                             ps_cu_node->ps_sub_cu[i]->au1_best_mode_4tu[0];
1394                                         ps_intra8_analyse->au1_4x4_best_modes[j][1] =
1395                                             ps_cu_node->ps_sub_cu[i]
1396                                                 ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
1397                                         ps_intra8_analyse->au1_4x4_best_modes[j][2] =
1398                                             ps_cu_node->ps_sub_cu[i]
1399                                                 ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
1400                                         ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
1401                                     }
1402                                 }
1403                             }
1404                             /* store the 32x32 non split flag */
1405                             ps_intra32_analyse->b1_split_flag = 0;
1406                             ps_intra32_analyse->as_intra16_analyse[0].b1_split_flag = 0;
1407                             ps_intra32_analyse->as_intra16_analyse[1].b1_split_flag = 0;
1408                             ps_intra32_analyse->as_intra16_analyse[2].b1_split_flag = 0;
1409                             ps_intra32_analyse->as_intra16_analyse[3].b1_split_flag = 0;
1410 
1411                             au1_best_32x32_modes[blk_cnt >> 4] =
1412                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1413 
1414                             au4_best_32x32_cost[blk_cnt >> 4] =
1415                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0];
1416                             /*As 32*32 has won, pick L2 8x8 qp which maps
1417                             to L0 32x32 Qp*/
1418                             ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
1419                             ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
1420                             u1_cu_possible_qp = ihevce_cu_level_qp_mod(
1421                                 ps_ctxt->i4_qscale,
1422                                 ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
1423                                 ps_ctxt->ld_curr_frame_16x16_log_avg[0],
1424                                 f_strength,
1425                                 &i4_act_factor,
1426                                 &i4_q_scale_q3_mod,
1427                                 ps_ctxt->ps_rc_quant_ctxt);
1428                             /* cost accumalation of best cu size candiate */
1429                             i8_frame_acc_satd_cost += parent_cost;
1430 
1431                             /* satd and mpm bits accumalation of best cu size candiate */
1432                             i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
1433 
1434                             /* Mode bits cost accumalation for best cu size and cu mode */
1435                             i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
1436 
1437                             /*satd/mod_qp accumulation of best cu */
1438                             i8_frame_acc_satd_by_modqp_q10 +=
1439                                 ((LWORD64)ps_cu_node->ps_parent->best_satd
1440                                  << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1441                                 i4_q_scale_q3_mod;
1442 
1443                             /* Increment pointers */
1444                             ps_ed_blk_l1 += 16;
1445                             blk_cnt += 16;
1446                             //ps_row_cu++;
1447                             merge_64x64 &= 1;
1448                         }
1449                         else
1450                         {
1451                             /* store the 32x32 split flag */
1452                             ps_intra32_analyse->b1_split_flag = 1;
1453 
1454                             /* CU size 16x16 and fill the final cu params for all 4 blocks */
1455                             for(j = 0; j < 4; j++)
1456                             {
1457                                 WORD32 i4_q_scale_q3_mod;
1458                                 UWORD8 u1_cu_possible_qp;
1459                                 WORD32 i4_act_factor;
1460 
1461                                 /* Set CU split flag */
1462                                 ASSERT(blk_cnt % 4 == 0);
1463 
1464                                 ihevce_update_cand_list(
1465                                     ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
1466 
1467                                 /* store the 16x16 non split flag  */
1468                                 ps_intra16_analyse[j].b1_split_flag = 0;
1469 
1470                                 ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1471                                 ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
1472                                 /*As 16*16 has won, pick L1 8x8 qp which maps
1473                                 to L0 16x16 Qp*/
1474                                 u1_cu_possible_qp = ihevce_cu_level_qp_mod(
1475                                     ps_ctxt->i4_qscale,
1476                                     ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
1477                                     ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1478                                     f_strength,
1479                                     &i4_act_factor,
1480                                     &i4_q_scale_q3_mod,
1481                                     ps_ctxt->ps_rc_quant_ctxt);
1482 
1483                                 /*accum satd/qp for all child block*/
1484                                 i8_frame_acc_satd_by_modqp_q10 +=
1485                                     ((LWORD64)child_satd[j]
1486                                      << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1487                                     i4_q_scale_q3_mod;
1488 
1489                                 /* Accumalate mode bits for all child blocks */
1490                                 i8_frame_acc_mode_bits_cost +=
1491                                     ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
1492 
1493                                 /* satd and mpm bits accumalation of best cu size candiate */
1494                                 i4_ctb_acc_satd += child_satd[j];
1495 
1496                                 /* Increment pointers */
1497                                 //ps_row_cu++;
1498                                 ps_ed_blk_l1 += 4;
1499                                 blk_cnt += 4;
1500                             }
1501 
1502                             /* cost accumalation of best cu size candiate */
1503                             i8_frame_acc_satd_cost += child_cost_least;
1504 
1505                             /* 64x64 merge is not possible */
1506                             merge_64x64 = 0;
1507                         }
1508 
1509                         //ps_ed_blk_l2 += 4;
1510 
1511                     }  //end of EIID's else
1512 #endif
1513                 }
1514                 /* If Merge success for L1 max CU size 16x16 is chosen */
1515                 else if(merge_16x16_l1)
1516                 {
1517 #if IP_DBG_L1_l2
1518                     ps_cu_node->ps_parent->u1_cu_size = 16;
1519                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1520                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1521                     ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_merge_mode;
1522                     ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1523 
1524                     blk_cnt += 4;
1525                     ps_ed_blk_l1 += 4;
1526                     ps_row_cu++;
1527                     merge_64x64 = 0;
1528 #else
1529 
1530                     /*EIID: evaluate only if L1 early-inter-intra decision is not favouring inter*/
1531                     /* enable this only in B pictures */
1532                     if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
1533                     {
1534                         WORD32 i4_q_scale_q3_mod, i4_local_ctr;
1535                         WORD8 i1_cu_possible_qp;
1536                         WORD32 i4_act_factor;
1537                         /* make cost infinity. */
1538                         /* make modes invalid */
1539                         /* update loop variables */
1540                         /* set other output variales */
1541                         /* dont set neighbour flag so that next blocks wont access this cu */
1542                         /* what happens to ctb_mode_map?? */
1543 
1544                         ps_cu_node->ps_parent->u1_cu_size = 16;
1545                         ps_cu_node->ps_parent->u2_x0 =
1546                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1547                         ps_cu_node->ps_parent->u2_y0 =
1548                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1549                         ps_cu_node->ps_parent->best_mode =
1550                             INTRA_DC;  //ps_ed_blk_l1->best_merge_mode;
1551 
1552                         /* fill in the first modes as invalid */
1553 
1554                         ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
1555                         ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
1556                             INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
1557                         ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
1558 
1559                         ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
1560                         ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
1561                         ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
1562 
1563                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1564 
1565                         //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
1566                         //ps_row_cu->u1_num_intra_rdopt_cands = 0;
1567 
1568                         ps_intra32_analyse->b1_split_flag = 1;
1569                         ps_intra32_analyse->b1_merge_flag = 0;
1570 
1571                         ps_intra16_analyse->b1_valid_cu = 0;
1572                         ps_intra16_analyse->b1_split_flag = 0;
1573                         ps_intra16_analyse->b1_merge_flag = 1;
1574                         //memset (&ps_intra16_analyse->au1_best_modes_16x16_tu,
1575                         //  255,
1576                         //  NUM_BEST_MODES);
1577                         //memset (&ps_intra16_analyse->au1_best_modes_8x8_tu,
1578                         //  255,
1579                         //  NUM_BEST_MODES);
1580                         //set only first mode since if it's 255. it wont go ahead
1581                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
1582                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
1583                         *pi4_intra_16_cost = MAX_INTRA_COST_IPE;
1584 
1585                         /*since ME will start evaluating from bottom up, set the lower
1586                         cu size data invalid */
1587                         for(i4_local_ctr = 0; i4_local_ctr < 4; i4_local_ctr++)
1588                         {
1589                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1590                                 .au1_4x4_best_modes[0][0] = 255;
1591                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1592                                 .au1_4x4_best_modes[1][0] = 255;
1593                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1594                                 .au1_4x4_best_modes[2][0] = 255;
1595                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1596                                 .au1_4x4_best_modes[3][0] = 255;
1597                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1598                                 .au1_best_modes_8x8_tu[0] = 255;
1599                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1600                                 .au1_best_modes_4x4_tu[0] = 255;
1601 
1602                             pi4_intra_8_cost
1603                                 [(i4_local_ctr & 1) + (MAX_CU_IN_CTB_ROW * (i4_local_ctr >> 1))] =
1604                                     MAX_INTRA_COST_IPE;
1605                         }
1606 
1607                         /* set neighbours even if intra is not evaluated, since source is always available. */
1608                         ihevce_set_nbr_map(
1609                             ps_ctxt->pu1_ctb_nbr_map,
1610                             ps_ctxt->i4_nbr_map_strd,
1611                             ps_cu_node->ps_parent->u2_x0 << 1,
1612                             ps_cu_node->ps_parent->u2_y0 << 1,
1613                             (ps_cu_node->ps_parent->u1_cu_size >> 2),
1614                             1);
1615 
1616                         //what happends to RC variables??
1617                         /* run only constant Qp */
1618                         ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1619                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
1620                         i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1621                             ps_ctxt->i4_qscale,
1622                             ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
1623                             ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1624                             f_strength,
1625                             &i4_act_factor,
1626                             &i4_q_scale_q3_mod,
1627                             ps_ctxt->ps_rc_quant_ctxt);
1628 
1629                         /* cost accumalation of best cu size candiate */
1630                         i8_frame_acc_satd_cost += 0;  //parent_cost;  //incorrect accumulation
1631 
1632                         /*satd/mod_qp accumulation of best cu */
1633                         i8_frame_acc_satd_by_modqp_q10 += 0;  //incorrect accumulation
1634                         //((LWORD64)ps_cu_node->ps_parent->best_satd << SATD_BY_ACT_Q_FAC)/i4_q_scale_q3_mod;
1635 
1636                         /* Accumalate mode bits for all child blocks */
1637                         i8_frame_acc_mode_bits_cost +=
1638                             0;  //ps_cu_node->ps_parent->u2_mode_bits_cost;
1639                         //incoorect accumulation
1640 
1641                         blk_cnt += 4;
1642                         ps_ed_blk_l1 += 4;
1643                         //ps_row_cu++;
1644                         merge_64x64 = 0;
1645 
1646                         /* increment for stat purpose only. Increment is valid only on single thread */
1647                         ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 1;
1648                     }
1649                     else
1650                     {
1651                         /* 64x64 merge is not possible */
1652                         merge_64x64 = 0;
1653 
1654                         /* set the 32x32 split flag to 1 */
1655                         ps_intra32_analyse->b1_split_flag = 1;
1656 
1657                         ps_intra32_analyse->b1_merge_flag = 0;
1658 
1659                         ps_intra16_analyse->b1_merge_flag = 1;
1660 
1661                         if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1662                            (ps_ctxt->i4_slice_type == PSLICE))
1663                         {
1664                             ps_ctxt->u1_disable_child_cu_decide = 1;
1665                             step2_bypass = 0;
1666                         }
1667                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1668                         /* Based on the flag, Child modes decision can be disabled*/
1669                         if(0 == ps_ctxt->u1_disable_child_cu_decide)
1670                         {
1671                             for(j = 0; j < 4; j++)
1672                             {
1673                                 intra8_analyse_t *ps_intra8_analyse;
1674                                 WORD32 best_ang_mode = (ps_ed_blk_l1 + j)->best_mode;
1675 
1676                                 if(best_ang_mode < 2)
1677                                     best_ang_mode = 26;
1678 
1679                                 //ps_cu_node->ps_sub_cu[j]->best_cost = MAX_INTRA_COST_IPE;
1680                                 //ps_cu_node->ps_sub_cu[j]->best_mode = (ps_ed_blk_l1 + j)->best_mode;
1681 
1682                                 ps_cu_node->ps_sub_cu[j]->u2_x0 =
1683                                     gau1_cu_pos_x[blk_cnt + j]; /* Populate properly */
1684                                 ps_cu_node->ps_sub_cu[j]->u2_y0 =
1685                                     gau1_cu_pos_y[blk_cnt + j]; /* Populate properly */
1686                                 ps_cu_node->ps_sub_cu[j]->u1_cu_size = 8;
1687 
1688                                 ihevce_mode_eval_filtering(
1689                                     ps_cu_node->ps_sub_cu[j],
1690                                     ps_cu_node,
1691                                     ps_ctxt,
1692                                     ps_curr_src,
1693                                     best_ang_mode,
1694                                     &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1695                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1696                                     !step2_bypass,
1697                                     1);
1698 
1699                                 if(i4_enable_4cu_16tu)
1700                                 {
1701                                     ihevce_mode_eval_filtering(
1702                                         ps_cu_node->ps_sub_cu[j],
1703                                         ps_cu_node,
1704                                         ps_ctxt,
1705                                         ps_curr_src,
1706                                         best_ang_mode,
1707                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1708                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1709                                         !step2_bypass,
1710                                         0);
1711                                 }
1712                                 else
1713                                 {
1714                                     /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1715                                     memcpy(
1716                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1717                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1718                                         NUM_BEST_MODES);
1719 
1720                                     /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1721                                     memcpy(
1722                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1723                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1724                                         NUM_BEST_MODES * sizeof(WORD32));
1725                                 }
1726 
1727                                 child_cost[j] =
1728                                     MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1729                                         ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
1730 
1731                                 child_cost_least += child_cost[j];
1732 
1733                                 /* Select the best mode to be populated as top and left nbr depending on the
1734                                 4tu and 1tu cost */
1735                                 if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
1736                                    ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
1737                                 {
1738                                     ps_cu_node->ps_sub_cu[j]->best_mode =
1739                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
1740                                 }
1741                                 else
1742                                 {
1743                                     ps_cu_node->ps_sub_cu[j]->best_mode =
1744                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
1745                                 }
1746                                 { /* Update the CTB nodes only for MAX - 1 CU nodes */
1747                                     WORD32 xA, yA, row, col;
1748                                     xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
1749                                     yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
1750                                     size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
1751                                     for(row = yA; row < (yA + size); row++)
1752                                     {
1753                                         for(col = xA; col < (xA + size); col++)
1754                                         {
1755                                             ps_ctxt->au1_ctb_mode_map[row][col] =
1756                                                 ps_cu_node->ps_sub_cu[j]->best_mode;
1757                                         }
1758                                     }
1759                                 }
1760 
1761                                 /*collect individual child satd for final SATD/qp accum*/
1762                                 child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
1763 
1764                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
1765 
1766                                 /* store the child 8x8 costs */
1767                                 pi4_intra_8_cost[(j & 1) + (MAX_CU_IN_CTB_ROW * (j >> 1))] =
1768                                     child_cost[j];
1769 
1770                                 /* set the CU valid flag */
1771                                 ps_intra8_analyse->b1_valid_cu = 1;
1772                                 ps_intra8_analyse->b1_enable_nxn = 0;
1773 
1774                                 /* storing the modes to intra8  analyse */
1775 
1776                                 /* store the best 8x8 modes 8x8 tu */
1777                                 memcpy(
1778                                     &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
1779                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1780                                     sizeof(UWORD8) * (NUM_BEST_MODES));
1781                                 ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1782 
1783                                 /* store the best 8x8 modes 4x4 tu */
1784                                 memcpy(
1785                                     &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
1786                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1787                                     sizeof(UWORD8) * (NUM_BEST_MODES));
1788                                 ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
1789 
1790                                 /* NXN modes not evaluated hence set to 255 */
1791                                 memset(
1792                                     &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1793                                     255,
1794                                     sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1795                             }
1796 
1797                             ihevce_set_nbr_map(
1798                                 ps_ctxt->pu1_ctb_nbr_map,
1799                                 ps_ctxt->i4_nbr_map_strd,
1800                                 ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
1801                                 ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
1802                                 (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
1803                                 0);
1804                         }
1805 #if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
1806                         else
1807                         {
1808                             for(j = 0; j < 4; j++)
1809                             {
1810                                 intra8_analyse_t *ps_intra8_analyse;
1811                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
1812                                 ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
1813                                 ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
1814                                 /* NXN modes not evaluated hence set to 255 */
1815                                 memset(
1816                                     &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1817                                     255,
1818                                     sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1819 
1820                                 ps_intra8_analyse->b1_valid_cu = 0;
1821                                 ps_intra8_analyse->b1_enable_nxn = 0;
1822                             }
1823                             child_cost_least = MAX_INTRA_COST_IPE;
1824                         }
1825 #endif
1826                         //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
1827                         //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
1828 
1829                         ps_cu_node->ps_parent->u1_cu_size = 16;
1830                         ps_cu_node->ps_parent->u2_x0 =
1831                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1832                         ps_cu_node->ps_parent->u2_y0 =
1833                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1834 
1835                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1836 
1837                         /* Eval for TUSize = CuSize */
1838                         ihevce_mode_eval_filtering(
1839                             ps_cu_node->ps_parent,
1840                             ps_cu_node,
1841                             ps_ctxt,
1842                             ps_curr_src,
1843                             26,
1844                             &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1845                             &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1846                             step2_bypass,
1847                             1);
1848 
1849                         if(i4_enable_1cu_4tu)
1850                         {
1851                             /* Eval for TUSize = CuSize/2 */
1852                             ihevce_mode_eval_filtering(
1853                                 ps_cu_node->ps_parent,
1854                                 ps_cu_node,
1855                                 ps_ctxt,
1856                                 ps_curr_src,
1857                                 26,
1858                                 &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1859                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1860                                 step2_bypass,
1861                                 0);
1862                         }
1863                         else
1864                         {
1865                             /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1866                             memcpy(
1867                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1868                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1869                                 NUM_BEST_MODES);
1870 
1871                             /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1872                             memcpy(
1873                                 &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1874                                 &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1875                                 NUM_BEST_MODES * sizeof(WORD32));
1876                         }
1877 
1878                         ps_ctxt->u1_disable_child_cu_decide = 0;
1879                         step2_bypass = 1;
1880 
1881                         /* Update parent cost */
1882                         parent_cost =
1883                             MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1884                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
1885 
1886                         /* Select the best mode to be populated as top and left nbr depending on the
1887                         4tu and 1tu cost */
1888                         if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
1889                            ps_cu_node->ps_parent->au4_best_cost_1tu[0])
1890                         {
1891                             ps_cu_node->ps_parent->best_mode =
1892                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1893                         }
1894                         else
1895                         {
1896                             ps_cu_node->ps_parent->best_mode =
1897                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0];
1898                         }
1899 
1900                         /* store the 16x16 cost */
1901                         *pi4_intra_16_cost = parent_cost;
1902 
1903                         /* accumulate the 32x32 cost */
1904                         if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
1905                         {
1906                             *pi4_intra_32_cost = parent_cost;
1907                         }
1908                         else
1909                         {
1910                             *pi4_intra_32_cost += parent_cost;
1911                         }
1912 
1913                         /* set the CU valid flag */
1914                         ps_intra16_analyse->b1_valid_cu = 1;
1915 
1916                         /* storing the modes to intra 16 analyse */
1917                         {
1918                             /* store the best 16x16 modes 16x16 tu */
1919                             memcpy(
1920                                 &ps_intra16_analyse->au1_best_modes_16x16_tu[0],
1921                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1922                                 sizeof(UWORD8) * NUM_BEST_MODES);
1923                             ps_intra16_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1924 
1925                             /* store the best 16x16 modes 8x8 tu */
1926                             memcpy(
1927                                 &ps_intra16_analyse->au1_best_modes_8x8_tu[0],
1928                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1929                                 sizeof(UWORD8) * NUM_BEST_MODES);
1930                             ps_intra16_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1931                         }
1932 
1933                         parent_best_mode = ps_cu_node->ps_parent->best_mode;
1934                         if(parent_cost <=
1935                            child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
1936                                                LAMBDA_Q_SHIFT))  //|| identical_modes)
1937                         {
1938                             WORD32 i4_q_scale_q3_mod;
1939                             WORD8 i1_cu_possible_qp;
1940                             WORD32 i4_act_factor;
1941                             //choose parent CU
1942 
1943                             ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1944 
1945                             /* set the 16x16 non split flag */
1946                             ps_intra16_analyse->b1_split_flag = 0;
1947 
1948                             /*As 16*16 has won, pick L1 8x8 qp which maps
1949                             to L0 16x16 Qp*/
1950                             ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
1951                             ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
1952                             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1953                                 ps_ctxt->i4_qscale,
1954                                 ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
1955                                 ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1956                                 f_strength,
1957                                 &i4_act_factor,
1958                                 &i4_q_scale_q3_mod,
1959                                 ps_ctxt->ps_rc_quant_ctxt);
1960 
1961                             /* cost accumalation of best cu size candiate */
1962                             i8_frame_acc_satd_cost += parent_cost;
1963 
1964                             /* satd and mpm bits accumalation of best cu size candiate */
1965                             i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
1966 
1967                             /*satd/mod_qp accumulation of best cu */
1968                             i8_frame_acc_satd_by_modqp_q10 +=
1969                                 ((LWORD64)ps_cu_node->ps_parent->best_satd
1970                                  << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1971                                 i4_q_scale_q3_mod;
1972 
1973                             /* Accumalate mode bits for all child blocks */
1974                             i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
1975 
1976                             blk_cnt += 4;
1977                             ps_ed_blk_l1 += 4;
1978                             //ps_row_cu++;
1979                         }
1980                         else
1981                         {
1982                             //choose child CU
1983                             WORD8 i1_cu_possible_qp;
1984                             WORD32 i4_act_factor;
1985                             WORD32 i4_q_scale_q3_mod;
1986 
1987                             ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1988                             ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1] != -2);
1989                             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1990                                 ps_ctxt->i4_qscale,
1991                                 ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1],
1992                                 ps_ctxt->ld_curr_frame_8x8_log_avg[1],
1993                                 f_strength,
1994                                 &i4_act_factor,
1995                                 &i4_q_scale_q3_mod,
1996                                 ps_ctxt->ps_rc_quant_ctxt);
1997 
1998                             /* set the 16x16 split flag */
1999                             ps_intra16_analyse->b1_split_flag = 1;
2000 
2001                             for(j = 0; j < 4; j++)
2002                             {
2003                                 ihevce_update_cand_list(
2004                                     ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
2005 
2006                                 if((IHEVCE_QUALITY_P3 > i4_quality_preset))
2007                                 {
2008                                     WORD32 k;
2009                                     intra8_analyse_t *ps_intra8_analyse;
2010                                     ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
2011 
2012                                     for(k = 0; k < 4; k++)
2013                                     {
2014                                         /* Populate best 3 nxn modes */
2015                                         ps_intra8_analyse->au1_4x4_best_modes[k][0] =
2016                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
2017                                         ps_intra8_analyse->au1_4x4_best_modes[k][1] =
2018                                             ps_cu_node->ps_sub_cu[j]
2019                                                 ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
2020                                         ps_intra8_analyse->au1_4x4_best_modes[k][2] =
2021                                             ps_cu_node->ps_sub_cu[j]
2022                                                 ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
2023                                         ps_intra8_analyse->au1_4x4_best_modes[k][3] = 255;
2024                                     }
2025                                 }
2026                                 /*accum satd/qp for all child block*/
2027                                 i8_frame_acc_satd_by_modqp_q10 +=
2028                                     ((LWORD64)child_satd[j]
2029                                      << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2030                                     i4_q_scale_q3_mod;
2031 
2032                                 /* Accumalate mode bits for all child blocks */
2033                                 i8_frame_acc_mode_bits_cost +=
2034                                     ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
2035 
2036                                 /* satd and mpm bits accumalation of best cu size candiate */
2037                                 i4_ctb_acc_satd += child_satd[j];
2038 
2039                                 blk_cnt += 1;
2040                                 ps_ed_blk_l1 += 1;
2041                                 //ps_row_cu++;
2042                             }
2043 
2044                             /* cost accumalation of best cu size candiate */
2045                             i8_frame_acc_satd_cost += child_cost_least;
2046                         }
2047 
2048                     }  //else of EIID
2049 #endif
2050                 }  // if(merge_16x16_l1)
2051                 /* MAX CU SIZE 8x8 */
2052                 else
2053                 {
2054 #if IP_DBG_L1_l2
2055                     for(i = 0; i < 4; i++)
2056                     {
2057                         ps_cu_node->ps_parent->u1_cu_size = 8;
2058                         ps_cu_node->ps_parent->u2_x0 =
2059                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2060                         ps_cu_node->ps_parent->u2_y0 =
2061                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2062                         ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
2063 
2064                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2065                         blk_cnt++;
2066                         ps_ed_blk_l1++;
2067                         ps_row_cu++;
2068                         merge_64x64 = 0;
2069                     }
2070 #else
2071 
2072                     /* EIID: Skip all 4 8x8 block if L1 decisions says skip intra */
2073                     if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
2074                     {
2075                         WORD32 i4_q_scale_q3_mod;
2076                         WORD8 i1_cu_possible_qp;
2077                         WORD32 i4_act_factor;
2078 
2079                         merge_64x64 = 0;
2080 
2081                         ps_intra32_analyse->b1_merge_flag = 0;
2082 
2083                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
2084                         ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 255;
2085                         ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
2086 
2087                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
2088                         ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 255;
2089                         ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
2090                         ps_intra16_analyse->b1_split_flag = 1;
2091                         ps_intra16_analyse->b1_valid_cu = 0;
2092                         ps_intra16_analyse->b1_merge_flag = 0;
2093 
2094                         for(i = 0; i < 4; i++)
2095                         {
2096                             intra8_analyse_t *ps_intra8_analyse;
2097                             WORD32 ctr_sub_cu;
2098 
2099                             cu_pos_x = gau1_cu_pos_x[blk_cnt];
2100                             cu_pos_y = gau1_cu_pos_y[blk_cnt];
2101 
2102                             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
2103                             {
2104                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
2105 
2106                                 ps_intra8_analyse->b1_valid_cu = 0;
2107                                 ps_intra8_analyse->b1_enable_nxn = 0;
2108                                 ps_intra8_analyse->au1_4x4_best_modes[0][0] = 255;
2109                                 ps_intra8_analyse->au1_4x4_best_modes[1][0] = 255;
2110                                 ps_intra8_analyse->au1_4x4_best_modes[2][0] = 255;
2111                                 ps_intra8_analyse->au1_4x4_best_modes[3][0] = 255;
2112                                 ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
2113                                 ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
2114 
2115                                 ps_cu_node->ps_parent->u1_cu_size = 8;
2116                                 ps_cu_node->ps_parent->u2_x0 =
2117                                     gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2118                                 ps_cu_node->ps_parent->u2_y0 =
2119                                     gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2120                                 ps_cu_node->ps_parent->best_mode =
2121                                     INTRA_DC;  //ps_ed_blk_l1->best_mode;
2122 
2123                                 /* fill in the first modes as invalid */
2124 
2125                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
2126                                 ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
2127                                     INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
2128                                 ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
2129 
2130                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
2131                                 ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
2132                                 ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
2133 
2134                                 ihevce_update_cand_list(
2135                                     ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2136 
2137                                 //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
2138                                 //ps_row_cu->u1_num_intra_rdopt_cands = 0;
2139 
2140                                 for(ctr_sub_cu = 0; ctr_sub_cu < 4; ctr_sub_cu++)
2141                                 {
2142                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_1tu[0] =
2143                                         INTRA_DC;
2144                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_4tu[0] =
2145                                         INTRA_DC;
2146                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_1tu[0] =
2147                                         MAX_INTRA_COST_IPE;
2148 
2149                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_4tu[0] =
2150                                         MAX_INTRA_COST_IPE;
2151                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->best_cost =
2152                                         MAX_INTRA_COST_IPE;
2153                                 }
2154 
2155                                 pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2156                                     MAX_INTRA_COST_IPE;
2157 
2158                                 ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2159                                 ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
2160                                 i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2161                                     ps_ctxt->i4_qscale,
2162                                     ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
2163                                     ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2164                                     f_strength,
2165                                     &i4_act_factor,
2166                                     &i4_q_scale_q3_mod,
2167                                     ps_ctxt->ps_rc_quant_ctxt);
2168 
2169                                 /* set neighbours even if intra is not evaluated, since source is always available. */
2170                                 ihevce_set_nbr_map(
2171                                     ps_ctxt->pu1_ctb_nbr_map,
2172                                     ps_ctxt->i4_nbr_map_strd,
2173                                     ps_cu_node->ps_parent->u2_x0 << 1,
2174                                     ps_cu_node->ps_parent->u2_y0 << 1,
2175                                     (ps_cu_node->ps_parent->u1_cu_size >> 2),
2176                                     1);
2177 
2178                                 //ps_row_cu++;
2179                             }
2180                             blk_cnt++;
2181                             ps_ed_blk_l1++;
2182                         }
2183                     }
2184                     else
2185                     {
2186                         //cu_intra_cand_t *ps_cu_intra_cand;
2187                         WORD8 i1_cu_possible_qp;
2188                         WORD32 i4_act_factor;
2189                         WORD32 i4_q_scale_q3_mod;
2190 
2191                         ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2192                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
2193                         i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2194                             ps_ctxt->i4_qscale,
2195                             ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
2196                             ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2197                             f_strength,
2198                             &i4_act_factor,
2199                             &i4_q_scale_q3_mod,
2200                             ps_ctxt->ps_rc_quant_ctxt);
2201 
2202                         /* 64x64 merge is not possible */
2203                         merge_64x64 = 0;
2204 
2205                         ps_intra32_analyse->b1_merge_flag = 0;
2206 
2207                         ps_intra16_analyse->b1_merge_flag = 0;
2208 
2209                         /* by default 16x16 modes are set to default values DC and Planar */
2210                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 0;
2211                         ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 1;
2212                         ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
2213 
2214                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 0;
2215                         ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 1;
2216                         ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
2217                         ps_intra16_analyse->b1_split_flag = 1;
2218                         ps_intra16_analyse->b1_valid_cu = 1;
2219 
2220                         for(i = 0; i < 4; i++)
2221                         {
2222                             intra8_analyse_t *ps_intra8_analyse;
2223                             cu_pos_x = gau1_cu_pos_x[blk_cnt];
2224                             cu_pos_y = gau1_cu_pos_y[blk_cnt];
2225                             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
2226                             {
2227                                 //ps_cu_intra_cand = &ps_row_cu->s_cu_intra_cand;
2228                                 //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
2229 
2230                                 //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
2231 
2232                                 child_cost_least = 0;
2233 
2234                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
2235                                 ps_cu_node->ps_parent->u1_cu_size = 8;
2236                                 ps_cu_node->ps_parent->u2_x0 =
2237                                     gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2238                                 ps_cu_node->ps_parent->u2_y0 =
2239                                     gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2240 
2241                                 //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
2242 
2243                                 /*EARLY DECISION 8x8 block */
2244                                 ihevce_pu_calc_8x8_blk(
2245                                     ps_curr_src, ps_ctxt, ps_cu_node, ps_ctxt->ps_func_selector);
2246                                 for(j = 0; j < 4; j++)
2247                                 {
2248                                     child_cost_least += ps_cu_node->ps_sub_cu[j]->best_cost;
2249                                     child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
2250                                 }
2251 
2252                                 /* Based on the flag, CU = 4TU modes decision can be disabled, CU = 4PU is retained */
2253                                 if(0 == ps_ctxt->u1_disable_child_cu_decide)
2254                                 {
2255                                     ihevce_set_nbr_map(
2256                                         ps_ctxt->pu1_ctb_nbr_map,
2257                                         ps_ctxt->i4_nbr_map_strd,
2258                                         ps_cu_node->ps_parent->u2_x0 << 1,
2259                                         ps_cu_node->ps_parent->u2_y0 << 1,
2260                                         (ps_cu_node->ps_parent->u1_cu_size >> 2),
2261                                         0);
2262 
2263                                     //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
2264 
2265                                     /* Eval for TUSize = CuSize */
2266                                     ihevce_mode_eval_filtering(
2267                                         ps_cu_node->ps_parent,
2268                                         ps_cu_node,
2269                                         ps_ctxt,
2270                                         ps_curr_src,
2271                                         26,
2272                                         &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2273                                         &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2274                                         step2_bypass,
2275                                         1);
2276 
2277                                     if(i4_enable_1cu_4tu)
2278                                     {
2279                                         /* Eval for TUSize = CuSize/2 */
2280                                         ihevce_mode_eval_filtering(
2281                                             ps_cu_node->ps_parent,
2282                                             ps_cu_node,
2283                                             ps_ctxt,
2284                                             ps_curr_src,
2285                                             26,
2286                                             &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2287                                             &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2288                                             step2_bypass,
2289                                             0);
2290                                     }
2291                                     else
2292                                     {
2293                                         /* 4TU not evaluated :  4tu modes set same as 1tu modes */
2294                                         memcpy(
2295                                             &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2296                                             &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2297                                             NUM_BEST_MODES);
2298 
2299                                         /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
2300                                         memcpy(
2301                                             &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2302                                             &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2303                                             NUM_BEST_MODES * sizeof(WORD32));
2304                                     }
2305 
2306                                     /* Update parent cost */
2307                                     parent_cost =
2308                                         MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2309                                             ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
2310 
2311                                     /* Select the best mode to be populated as top and left nbr depending on the
2312                             4tu and 1tu cost */
2313                                     if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
2314                                        ps_cu_node->ps_parent->au4_best_cost_1tu[0])
2315                                     {
2316                                         ps_cu_node->ps_parent->best_mode =
2317                                             ps_cu_node->ps_parent->au1_best_mode_1tu[0];
2318                                     }
2319                                     else
2320                                     {
2321                                         ps_cu_node->ps_parent->best_mode =
2322                                             ps_cu_node->ps_parent->au1_best_mode_4tu[0];
2323                                     }
2324                                 }
2325 
2326                                 /* set the CU valid flag */
2327                                 ps_intra8_analyse->b1_valid_cu = 1;
2328                                 ps_intra8_analyse->b1_enable_nxn = 0;
2329 
2330                                 /* storing the modes to intra 8 analyse */
2331 
2332                                 /* store the best 8x8 modes 8x8 tu */
2333                                 memcpy(
2334                                     &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
2335                                     &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2336                                     sizeof(UWORD8) * (NUM_BEST_MODES));
2337                                 ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
2338 
2339                                 /* store the best 8x8 modes 4x4 tu */
2340                                 memcpy(
2341                                     &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
2342                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2343                                     sizeof(UWORD8) * (NUM_BEST_MODES));
2344                                 ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
2345 
2346                                 /*As 8*8 has won, pick L1 4x4 qp which is equal to
2347                                 L1 8x8 Qp*/
2348                                 //ps_row_cu->u1_cu_possible_qp[0] = u1_cu_possible_qp;
2349                                 //ps_row_cu->i4_act_factor[0][1] = i4_act_factor;
2350 
2351                                 parent_best_mode = ps_cu_node->ps_parent->best_mode;
2352                                 if(parent_cost <=
2353                                    child_cost_least +
2354                                        (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >> LAMBDA_Q_SHIFT))
2355                                 {
2356                                     /*CU = 4TU */
2357                                     ihevce_update_cand_list(
2358                                         ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2359 
2360                                     /* store the child 8x8 costs */
2361                                     pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2362                                         parent_cost;
2363 
2364                                     /* cost accumalation of best cu size candiate */
2365                                     i8_frame_acc_satd_cost += parent_cost;
2366 
2367                                     /*satd/mod_qp accumulation of best cu */
2368                                     i8_frame_acc_satd_by_modqp_q10 +=
2369                                         ((LWORD64)ps_cu_node->ps_parent->best_satd
2370                                          << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2371                                         i4_q_scale_q3_mod;
2372 
2373                                     /* Accumalate mode bits for all child blocks */
2374                                     i8_frame_acc_mode_bits_cost +=
2375                                         ps_cu_node->ps_parent->u2_mode_bits_cost;
2376 
2377                                     /* satd and mpm bits accumalation of best cu size candiate */
2378                                     i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
2379 
2380                                     /* accumulate the 16x16 cost*/
2381                                     if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
2382                                     {
2383                                         *pi4_intra_16_cost = parent_cost;
2384                                     }
2385                                     else
2386                                     {
2387                                         *pi4_intra_16_cost += parent_cost;
2388                                     }
2389 
2390                                     /* accumulate the 32x32 cost*/
2391                                     if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
2392                                     {
2393                                         *pi4_intra_32_cost = parent_cost;
2394                                     }
2395                                     else
2396                                     {
2397                                         *pi4_intra_32_cost += parent_cost;
2398                                     }
2399                                 }
2400                                 else
2401                                 {
2402                                     /*CU = 4PU*/
2403                                     //ps_row_cu->b3_cu_pos_x = (UWORD8) ps_cu_node->ps_parent->u2_x0;
2404                                     //ps_row_cu->b3_cu_pos_y = (UWORD8) ps_cu_node->ps_parent->u2_y0;
2405                                     //ps_row_cu->u1_cu_size  = ps_cu_node->ps_parent->u1_cu_size;
2406 
2407                                     /* store the child 8x8 costs woth 4x4 pu summed cost */
2408                                     pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2409                                         (child_cost_least);
2410 
2411                                     /* accumulate the 16x16 cost*/
2412                                     if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
2413                                     {
2414                                         *pi4_intra_16_cost = child_cost_least;
2415                                     }
2416                                     else
2417                                     {
2418                                         *pi4_intra_16_cost += child_cost_least;
2419                                     }
2420 
2421                                     /* cost accumalation of best cu size candiate */
2422                                     i8_frame_acc_satd_cost += child_cost_least;
2423 
2424                                     for(j = 0; j < 4; j++)
2425                                     {
2426                                         /*satd/qp accumualtion*/
2427                                         i8_frame_acc_satd_by_modqp_q10 +=
2428                                             ((LWORD64)child_satd[j]
2429                                              << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2430                                             i4_q_scale_q3_mod;
2431 
2432                                         /* Accumalate mode bits for all child blocks */
2433                                         i8_frame_acc_mode_bits_cost +=
2434                                             ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
2435 
2436                                         /* satd and mpm bits accumalation of best cu size candiate */
2437                                         i4_ctb_acc_satd += child_satd[j];
2438                                     }
2439 
2440                                     /* accumulate the 32x32 cost*/
2441                                     if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
2442                                     {
2443                                         *pi4_intra_32_cost = child_cost_least;
2444                                     }
2445                                     else
2446                                     {
2447                                         *pi4_intra_32_cost += child_cost_least;
2448                                     }
2449 
2450                                     ps_intra8_analyse->b1_enable_nxn = 1;
2451 
2452                                     /* Insert the best 8x8 modes unconditionally */
2453 
2454                                     x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
2455                                     y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
2456                                     size = ps_cu_node->u1_cu_size >> 2;
2457 
2458                                     ps_ctxt->au1_ctb_mode_map[y][x] =
2459                                         ps_cu_node->ps_sub_cu[0]->best_mode;
2460                                     ps_ctxt->au1_ctb_mode_map[y][x + 1] =
2461                                         ps_cu_node->ps_sub_cu[1]->best_mode;
2462                                     ps_ctxt->au1_ctb_mode_map[y + 1][x] =
2463                                         ps_cu_node->ps_sub_cu[2]->best_mode;
2464                                     ps_ctxt->au1_ctb_mode_map[y + 1][x + 1] =
2465                                         ps_cu_node->ps_sub_cu[3]->best_mode;
2466                                 }
2467                                 /* NXN mode population */
2468                                 for(j = 0; j < 4; j++)
2469                                 {
2470                                     cand_mode_list[0] =
2471                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
2472                                     cand_mode_list[1] =
2473                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[1];
2474                                     cand_mode_list[2] =
2475                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[2];
2476 
2477                                     if(1)
2478                                     {
2479                                         /* Populate best 3 nxn modes */
2480                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] =
2481                                             cand_mode_list[0];
2482                                         ps_intra8_analyse->au1_4x4_best_modes[j][1] =
2483                                             cand_mode_list[1];  //(ps_ed + 1)->best_mode;
2484                                         ps_intra8_analyse->au1_4x4_best_modes[j][2] =
2485                                             cand_mode_list[2];  //(ps_ed + 2)->best_mode;
2486                                         ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
2487 
2488                                         //memcpy(ps_intra8_analyse->au1_4x4_best_modes[j], ps_row_cu->s_cu_intra_cand.au1_intra_luma_modes_nxn[j], 4);
2489                                     }
2490                                     /* For HQ, all 35 modes to be used for RDOPT, removed from here for memory clean-up */
2491 
2492                                     else /* IHEVCE_QUALITY_P0 == i4_quality_preset */
2493                                     {
2494                                         /* To indicate to enc loop that NXN is enabled in HIGH QUALITY fior CU 8x8*/
2495                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] = 0;
2496                                     }
2497 
2498                                     ps_intra8_analyse
2499                                         ->au1_4x4_best_modes[j][MAX_INTRA_CU_CANDIDATES] = 255;
2500                                 }
2501 
2502                                 //ps_row_cu++;
2503                             }
2504                             else
2505                             {
2506                                 /* For Incomplete CTB, 16x16 is not valid */
2507                                 ps_intra16_analyse->b1_valid_cu = 0;
2508                             }
2509                             blk_cnt++;
2510                             ps_ed_blk_l1++;
2511                         }
2512                         //ps_ed_blk_l2 ++;
2513                     }  //else of EIID
2514 #endif
2515                 }
2516             }
2517             else
2518             {
2519                 /* For incomplete CTB, init valid CU to 0 */
2520                 ps_ed_blk_l1++;
2521                 ps_intra32_analyse->b1_valid_cu = 0;
2522                 ps_intra16_analyse[0].b1_valid_cu = 0;
2523                 blk_cnt++;
2524                 merge_64x64 = 0;
2525             }
2526         } while(blk_cnt != MAX_CTB_SIZE);
2527         /* if 64x64 merge is possible then check for 32x32 having same best modes */
2528         if(1 == merge_64x64)
2529         {
2530             WORD32 act_mode = au1_best_32x32_modes[0];
2531 
2532             ps_ed_blk_l2 = ps_ed_l2_ctb;
2533             best_mode = ps_ed_blk_l2->best_mode;
2534             merge_64x64 =
2535                 ((act_mode == au1_best_32x32_modes[0]) + (act_mode == au1_best_32x32_modes[1]) +
2536                      (act_mode == au1_best_32x32_modes[2]) +
2537                      (act_mode == au1_best_32x32_modes[3]) ==
2538                  4);
2539             if(merge_64x64 == 1)
2540                 best_mode = au1_best_32x32_modes[0];
2541             else
2542                 best_mode = ps_ed_blk_l2->best_mode;
2543             /* All 32x32 costs are accumalated to 64x64 cost */
2544             ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
2545             for(i = 0; i < 4; i++)
2546             {
2547                 ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
2548                     ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
2549             }
2550 
2551             /* If all modes of 32x32 block is not same */
2552             if(0 == merge_64x64)
2553             {
2554                 /*Compute CHILD cost for 32x32 */
2555                 WORD32 child_cost_64x64 = au4_best_32x32_cost[0] + au4_best_32x32_cost[1] +
2556                                           au4_best_32x32_cost[2] + au4_best_32x32_cost[3];
2557                 WORD32 cost = MAX_INTRA_COST_IPE;
2558 
2559                 WORD32 best_mode_temp = 0;
2560                 /*Compute 64x64 cost for each mode of 32x32*/
2561                 for(i = 0; i < 4; i++)
2562                 {
2563                     WORD32 mode = au1_best_32x32_modes[i];
2564                     if(mode < 2)
2565                         mode = 26;
2566                     ps_cu_node->ps_parent->u1_cu_size = 64;
2567                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[0]; /* Populate properly */
2568                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[0]; /* Populate properly */
2569 
2570                     ihevce_set_nbr_map(
2571                         ps_ctxt->pu1_ctb_nbr_map,
2572                         ps_ctxt->i4_nbr_map_strd,
2573                         (ps_cu_node->ps_parent->u2_x0 << 1),
2574                         (ps_cu_node->ps_parent->u2_y0 << 1),
2575                         (ps_cu_node->ps_parent->u1_cu_size >> 2),
2576                         0);
2577 
2578                     ihevce_mode_eval_filtering(
2579                         ps_cu_node->ps_parent,
2580                         ps_cu_node,
2581                         ps_ctxt,
2582                         ps_curr_src,
2583                         mode,
2584                         &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2585                         &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2586                         !step2_bypass,
2587                         0);
2588 
2589                     parent_cost = ps_cu_node->ps_parent->best_cost;
2590                     if(cost > parent_cost)
2591                     {
2592                         cost = parent_cost;
2593                         best_mode_temp = ps_cu_node->ps_parent->best_mode;
2594                     }
2595                 }
2596                 if(cost < child_cost_64x64)
2597                 {
2598                     merge_64x64 = 1;
2599                     best_mode = best_mode_temp;
2600 
2601                     /* Update 64x64 cost if CU 64x64 is chosen  */
2602                     ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = cost;
2603 
2604                     /* Accumalate the least cost for CU 64x64 */
2605                     i8_frame_acc_satd_cost = cost;
2606                     i8_frame_acc_mode_bits_cost = ps_cu_node->ps_parent->u2_mode_bits_cost;
2607 
2608                     /* satd and mpm bits accumalation of best cu size candiate */
2609                     i4_ctb_acc_satd = ps_cu_node->ps_parent->best_satd;
2610                 }
2611             }
2612         }
2613 
2614         if(merge_64x64)
2615         {
2616             WORD32 i, j;
2617             intra32_analyse_t *ps_intra32_analyse;
2618             intra16_analyse_t *ps_intra16_analyse;
2619             WORD32 row, col;
2620             WORD32 i4_q_scale_q3_mod;
2621             WORD8 i1_cu_possible_qp;
2622             WORD32 i4_act_factor;
2623             //ps_row_cu = ps_curr_cu;
2624             ps_ctb_out->u4_cu_split_flags = 0x0;
2625             ps_ed_blk_l1 = ps_ed_l1_ctb;
2626             ps_ed_blk_l2 = ps_ed_l2_ctb;
2627 
2628             ps_l0_ipe_out_ctb->u1_split_flag = 0;
2629 
2630             /* If CU size of 64x64 is chosen, disbale all the 16x16 flag*/
2631             for(i = 0; i < 4; i++)
2632             {
2633                 /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
2634                 /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
2635                 ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[i];
2636 
2637                 for(j = 0; j < 4; j++)
2638                 {
2639                     /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
2640                     /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
2641                     ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[j];
2642                     ps_intra16_analyse->b1_merge_flag = 0;
2643                 }
2644             }
2645 
2646             /* CU size 64x64 and fill the final cu params */
2647             //ps_row_cu->b3_cu_pos_x = gau1_cu_pos_x[0];
2648             //ps_row_cu->b3_cu_pos_y = gau1_cu_pos_y[0];
2649             //ps_row_cu->u1_cu_size  = 64;
2650 
2651             /* Candidate mode Update */
2652             cand_mode_list[0] = best_mode;
2653             if(cand_mode_list[0] > 1)
2654             {
2655                 if(cand_mode_list[0] == 2)
2656                 {
2657                     cand_mode_list[1] = 34;
2658                     cand_mode_list[2] = 3;
2659                 }
2660                 else if(cand_mode_list[0] == 34)
2661                 {
2662                     cand_mode_list[1] = 2;
2663                     cand_mode_list[2] = 33;
2664                 }
2665                 else
2666                 {
2667                     cand_mode_list[1] = cand_mode_list[0] - 1;
2668                     cand_mode_list[2] = cand_mode_list[0] + 1;
2669                 }
2670                 //cand_mode_list[1] = ps_ed_blk_l1->nang_attr.best_mode;
2671                 //cand_mode_list[2] = ps_ed_blk_l1->ang_attr.best_mode;
2672             }
2673             else
2674             {
2675                 cand_mode_list[0] = 0;
2676                 cand_mode_list[1] = 1;
2677                 cand_mode_list[2] = 26;
2678                 //cand_mode_list[2] = ps_ed_blk_l1->nang_attr.best_mode;
2679             }
2680 
2681             /* All 32x32 costs are accumalated to 64x64 cost */
2682             ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
2683             for(i = 0; i < 4; i++)
2684             {
2685                 ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
2686                     ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
2687             }
2688             /* by default 64x64 modes are set to default values DC and Planar */
2689             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = cand_mode_list[0];
2690             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = cand_mode_list[1];
2691             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = cand_mode_list[2];
2692             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[3] = 255;
2693 
2694             /* Update CTB mode map for the finalised CU */
2695             x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
2696             y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
2697             size = ps_cu_node->u1_cu_size >> 2;
2698 
2699             for(row = y; row < (y + size); row++)
2700             {
2701                 for(col = x; col < (x + size); col++)
2702                 {
2703                     ps_ctxt->au1_ctb_mode_map[row][col] = best_mode;
2704                 }
2705             }
2706 
2707             ihevce_set_nbr_map(
2708                 ps_ctxt->pu1_ctb_nbr_map,
2709                 ps_ctxt->i4_nbr_map_strd,
2710                 (ps_cu_node->u2_x0 << 1),
2711                 (ps_cu_node->u2_y0 << 1),
2712                 (ps_cu_node->u1_cu_size >> 2),
2713                 1);
2714 
2715             /*As 64*64 has won, pick L1 32x32 qp*/
2716             //ASSERT(((blk_cnt>>6) & 0xF) == (blk_cnt>>6));
2717             //ASSERT((blk_cnt>>6) == 0);
2718             ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
2719             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2720                 ps_ctxt->i4_qscale,
2721                 ps_ed_ctb_l1->i4_32x32_satd[0][0],
2722                 ps_ctxt->ld_curr_frame_32x32_log_avg[0],
2723                 f_strength,
2724                 &i4_act_factor,
2725                 &i4_q_scale_q3_mod,
2726                 ps_ctxt->ps_rc_quant_ctxt);
2727 
2728             i8_frame_acc_satd_by_modqp_q10 =
2729                 (i8_frame_acc_satd_cost << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2730                 i4_q_scale_q3_mod;
2731             /* Increment pointers */
2732             ps_ed_blk_l1 += 64;
2733             ps_ed_blk_l2 += 16;
2734             //ps_row_cu++;
2735         }
2736     }
2737 
2738     //ps_ctb_out->u1_num_cus_in_ctb = (UWORD8)(ps_row_cu - ps_curr_cu);
2739 
2740     {
2741         WORD32 i4_i, i4_j;
2742         WORD32 dummy;
2743         WORD8 i1_cu_qp;
2744         (void)i1_cu_qp;
2745         /*MAM_VAR_L1*/
2746         for(i4_j = 0; i4_j < 2; i4_j++)
2747         {
2748             i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[i4_j];
2749             f_strength = ps_ctxt->f_strength;
2750 
2751             //i4_mod_factor_num = 4;
2752 
2753             ps_ed_blk_l1 = ps_ed_l1_ctb;
2754             ps_ed_blk_l2 = ps_ed_l2_ctb;
2755             //ps_row_cu = ps_curr_cu;
2756 
2757             /*Valid only for complete CTB */
2758             if((64 == u1_curr_ctb_wdt) && (64 == u1_curr_ctb_hgt))
2759             {
2760                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
2761                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][1] != -2);
2762                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][2] != -2);
2763                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][3] != -2);
2764 
2765                 i1_cu_qp = ihevce_cu_level_qp_mod(
2766                     ps_ctxt->i4_qscale,
2767                     ps_ed_ctb_l1->i4_32x32_satd[0][0],
2768                     ps_ctxt->ld_curr_frame_32x32_log_avg[0],
2769                     f_strength,
2770                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j],
2771                     &dummy,
2772                     ps_ctxt->ps_rc_quant_ctxt);
2773 
2774                 i1_cu_qp = ihevce_cu_level_qp_mod(
2775                     ps_ctxt->i4_qscale,
2776                     ps_ed_ctb_l1->i4_32x32_satd[0][1],
2777                     ps_ctxt->ld_curr_frame_32x32_log_avg[1],
2778                     f_strength,
2779                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j],
2780                     &dummy,
2781                     ps_ctxt->ps_rc_quant_ctxt);
2782                 i1_cu_qp = ihevce_cu_level_qp_mod(
2783                     ps_ctxt->i4_qscale,
2784                     ps_ed_ctb_l1->i4_32x32_satd[0][2],
2785                     ps_ctxt->ld_curr_frame_32x32_log_avg[2],
2786                     f_strength,
2787                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j],
2788                     &dummy,
2789                     ps_ctxt->ps_rc_quant_ctxt);
2790 
2791                 i1_cu_qp = ihevce_cu_level_qp_mod(
2792                     ps_ctxt->i4_qscale,
2793                     ps_ed_ctb_l1->i4_32x32_satd[0][3],
2794                     2.0 + ps_ctxt->ld_curr_frame_16x16_log_avg[0],
2795                     f_strength,
2796                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j],
2797                     &dummy,
2798                     ps_ctxt->ps_rc_quant_ctxt);
2799 
2800                 ASSERT(ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] > 0);
2801             }
2802             else
2803             {
2804                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j] = 1024;
2805                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j] = 1024;
2806                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j] = 1024;
2807                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] = 1024;
2808             }
2809 
2810             /*Store the 8x8 Qps from L2 (in raster order) as output of intra prediction
2811             for the usage by ME*/
2812 
2813             {
2814                 WORD32 pos_x_32, pos_y_32, pos;
2815                 //WORD32 i4_incomplete_ctb_val_8;
2816                 pos_x_32 = u1_curr_ctb_wdt / 16;
2817                 pos_y_32 = u1_curr_ctb_hgt / 16;
2818 
2819                 pos = (pos_x_32 < pos_y_32) ? pos_x_32 : pos_y_32;
2820 
2821                 for(i4_i = 0; i4_i < 4; i4_i++)
2822                 {
2823                     if(i4_i < pos)
2824                     {
2825                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] != -2);
2826                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] != -2);
2827                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] != -2);
2828                         i1_cu_qp = ihevce_cu_level_qp_mod(
2829                             ps_ctxt->i4_qscale,
2830                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][0],
2831                             ps_ctxt->ld_curr_frame_16x16_log_avg[0],
2832                             f_strength,
2833                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j],
2834                             &dummy,
2835                             ps_ctxt->ps_rc_quant_ctxt);
2836                         i1_cu_qp = ihevce_cu_level_qp_mod(
2837                             ps_ctxt->i4_qscale,
2838                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][1],
2839                             ps_ctxt->ld_curr_frame_16x16_log_avg[1],
2840                             f_strength,
2841                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j],
2842                             &dummy,
2843                             ps_ctxt->ps_rc_quant_ctxt);
2844                         i1_cu_qp = ihevce_cu_level_qp_mod(
2845                             ps_ctxt->i4_qscale,
2846                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][2],
2847                             ps_ctxt->ld_curr_frame_16x16_log_avg[2],
2848                             f_strength,
2849                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j],
2850                             &dummy,
2851                             ps_ctxt->ps_rc_quant_ctxt);
2852                     }
2853                     else
2854                     {
2855                         /*For incomplete CTB */
2856                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j] = 1024;
2857                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j] = 1024;
2858                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j] = 1024;
2859                     }
2860                 }
2861             }
2862 
2863             /*Store the 8x8 Qps from L1 (in raster order) as output of intra prediction
2864             for the usage by ME*/
2865             {
2866                 WORD32 pos_x_16, pos_y_16, pos;
2867                 //WORD32 i4_incomplete_ctb_val_8;
2868                 pos_x_16 = u1_curr_ctb_wdt / 4;
2869                 pos_y_16 = u1_curr_ctb_hgt / 4;
2870 
2871                 pos = (pos_x_16 < pos_y_16) ? pos_x_16 : pos_y_16;
2872                 for(i4_i = 0; i4_i < 16; i4_i++)
2873                 {
2874                     if(i4_i < pos)
2875                     {
2876                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] != -2);
2877                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] != -2);
2878                         i1_cu_qp = ihevce_cu_level_qp_mod(
2879                             ps_ctxt->i4_qscale,
2880                             ps_ed_ctb_l1->i4_8x8_satd[i4_i][0],
2881                             ps_ctxt->ld_curr_frame_8x8_log_avg[0],
2882                             f_strength,
2883                             &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j],
2884                             &dummy,
2885                             ps_ctxt->ps_rc_quant_ctxt);
2886                         i1_cu_qp = ihevce_cu_level_qp_mod(
2887                             ps_ctxt->i4_qscale,
2888                             ps_ed_ctb_l1->i4_8x8_satd[i4_i][1],
2889                             ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2890                             f_strength,
2891                             &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j],
2892                             &dummy,
2893                             ps_ctxt->ps_rc_quant_ctxt);
2894                     }
2895                     else
2896                     {
2897                         /*For incomplete CTB */
2898                         ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j] = 1024;
2899                         ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j] = 1024;
2900                     }
2901                 }
2902             }
2903         }  //for loop
2904 
2905         /* Accumalate the cost of ctb to the total cost */
2906         ps_ctxt->i8_frame_acc_satd_cost += i8_frame_acc_satd_cost;
2907         ps_ctxt->i8_frame_acc_satd_by_modqp_q10 += i8_frame_acc_satd_by_modqp_q10;
2908 
2909         ps_ctxt->i8_frame_acc_mode_bits_cost += i8_frame_acc_mode_bits_cost;
2910 
2911         /* satd and mpm bits accumalation of best cu size candiate for the ctb */
2912         ps_l0_ipe_out_ctb->i4_ctb_acc_satd = i4_ctb_acc_satd;
2913         ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = i8_frame_acc_mode_bits_cost;
2914 
2915         ps_ctxt->i8_frame_acc_satd += i4_ctb_acc_satd;
2916     }
2917 
2918     {
2919         WORD32 ctr_8x8;
2920         for(ctr_8x8 = 0; ctr_8x8 < (MAX_CU_IN_CTB >> 2); ctr_8x8++)
2921         {
2922             /*Accumalate activity factor for Intra and Inter*/
2923             if(ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[ctr_8x8] <
2924                ps_ed_ctb_l1->i4_sad_me_for_ref[ctr_8x8])
2925             {
2926                 ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
2927                     ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
2928             }
2929             else
2930             {
2931                 ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
2932                     ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
2933             }
2934 
2935             /*Accumalate activity factor at frame level*/
2936             ps_ctxt->i8_frame_acc_act_factor += ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8];
2937         }
2938     }
2939     return;
2940 }
2941 
ihevce_nxn_sad_computer(UWORD8 * pu1_inp,WORD32 i4_inp_stride,UWORD8 * pu1_ref,WORD32 i4_ref_stride,WORD32 trans_size)2942 WORD32 ihevce_nxn_sad_computer(
2943     UWORD8 *pu1_inp, WORD32 i4_inp_stride, UWORD8 *pu1_ref, WORD32 i4_ref_stride, WORD32 trans_size)
2944 {
2945     WORD32 wd, ht, i, j;
2946     WORD32 sad = 0;
2947 
2948     wd = trans_size;
2949     ht = trans_size;
2950 
2951     for(i = 0; i < ht; i++)
2952     {
2953         for(j = 0; j < wd; j++)
2954         {
2955             sad += (ABS(((WORD32)pu1_inp[j] - (WORD32)pu1_ref[j])));
2956         }
2957         pu1_inp += i4_inp_stride;
2958         pu1_ref += i4_ref_stride;
2959     }
2960 
2961     return sad;
2962 }
2963 
2964 /*!
2965 ******************************************************************************
2966 * \if Function name : ihevce_mode_eval_filtering \endif
2967 *
2968 * \brief
2969 *    Evaluates best 3 modes for the given CU size with probable modes from,
2970 *    early decision structure, mpm candidates and dc, planar mode
2971 *
2972 * \param[in] ps_cu_node : pointer to MAX cu node info buffer
2973 * \param[in] ps_child_cu_node : pointer to (MAX - 1) cu node info buffer
2974 * \param[in] ps_ctxt : pointer to IPE context struct
2975 * \param[in] ps_curr_src : pointer to src pixels struct
2976 * \param[in] best_amode : best angular mode from l1 layer or
2977                             from (MAX - 1) CU mode
2978 * \param[in] best_costs_4x4  : pointer to 3 best cost buffer
2979 * \param[in] best_modes_4x4  : pointer to 3 best mode buffer
2980 * \param[in] step2_bypass : if 0, (MAX - 1) CU is evaluated
2981 *                           if 1, (MAX CU) sugested is evaluated
2982 * \param[in] tu_eq_cu     : indicates if tu size is same as cu or cu/2
2983 *
2984 * \return
2985 *    None
2986 *
2987 * \author
2988 *  Ittiam
2989 *
2990 *****************************************************************************
2991 */
ihevce_mode_eval_filtering(ihevce_ipe_cu_tree_t * ps_cu_node,ihevce_ipe_cu_tree_t * ps_child_cu_node,ihevce_ipe_ctxt_t * ps_ctxt,iv_enc_yuv_buf_t * ps_curr_src,WORD32 best_amode,WORD32 * best_costs_4x4,UWORD8 * best_modes_4x4,WORD32 step2_bypass,WORD32 tu_eq_cu)2992 void ihevce_mode_eval_filtering(
2993     ihevce_ipe_cu_tree_t *ps_cu_node,
2994     ihevce_ipe_cu_tree_t *ps_child_cu_node,
2995     ihevce_ipe_ctxt_t *ps_ctxt,
2996     iv_enc_yuv_buf_t *ps_curr_src,
2997     WORD32 best_amode,
2998     WORD32 *best_costs_4x4,
2999     UWORD8 *best_modes_4x4,
3000     WORD32 step2_bypass,
3001     WORD32 tu_eq_cu)
3002 {
3003     UWORD8 *pu1_origin, *pu1_orig;
3004     WORD32 src_strd = ps_curr_src->i4_y_strd;
3005     WORD32 nbr_flags;
3006     nbr_avail_flags_t s_nbr;
3007     WORD32 trans_size = tu_eq_cu ? ps_cu_node->u1_cu_size : ps_cu_node->u1_cu_size >> 1;
3008     WORD32 num_tu_in_x = tu_eq_cu ? 1 : 2;
3009     WORD32 num_tu_in_y = tu_eq_cu ? 1 : 2;
3010     UWORD8 mode;
3011 
3012     WORD32 cost_ang_mode = MAX_INTRA_COST_IPE;
3013     WORD32 filter_flag;
3014     WORD32 cost_amode_step2[7] = { 0 };
3015     /*WORD32 best_sad[5];  // NOTE_A01: Not getting consumed at present */
3016     WORD32 sad = 0;
3017     WORD32 cu_pos_x, cu_pos_y;
3018     WORD32 temp;
3019     WORD32 i = 0, j, k, i_end, z;
3020     //WORD32 row, col, size;
3021     UWORD8 *pu1_ref;
3022     WORD32 xA, yA, xB, yB;
3023     WORD32 top_intra_mode;
3024     WORD32 left_intra_mode;
3025     UWORD8 *pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3026     UWORD8 *pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3027 
3028     UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
3029     WORD32 count;
3030 
3031     pf_ipe_res_trans_had apf_resd_trns_had[4];
3032 
3033     WORD32 cand_mode_satd_list[3];
3034     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
3035 
3036     ihevc_intra_pred_luma_ref_substitution_fptr =
3037         ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
3038 
3039     apf_resd_trns_had[0] = ps_ctxt->s_cmn_opt_func.pf_HAD_4x4_8bit;
3040     apf_resd_trns_had[1] = ps_ctxt->s_cmn_opt_func.pf_HAD_8x8_8bit;
3041     apf_resd_trns_had[2] = ps_ctxt->s_cmn_opt_func.pf_HAD_16x16_8bit;
3042     apf_resd_trns_had[3] = ps_ctxt->s_cmn_opt_func.pf_HAD_32x32_8bit;
3043 
3044     /* initialize modes_to_eval as zero */
3045     memset(&ps_ctxt->au1_modes_to_eval, 0, MAX_NUM_IP_MODES);
3046 
3047     /* Compute the Parent Cost */
3048 
3049     /* Pointer to top-left of the CU - y0,x0 in 8x8 granularity */
3050     pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) + ((ps_cu_node->u2_y0 << 3) * src_strd) +
3051                (ps_cu_node->u2_x0 << 3);
3052 
3053     /* Get position of CU within CTB at 4x4 granularity */
3054     cu_pos_x = ps_cu_node->u2_x0 << 1;
3055     cu_pos_y = ps_cu_node->u2_y0 << 1;
3056 
3057     /* get the neighbour availability flags */
3058     ihevce_get_only_nbr_flag(
3059         &s_nbr,
3060         ps_ctxt->pu1_ctb_nbr_map,
3061         ps_ctxt->i4_nbr_map_strd,
3062         cu_pos_x,
3063         cu_pos_y,
3064         trans_size >> 2,
3065         trans_size >> 2);
3066 
3067     /* Traverse for all 4 child blocks in the parent block */
3068     xA = (ps_cu_node->u2_x0 << 3) >> 2;
3069     yA = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
3070     xB = xA + 1;
3071     yB = yA - 1;
3072     left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
3073     top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
3074     /* call the function which populates sad cost for all the modes */
3075 
3076     ihevce_intra_populate_mode_bits_cost_bracketing(
3077         top_intra_mode,
3078         left_intra_mode,
3079         s_nbr.u1_top_avail,
3080         s_nbr.u1_left_avail,
3081         ps_cu_node->u2_y0,
3082         &ps_ctxt->au2_mode_bits_satd_cost[0],
3083         &ps_ctxt->au2_mode_bits_satd[0],
3084         ps_ctxt->i4_ol_satd_lambda,
3085         cand_mode_satd_list);
3086 
3087     for(k = 0; k < num_tu_in_y; k++)
3088     {
3089         for(j = 0; j < num_tu_in_x; j++)
3090         {
3091             /* get the neighbour availability flags */
3092             nbr_flags = ihevce_get_nbr_intra(
3093                 &s_nbr,
3094                 ps_ctxt->pu1_ctb_nbr_map,
3095                 ps_ctxt->i4_nbr_map_strd,
3096                 cu_pos_x + ((j) * (trans_size >> 2)),
3097                 cu_pos_y + ((k) * (trans_size >> 2)),
3098                 trans_size >> 2);
3099 
3100             pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3101 
3102             /* Create reference samples array */
3103             ihevc_intra_pred_luma_ref_substitution_fptr(
3104                 pu1_origin - src_strd - 1,
3105                 pu1_origin - src_strd,
3106                 pu1_origin - 1,
3107                 src_strd,
3108                 trans_size,
3109                 nbr_flags,
3110                 pu1_ref_orig,
3111                 0);
3112 
3113             /* Perform reference samples filtering */
3114             ihevce_intra_pred_ref_filtering(pu1_ref_orig, trans_size, pu1_ref_filt);
3115 
3116             ihevce_set_nbr_map(
3117                 ps_ctxt->pu1_ctb_nbr_map,
3118                 ps_ctxt->i4_nbr_map_strd,
3119                 cu_pos_x + ((j) * (trans_size >> 2)),
3120                 cu_pos_y + ((k) * (trans_size >> 2)),
3121                 (trans_size >> 2),
3122                 1);
3123 
3124             pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3125             pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3126         }
3127     }
3128 
3129     /* Revaluation for angular mode */
3130     //if(ps_ed_blk->ang_attr.mode_present == 1)
3131     //if(((best_amode & 0x1) != 1))
3132 
3133     {
3134         WORD32 u1_trans_idx = trans_size >> 3;
3135         if(trans_size == 32)
3136             u1_trans_idx = 3;
3137         //best_amode = ps_ed_blk->ang_attr.best_mode;
3138 
3139         i = 0;
3140         if(!step2_bypass)
3141         {
3142             /* Around best level 4 angular mode, search for best level 2 mode */
3143             ASSERT((best_amode >= 2) && (best_amode <= 34));
3144 
3145             if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
3146             {
3147                 if(best_amode >= 4)
3148                     ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode - 2;
3149             }
3150 
3151             ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode;
3152 
3153             if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
3154             {
3155                 if(best_amode <= 32)
3156                     ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode + 2;
3157             }
3158         }
3159         else
3160         {
3161             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[0]->best_mode;
3162             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[1]->best_mode;
3163             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[2]->best_mode;
3164             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[3]->best_mode;
3165         }
3166 
3167         /* Add the left and top MPM modes for computation*/
3168 
3169         ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[0];
3170         ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[1];
3171 
3172         i_end = i;
3173         count = 0;
3174 
3175         /*Remove duplicate modes from modes_to_eval_temp[] */
3176         for(j = 0; j < i_end; j++)
3177         {
3178             for(k = 0; k < count; k++)
3179             {
3180                 if(ps_ctxt->au1_modes_to_eval_temp[j] == ps_ctxt->au1_modes_to_eval[k])
3181                     break;
3182             }
3183             if((k == count) && (ps_ctxt->au1_modes_to_eval_temp[j] > 1))
3184             {
3185                 ps_ctxt->au1_modes_to_eval[count] = ps_ctxt->au1_modes_to_eval_temp[j];
3186                 count++;
3187             }
3188         }
3189         i_end = count;
3190         if(count == 0)
3191         {
3192             ps_ctxt->au1_modes_to_eval[0] = 26;
3193             i_end = 1;
3194         }
3195 
3196         for(i = 0; i < i_end; i++)
3197         {
3198             pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3199             pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3200 
3201             mode = ps_ctxt->au1_modes_to_eval[i];
3202             ASSERT((mode >= 2) && (mode <= 34));
3203             cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
3204             filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
3205 
3206             for(k = 0; k < num_tu_in_y; k++)
3207             {
3208                 for(j = 0; j < num_tu_in_x; j++)
3209                 {
3210                     pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3211 
3212                     if(0 == filter_flag)
3213                         pu1_ref = pu1_ref_orig;
3214                     else
3215                         pu1_ref = pu1_ref_filt;
3216 
3217                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
3218                         pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
3219 
3220                     if(ps_ctxt->u1_use_satd)
3221                     {
3222                         sad = apf_resd_trns_had[u1_trans_idx](
3223                             pu1_origin,
3224                             ps_curr_src->i4_y_strd,
3225                             &ps_ctxt->au1_pred_samples[0],
3226                             trans_size,
3227                             NULL,
3228                             0
3229 
3230                         );
3231                     }
3232                     else
3233                     {
3234                         sad = ps_ctxt->s_ipe_optimised_function_list.pf_nxn_sad_computer(
3235                             pu1_origin,
3236                             ps_curr_src->i4_y_strd,
3237                             &ps_ctxt->au1_pred_samples[0],
3238                             trans_size,
3239                             trans_size);
3240                     }
3241 
3242                     cost_amode_step2[i] += sad;
3243 
3244                     pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3245                     pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3246                 }
3247             }
3248         }
3249         best_amode = ps_ctxt->au1_modes_to_eval[0];
3250         /*Init cost indx */
3251         cost_ang_mode = MAX_INTRA_COST_IPE;  //cost_amode_step2[0];
3252         for(z = 0; z < i_end; z++)
3253         {
3254             /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
3255             if(cost_ang_mode >= cost_amode_step2[z])
3256             {
3257                 if(cost_ang_mode == cost_amode_step2[z])
3258                 {
3259                     if(best_amode > ps_ctxt->au1_modes_to_eval[z])
3260                         best_amode = ps_ctxt->au1_modes_to_eval[z];
3261                 }
3262                 else
3263                 {
3264                     best_amode = ps_ctxt->au1_modes_to_eval[z];
3265                 }
3266                 cost_ang_mode = cost_amode_step2[z];
3267             }
3268         }
3269 
3270         /*Modify mode bits for the angular modes */
3271     }
3272 
3273     {
3274         /* Step - I modification */
3275         ASSERT((best_amode >= 2) && (best_amode <= 34));
3276         i_end = 0;
3277         z = 0;
3278 
3279         /* Around best level 3 angular mode, search for best level 1 mode */
3280         ps_ctxt->au1_modes_to_eval[i_end++] = 0;
3281         ps_ctxt->au1_modes_to_eval[i_end++] = 1;
3282 
3283         if(best_amode != 2)
3284             ps_ctxt->au1_modes_to_eval[i_end++] = best_amode - 1;
3285 
3286         ps_ctxt->au1_modes_to_eval[i_end++] = best_amode;
3287 
3288         if(best_amode != 34)
3289             ps_ctxt->au1_modes_to_eval[i_end++] = best_amode + 1;
3290 
3291         /* Inserting step_2's best mode at last to avoid
3292         recalculation of it's SATD cost */
3293 
3294         //ps_ctxt->au1_modes_to_eval[i_end] = best_amode; //Bugfix: HSAD compared with SAD
3295         //cost_amode_step2[i_end] = cost_ang_mode;
3296 
3297         /*best_sad[i_end] = cost_ang_mode
3298                 - mode_bits_satd_cost[best_amode]; //See NOTE_A01 above */
3299 
3300         cost_ang_mode = MAX_INTRA_COST_IPE; /* Init cost */
3301 
3302         for(i = 0; i < i_end; i++)
3303         {
3304             WORD32 u1_trans_idx = trans_size >> 3;
3305             if(trans_size == 32)
3306                 u1_trans_idx = 3;
3307             pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3308             pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3309 
3310             /*best_sad[i] = 0; //See NOTE_A01 above */
3311             mode = ps_ctxt->au1_modes_to_eval[i];
3312             cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
3313             filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
3314 
3315             for(k = 0; k < num_tu_in_y; k++)
3316             {
3317                 for(j = 0; j < num_tu_in_x; j++)
3318                 {
3319                     pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3320 
3321                     if(0 == filter_flag)
3322                         pu1_ref = pu1_ref_orig;
3323                     else
3324                         pu1_ref = pu1_ref_filt;
3325 
3326                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
3327                         pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
3328 
3329                     //if(trans_size != 4)
3330                     {
3331                         sad = apf_resd_trns_had[u1_trans_idx](
3332                             pu1_origin,
3333                             ps_curr_src->i4_y_strd,
3334                             &ps_ctxt->au1_pred_samples[0],
3335                             trans_size,
3336                             NULL,
3337                             0);
3338                     }
3339 
3340                     /*accumualting SATD though name says it is sad*/
3341                     cost_amode_step2[i] += sad;
3342                     /*best_sad[i] +=sad; //See NOTE_A01 above */
3343                     pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3344                     pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3345                 }
3346             }
3347         }
3348         /* Updating i_end for the step_2's inserted mode*/
3349         //        i_end++;
3350 
3351         /* Arrange the reference array in ascending order */
3352 
3353         for(i = 0; i < (i_end - 1); i++)
3354         {
3355             for(j = i + 1; j < i_end; j++)
3356             {
3357                 if(cost_amode_step2[i] > cost_amode_step2[j])
3358                 {
3359                     temp = cost_amode_step2[i];
3360                     cost_amode_step2[i] = cost_amode_step2[j];
3361                     cost_amode_step2[j] = temp;
3362 
3363                     temp = modes_4x4[i];
3364                     modes_4x4[i] = modes_4x4[j];
3365                     modes_4x4[j] = temp;
3366                 }
3367             }
3368         }
3369 
3370         /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
3371         best_amode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
3372         cost_ang_mode = cost_amode_step2[0];
3373         ps_cu_node->best_satd = cost_ang_mode - ps_ctxt->au2_mode_bits_satd_cost[best_amode];
3374         ps_cu_node->best_cost = cost_amode_step2[0];
3375         ps_cu_node->best_mode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
3376         ps_cu_node->best_satd =
3377             ps_cu_node->best_cost - ps_ctxt->au2_mode_bits_satd_cost[ps_cu_node->best_mode];
3378 
3379         /*Accumalate best mode bits cost for RC*/
3380         ps_cu_node->u2_mode_bits_cost = ps_ctxt->au2_mode_bits_satd[ps_cu_node->best_mode];
3381 
3382         /* Store the best three candidates */
3383         for(i = 0; i < 3; i++)
3384         {
3385             best_costs_4x4[i] = cost_amode_step2[i];
3386             best_modes_4x4[i] = ps_ctxt->au1_modes_to_eval[modes_4x4[i]];
3387         }
3388     }
3389 
3390     return;
3391 }
3392