1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /*!
22 ******************************************************************************
23 * \file ihevce_decomp_pre_intra_pass.c
24 *
25 * \brief
26 *    This file contains definitions related to frame decomposition done during
27 *    pre intra processing
28 *
29 * \date
30 *    19/02/2013
31 *
32 * \author
33 *    Ittiam
34 *
35 * List of Functions
36 *    ihevce_intra_populate_mode_bits_cost()
37 *    ihevce_8x8_sad_computer()
38 *    ihevce_4x4_sad_computer()
39 *    ihevce_ed_4x4_find_best_modes()
40 *    ihevce_ed_calc_4x4_blk()
41 *    ihevce_ed_calc_8x8_blk()
42 *    ihevce_ed_calc_incomplete_ctb()
43 *    ihevce_cu_level_qp_mod()
44 *    ihevce_ed_calc_ctb()
45 *    ihevce_ed_frame_init()
46 *    ihevce_scale_by_2()
47 *    ihevce_decomp_pre_intra_process_row()
48 *    ihevce_decomp_pre_intra_process()
49 *    ihevce_decomp_pre_intra_get_num_mem_recs()
50 *    ihevce_decomp_pre_intra_get_mem_recs()
51 *    ihevce_decomp_pre_intra_init()
52 *    ihevce_decomp_pre_intra_frame_init()
53 *    ihevce_merge_sort()
54 *    ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit()
55 *
56 ******************************************************************************
57 */
58 
59 /*****************************************************************************/
60 /* File Includes                                                             */
61 /*****************************************************************************/
62 /* System include files */
63 #include <stdio.h>
64 #include <string.h>
65 #include <stdlib.h>
66 #include <assert.h>
67 #include <stdarg.h>
68 #include <math.h>
69 #include <limits.h>
70 
71 /* User include files */
72 #include "ihevc_typedefs.h"
73 #include "itt_video_api.h"
74 #include "ihevce_api.h"
75 
76 #include "rc_cntrl_param.h"
77 #include "rc_frame_info_collector.h"
78 #include "rc_look_ahead_params.h"
79 
80 #include "ihevc_defs.h"
81 #include "ihevc_debug.h"
82 #include "ihevc_structs.h"
83 #include "ihevc_platform_macros.h"
84 #include "ihevc_deblk.h"
85 #include "ihevc_itrans_recon.h"
86 #include "ihevc_chroma_itrans_recon.h"
87 #include "ihevc_chroma_intra_pred.h"
88 #include "ihevc_intra_pred.h"
89 #include "ihevc_inter_pred.h"
90 #include "ihevc_mem_fns.h"
91 #include "ihevc_padding.h"
92 #include "ihevc_weighted_pred.h"
93 #include "ihevc_sao.h"
94 #include "ihevc_resi_trans.h"
95 #include "ihevc_quant_iquant_ssd.h"
96 #include "ihevc_cabac_tables.h"
97 
98 #include "ihevce_defs.h"
99 #include "ihevce_hle_interface.h"
100 #include "ihevce_lap_enc_structs.h"
101 #include "ihevce_multi_thrd_structs.h"
102 #include "ihevce_multi_thrd_funcs.h"
103 #include "ihevce_me_common_defs.h"
104 #include "ihevce_had_satd.h"
105 #include "ihevce_error_codes.h"
106 #include "ihevce_bitstream.h"
107 #include "ihevce_cabac.h"
108 #include "ihevce_rdoq_macros.h"
109 #include "ihevce_function_selector.h"
110 #include "ihevce_enc_structs.h"
111 #include "ihevce_entropy_structs.h"
112 #include "ihevce_cmn_utils_instr_set_router.h"
113 #include "ihevce_ipe_instr_set_router.h"
114 #include "ihevce_decomp_pre_intra_structs.h"
115 #include "ihevce_decomp_pre_intra_pass.h"
116 #include "ihevce_enc_loop_structs.h"
117 #include "hme_datatype.h"
118 #include "hme_interface.h"
119 #include "hme_common_defs.h"
120 #include "ihevce_global_tables.h"
121 
122 /*****************************************************************************/
123 /* Typedefs                                                                  */
124 /*****************************************************************************/
125 typedef void (*pf_ed_calc_ctb)(
126     ihevce_ed_ctxt_t *ps_ed_ctxt,
127     ihevce_ed_blk_t *ps_ed_ctb,
128     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
129     UWORD8 *pu1_src,
130     WORD32 src_stride,
131     WORD32 num_4x4_blks_x,
132     WORD32 num_4x4_blks_y,
133     WORD32 *nbr_flags,
134     WORD32 i4_layer_id,
135     WORD32 row_block_no,
136     WORD32 col_block_no,
137     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
138     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list);
139 
140 /*****************************************************************************/
141 /* Constant Macros                                                           */
142 /*****************************************************************************/
143 #define SATD_NOISE_FLOOR_THRESHOLD 16
144 #define MINIMUM_VARIANCE 15
145 #define SCALE_FACTOR_VARIANCE 20
146 #define SCALE_FACTOR_VARIANCE_8x8 60
147 #define MIN_SATD_THRSHLD 0
148 #define MAX_SATD_THRSHLD 64
149 #define SUB_NOISE_THRSHLD 0
150 #define MIN_BLKS 2
151 
152 /*****************************************************************************/
153 /* Global variables                                                          */
154 /*****************************************************************************/
155 
156 /**
157 *****************************************************************************
158 * @brief  list of pointers to luma intra pred functions
159 *****************************************************************************
160 */
161 pf_intra_pred g_apf_lum_ip[NUM_IP_FUNCS];
162 
163 /*****************************************************************************/
164 /* Function Definitions                                                      */
165 /*****************************************************************************/
166 
167 /*!
168 ******************************************************************************
169 * \if Function name : ihevce_intra_populate_mode_bits_cost \endif
170 *
171 * \brief: look-up table of cost of signalling an intra mode in the
172 *  bitstream
173 *
174 *****************************************************************************
175 */
ihevce_intra_populate_mode_bits_cost(WORD32 top_intra_mode,WORD32 left_intra_mode,WORD32 available_top,WORD32 available_left,WORD32 cu_pos_y,UWORD16 * mode_bits_cost,WORD32 lambda)176 void ihevce_intra_populate_mode_bits_cost(
177     WORD32 top_intra_mode,
178     WORD32 left_intra_mode,
179     WORD32 available_top,
180     WORD32 available_left,
181     WORD32 cu_pos_y,
182     UWORD16 *mode_bits_cost,
183     WORD32 lambda)
184 {
185     WORD32 i;
186     // 5.5 * lambda
187     UWORD16 five_bits_cost = COMPUTE_RATE_COST_CLIP30(11, lambda, (LAMBDA_Q_SHIFT + 1));
188 
189     (void)top_intra_mode;
190     (void)left_intra_mode;
191     (void)available_top;
192     (void)available_left;
193     (void)cu_pos_y;
194     for(i = 0; i < NUM_MODES; i++)
195     {
196         mode_bits_cost[i] = five_bits_cost;
197     }
198 }
199 
200 /*!
201 ******************************************************************************
202 * \if Function name : ihevce_8x8_sad_computer \endif
203 *
204 * \brief: compute sad between 2 8x8 blocks
205 *
206 *****************************************************************************
207 */
208 UWORD16
ihevce_8x8_sad_computer(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd)209     ihevce_8x8_sad_computer(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
210 {
211     UWORD16 sad = 0;
212     WORD32 i, j;
213 
214     for(i = 0; i < 8; i++)
215     {
216         for(j = 0; j < 8; j++)
217         {
218             sad += ABS(*pu1_src - *pu1_pred);
219             pu1_src++;
220             pu1_pred++;
221         }
222         pu1_src = pu1_src + (src_strd - 8);
223         pu1_pred = pu1_pred + (pred_strd - 8);
224     }
225 
226     return sad;
227 }
228 
229 /*!
230 ******************************************************************************
231 * \if Function name : ihevce_4x4_sad_computer \endif
232 *
233 * \brief: compute sad between 2 4x4 blocks
234 *
235 *****************************************************************************
236 */
237 UWORD16
ihevce_4x4_sad_computer(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd)238     ihevce_4x4_sad_computer(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
239 {
240     UWORD16 sad = 0;
241     WORD32 i, j;
242 
243     for(i = 0; i < 4; i++)
244     {
245         for(j = 0; j < 4; j++)
246         {
247             sad += ABS(*pu1_src - *pu1_pred);
248             pu1_src++;
249             pu1_pred++;
250         }
251         pu1_src = pu1_src + (src_strd - 4);
252         pu1_pred = pu1_pred + (pred_strd - 4);
253     }
254 
255     return sad;
256 }
257 
258 /*!
259 ******************************************************************************
260 * \if Function name : ihevce_ed_4x4_find_best_modes \endif
261 *
262 * \brief: evaluate input 4x4 block for pre-selected list of angular and normal
263 *  intra modes and return best sad, cost
264 *
265 *****************************************************************************
266 */
ihevce_ed_4x4_find_best_modes(UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * ref,UWORD16 * mode_bits_cost,UWORD8 * pu1_best_modes,WORD32 * pu1_best_sad_costs,WORD32 u1_low_resol,FT_SAD_COMPUTER * pf_4x4_sad_computer)267 void ihevce_ed_4x4_find_best_modes(
268     UWORD8 *pu1_src,
269     WORD32 src_stride,
270     UWORD8 *ref,
271     UWORD16 *mode_bits_cost,
272     UWORD8 *pu1_best_modes,
273     WORD32 *pu1_best_sad_costs,
274     WORD32 u1_low_resol,
275     FT_SAD_COMPUTER *pf_4x4_sad_computer)
276 {
277     WORD32 i;
278     UWORD8 mode = 0, best_amode = 0, best_nmode = 0;
279     UWORD8 pred[16];
280     WORD32 sad = 0;
281     WORD32 sad_cost = 0;
282     WORD32 best_asad_cost = 0xFFFFF;
283     WORD32 best_nsad_cost = 0xFFFFF;
284 
285     /* If lower layers, l1 or l2, all the 11 modes are evaluated */
286     /* If L0 layer, all modes excluding DC and Planar are evaluated */
287     if(1 == u1_low_resol)
288         i = 0;
289     else
290         i = 2;
291 
292     /* Find the best non-angular and angular mode till level 4 */
293     for(; i < 11; i++)
294     {
295         mode = gau1_modes_to_eval[i];
296         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
297         sad = pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
298         sad_cost = sad;
299         sad_cost += mode_bits_cost[mode];
300         if(mode < 2)
301         {
302             if(sad_cost < best_nsad_cost)
303             {
304                 best_nmode = mode;
305                 best_nsad_cost = sad_cost;
306             }
307         }
308         else
309         {
310             if(sad_cost < best_asad_cost)
311             {
312                 best_amode = mode;
313                 best_asad_cost = sad_cost;
314             }
315         }
316     }
317 
318     pu1_best_modes[0] = best_amode;
319     pu1_best_sad_costs[0] = best_asad_cost;
320 
321     /* Accumalate the best non-angular mode and cost for the l1 and l2 layers */
322     if(1 == u1_low_resol)
323     {
324         pu1_best_modes[1] = best_nmode;
325         pu1_best_sad_costs[1] = best_nsad_cost;
326     }
327 }
328 
329 /*!
330 ******************************************************************************
331 * \if Function name : ihevce_ed_calc_4x4_blk \endif
332 *
333 * \brief: evaluate input 4x4 block for all intra modes and return best sad &
334 *  cost
335 *
336 *****************************************************************************
337 */
ihevce_ed_calc_4x4_blk(ihevce_ed_blk_t * ps_ed,UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * ref,UWORD16 * mode_bits_cost,WORD32 * sad_ptr,WORD32 * pi4_best_satd,WORD32 i4_quality_preset,WORD32 * pi4_best_sad_cost,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list)338 static void ihevce_ed_calc_4x4_blk(
339     ihevce_ed_blk_t *ps_ed,
340     UWORD8 *pu1_src,
341     WORD32 src_stride,
342     UWORD8 *ref,
343     UWORD16 *mode_bits_cost,
344     WORD32 *sad_ptr,
345     WORD32 *pi4_best_satd,
346     WORD32 i4_quality_preset,
347     WORD32 *pi4_best_sad_cost,
348     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list)
349 {
350     WORD32 i, i_end;
351     UWORD8 mode, best_amode, best_nmode;
352     UWORD8 pred[16];
353 
354     UWORD16 sad;
355     WORD32 sad_cost = 0;
356     WORD32 best_asad_cost = 0xFFFFF;
357     WORD32 best_nsad_cost = 0xFFFFF;
358 
359     UWORD8 au1_best_modes[2];
360     WORD32 ai4_best_sad_costs[2];
361 
362     /* L1/L2 resolution hence low resolution enable */
363     WORD32 u1_low_resol = 1;
364 
365     UWORD8 modes_to_eval[2];
366 
367     /* The *pi4_best_satd will be consumed only if current
368     layer has odd number of 4x4 blocks in either x or y
369     direction. But the function hme_derive_num_layers() makes
370     sure that every layer has width and height such that each one
371     is a multiple of 16. Which makes pi4_best_satd useless. Hence
372     feel free to remove pi4_best_satd. Concluded on 29th Aug13 */
373     *pi4_best_satd = -1;
374     ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
375         pu1_src,
376         src_stride,
377         ref,
378         mode_bits_cost,
379         au1_best_modes,
380         ai4_best_sad_costs,
381         u1_low_resol,
382         ps_ipe_optimised_function_list->pf_4x4_sad_computer);
383 
384     best_nmode = au1_best_modes[1];
385     best_amode = au1_best_modes[0];
386     best_nsad_cost = ai4_best_sad_costs[1];
387     best_asad_cost = ai4_best_sad_costs[0];
388 
389     /* Updation of pi4_best_satd here needed iff the mode given by
390     ihevce_ed_4x4_find_best_modes() comes out to be
391     the best mode at the end of the function */
392     *pi4_best_satd = best_asad_cost - mode_bits_cost[best_amode];
393 
394     /* Around best level 4 angular mode, search for best level 2 mode */
395     modes_to_eval[0] = best_amode - 2;
396     modes_to_eval[1] = best_amode + 2;
397     i = 0;
398     i_end = 2;
399     if(best_amode == 2)
400         i = 1;
401     else if(best_amode == 34)
402         i_end = 1;
403     for(; i < i_end; i++)
404     {
405         mode = modes_to_eval[i];
406         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
407         sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
408         sad_cost = sad;
409         sad_cost += mode_bits_cost[mode];
410         if(sad_cost < best_asad_cost)
411         {
412             best_amode = mode;
413             best_asad_cost = sad_cost;
414             *pi4_best_satd = sad;
415         }
416         sad_ptr[mode] = sad;
417     }
418 
419     /*To be done : Add a flag here instead of preset condn*/
420     if((i4_quality_preset < IHEVCE_QUALITY_P4))
421     {
422         /* Around best level 2 angular mode, search for best level 1 mode */
423         modes_to_eval[0] = best_amode - 1;
424         modes_to_eval[1] = best_amode + 1;
425         i = 0;
426         i_end = 2;
427         if(best_amode == 2)
428             i = 1;
429         else if(best_amode == 34)
430             i_end = 1;
431         for(; i < i_end; i++)
432         {
433             mode = modes_to_eval[i];
434             g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
435             sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(
436                 pu1_src, &pred[0], src_stride, 4);
437             sad_cost = sad;
438             sad_cost += mode_bits_cost[mode];
439             if(sad_cost < best_asad_cost)
440             {
441                 best_amode = mode;
442                 best_asad_cost = sad_cost;
443                 *pi4_best_satd = sad;
444             }
445             sad_ptr[mode] = sad;
446         }
447     }
448 
449     if(best_asad_cost < best_nsad_cost)
450     {
451         ps_ed->best_mode = best_amode;
452         *pi4_best_sad_cost = best_asad_cost;
453     }
454     else
455     {
456         ps_ed->best_mode = best_nmode;
457         *pi4_best_sad_cost = best_nsad_cost;
458     }
459     ps_ed->intra_or_inter = 0;
460     ps_ed->merge_success = 0;
461 }
462 
463 /*!
464 ******************************************************************************
465 * \if Function name : ihevce_ed_calc_8x8_blk \endif
466 *
467 * \brief: evaluate input 8x8 block for intra modes basing on the intra mode
468 *  decisions made at 4x4 level. This function also makes a decision whether
469 *  to split blk in to 4x4 partitions or not.
470 *
471 *****************************************************************************
472 */
ihevce_ed_calc_8x8_blk(ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_8x8,UWORD8 * pu1_src,WORD32 src_stride,WORD32 * nbr_flags_ptr,WORD32 * top_intra_mode_ptr,WORD32 * left_intra_mode_ptr,WORD32 cu_pos_y,WORD32 lambda,WORD32 * sad_ptr_8x8,WORD32 * pi4_best_satd,WORD32 i4_layer_id,WORD32 i4_quality_preset,WORD32 i4_slice_type,WORD32 * pi4_best_sad_cost_8x8_l1_ipe,WORD32 * pi4_best_sad_8x8_l1_ipe,WORD32 * pi4_sum_4x4_satd,WORD32 * pi4_min_4x4_satd,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)473 static void ihevce_ed_calc_8x8_blk(
474     ihevce_ed_ctxt_t *ps_ed_ctxt,
475     ihevce_ed_blk_t *ps_ed_8x8,
476     UWORD8 *pu1_src,
477     WORD32 src_stride,
478     WORD32 *nbr_flags_ptr,
479     WORD32 *top_intra_mode_ptr,
480     WORD32 *left_intra_mode_ptr,
481     WORD32 cu_pos_y,
482     WORD32 lambda,
483     WORD32 *sad_ptr_8x8,
484     WORD32 *pi4_best_satd,
485     WORD32 i4_layer_id,
486     WORD32 i4_quality_preset,
487     WORD32 i4_slice_type,
488     WORD32 *pi4_best_sad_cost_8x8_l1_ipe,
489     WORD32 *pi4_best_sad_8x8_l1_ipe,
490     WORD32 *pi4_sum_4x4_satd,
491     WORD32 *pi4_min_4x4_satd,
492     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
493     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
494 {
495     WORD32 i, j;
496     WORD32 nbr_flags, nbr_flags_TR;
497     UWORD8 *pu1_src_4x4;
498     WORD32 top_available;
499     WORD32 left_available;
500     ihevce_ed_blk_t *ps_ed_4x4 = ps_ed_8x8;
501     WORD32 top_intra_mode;
502     WORD32 left_intra_mode;
503     WORD32 next_left_intra_mode;
504     WORD32 *sad_ptr = sad_ptr_8x8;
505     UWORD8 *pu1_src_arr[4];
506     WORD32 i4_4x4_best_sad_cost[4];
507     func_selector_t *ps_func_selector = ps_ed_ctxt->ps_func_selector;
508     ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution =
509         ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
510 
511     (void)i4_slice_type;
512 
513     /* Compute ref samples for 8x8 merge block */
514     nbr_flags = nbr_flags_ptr[0];
515     nbr_flags_TR = nbr_flags_ptr[1];
516 
517     if(CHECK_TR_AVAILABLE(nbr_flags_TR))
518     {
519         SET_TR_AVAILABLE(nbr_flags);
520     }
521     else
522     {
523         SET_TR_UNAVAILABLE(nbr_flags);
524     }
525 
526     if(CHECK_BL_AVAILABLE(nbr_flags))
527     {
528         SET_BL_AVAILABLE(nbr_flags);
529     }
530     else
531     {
532         SET_BL_UNAVAILABLE(nbr_flags);
533     }
534 
535     /* call the function which populates ref data for intra predicion */
536     pf_intra_pred_luma_ref_substitution(
537         pu1_src - src_stride - 1,
538         pu1_src - src_stride,
539         pu1_src - 1,
540         src_stride,
541         8,
542         nbr_flags,
543         &ps_ed_ctxt->au1_ref_8x8[0][0],
544         0);
545 
546     for(i = 0; i < 2; i++)
547     {
548         pu1_src_4x4 = pu1_src + i * 4 * src_stride;
549         cu_pos_y += i * 4;
550         next_left_intra_mode = left_intra_mode_ptr[i];
551         for(j = 0; j < 2; j++)
552         {
553             WORD32 i4_best_satd;
554             pu1_src_arr[i * 2 + j] = pu1_src_4x4;
555             nbr_flags = nbr_flags_ptr[i * 8 + j];
556             top_intra_mode = top_intra_mode_ptr[j];
557             left_intra_mode = next_left_intra_mode;
558             /* call the function which populates ref data for intra predicion */
559             pf_intra_pred_luma_ref_substitution(
560                 pu1_src_4x4 - src_stride - 1,
561                 pu1_src_4x4 - src_stride,
562                 pu1_src_4x4 - 1,
563                 src_stride,
564                 4,
565                 nbr_flags,
566                 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
567                 0);
568 
569             top_available = CHECK_T_AVAILABLE(nbr_flags);
570             left_available = CHECK_L_AVAILABLE(nbr_flags);
571             /* call the function which populates sad cost for all the modes */
572             ihevce_intra_populate_mode_bits_cost(
573                 top_intra_mode,
574                 left_intra_mode,
575                 top_available,
576                 left_available,
577                 cu_pos_y,
578                 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0],
579                 lambda);
580             ihevce_ed_calc_4x4_blk(
581                 ps_ed_4x4,
582                 pu1_src_4x4,
583                 src_stride,
584                 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
585                 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0],
586                 sad_ptr,
587                 &i4_best_satd,
588                 i4_quality_preset,
589                 &i4_4x4_best_sad_cost[i * 2 + j],
590                 ps_ipe_optimised_function_list);
591 
592             top_intra_mode_ptr[j] = ps_ed_4x4->best_mode;
593             next_left_intra_mode = ps_ed_4x4->best_mode;
594             pu1_src_4x4 += 4;
595             ps_ed_4x4 += 1;
596             sad_ptr += NUM_MODES;
597         }
598         left_intra_mode_ptr[i] = next_left_intra_mode;
599     }
600 
601     /* 8x8 merge */
602     {
603         UWORD8 modes_to_eval[6];
604         WORD32 sad;
605         UWORD8 pred[16];
606         UWORD8 pred_8x8[64] = { 0 };
607         WORD32 merge_success;
608         UWORD8 mode;
609 
610         ps_ed_4x4 = ps_ed_8x8;
611         mode = (ps_ed_4x4)->best_mode;
612 
613         *pi4_best_satd = -1;
614 
615         merge_success =
616             ((((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 1)->best_mode) +
617               ((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 2)->best_mode) +
618               ((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 3)->best_mode)) == 3);
619 
620         {
621             WORD32 i4_satd;
622             //UWORD16 au2_4x4_sad_cost_array[4];/*SAD of 4x4 blocks*/
623             UWORD16 u2_sum_best_4x4_sad_cost; /*Sum of 4x4 sad costs*/
624             UWORD16 u2_sum_best_4x4_satd_cost; /*Sum of 4x4 satd costs*/
625             UWORD8 u1_best_8x8_mode; /*8x8 mode.*/
626             UWORD16 u2_best_8x8_cost; /*8x8 Cost. Can store SATD/SAD cost*/
627             WORD32 i4_best_8x8_sad_satd; /* SATD/SAD value of 8x8 block*/
628             UWORD16 au2_8x8_costs[6] = { 0 }; /*Cost of 8x8 block for 6 modes*/
629             UWORD8 u1_cond_4x4_satd; /*condition if 4x4 SATD needs to be done*/
630             UWORD8 u1_cond_8x8_satd; /*condition if 8x8 SATD needs to be done*/
631             UWORD8 u1_good_quality;
632             WORD32 i4_merge_success_stage2;
633 
634             /*Initiallization*/
635             *pi4_best_satd = 0;
636             u2_best_8x8_cost = (UWORD16)(-1) /*max value*/;
637             u2_sum_best_4x4_sad_cost = 0;
638             *pi4_sum_4x4_satd = -1;
639             *pi4_min_4x4_satd = 0x7FFFFFFF;
640             i4_best_8x8_sad_satd = 0;
641             u2_sum_best_4x4_satd_cost = 0;
642             u1_best_8x8_mode = ps_ed_4x4->best_mode;
643 
644             /*We thought of "replacing" SATDs by SADs for 4x4 vs 8x8 decision
645             for speed improvement, but it gave opposite results. Setting
646             good_quality to 1 in order to throw away the idea of "replacing".*/
647             u1_good_quality = 1;
648             //u1_good_quality = ((i4_quality_preset != IHEVCE_QUALITY_P5)
649             //  && (i4_quality_preset != IHEVCE_QUALITY_P4));
650 
651             /*Needed to disable some processing based on speed preset*/
652             i4_merge_success_stage2 = 0;
653 
654             /*Store SAD cost of 4x4 blocks */
655             for(i = 0; i < 4; i++)
656             {
657                 //au2_4x4_sad_cost_array[i] = (ps_ed_4x4 + i)->best_sad_cost;
658                 u2_sum_best_4x4_sad_cost +=
659                     i4_4x4_best_sad_cost[i];  //(ps_ed_4x4 + i)->best_sad_cost;
660                 modes_to_eval[i] = (ps_ed_4x4 + i)->best_mode;
661                 /*NOTE_01: i4_4x4_satd is not used anywhere at present.
662                 Setting it to zero to avoid ASSERT failure */
663                 /*Now taken care of incomplete CTB*/
664                 //(ps_ed_4x4 + i)->i4_4x4_satd = 0;
665             }
666 
667             /*Calculate SATD/SAd for 4x4 blocks*/
668             /*For (layer_2 && high_speed): No need to get 4x4 SATDs bcoz
669             it won't have any impact on quality but speed will improve.*/
670             u1_cond_4x4_satd = ((1 == i4_layer_id) || (u1_good_quality && (!merge_success)));
671 
672             if(u1_cond_4x4_satd)
673             {
674                 *pi4_sum_4x4_satd = 0;
675                 /*FYI: 1. Level 2 doesn't need the SATD.
676                 2. The 4x4 vs. 8x8 decision for high_speed will
677                 happen based on SAD. */
678                 /*Get SATD for 4x4 blocks */
679                 for(i = 0; i < 4; i++)
680                 {
681                     mode = modes_to_eval[i];
682                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
683                         &ps_ed_ctxt->au1_ref_full_ctb[i][0], 0, &pred[0], 4, 4, mode);
684 
685                     i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
686                         pu1_src_arr[i], src_stride, &pred[0], 4, NULL, 0);
687 
688                     {
689                         /*Save 4x4x satd in ed blk struct */
690                         (ps_ed_4x4 + i)->i4_4x4_satd = i4_satd;
691                     }
692 
693                     /*(ps_ed_4x4 + i)->i4_4x4_satd = i4_satd; // See NOTE_01*/
694                     u2_sum_best_4x4_satd_cost +=
695                         ((UWORD16)i4_satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
696                     *pi4_best_satd += i4_satd;
697                 }
698             }
699             /* Not being used in current code */
700             else /* (Level_2 && extreme_speed) */
701             {
702                 /******DONT ENTER HERE AT aNY COST***************************/
703                 /* Transistor killers lie ahead!!!!!!! */
704                 /*This else part is not getting executed as of now*/
705                 if(2 != i4_layer_id)
706                     ASSERT(0);
707                 /*Update values by SAD_cost_array */
708                 for(i = 0; i < 4; i++)
709                 {
710                     mode = modes_to_eval[i];
711                     //u2_sum_best_4x4_satd_cost += au2_4x4_sad_cost_array[i];
712                     //sad = (WORD32)((ps_ed_4x4 + i)->best_sad_cost - ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
713                     sad = (WORD32)(
714                         i4_4x4_best_sad_cost[i] - ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
715                     *pi4_sum_4x4_satd += sad;
716                     /*(ps_ed_4x4 + i)->i4_4x4_satd = sad;// See NOTE_01*/
717                     *pi4_best_satd += sad;
718 
719                     if(*pi4_min_4x4_satd > sad)
720                         *pi4_min_4x4_satd = sad;
721                 }
722             }
723             if(!merge_success) /*If the modes are not identical*/
724             {
725                 UWORD8 i1_start; /* no of modes to evaluate */
726                 UWORD8 ai1_modes[6];
727 
728                 /* Prepare 6 candidates for 8x8 block. Two are DC and planar */
729                 ai1_modes[4] = 0;
730                 ai1_modes[5] = 1;
731                 i1_start = 4;
732 
733                 /*Assign along with removing duplicates rest 4 candidates. */
734                 for(i = 3; i >= 0; i--)
735                 {
736                     WORD8 i1_fresh_mode_flag = 1;
737                     mode = modes_to_eval[i];
738                     /*Check if duplicate already exists in ai1_modes*/
739                     for(j = i1_start; j < 6; j++)
740                     {
741                         if(mode == ai1_modes[j])
742                             i1_fresh_mode_flag = 0;
743                     }
744                     if(i1_fresh_mode_flag)
745                     {
746                         i1_start--;
747                         ai1_modes[i1_start] = mode;
748                     }
749                 }
750 
751                 /*Calculate SATD/SAD of 8x8 block for all modes*/
752                 /*If (u1_good_quality == 0) then SATD gets replaced by SAD*/
753                 if(u1_good_quality && (i4_quality_preset <= IHEVCE_QUALITY_P4))
754                 {
755                     //7.5 * lambda to incorporate transfrom flags
756                     u2_sum_best_4x4_satd_cost +=
757                         (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
758 
759                     /*Loop over all modes for calculating SATD*/
760                     for(i = i1_start; i < 6; i++)
761                     {
762                         mode = ai1_modes[i];
763                         g_apf_lum_ip[g_i4_ip_funcs[mode]](
764                             &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
765 
766                         i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
767                             pu1_src_arr[0], src_stride, &pred_8x8[0], 8, NULL, 0);
768 
769                         au2_8x8_costs[i] =
770                             ((UWORD16)i4_satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode]);
771 
772                         /*Update data correspoinding to least 8x8 cost */
773                         if(au2_8x8_costs[i] <= u2_best_8x8_cost)
774                         {
775                             u2_best_8x8_cost = au2_8x8_costs[i];
776                             i4_best_8x8_sad_satd = i4_satd;
777                             u1_best_8x8_mode = mode;
778                         }
779                     }
780                     /*8x8 vs 4x4 decision based on SATD values*/
781                     if((u2_best_8x8_cost <= u2_sum_best_4x4_satd_cost) || (u2_best_8x8_cost <= 300))
782                     {
783                         i4_merge_success_stage2 = 1;
784                     }
785 
786                     /* EIID: Early inter-intra decision */
787                     /* Find the SAD based cost for 8x8 block for best mode */
788                     if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
789                     {
790                         UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
791                         WORD32 i4_best_8x8_sad_curr;
792 
793                         g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]](
794                             &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, i4_best_8x8_mode);
795 
796                         i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
797                             pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
798 
799                         //register best sad in the context
800                         //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
801 
802                         //register the best cost in the context
803                         //[0]th index is used since all 4 blocks are having same cost right now
804                         //also it doesnt depends on mode. It only depends on the lambda
805 
806                         *pi4_best_sad_cost_8x8_l1_ipe =
807                             i4_best_8x8_sad_curr +
808                             ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode];
809                         *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
810                     }
811                 }
812                 else /*If high_speed or extreme speed*/
813                 {
814                     //7.5 * lambda to incorporate transfrom flags
815                     u2_sum_best_4x4_sad_cost +=
816                         (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
817 
818                     /*Loop over all modes for calculating SAD*/
819                     for(i = i1_start; i < 6; i++)
820                     {
821                         mode = ai1_modes[i];
822                         g_apf_lum_ip[g_i4_ip_funcs[mode]](
823                             &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
824 
825                         sad = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
826                             pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
827 
828                         au2_8x8_costs[i] +=
829                             ((UWORD16)sad + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode]);
830 
831                         /*Find the data correspoinding to least cost */
832                         if(au2_8x8_costs[i] <= u2_best_8x8_cost)
833                         {
834                             u2_best_8x8_cost = au2_8x8_costs[i];
835                             i4_best_8x8_sad_satd = sad;
836                             u1_best_8x8_mode = mode;
837                         }
838                     }
839                     /*8x8 vs 4x4 decision based on SAD values*/
840                     if((u2_best_8x8_cost <= u2_sum_best_4x4_sad_cost) || (u2_best_8x8_cost <= 300))
841                     {
842                         i4_merge_success_stage2 = 1;
843                     }
844 
845                     /* EIID: Early inter-intra decision */
846                     /* Find the SAD based cost for 8x8 block for best mode */
847                     if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
848                     {
849                         //UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
850                         WORD32 i4_best_8x8_sad_cost_curr = u2_best_8x8_cost;
851 
852                         //register best sad in the context
853                         //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
854 
855                         //register the best cost in the context
856                         *pi4_best_sad_cost_8x8_l1_ipe = i4_best_8x8_sad_cost_curr;
857                         *pi4_best_sad_8x8_l1_ipe =
858                             i4_best_8x8_sad_satd;  //i4_best_8x8_sad_cost_curr;
859                     }
860                 }
861             }
862 
863             /***** Modes for 4x4 and 8x8 are decided before this point ****/
864             if(merge_success || i4_merge_success_stage2)
865             {
866                 /*FYI: 1. 8x8 SATD is not needed if merge is failed.
867                 2. For layer_2: SATD won't be calculated for 8x8. So
868                 the best_8x8_cost is SAD-cost. */
869 
870                 /* Store the 8x8 level data in the first 4x4 block*/
871                 ps_ed_4x4->merge_success = 1;
872                 ps_ed_4x4->best_merge_mode = u1_best_8x8_mode;
873                 /* ps_ed_4x4->best_merge_sad_cost = u2_best_8x8_cost;
874                 This data is not getting consumed anywhere at present */
875 
876                 top_intra_mode_ptr[0] = u1_best_8x8_mode;
877                 top_intra_mode_ptr[1] = u1_best_8x8_mode;
878                 left_intra_mode_ptr[0] = u1_best_8x8_mode;
879                 left_intra_mode_ptr[1] = u1_best_8x8_mode;
880 
881                 /*If it is layer_1 and high_speed*/
882                 u1_cond_8x8_satd =
883                     ((1 == i4_layer_id) &&
884                      (merge_success || ((!u1_good_quality) && i4_merge_success_stage2)));
885                 if(u1_cond_8x8_satd)
886                 {
887                     mode = u1_best_8x8_mode;
888                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
889                         &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
890 
891                     if(i4_quality_preset > IHEVCE_QUALITY_P3)
892                     {
893                         i4_satd = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
894                             pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
895                     }
896                     else
897                     {
898                         i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
899                             pu1_src_arr[0], src_stride, &pred_8x8[0], 8, NULL, 0);
900                     }
901                     /* u2_best_8x8_cost = ((UWORD16)i4_satd + mode_bits_cost[0][mode]);
902                     This data is not getting consumed at present */
903                     i4_best_8x8_sad_satd = i4_satd;
904                 }
905                 *pi4_best_satd = i4_best_8x8_sad_satd;
906 
907                 /* EIID: Early inter-intra decision */
908                 /* Find the SAD based cost for 8x8 block for best mode */
909                 if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
910                 {
911                     UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
912                     WORD32 i4_best_8x8_sad_curr;
913 
914                     g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]](
915                         &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, i4_best_8x8_mode);
916 
917                     i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
918                         pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
919                     //register best sad in the context
920                     //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
921 
922                     //register the best cost in the context
923                     //[0]th index is used since all 4 blocks are having same cost right now
924                     //also it doesnt depends on mode. It only depends on the lambda
925 
926                     *pi4_best_sad_cost_8x8_l1_ipe =
927                         i4_best_8x8_sad_curr +
928                         ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode];
929                     *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
930 
931                 }  // EIID ends
932 
933             }  //if(merge_success || i4_merge_success_stage2)
934         }
935     }
936 }
937 
938 /*!
939 ******************************************************************************
940 * \if Function name : ihevce_ed_calc_incomplete_ctb \endif
941 *
942 * \brief: performs L1 8x8 and 4x4 intra mode analysis
943 *
944 *****************************************************************************
945 */
ihevce_ed_calc_incomplete_ctb(ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_ctb,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,UWORD8 * pu1_src,WORD32 src_stride,WORD32 num_4x4_blks_x,WORD32 num_4x4_blks_y,WORD32 * nbr_flags,WORD32 i4_layer_id,WORD32 i4_row_block_no,WORD32 i4_col_block_no,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)946 void ihevce_ed_calc_incomplete_ctb(
947     ihevce_ed_ctxt_t *ps_ed_ctxt,
948     ihevce_ed_blk_t *ps_ed_ctb,
949     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
950     UWORD8 *pu1_src,
951     WORD32 src_stride,
952     WORD32 num_4x4_blks_x,
953     WORD32 num_4x4_blks_y,
954     WORD32 *nbr_flags,
955     WORD32 i4_layer_id,
956     WORD32 i4_row_block_no,
957     WORD32 i4_col_block_no,
958     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
959     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
960 {
961     WORD32 i, j, k;
962     WORD32 z_scan_idx = 0;
963     WORD32 z_scan_act_idx = 0;
964     ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution =
965         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
966 
967     //UWORD8 ref[18];
968     //WORD32 top_intra_modes[20];
969     WORD32 *sad_ptr = &ps_ed_ctxt->sad[0];
970     WORD32 lambda = ps_ed_ctxt->lambda;
971     //UWORD16 mode_bits_cost[NUM_MODES];
972 
973     UWORD8 *pu1_src_8x8;
974     ihevce_ed_blk_t *ps_ed_8x8, *ps_ed_4x4;
975     WORD32 *top_intra_mode_ptr;
976     WORD32 *left_intra_mode_ptr = ps_ed_ctxt->left_ctb_intra_modes;
977     WORD32 *nbr_flags_ptr;
978     WORD32 top_intra_mode;
979     WORD32 left_intra_mode;
980     WORD32 next_left_intra_mode;
981     WORD32 nbr_flag = 0;
982     WORD32 top_available;
983     WORD32 left_available;
984     UWORD8 *pu1_src_4x4;
985     WORD32 left_over_4x4_blks;
986     WORD32 i4_incomplete_sum_4x4_satd = 0;
987     WORD32 i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
988     WORD32 i4_best_sad_cost_8x8_l1_ipe, i4_best_sad_8x8_l1_ipe, i4_sum_4x4_satd, i4_min_4x4_satd;
989 
990     (void)i4_row_block_no;
991     (void)i4_col_block_no;
992     /*Find the modulated qp of 16*16 at L2 from 8*8 SATDs in L2
993     THis is used as 64*64 Qp in L0*/
994     /*For Incomplete CTB, init all SATD to -1 and then popualate for the complete 8x8 blocks (CU 16 in L0)*/
995     /* Not populated for 4x4 blocks (CU 8 in L0), can be done */
996     /*Also, not 32x32 satd is not populated, as it would correspong to CU 64 and it is not an incomplete CTB */
997     if(i4_layer_id == 1)
998     {
999         WORD32 i4_i;
1000 
1001         for(i4_i = 0; i4_i < 64; i4_i++)
1002         {
1003             (ps_ed_ctb + i4_i)->i4_4x4_satd = -1;
1004             (ps_ed_ctb + i4_i)->i4_4x4_cur_satd = -1;
1005         }
1006 
1007         for(i4_i = 0; i4_i < 16; i4_i++)
1008         {
1009             ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2;
1010             ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF;
1011             ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2;
1012             ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2;
1013         }
1014 
1015         for(i4_i = 0; i4_i < 4; i4_i++)
1016         {
1017             ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2;
1018             ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2;
1019             ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2;
1020         }
1021         ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2;
1022         ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2;
1023         ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2;
1024 
1025         ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2;
1026 
1027         for(i4_i = 0; i4_i < 16; i4_i++)
1028         {
1029             ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -1;
1030             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -1;
1031             ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -1;
1032             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -1;
1033             ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -1;
1034             ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -1;
1035             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -1;
1036 
1037             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -1;
1038         }
1039     }
1040     /*
1041     * src scan happens in raster scan order. ps_ed update happens in z-scan order.
1042     */
1043     for(i = 0; i < num_4x4_blks_x; i++)
1044     {
1045         ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[i] = INTRA_DC;
1046     }
1047     next_left_intra_mode = left_intra_mode_ptr[0];
1048     for(i = 0; i < num_4x4_blks_y / 2; i++)
1049     {
1050         pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride;
1051         top_intra_mode_ptr = &ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[0];
1052         nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i;
1053 
1054         for(j = 0; j < num_4x4_blks_x / 2; j++)
1055         {
1056             WORD32 i4_best_satd;
1057             // Multiply i by 16 since the
1058             // matrix is prepared for ctb_size = 64
1059             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2];
1060             z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
1061             ASSERT(z_scan_act_idx <= 15);
1062             ps_ed_8x8 = ps_ed_ctb + z_scan_idx;
1063 
1064             ihevce_ed_calc_8x8_blk(
1065                 ps_ed_ctxt,
1066                 ps_ed_8x8,
1067                 pu1_src_8x8,
1068                 src_stride,
1069                 nbr_flags_ptr,
1070                 top_intra_mode_ptr,
1071                 left_intra_mode_ptr,
1072                 i * 8,
1073                 lambda,
1074                 sad_ptr + z_scan_idx * NUM_MODES,
1075                 &i4_best_satd,
1076                 i4_layer_id,
1077                 ps_ed_ctxt->i4_quality_preset,
1078                 ps_ed_ctxt->i4_slice_type,
1079                 &i4_best_sad_cost_8x8_l1_ipe,
1080                 &i4_best_sad_8x8_l1_ipe,
1081                 &i4_sum_4x4_satd,
1082                 &i4_min_4x4_satd,
1083                 ps_ipe_optimised_function_list,
1084                 ps_cmn_utils_optimised_function_list);
1085 
1086             ASSERT(i4_best_satd >= 0);
1087             if(i4_layer_id == 1)
1088             {
1089                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] =
1090                     i4_best_sad_cost_8x8_l1_ipe;
1091                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe;
1092                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd;
1093                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
1094                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
1095                 //ps_ed_ctb_l1->i4_sum_4x4_satd[z_scan_act_idx] = i4_sum_4x4_satd;
1096                 //ps_ed_ctb_l1->i4_min_4x4_satd[z_scan_act_idx] = i4_min_4x4_satd;
1097             }
1098 
1099             pu1_src_8x8 += 8;
1100             //ps_ed_8x8  += 4;
1101             top_intra_mode_ptr += 2;
1102             nbr_flags_ptr += 2;
1103         }
1104 
1105         next_left_intra_mode = left_intra_mode_ptr[0];
1106         left_over_4x4_blks = (num_4x4_blks_x - (2 * (num_4x4_blks_x / 2)));
1107         left_over_4x4_blks = left_over_4x4_blks * 2;
1108 
1109         pu1_src_4x4 = pu1_src_8x8;
1110 
1111         i4_incomplete_sum_4x4_satd = 0;
1112         i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
1113 
1114         /* For leftover right 4x4 blks (num_4x4_blks_x - 2 *(num_4x4_blks_x/2))*/
1115         for(k = 0; k < left_over_4x4_blks; k++)
1116         {
1117             WORD32 i4_best_satd;
1118             WORD32 i4_dummy_sad_cost;
1119             // Multiply i by 16 since the
1120             // matrix is prepared for ctb_size = 64
1121             ASSERT(left_over_4x4_blks == 2);
1122             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + k * 16 + j * 2];
1123             ps_ed_4x4 = ps_ed_ctb + z_scan_idx;
1124 
1125             top_intra_mode = ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j];
1126             left_intra_mode = next_left_intra_mode;
1127 
1128             nbr_flag = nbr_flags[i * 2 * 8 + k * 8 + j * 2];
1129 
1130             /* call the function which populates ref data for intra predicion */
1131             pf_intra_pred_luma_ref_substitution(
1132                 pu1_src_4x4 - src_stride - 1,
1133                 pu1_src_4x4 - src_stride,
1134                 pu1_src_4x4 - 1,
1135                 src_stride,
1136                 4,
1137                 nbr_flag,
1138                 &ps_ed_ctxt->au1_ref_ic_ctb[0],
1139                 0);
1140 
1141             top_available = CHECK_T_AVAILABLE(nbr_flag);
1142             left_available = CHECK_L_AVAILABLE(nbr_flag);
1143             /* call the function which populates sad cost for all the modes */
1144             ihevce_intra_populate_mode_bits_cost(
1145                 top_intra_mode,
1146                 left_intra_mode,
1147                 top_available,
1148                 left_available,
1149                 i * 4,
1150                 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
1151                 lambda);
1152 
1153             ihevce_ed_calc_4x4_blk(
1154                 ps_ed_4x4,
1155                 pu1_src_4x4,
1156                 src_stride,
1157                 &ps_ed_ctxt->au1_ref_ic_ctb[0],
1158                 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
1159                 sad_ptr + z_scan_idx * NUM_MODES,
1160                 &i4_best_satd,
1161                 ps_ed_ctxt->i4_quality_preset,
1162                 &i4_dummy_sad_cost,
1163                 ps_ipe_optimised_function_list);
1164 
1165             ASSERT(i4_best_satd >= 0);
1166             if(i4_layer_id == 1)  //Can we ignore this check?
1167             {
1168                 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
1169                 /*Note : The satd population is not populated for last 4*4 block in incomplete CTB */
1170                 /* Which corresponds to CU 8 in L0 */
1171 
1172                 /*MAM_VAR_L1 */
1173                 i4_incomplete_sum_4x4_satd = i4_incomplete_sum_4x4_satd + i4_best_satd;
1174                 if(i4_incomplete_min_4x4_satd >= i4_best_satd)
1175                     i4_incomplete_min_4x4_satd = i4_best_satd;
1176                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
1177                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
1178                 if((k & 1) == 0)
1179                 {
1180                     ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = 0;
1181                 }
1182                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] += i4_best_satd;
1183             }
1184 
1185             ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j * 2] = ps_ed_4x4->best_mode;
1186             next_left_intra_mode = ps_ed_4x4->best_mode;
1187             pu1_src_4x4 += src_stride;
1188             left_intra_mode_ptr[k] = next_left_intra_mode;
1189         }
1190         left_intra_mode_ptr += 2;
1191     }
1192 
1193     if(num_4x4_blks_y & 1)
1194     {
1195         /* For leftover bottom 4x4 blks. (num_4x4_blks_x) */
1196         pu1_src_4x4 = pu1_src + i * 2 * 4 * src_stride;
1197         //memset(&ps_ed_ctb_l1->i4_best_satd_8x8[i][0],0,4*sizeof(WORD32));
1198         for(j = 0; j < num_4x4_blks_x; j++)
1199         {
1200             WORD32 i4_best_satd;
1201             WORD32 i4_dummy_sad_cost;
1202             // Multiply i by 16 since the
1203             // matrix is prepared for ctb_size = 64
1204             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j];
1205             ps_ed_4x4 = ps_ed_ctb + z_scan_idx;
1206 
1207             if((j & 1) == 0)
1208             {
1209                 i4_incomplete_sum_4x4_satd = 0;
1210                 i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
1211             }
1212 
1213             top_intra_mode = ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j];
1214             left_intra_mode = next_left_intra_mode;
1215 
1216             nbr_flag = nbr_flags[i * 2 * 8 + j];
1217 
1218             /* call the function which populates ref data for intra predicion */
1219             pf_intra_pred_luma_ref_substitution(
1220                 pu1_src_4x4 - src_stride - 1,
1221                 pu1_src_4x4 - src_stride,
1222                 pu1_src_4x4 - 1,
1223                 src_stride,
1224                 4,
1225                 nbr_flag,
1226                 &ps_ed_ctxt->au1_ref_ic_ctb[0],
1227                 0);
1228 
1229             top_available = CHECK_T_AVAILABLE(nbr_flag);
1230             left_available = CHECK_L_AVAILABLE(nbr_flag);
1231             /* call the function which populates sad cost for all the modes */
1232             ihevce_intra_populate_mode_bits_cost(
1233                 top_intra_mode,
1234                 left_intra_mode,
1235                 top_available,
1236                 left_available,
1237                 i * 4,
1238                 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
1239                 lambda);
1240 
1241             ihevce_ed_calc_4x4_blk(
1242                 ps_ed_4x4,
1243                 pu1_src_4x4,
1244                 src_stride,
1245                 &ps_ed_ctxt->au1_ref_ic_ctb[0],
1246                 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
1247                 sad_ptr + z_scan_idx * NUM_MODES,
1248                 &i4_best_satd,
1249                 ps_ed_ctxt->i4_quality_preset,
1250                 &i4_dummy_sad_cost,
1251                 ps_ipe_optimised_function_list);
1252 
1253             /*Note : The satd population is not populated for last 4*4 block in incomplete CTB */
1254             /* Which corresponds to CU 8 in L0 */
1255 
1256             /*MAM_VAR_L1 */
1257             ASSERT(i4_best_satd >= 0);
1258             if(i4_layer_id == 1)  //Can we ignore this check?
1259             {
1260                 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + (j >> 1)];
1261                 if((j & 1) == 0)
1262                 {
1263                     ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = 0;
1264                 }
1265                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] += i4_best_satd;
1266                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
1267                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
1268                 i4_incomplete_sum_4x4_satd = i4_incomplete_sum_4x4_satd + i4_best_satd;
1269                 if(i4_incomplete_min_4x4_satd >= i4_best_satd)
1270                     i4_incomplete_min_4x4_satd = i4_best_satd;
1271             }
1272 
1273             ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j] = ps_ed_4x4->best_mode;
1274             next_left_intra_mode = ps_ed_4x4->best_mode;
1275             pu1_src_4x4 += 4;
1276         }
1277     }
1278     left_intra_mode_ptr[0] = next_left_intra_mode;
1279 }
1280 
1281 /*!
1282 ******************************************************************************
1283 * \if Function name : ihevce_cu_level_qp_mod \endif
1284 *
1285 * \brief: Performs CU level QP modulation
1286 *
1287 *****************************************************************************
1288 */
ihevce_cu_level_qp_mod(WORD32 i4_qscale,WORD32 i4_satd,long double ld_curr_frame_log_avg_act,float f_mod_strength,WORD32 * pi4_act_factor,WORD32 * pi4_q_scale_mod,rc_quant_t * ps_rc_quant_ctxt)1289 WORD32 ihevce_cu_level_qp_mod(
1290     WORD32 i4_qscale,
1291     WORD32 i4_satd,
1292     long double ld_curr_frame_log_avg_act,
1293     float f_mod_strength,
1294     WORD32 *pi4_act_factor,
1295     WORD32 *pi4_q_scale_mod,
1296     rc_quant_t *ps_rc_quant_ctxt)
1297 {
1298     WORD32 i4_temp_qscale;
1299     WORD32 i4_temp_qp;
1300 
1301     if(i4_satd != -1)
1302     {
1303         WORD32 i4_loc_satd = i4_satd;
1304         if(i4_loc_satd < 1)
1305         {
1306             i4_loc_satd = 1;
1307         }
1308         if((WORD32)ld_curr_frame_log_avg_act == 0)
1309         {
1310             *pi4_act_factor = (1 << (QP_LEVEL_MOD_ACT_FACTOR));
1311         }
1312         else
1313         {
1314             UWORD32 u4_log2_sq_cur_satd;
1315             ULWORD64 u8_sq_cur_satd;
1316             WORD32 qp_offset;
1317 
1318             ASSERT(USE_SQRT_AVG_OF_SATD_SQR);
1319             u8_sq_cur_satd = (i4_loc_satd * i4_loc_satd);
1320             GET_POS_MSB_64(u4_log2_sq_cur_satd, u8_sq_cur_satd);
1321             if(ABS((
1322                    long double)(((1 << u4_log2_sq_cur_satd) * POW_2_TO_1_BY_4) - ((long double)u8_sq_cur_satd))) >
1323                ABS((
1324                    long double)(((1 << u4_log2_sq_cur_satd) * POW_2_TO_3_BY_4) - ((long double)u8_sq_cur_satd))))
1325             {
1326                 u4_log2_sq_cur_satd += 1;
1327             }
1328             qp_offset = (WORD32)(
1329                 f_mod_strength *
1330                 (float)((long double)u4_log2_sq_cur_satd - ld_curr_frame_log_avg_act));
1331             qp_offset = CLIP3(qp_offset, MIN_QP_MOD_OFFSET, MAX_QP_MOD_OFFSET);
1332             *pi4_act_factor = (WORD32)(
1333                 gad_look_up_activity[qp_offset + ABS(MIN_QP_MOD_OFFSET)] *
1334                 (1 << QP_LEVEL_MOD_ACT_FACTOR));
1335         }
1336 
1337         ASSERT(*pi4_act_factor > 0);
1338         i4_temp_qscale = ((i4_qscale * (*pi4_act_factor)) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >>
1339                          QP_LEVEL_MOD_ACT_FACTOR;
1340     }
1341     else
1342     {
1343         i4_temp_qscale = i4_qscale;
1344         *pi4_act_factor = (1 << QP_LEVEL_MOD_ACT_FACTOR);
1345     }
1346     ASSERT(*pi4_act_factor > 0);
1347 
1348     if(i4_temp_qscale > ps_rc_quant_ctxt->i2_max_qscale)
1349     {
1350         i4_temp_qscale = ps_rc_quant_ctxt->i2_max_qscale;
1351     }
1352     else if(i4_temp_qscale < ps_rc_quant_ctxt->i2_min_qscale)
1353     {
1354         i4_temp_qscale = ps_rc_quant_ctxt->i2_min_qscale;
1355     }
1356     /*store q scale for stat gen for I frame model*/
1357     /*Here activity factor is not modified as the cu qp would be clipped in rd-opt stage*/
1358     *pi4_q_scale_mod = i4_temp_qscale;
1359     i4_temp_qp = ps_rc_quant_ctxt->pi4_qscale_to_qp[i4_temp_qscale];
1360     if(i4_temp_qp > ps_rc_quant_ctxt->i2_max_qp)
1361     {
1362         i4_temp_qp = ps_rc_quant_ctxt->i2_max_qp;
1363     }
1364     else if(i4_temp_qp < ps_rc_quant_ctxt->i2_min_qp)
1365     {
1366         i4_temp_qp = ps_rc_quant_ctxt->i2_min_qp;
1367     }
1368     return (i4_temp_qp);
1369 }
1370 
1371 /*!
1372 ******************************************************************************
1373 * \if Function name : ihevce_ed_calc_ctb \endif
1374 *
1375 * \brief: performs L1 8x8 and 4x4 intra mode analysis
1376 *
1377 *****************************************************************************
1378 */
ihevce_ed_calc_ctb(ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_ctb,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,UWORD8 * pu1_src,WORD32 src_stride,WORD32 num_4x4_blks_x,WORD32 num_4x4_blks_y,WORD32 * nbr_flags,WORD32 i4_layer_id,WORD32 i4_row_block_no,WORD32 i4_col_block_no,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)1379 void ihevce_ed_calc_ctb(
1380     ihevce_ed_ctxt_t *ps_ed_ctxt,
1381     ihevce_ed_blk_t *ps_ed_ctb,
1382     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
1383     UWORD8 *pu1_src,
1384     WORD32 src_stride,
1385     WORD32 num_4x4_blks_x,
1386     WORD32 num_4x4_blks_y,
1387     WORD32 *nbr_flags,
1388     WORD32 i4_layer_id,
1389     WORD32 i4_row_block_no,
1390     WORD32 i4_col_block_no,
1391     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
1392     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
1393 {
1394     WORD32 i, j;
1395     WORD32 z_scan_idx = 0;
1396     WORD32 z_scan_act_idx = 0;
1397     ihevce_ed_blk_t *ps_ed_8x8;
1398     UWORD8 *pu1_src_8x8;
1399 
1400     WORD32 top_intra_modes[20];
1401     WORD32 *top_intra_mode_ptr;
1402     WORD32 *left_intra_mode_ptr = ps_ed_ctxt->left_ctb_intra_modes;
1403 
1404     WORD32 *sad_ptr = &ps_ed_ctxt->sad[0];
1405     WORD32 lambda = ps_ed_ctxt->lambda;
1406     WORD32 *nbr_flags_ptr;
1407     WORD32 i4_best_sad_cost_8x8_l1_ipe, i4_best_sad_8x8_l1_ipe, i4_sum_4x4_satd, i4_min_4x4_satd;
1408 
1409     (void)num_4x4_blks_y;
1410     (void)i4_row_block_no;
1411     (void)i4_col_block_no;
1412     ASSERT(num_4x4_blks_x % 2 == 0);
1413     ASSERT(num_4x4_blks_y % 2 == 0);
1414     ASSERT((num_4x4_blks_x == 4) || (num_4x4_blks_x == 8));
1415     ASSERT((num_4x4_blks_y == 4) || (num_4x4_blks_y == 8));
1416 
1417     if(i4_layer_id == 1)
1418     {
1419         WORD32 i4_i;
1420 
1421         for(i4_i = 0; i4_i < 64; i4_i++)
1422         {
1423             (ps_ed_ctb + i4_i)->i4_4x4_satd = -1;
1424             (ps_ed_ctb + i4_i)->i4_4x4_cur_satd = -1;
1425         }
1426 
1427         for(i4_i = 0; i4_i < 16; i4_i++)
1428         {
1429             ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2;
1430             ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF;
1431             ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2;
1432             ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2;
1433         }
1434 
1435         for(i4_i = 0; i4_i < 4; i4_i++)
1436         {
1437             ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2;
1438             ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2;
1439             ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2;
1440         }
1441         ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2;
1442         ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2;
1443         ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2;
1444         ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2;
1445         for(i4_i = 0; i4_i < 16; i4_i++)
1446         {
1447             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -2;
1448             ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -2;
1449             ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -2;
1450             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -2;
1451 
1452             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -2;
1453 
1454             ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -2;
1455             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -2;
1456             ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -2;
1457         }
1458     }
1459     /*
1460     * src scan happens in raster scan order. ps_ed update happens in z-scan order.
1461     */
1462     for(i = 0; i < num_4x4_blks_x; i++)
1463     {
1464         top_intra_modes[i] = INTRA_DC;
1465     }
1466     for(i = 0; i < num_4x4_blks_x / 2; i++)
1467     {
1468         pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride;
1469         top_intra_mode_ptr = &top_intra_modes[0];
1470         nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i;
1471 
1472         for(j = 0; j < num_4x4_blks_x / 2; j++)
1473         {
1474             WORD32 i4_best_satd;
1475             ASSERT(i <= 3);
1476             ASSERT(j <= 3);
1477 
1478             // Multiply i by 16 since the
1479             // matrix is prepared for ctb_size = 64
1480             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2];
1481             z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
1482             ASSERT(z_scan_act_idx <= 15);
1483 
1484             ps_ed_8x8 = ps_ed_ctb + z_scan_idx;
1485 
1486             ihevce_ed_calc_8x8_blk(
1487                 ps_ed_ctxt,
1488                 ps_ed_8x8,
1489                 pu1_src_8x8,
1490                 src_stride,
1491                 nbr_flags_ptr,
1492                 top_intra_mode_ptr,
1493                 left_intra_mode_ptr,
1494                 i * 8,
1495                 lambda,
1496                 sad_ptr + z_scan_idx * NUM_MODES,
1497                 &i4_best_satd,
1498                 i4_layer_id,
1499                 ps_ed_ctxt->i4_quality_preset,
1500                 ps_ed_ctxt->i4_slice_type,
1501                 &i4_best_sad_cost_8x8_l1_ipe,
1502                 &i4_best_sad_8x8_l1_ipe,
1503                 &i4_sum_4x4_satd,
1504                 &i4_min_4x4_satd,
1505                 ps_ipe_optimised_function_list,
1506                 ps_cmn_utils_optimised_function_list);
1507 
1508             if(i4_layer_id == 1)
1509             {
1510                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] =
1511                     i4_best_sad_cost_8x8_l1_ipe;
1512                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe;
1513                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd;
1514                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
1515                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
1516                 //ps_ed_ctb_l1->i4_sum_4x4_satd[z_scan_act_idx] = i4_sum_4x4_satd;
1517                 //ps_ed_ctb_l1->i4_min_4x4_satd[z_scan_act_idx] = i4_min_4x4_satd;
1518             }
1519 
1520             pu1_src_8x8 += 8;
1521             //ps_ed_8x8  += 4;
1522             top_intra_mode_ptr += 2;
1523             nbr_flags_ptr += 2;
1524         }
1525         left_intra_mode_ptr += 2;
1526     }
1527 }
1528 
1529 /*!
1530 ******************************************************************************
1531 * \if Function name : ihevce_ed_frame_init \endif
1532 *
1533 * \brief: Initialize frame context for early decision
1534 *
1535 *****************************************************************************
1536 */
ihevce_ed_frame_init(void * pv_ed_ctxt,WORD32 i4_layer_no)1537 void ihevce_ed_frame_init(void *pv_ed_ctxt, WORD32 i4_layer_no)
1538 {
1539     ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
1540 
1541     g_apf_lum_ip[IP_FUNC_MODE_0] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_planar_fptr;
1542     g_apf_lum_ip[IP_FUNC_MODE_1] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_dc_fptr;
1543     g_apf_lum_ip[IP_FUNC_MODE_2] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode2_fptr;
1544     g_apf_lum_ip[IP_FUNC_MODE_3TO9] =
1545         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_3_to_9_fptr;
1546     g_apf_lum_ip[IP_FUNC_MODE_10] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_horz_fptr;
1547     g_apf_lum_ip[IP_FUNC_MODE_11TO17] =
1548         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_11_to_17_fptr;
1549     g_apf_lum_ip[IP_FUNC_MODE_18_34] =
1550         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_18_34_fptr;
1551     g_apf_lum_ip[IP_FUNC_MODE_19TO25] =
1552         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_19_to_25_fptr;
1553     g_apf_lum_ip[IP_FUNC_MODE_26] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ver_fptr;
1554     g_apf_lum_ip[IP_FUNC_MODE_27TO33] =
1555         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_27_to_33_fptr;
1556 
1557     if(i4_layer_no == 1)
1558     {
1559         ps_ed_ctxt->i8_sum_best_satd = 0;
1560         ps_ed_ctxt->i8_sum_sq_best_satd = 0;
1561     }
1562 }
1563 
1564 /**
1565 ********************************************************************************
1566 *
1567 *  @brief  downscales by 2 in horz and vertical direction, creates output of
1568 *          size wd/2 * ht/2
1569 *
1570 *  @param[in]  pu1_src : source pointer
1571 *  @param[in]  src_stride : source stride
1572 *  @param[out] pu1_dst : destination pointer. Starting of a row.
1573 *  @param[in]  dst_stride : destination stride
1574 *  @param[in]  wd : width
1575 *  @param[in]  ht : height
1576 *  @param[in]  pu1_wkg_mem : working memory (atleast of size CEIL16(wd) * ht))
1577 *  @param[in]  ht_offset : height offset of the block to be scaled
1578 *  @param[in]  block_ht : height of the block to be scaled
1579 *  @param[in]  wd_offset : width offset of the block to be scaled
1580 *  @param[in]  block_wd : width of the block to be scaled
1581 *
1582 *  @return void
1583 *
1584 *  @remarks Assumption made block_ht should me multiple of 2. LANCZOS_SCALER
1585 *
1586 ********************************************************************************
1587 */
ihevce_scaling_filter_mxn(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_scrtch,WORD32 scrtch_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht,WORD32 wd)1588 void ihevce_scaling_filter_mxn(
1589     UWORD8 *pu1_src,
1590     WORD32 src_strd,
1591     UWORD8 *pu1_scrtch,
1592     WORD32 scrtch_strd,
1593     UWORD8 *pu1_dst,
1594     WORD32 dst_strd,
1595     WORD32 ht,
1596     WORD32 wd)
1597 {
1598 #define FILT_TAP_Q 8
1599 #define N_TAPS 7
1600     const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 };
1601     WORD32 i, j;
1602     WORD32 tmp;
1603     UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd;
1604     UWORD8 *pu1_scrtch_tmp = pu1_scrtch;
1605 
1606     /* horizontal filtering */
1607     for(i = -3; i < ht + 2; i++)
1608     {
1609         for(j = 0; j < wd; j += 2)
1610         {
1611             tmp = (i4_ftaps[3] * pu1_src_tmp[j] +
1612                    i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) +
1613                    i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) +
1614                    i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) +
1615                    (1 << (FILT_TAP_Q - 1))) >>
1616                   FILT_TAP_Q;
1617             pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp);
1618         }
1619         pu1_scrtch_tmp += scrtch_strd;
1620         pu1_src_tmp += src_strd;
1621     }
1622     /* vertical filtering */
1623     pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd;
1624     for(i = 0; i < ht; i += 2)
1625     {
1626         for(j = 0; j < (wd >> 1); j++)
1627         {
1628             tmp =
1629                 (i4_ftaps[3] * pu1_scrtch_tmp[j] +
1630                  i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) +
1631                  i4_ftaps[1] *
1632                      (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) +
1633                  i4_ftaps[0] *
1634                      (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) +
1635                  (1 << (FILT_TAP_Q - 1))) >>
1636                 FILT_TAP_Q;
1637             pu1_dst[j] = CLIP_U8(tmp);
1638         }
1639         pu1_dst += dst_strd;
1640         pu1_scrtch_tmp += (scrtch_strd << 1);
1641     }
1642 }
1643 
ihevce_scale_by_2(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht,UWORD8 * pu1_wkg_mem,WORD32 ht_offset,WORD32 block_ht,WORD32 wd_offset,WORD32 block_wd,FT_COPY_2D * pf_copy_2d,FT_SCALING_FILTER_BY_2 * pf_scaling_filter_mxn)1644 void ihevce_scale_by_2(
1645     UWORD8 *pu1_src,
1646     WORD32 src_strd,
1647     UWORD8 *pu1_dst,
1648     WORD32 dst_strd,
1649     WORD32 wd,
1650     WORD32 ht,
1651     UWORD8 *pu1_wkg_mem,
1652     WORD32 ht_offset,
1653     WORD32 block_ht,
1654     WORD32 wd_offset,
1655     WORD32 block_wd,
1656     FT_COPY_2D *pf_copy_2d,
1657     FT_SCALING_FILTER_BY_2 *pf_scaling_filter_mxn)
1658 {
1659 #define N_TAPS 7
1660 #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1))
1661     UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ];
1662     UWORD32 cpy_strd = MAX_BLK_SZ;
1663     UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1);
1664 
1665     UWORD8 *pu1_in, *pu1_out;
1666     WORD32 in_strd, wkg_mem_strd;
1667 
1668     WORD32 row_start, row_end;
1669     WORD32 col_start, col_end;
1670     WORD32 i, fun_select;
1671     WORD32 ht_tmp, wd_tmp;
1672     FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2];
1673 
1674     assert((wd & 1) == 0);
1675     assert((ht & 1) == 0);
1676     assert(block_wd <= MAX_CTB_SIZE);
1677     assert(block_ht <= MAX_CTB_SIZE);
1678 
1679     /* function pointers for filtering different dimensions */
1680     ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn;
1681     ihevce_scaling_filters[1] = pf_scaling_filter_mxn;
1682 
1683     /* handle boundary blks */
1684     col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0;
1685     row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0;
1686     col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0;
1687     row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0;
1688     if(col_end && (wd % block_wd != 0))
1689     {
1690         block_wd = (wd % block_wd);
1691     }
1692     if(row_end && (ht % block_ht != 0))
1693     {
1694         block_ht = (ht % block_ht);
1695     }
1696 
1697     /* boundary blks needs to be padded, copy src to tmp buffer */
1698     if(col_start || col_end || row_end || row_start)
1699     {
1700         UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd;
1701 
1702         pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start));
1703         pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start));
1704         ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end);
1705         wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end);
1706         pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp);
1707         pu1_in = au1_cpy + cpy_strd * 3 + 3;
1708         in_strd = cpy_strd;
1709     }
1710     else
1711     {
1712         pu1_in = pu1_src + wd_offset + ht_offset * src_strd;
1713         in_strd = src_strd;
1714     }
1715 
1716     /*top padding*/
1717     if(row_start)
1718     {
1719         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3;
1720 
1721         pu1_cpy = au1_cpy + cpy_strd * (3 - 1);
1722         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1723         pu1_cpy -= cpy_strd;
1724         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1725         pu1_cpy -= cpy_strd;
1726         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1727     }
1728 
1729     /*bottom padding*/
1730     if(row_end)
1731     {
1732         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd;
1733 
1734         pu1_cpy = pu1_cpy_tmp + cpy_strd;
1735         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1736         pu1_cpy += cpy_strd;
1737         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1738         pu1_cpy += cpy_strd;
1739         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1740     }
1741 
1742     /*left padding*/
1743     if(col_start)
1744     {
1745         UWORD8 *pu1_cpy_tmp = au1_cpy + 3;
1746 
1747         pu1_cpy = au1_cpy;
1748         for(i = 0; i < block_ht + 6; i++)
1749         {
1750             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
1751             pu1_cpy += cpy_strd;
1752             pu1_cpy_tmp += cpy_strd;
1753         }
1754     }
1755 
1756     /*right padding*/
1757     if(col_end)
1758     {
1759         UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1;
1760 
1761         pu1_cpy = au1_cpy + 3 + block_wd;
1762         for(i = 0; i < block_ht + 6; i++)
1763         {
1764             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
1765             pu1_cpy += cpy_strd;
1766             pu1_cpy_tmp += cpy_strd;
1767         }
1768     }
1769 
1770     wkg_mem_strd = block_wd >> 1;
1771     pu1_out = pu1_dst + (wd_offset >> 1);
1772     fun_select = (block_wd % 16 == 0);
1773     ihevce_scaling_filters[fun_select](
1774         pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd);
1775 
1776     /* Left padding of 16 for 1st block of every row */
1777     if(wd_offset == 0)
1778     {
1779         UWORD8 u1_val;
1780         WORD32 pad_wd = 16;
1781         WORD32 pad_ht = block_ht >> 1;
1782         UWORD8 *dst = pu1_dst;
1783 
1784         for(i = 0; i < pad_ht; i++)
1785         {
1786             u1_val = dst[0];
1787             memset(&dst[-pad_wd], u1_val, pad_wd);
1788             dst += dst_strd;
1789         }
1790     }
1791 
1792     if(wd == wd_offset + block_wd)
1793     {
1794         /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */
1795         /* Right padding is done only after processing of last block of that row is done*/
1796         UWORD8 u1_val;
1797         WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4;
1798         WORD32 pad_ht = block_ht >> 1;
1799         UWORD8 *dst = pu1_dst + (wd >> 1) - 1;
1800 
1801         for(i = 0; i < pad_ht; i++)
1802         {
1803             u1_val = dst[0];
1804             memset(&dst[1], u1_val, pad_wd);
1805             dst += dst_strd;
1806         }
1807 
1808         if(ht_offset == 0)
1809         {
1810             /* Top padding of 16 is done for 1st row only after we reach end of that row */
1811             WORD32 pad_wd = dst_strd;
1812             WORD32 pad_ht = 16;
1813             UWORD8 *dst = pu1_dst - 16;
1814 
1815             for(i = 1; i <= pad_ht; i++)
1816             {
1817                 memcpy(dst - (i * dst_strd), dst, pad_wd);
1818             }
1819         }
1820 
1821         /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have
1822          reached end of frame */
1823         if(ht - ht_offset - block_ht == 0)
1824         {
1825             WORD32 pad_wd = dst_strd;
1826             WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4;
1827             UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16;
1828 
1829             for(i = 1; i <= pad_ht; i++)
1830                 memcpy(dst + (i * dst_strd), dst, pad_wd);
1831         }
1832     }
1833 }
1834 
1835 /*!
1836 ******************************************************************************
1837 * \if Function name : ihevce_decomp_pre_intra_process_row \endif
1838 *
1839 * \brief
1840 *    Row level function which down scales a given row by 2 in horz and
1841 *    vertical direction creates output of size wd/2 * ht/2.
1842 *
1843 *  @param[in]  pu1_src : soource pointer
1844 *  @param[in]  src_stride : source stride
1845 *  @param[out] pu1_dst : desitnation pointer
1846 *  @param[in]  dst_stride : destination stride
1847 *  @param[in]  layer_wd : layer width
1848 *  @param[in]  layer_ht : layer height
1849 *  @param[in]  ht_offset : height offset of the block to be scaled
1850 *  @param[in]  block_ht : height of the block to be scaled
1851 *  @param[in]  wd_offset : width offset of the block to be scaled
1852 *  @param[in]  block_wd : width of the block to be scaled
1853 *  @param[in]  num_col_blks : number of col blks in that row
1854 *
1855 * \return None
1856 *
1857 *  @NOTE : When decompositionis done from L1 to L2 pre intra analysis is
1858 *          done on L1
1859 *
1860 *****************************************************************************
1861 */
ihevce_decomp_pre_intra_process_row(UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * pu1_dst_decomp,WORD32 dst_stride,WORD32 layer_wd,WORD32 layer_ht,UWORD8 * pu1_wkg_mem,WORD32 ht_offset,WORD32 block_ht,WORD32 block_wd,WORD32 i4_cu_aligned_pic_wd,WORD32 i4_cu_aligned_pic_ht,WORD32 num_col_blks,WORD32 layer_no,ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_row,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1_row,ihevce_8x8_L0_satd_t * ps_layer0_cur_satd,ihevce_8x8_L0_mean_t * ps_layer0_cur_mean,WORD32 num_4x4_blks_ctb_y,WORD32 num_4x4_blks_last_ctb_x,WORD32 skip_decomp,WORD32 skip_pre_intra,WORD32 row_block_no,WORD32 i4_enable_noise_detection,ctb_analyse_t * ps_ctb_analyse,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)1862 void ihevce_decomp_pre_intra_process_row(
1863     UWORD8 *pu1_src,
1864     WORD32 src_stride,
1865     UWORD8 *pu1_dst_decomp,
1866     WORD32 dst_stride,
1867     WORD32 layer_wd,
1868     WORD32 layer_ht,
1869     UWORD8 *pu1_wkg_mem,
1870     WORD32 ht_offset,
1871     WORD32 block_ht,
1872     WORD32 block_wd,
1873     WORD32 i4_cu_aligned_pic_wd,
1874     WORD32 i4_cu_aligned_pic_ht,
1875     WORD32 num_col_blks,
1876     WORD32 layer_no,
1877     ihevce_ed_ctxt_t *ps_ed_ctxt,
1878     ihevce_ed_blk_t *ps_ed_row,
1879     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1_row,
1880     ihevce_8x8_L0_satd_t *ps_layer0_cur_satd,
1881     ihevce_8x8_L0_mean_t *ps_layer0_cur_mean,
1882     WORD32 num_4x4_blks_ctb_y,
1883     WORD32 num_4x4_blks_last_ctb_x,
1884     WORD32 skip_decomp,
1885     WORD32 skip_pre_intra,
1886     WORD32 row_block_no,
1887     WORD32 i4_enable_noise_detection,
1888     ctb_analyse_t *ps_ctb_analyse,
1889     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
1890     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
1891 {
1892     WORD32 col_block_no;
1893 
1894     //ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
1895     UWORD8 *pu1_src_pre_intra = pu1_src + (ht_offset * src_stride);
1896     WORD32 num_4x4_blks_in_ctb = block_wd >> 2;
1897     //WORD32 nbr_flags[64];
1898     WORD32 *nbr_flags_ptr = &ps_ed_ctxt->ai4_nbr_flags[0];
1899     WORD32 src_inc_pre_intra = num_4x4_blks_in_ctb * 4;
1900     WORD32 inc_ctb = 0;
1901     ihevce_ed_blk_t *ps_ed_ctb = ps_ed_row;
1902     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1 = ps_ed_ctb_l1_row;
1903     WORD32 i, j;
1904     WORD32 do_pre_intra_analysis;
1905     pf_ed_calc_ctb ed_calc_ctb;
1906     ctb_analyse_t *ps_ctb_analyse_curr;
1907 
1908     (void)i4_cu_aligned_pic_wd;
1909     (void)i4_cu_aligned_pic_ht;
1910     (void)ps_layer0_cur_satd;
1911     (void)ps_layer0_cur_mean;
1912     (void)i4_enable_noise_detection;
1913     /*increment the struct pointer to point to the first CTB of the current row. */
1914     ps_ctb_analyse_curr = ps_ctb_analyse + row_block_no * num_col_blks;
1915 
1916     //if((num_4x4_blks_ctb_x == num_4x4_blks_ctb_y) && (num_4x4_blks_in_ctb == num_4x4_blks_ctb_x) )
1917     if(num_4x4_blks_in_ctb == num_4x4_blks_ctb_y)
1918     {
1919         ed_calc_ctb = ihevce_ed_calc_ctb;
1920     }
1921     else
1922     {
1923         ed_calc_ctb = ihevce_ed_calc_incomplete_ctb;
1924     }
1925 
1926     inc_ctb = num_4x4_blks_in_ctb * num_4x4_blks_in_ctb;
1927 
1928     do_pre_intra_analysis = ((layer_no == 1) || (layer_no == 2)) && (!skip_pre_intra);
1929 
1930     /*
1931     * For optimal pre intra analysis first block is processed outside
1932     * the loop.
1933     */
1934     if(!skip_decomp)
1935     {
1936         ihevce_scale_by_2(
1937             pu1_src,
1938             src_stride,
1939             pu1_dst_decomp,
1940             dst_stride,
1941             layer_wd,
1942             layer_ht,
1943             pu1_wkg_mem,
1944             ht_offset,
1945             block_ht,
1946             block_wd * 0,
1947             block_wd,
1948             ps_cmn_utils_optimised_function_list->pf_copy_2d,
1949             ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
1950         /* Disable noise detection */
1951         ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
1952 
1953         memset(
1954             ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
1955             0,
1956             sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
1957     }
1958 
1959     /*
1960     * Pre intra analysis for the first ctb.
1961     * To analyse any given CTB we need to set the availability flags of the
1962     * following neighbouring CTB: BL,L,TL,T,TR.
1963     */
1964     if(do_pre_intra_analysis)
1965     {
1966         /*
1967         * At the beginning of ctb row set left intra modes to default value.
1968         */
1969         for(j = 0; j < num_4x4_blks_ctb_y; j++)
1970         {
1971             ps_ed_ctxt->left_ctb_intra_modes[j] = INTRA_DC;
1972         }
1973 
1974         /*
1975         * Copy the neighbor flags for a general ctb (ctb inside the frame; not any corners).
1976         * The table gau4_nbr_flags_8x8_4x4blks generated for 16x16 4x4 blocks(ctb_size = 64).
1977         * But the same table holds good for other 4x4 blocks 2d arrays(eg 8x8 4x4 blks,4x4 4x4blks).
1978         * But the flags must be accessed with stride of 16 since the table has been generated for
1979         * ctb_size = 64. For odd 4x4 2d arrays(eg 3x3 4x4 blks) the flags needs modification.
1980         * The flags also need modification for corner ctbs.
1981         */
1982         memcpy(
1983             ps_ed_ctxt->ai4_nbr_flags,
1984             gau4_nbr_flags_8x8_4x4blks,
1985             sizeof(gau4_nbr_flags_8x8_4x4blks));
1986 
1987         /*
1988         * Since this is the fist ctb in the ctb row, set left flags unavailable for 1st CTB col
1989         */
1990         for(j = 0; j < num_4x4_blks_ctb_y; j++)
1991         {
1992             SET_L_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1993             SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1994             SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1995         }
1996         /*
1997         * If this is the fist ctb row, set top flags unavailable.
1998         */
1999         if(ht_offset == 0)
2000         {
2001             for(j = 0; j < num_4x4_blks_in_ctb; j++)
2002             {
2003                 SET_T_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
2004                 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
2005                 SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
2006             }
2007         }
2008 
2009         /* If this is last ctb row,set BL as not available. */
2010         if(ht_offset + block_ht >= layer_ht)
2011         {
2012             for(j = 0; j < num_4x4_blks_in_ctb; j++)
2013             {
2014                 SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(num_4x4_blks_ctb_y - 1) * 8 + j]);
2015             }
2016         }
2017         col_block_no = 0;
2018         /* Call intra analysis for the ctb */
2019         ed_calc_ctb(
2020             ps_ed_ctxt,
2021             ps_ed_ctb,
2022             ps_ed_ctb_l1,
2023             pu1_src_pre_intra,
2024             src_stride,
2025             num_4x4_blks_in_ctb,
2026             num_4x4_blks_ctb_y,
2027             nbr_flags_ptr,
2028             layer_no,
2029             row_block_no,
2030             col_block_no,
2031             ps_ipe_optimised_function_list,
2032             ps_cmn_utils_optimised_function_list
2033 
2034         );
2035 
2036         pu1_src_pre_intra += src_inc_pre_intra;
2037         ps_ed_ctb += inc_ctb;
2038         ps_ed_ctb_l1 += 1;
2039         /*
2040         * For the rest of the ctbs, set left flags available.
2041         */
2042         for(j = 0; j < num_4x4_blks_ctb_y; j++)
2043         {
2044             SET_L_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
2045         }
2046         for(j = 0; j < num_4x4_blks_ctb_y - 1; j++)
2047         {
2048             SET_BL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
2049             SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(j + 1) * 8]);
2050         }
2051         if(ht_offset != 0)
2052         {
2053             SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[0]);
2054         }
2055     }
2056 
2057     /* The first ctb is processed before the loop.
2058     * The last one is processed after the loop.
2059     */
2060     for(col_block_no = 1; col_block_no < num_col_blks - 1; col_block_no++)
2061     {
2062         if(!skip_decomp)
2063         {
2064             ihevce_scale_by_2(
2065                 pu1_src,
2066                 src_stride,
2067                 pu1_dst_decomp,
2068                 dst_stride,
2069                 layer_wd,
2070                 layer_ht,
2071                 pu1_wkg_mem,
2072                 ht_offset,
2073                 block_ht,
2074                 block_wd * col_block_no,
2075                 block_wd,
2076                 ps_cmn_utils_optimised_function_list->pf_copy_2d,
2077                 ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
2078             /* Disable noise detection */
2079             memset(
2080                 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
2081                 0,
2082                 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
2083 
2084             ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
2085         }
2086 
2087         if(do_pre_intra_analysis)
2088         {
2089             ed_calc_ctb(
2090                 ps_ed_ctxt,
2091                 ps_ed_ctb,
2092                 ps_ed_ctb_l1,
2093                 pu1_src_pre_intra,
2094                 src_stride,
2095                 num_4x4_blks_in_ctb,
2096                 num_4x4_blks_ctb_y,
2097                 nbr_flags_ptr,
2098                 layer_no,
2099                 row_block_no,
2100                 col_block_no,
2101                 ps_ipe_optimised_function_list,
2102                 ps_cmn_utils_optimised_function_list);
2103             pu1_src_pre_intra += src_inc_pre_intra;
2104             ps_ed_ctb += inc_ctb;
2105             ps_ed_ctb_l1 += 1;
2106         }
2107     }
2108 
2109     /* Last ctb in row */
2110     if((!skip_decomp) && (col_block_no == (num_col_blks - 1)))
2111     {
2112         ihevce_scale_by_2(
2113             pu1_src,
2114             src_stride,
2115             pu1_dst_decomp,
2116             dst_stride,
2117             layer_wd,
2118             layer_ht,
2119             pu1_wkg_mem,
2120             ht_offset,
2121             block_ht,
2122             block_wd * col_block_no,
2123             block_wd,
2124             ps_cmn_utils_optimised_function_list->pf_copy_2d,
2125             ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
2126         {
2127             /* Disable noise detection */
2128             memset(
2129                 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
2130                 0,
2131                 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
2132 
2133             ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
2134         }
2135     }
2136 
2137     if(do_pre_intra_analysis && (col_block_no == (num_col_blks - 1)))
2138     {
2139         /*
2140         * The last ctb can be complete or incomplete. The complete
2141         * ctb is handled in the if and incomplete is handled in the
2142         * else case
2143         */
2144         //if(num_4x4_blks_last_ctb == num_4x4_blks_in_ctb)
2145         if((num_4x4_blks_last_ctb_x == num_4x4_blks_ctb_y) &&
2146            (num_4x4_blks_in_ctb == num_4x4_blks_last_ctb_x))
2147         {
2148             /* Last ctb so set top right not available */
2149             SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[num_4x4_blks_in_ctb - 1]);
2150 
2151             ed_calc_ctb(
2152                 ps_ed_ctxt,
2153                 ps_ed_ctb,
2154                 ps_ed_ctb_l1,
2155                 pu1_src_pre_intra,
2156                 src_stride,
2157                 num_4x4_blks_in_ctb,
2158                 num_4x4_blks_in_ctb,
2159                 nbr_flags_ptr,
2160                 layer_no,
2161                 row_block_no,
2162                 col_block_no,
2163                 ps_ipe_optimised_function_list,
2164                 ps_cmn_utils_optimised_function_list);
2165             pu1_src_pre_intra += src_inc_pre_intra;
2166             ps_ed_ctb += inc_ctb;
2167             ps_ed_ctb_l1 += 1;
2168         }
2169         else
2170         {
2171             /* Last ctb so set top right not available */
2172             for(i = 0; i < num_4x4_blks_ctb_y; i++)
2173             {
2174                 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[i * 8 + num_4x4_blks_in_ctb - 1]);
2175             }
2176 
2177             ihevce_ed_calc_incomplete_ctb(
2178                 ps_ed_ctxt,
2179                 ps_ed_ctb,
2180                 ps_ed_ctb_l1,
2181                 pu1_src_pre_intra,
2182                 src_stride,
2183                 num_4x4_blks_last_ctb_x,
2184                 num_4x4_blks_ctb_y,
2185                 nbr_flags_ptr,
2186                 layer_no,
2187                 row_block_no,
2188                 col_block_no,
2189                 ps_ipe_optimised_function_list,
2190                 ps_cmn_utils_optimised_function_list);
2191         }
2192     }
2193 }
2194 
2195 /*!
2196 ******************************************************************************
2197 * \if Function name : ihevce_decomp_pre_intra_process \endif
2198 *
2199 * \brief
2200 *    Frame level function to decompose given layer L0 into coarser layers
2201 *
2202 * \param[in] pv_ctxt : pointer to master context of decomp_pre_intra module
2203 * \param[in] ps_inp  : pointer to input yuv buffer (frame buffer)
2204 * \param[in] pv_multi_thrd_ctxt : pointer to multithread context
2205 * \param[out] thrd_id : thread id
2206 *
2207 * \return
2208 *    None
2209 *
2210 * \author
2211 *  Ittiam
2212 *
2213 *****************************************************************************
2214 */
ihevce_decomp_pre_intra_process(void * pv_ctxt,ihevce_lap_output_params_t * ps_lap_out_prms,frm_ctb_ctxt_t * ps_frm_ctb_prms,void * pv_multi_thrd_ctxt,WORD32 thrd_id,WORD32 i4_ping_pong,ihevce_8x8_L0_satd_t * ps_layer0_cur_satd,ihevce_8x8_L0_mean_t * ps_layer0_cur_mean)2215 void ihevce_decomp_pre_intra_process(
2216     void *pv_ctxt,
2217     ihevce_lap_output_params_t *ps_lap_out_prms,
2218     frm_ctb_ctxt_t *ps_frm_ctb_prms,
2219     void *pv_multi_thrd_ctxt,
2220     WORD32 thrd_id,
2221     WORD32 i4_ping_pong,
2222     ihevce_8x8_L0_satd_t *ps_layer0_cur_satd,
2223     ihevce_8x8_L0_mean_t *ps_layer0_cur_mean)
2224 {
2225     WORD32 i4_layer_no;
2226     WORD32 i4_num_layers;
2227     WORD32 end_of_layer;
2228     UWORD8 *pu1_src, *pu1_dst;
2229     WORD32 src_stride, dst_stride;
2230     WORD32 i4_layer_wd, i4_layer_ht;
2231     WORD32 ht_offset, block_ht;
2232     WORD32 row_block_no, num_row_blocks;
2233     UWORD8 *pu1_wkg_mem;
2234     WORD32 block_wd;
2235     WORD32 num_col_blks;
2236     WORD32 skip_decomp, skip_pre_intra;
2237     WORD32 i4_cu_aligned_pic_wd, i4_cu_aligned_pic_ht;
2238     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
2239         (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
2240 
2241     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
2242         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thrd_id];
2243     multi_thrd_ctxt_t *ps_multi_thrd = (multi_thrd_ctxt_t *)pv_multi_thrd_ctxt;
2244 
2245     ihevce_ed_ctxt_t *ps_ed_ctxt;
2246     ihevce_ed_blk_t *ps_ed;
2247     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1;
2248     WORD32 inc_ctb = 0;
2249     WORD32 num_4x4_blks_lyr;
2250 
2251     i4_num_layers = ps_ctxt->i4_num_layers;
2252 
2253     ASSERT(i4_num_layers >= 3);
2254 
2255     /*
2256      * Always force minimum layers as 4 so that we would have both l1 and l2
2257      * pre intra analysis
2258      */
2259     if(i4_num_layers == 3)
2260     {
2261         i4_num_layers = 4;
2262     }
2263 
2264     ps_ctxt->as_layers[0].pu1_inp = (UWORD8 *)ps_lap_out_prms->s_input_buf.pv_y_buf;
2265     ps_ctxt->as_layers[0].i4_inp_stride = ps_lap_out_prms->s_input_buf.i4_y_strd;
2266     ps_ctxt->as_layers[0].i4_actual_wd = ps_lap_out_prms->s_input_buf.i4_y_wd;
2267     ps_ctxt->as_layers[0].i4_actual_ht = ps_lap_out_prms->s_input_buf.i4_y_ht;
2268 
2269     /* ------------ Loop over all the layers --------------- */
2270     /* This loop does only decomp for all layers by picking jobs from job queue */
2271     /* Decomp for all layers will completed with this for loop */
2272     for(i4_layer_no = 0; i4_layer_no < (i4_num_layers - 1); i4_layer_no++)
2273     {
2274         WORD32 idx = 0;
2275         src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride;
2276         pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp;
2277         i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd;
2278         i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht;
2279         pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp;
2280         dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride;
2281         block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd;
2282         block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht;
2283         num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks;
2284         num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks;
2285         i4_cu_aligned_pic_wd = ps_frm_ctb_prms->i4_cu_aligned_pic_wd;
2286         i4_cu_aligned_pic_ht = ps_frm_ctb_prms->i4_cu_aligned_pic_ht;
2287 
2288         /* register ed_ctxt buffer pointer */
2289         //pv_ed_ctxt =  &ps_ctxt->as_layers[i4_layer_no].s_early_decision;
2290         //ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
2291         //ps_ed = ps_ed_ctxt->ps_ed;
2292 
2293         //pv_ed_ctxt = &ps_ctxt->ps_ed_ctxt;
2294         ps_ed_ctxt = ps_ctxt->ps_ed_ctxt;
2295 
2296         /* initialize ed_ctxt here */
2297         /* init is moved here since now allocation is happening for only one instance
2298         is allocated. for each layer it is re-used */
2299         ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no];
2300         ps_ed_ctxt->i4_slice_type = ps_ctxt->i4_slice_type;
2301         ps_ed_ctxt->level = ps_ctxt->i4_codec_level;
2302         if(0 == i4_layer_no)
2303         {
2304             ps_ed_ctxt->ps_ed_pic = NULL;
2305             ps_ed_ctxt->ps_ed = NULL;
2306             ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
2307             ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
2308         }
2309         else if(1 == i4_layer_no)
2310         {
2311             ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf;
2312             ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf;
2313             ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1;
2314             ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1;
2315             ps_ctxt->ps_layer0_cur_satd = NULL;
2316             ps_ctxt->ps_layer0_cur_mean = NULL;
2317         }
2318         else if(2 == i4_layer_no)
2319         {
2320             ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf;
2321             ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf;
2322             ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
2323             ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
2324             ps_ctxt->ps_layer0_cur_satd = NULL;
2325             ps_ctxt->ps_layer0_cur_mean = NULL;
2326         }
2327 
2328         /*Calculate the number of 4x4 blocks in a CTB in that layer*/
2329         /*Divide block_wd by 4. 4 to get no of 4x4 blks*/
2330         num_4x4_blks_lyr = block_wd >> 2;
2331         inc_ctb = num_4x4_blks_lyr * num_4x4_blks_lyr;
2332 
2333         ps_ed = ps_ed_ctxt->ps_ed;
2334         ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1;
2335 
2336         end_of_layer = 0;
2337         skip_decomp = 0;
2338         skip_pre_intra = 1;
2339         //if( i4_layer_no >= ps_ctxt->i4_num_layers)
2340         if(i4_layer_no >= (ps_ctxt->i4_num_layers - 1))
2341         {
2342             skip_decomp = 1;
2343         }
2344         /* ------------ Loop over all the CTB rows --------------- */
2345         while(0 == end_of_layer)
2346         {
2347             job_queue_t *ps_pre_enc_job;
2348             WORD32 num_4x4_blks_ctb_y = 0;
2349             WORD32 num_4x4_blks_last_ctb_x = 0;
2350 
2351             /* Get the current row from the job queue */
2352             ps_pre_enc_job = (job_queue_t *)ihevce_pre_enc_grp_get_next_job(
2353                 pv_multi_thrd_ctxt, (DECOMP_JOB_LYR0 + i4_layer_no), 1, i4_ping_pong);
2354 
2355             pu1_wkg_mem = ps_ctxt->pu1_wkg_mem;
2356 
2357             /* If all rows are done, set the end of layer flag to 1, */
2358             if(NULL == ps_pre_enc_job)
2359             {
2360                 end_of_layer = 1;
2361             }
2362             else
2363             {
2364                 /* Obtain the current row's details from the job */
2365                 row_block_no = ps_pre_enc_job->s_job_info.s_decomp_job_info.i4_vert_unit_row_no;
2366                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = row_block_no;
2367                 ht_offset = row_block_no * block_ht;
2368 
2369                 if(row_block_no < (num_row_blocks))
2370                 {
2371                     pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
2372                               ((block_ht >> 1) * dst_stride * row_block_no);
2373 
2374                     /*L0 8x8 curr satd for qp mod*/
2375                     if(i4_layer_no == 0)
2376                     {
2377                         ps_ctxt->ps_layer0_cur_satd =
2378                             ps_layer0_cur_satd + (row_block_no * num_col_blks /*num ctbs*/ *
2379                                                   (block_wd >> 3) * (block_ht >> 3));
2380                         ps_ctxt->ps_layer0_cur_mean =
2381                             ps_layer0_cur_mean + (row_block_no * num_col_blks /*num ctbs*/ *
2382                                                   (block_wd >> 3) * (block_ht >> 3));
2383                     }
2384 
2385                     /* call the row level processing function */
2386                     ihevce_decomp_pre_intra_process_row(
2387                         pu1_src,
2388                         src_stride,
2389                         pu1_dst,
2390                         dst_stride,
2391                         i4_layer_wd,
2392                         i4_layer_ht,
2393                         pu1_wkg_mem,
2394                         ht_offset,
2395                         block_ht,
2396                         block_wd,
2397                         i4_cu_aligned_pic_wd,
2398                         i4_cu_aligned_pic_ht,
2399                         num_col_blks,
2400                         i4_layer_no,
2401                         ps_ed_ctxt,
2402                         ps_ed,
2403                         ps_ed_ctb_l1,
2404                         ps_ctxt->ps_layer0_cur_satd,
2405                         ps_ctxt->ps_layer0_cur_mean,
2406                         num_4x4_blks_ctb_y,
2407                         num_4x4_blks_last_ctb_x,
2408                         skip_decomp,
2409                         skip_pre_intra,
2410                         row_block_no,
2411                         ps_ctxt->i4_enable_noise_detection,
2412                         ps_ctxt->ps_ctb_analyse,
2413                         &ps_ctxt->s_ipe_optimised_function_list,
2414                         &ps_ctxt->s_cmn_opt_func);
2415 
2416                     /*When decompositionis done from L1 to L2
2417                     pre intra analysis is done on L1*/
2418                     if(i4_layer_no == 1 || i4_layer_no == 2)
2419                     {
2420                         // ps_ed   = ps_ed_ctxt->ps_ed +
2421                         //          (row_block_no * inc_ctb * (num_col_blks));
2422                     }
2423                 }
2424                 idx++;
2425                 /* set the output dependency */
2426                 ihevce_pre_enc_grp_job_set_out_dep(
2427                     pv_multi_thrd_ctxt, ps_pre_enc_job, i4_ping_pong);
2428             }
2429         }
2430         ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = idx;
2431 
2432         ihevce_ed_frame_init(ps_ed_ctxt, i4_layer_no);
2433 
2434         if((1 == i4_layer_no) && (IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset))
2435         {
2436             WORD32 vert_ctr, ctb_ctr, i;
2437             WORD32 ctb_ctr_blks = ps_ctxt->as_layers[1].i4_num_col_blks;
2438             WORD32 vert_ctr_blks = ps_ctxt->as_layers[1].i4_num_row_blks;
2439 
2440             if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
2441                (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))
2442             {
2443                 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
2444                 {
2445                     ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
2446                         ps_ctxt->ps_ed_ctb_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
2447 
2448                     for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
2449                     {
2450                         ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
2451                         for(i = 0; i < 16; i++)
2452                         {
2453                             ps_ed_ctb_curr_l1->i4_best_sad_cost_8x8_l1_ipe[i] = 0x7fffffff;
2454                             ps_ed_ctb_curr_l1->i4_best_sad_8x8_l1_ipe[i] = 0x7fffffff;
2455                         }
2456                     }
2457                 }
2458             }
2459         }
2460 
2461 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
2462         if(((2 == i4_layer_no) && (ps_lap_out_prms->i4_pic_type == IV_I_FRAME ||
2463                                    ps_lap_out_prms->i4_pic_type == IV_IDR_FRAME)) ||
2464            ((1 == i4_layer_no) &&
2465             (ps_lap_out_prms->i4_temporal_lyr_id <= TEMPORAL_LAYER_DISABLE)) ||
2466            ((IHEVCE_QUALITY_P6 != ps_ctxt->i4_quality_preset) && (0 != i4_layer_no)))
2467 #else
2468         if((0 != i4_layer_no) &&
2469            (1 != ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
2470                   (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
2471 #endif
2472         {
2473             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
2474 
2475             src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride;
2476             pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp;
2477             i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd;
2478             i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht;
2479             pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp;
2480             dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride;
2481             block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd;
2482             block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht;
2483             num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks;
2484             num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks;
2485             i4_cu_aligned_pic_wd = ps_frm_ctb_prms->i4_cu_aligned_pic_wd;
2486             i4_cu_aligned_pic_ht = ps_frm_ctb_prms->i4_cu_aligned_pic_ht;
2487 
2488             /* register ed_ctxt buffer pointer */
2489             ps_ed_ctxt = ps_ctxt->ps_ed_ctxt;
2490 
2491             /* initialize ed_ctxt here */
2492             /* init is moved here since now allocation is happening for only one instance
2493             is allocated. for each layer it is re-used */
2494             ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no];
2495             ps_ed_ctxt->i4_slice_type = ps_ctxt->i4_slice_type;
2496             ps_ed_ctxt->level = ps_ctxt->i4_codec_level;
2497             if(1 == i4_layer_no)
2498             {
2499                 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf;
2500                 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf;
2501                 ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1;
2502                 ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1;
2503                 ps_ctxt->ps_layer0_cur_satd = NULL;
2504                 ps_ctxt->ps_layer0_cur_mean = NULL;
2505             }
2506             else if(2 == i4_layer_no)
2507             {
2508                 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf;
2509                 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf;
2510                 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
2511                 ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
2512                 ps_ctxt->ps_layer0_cur_satd = NULL;
2513                 ps_ctxt->ps_layer0_cur_mean = NULL;
2514             }
2515 
2516             /*Calculate the number of 4x4 blocks in a CTB in that layer*/
2517             /*Divide block_wd by 4. 4 to get no of 4x4 blks*/
2518             num_4x4_blks_lyr = block_wd >> 2;
2519             inc_ctb = num_4x4_blks_lyr * num_4x4_blks_lyr;
2520 
2521             ps_ed = ps_ed_ctxt->ps_ed;
2522             ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1;
2523             skip_decomp = 1;
2524             skip_pre_intra = 0;
2525             for(idx = 0; idx < i4_num_rows; idx++)
2526             {
2527                 WORD32 num_4x4_blks_ctb_y = 0;
2528                 WORD32 num_4x4_blks_last_ctb_x = 0;
2529 
2530                 pu1_wkg_mem = ps_ctxt->pu1_wkg_mem;
2531 
2532                 {
2533                     /* Obtain the current row's details from the job */
2534                     row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
2535                     ht_offset = row_block_no * block_ht;
2536 
2537                     if(row_block_no < (num_row_blocks))
2538                     {
2539                         pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
2540                                   ((block_ht >> 1) * dst_stride * row_block_no);
2541 
2542                         if(i4_layer_no == 1 || i4_layer_no == 2)
2543                         {
2544                             ps_ed = ps_ed_ctxt->ps_ed + (row_block_no * inc_ctb * (num_col_blks));
2545                             ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1 + (row_block_no * num_col_blks);
2546 
2547                             ps_ed_ctxt->i4_quality_preset = ps_ctxt->i4_quality_preset;
2548                             num_4x4_blks_ctb_y = block_ht >> 2;
2549                             num_4x4_blks_last_ctb_x = block_wd >> 2;
2550 
2551                             if(row_block_no == num_row_blocks - 1)
2552                             {
2553                                 if(i4_layer_ht % block_ht)
2554                                 {
2555                                     num_4x4_blks_ctb_y = ((i4_layer_ht % block_ht) + 3) >> 2;
2556                                 }
2557                             }
2558 
2559                             if(i4_layer_wd % block_wd)
2560                             {
2561                                 num_4x4_blks_last_ctb_x = ((i4_layer_wd % block_wd) + 3) >> 2;
2562                             }
2563                         }
2564 
2565                         /* call the row level processing function */
2566                         ihevce_decomp_pre_intra_process_row(
2567                             pu1_src,
2568                             src_stride,
2569                             pu1_dst,
2570                             dst_stride,
2571                             i4_layer_wd,
2572                             i4_layer_ht,
2573                             pu1_wkg_mem,
2574                             ht_offset,
2575                             block_ht,
2576                             block_wd,
2577                             i4_cu_aligned_pic_wd,
2578                             i4_cu_aligned_pic_ht,
2579                             num_col_blks,
2580                             i4_layer_no,
2581                             ps_ed_ctxt,
2582                             ps_ed,
2583                             ps_ed_ctb_l1,
2584                             ps_ctxt->ps_layer0_cur_satd,
2585                             ps_ctxt->ps_layer0_cur_mean,
2586                             num_4x4_blks_ctb_y,
2587                             num_4x4_blks_last_ctb_x,
2588                             skip_decomp,
2589                             skip_pre_intra,
2590                             row_block_no,
2591                             0,
2592                             NULL,
2593                             &ps_ctxt->s_ipe_optimised_function_list,
2594                             &ps_ctxt->s_cmn_opt_func);
2595                     }
2596                 }
2597                 if(1 == i4_layer_no)
2598                 {
2599                     ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
2600                 }
2601             }
2602             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
2603             {
2604                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
2605             }
2606             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
2607         }
2608 
2609 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
2610         if((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
2611            (((i4_layer_no == 2) && (ps_lap_out_prms->i4_pic_type == ISLICE)) ||
2612             ((i4_layer_no == 1) && (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
2613         {
2614             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
2615             if(1 == i4_layer_no)
2616             {
2617                 for(idx = 0; idx < i4_num_rows; idx++)
2618                 {
2619                     row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
2620 
2621                     {
2622                         ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
2623                     }
2624                 }
2625             }
2626             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
2627             {
2628                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
2629             }
2630             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
2631         }
2632 #else
2633         if((i4_layer_no != 0) && ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
2634                                   (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE)))
2635         {
2636             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
2637             for(idx = 0; idx < i4_num_rows; idx++)
2638             {
2639                 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
2640                 if(1 == i4_layer_no)
2641                 {
2642                     ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
2643                 }
2644             }
2645             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
2646             {
2647                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
2648             }
2649             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
2650         }
2651 #endif
2652     }
2653 }
2654 
2655 /*!
2656 ************************************************************************
2657 * \brief
2658 *    return number of records used by decomp pre intra
2659 *
2660 ************************************************************************
2661 */
ihevce_decomp_pre_intra_get_num_mem_recs(void)2662 WORD32 ihevce_decomp_pre_intra_get_num_mem_recs(void)
2663 {
2664     return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
2665 }
2666 
2667 /*!
2668 ************************************************************************
2669 * @brief
2670 *    return each record attributes of  decomp pre intra
2671 ************************************************************************
2672 */
ihevce_decomp_pre_intra_get_mem_recs(iv_mem_rec_t * ps_mem_tab,WORD32 i4_num_proc_thrds,WORD32 i4_mem_space)2673 WORD32 ihevce_decomp_pre_intra_get_mem_recs(
2674     iv_mem_rec_t *ps_mem_tab, WORD32 i4_num_proc_thrds, WORD32 i4_mem_space)
2675 {
2676     /* memories should be requested assuming worst case requirememnts */
2677 
2678     /* Module context structure */
2679     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_size = sizeof(ihevce_decomp_pre_intra_master_ctxt_t);
2680     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
2681     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_alignment = 8;
2682 
2683     /* Thread context structure */
2684     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_size =
2685         i4_num_proc_thrds * sizeof(ihevce_decomp_pre_intra_ctxt_t);
2686     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
2687     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_alignment = 8;
2688 
2689     /* early decision context structure */
2690     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_size = i4_num_proc_thrds * sizeof(ihevce_ed_ctxt_t);
2691     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
2692     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_alignment = 8;
2693 
2694     return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
2695 }
2696 
2697 /*!
2698 ************************************************************************
2699 * @brief
2700 *    Init decomp pre intra context
2701 ************************************************************************
2702 */
ihevce_decomp_pre_intra_init(iv_mem_rec_t * ps_mem_tab,ihevce_static_cfg_params_t * ps_init_prms,WORD32 i4_num_proc_thrds,func_selector_t * ps_func_selector,WORD32 i4_resolution_id,UWORD8 u1_is_popcnt_available)2703 void *ihevce_decomp_pre_intra_init(
2704     iv_mem_rec_t *ps_mem_tab,
2705     ihevce_static_cfg_params_t *ps_init_prms,
2706     WORD32 i4_num_proc_thrds,
2707     func_selector_t *ps_func_selector,
2708     WORD32 i4_resolution_id,
2709     UWORD8 u1_is_popcnt_available)
2710 {
2711     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt;
2712     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt;
2713     WORD32 thread_no;
2714     WORD32 n_tot_layers;
2715     WORD32 count;
2716     WORD32 a_wd[MAX_NUM_HME_LAYERS], a_ht[MAX_NUM_HME_LAYERS], layer_no;
2717     WORD32 a_disp_wd[MAX_NUM_LAYERS], a_disp_ht[MAX_NUM_LAYERS];
2718     ihevce_ed_ctxt_t *ps_ed_ctxt;
2719     WORD32 min_cu_size;
2720 
2721     /* get the min cu size from config params */
2722     min_cu_size = ps_init_prms->s_config_prms.i4_min_log2_cu_size;
2723 
2724     min_cu_size = 1 << min_cu_size;
2725 
2726     /* Get the height and width of each layer */
2727     *a_wd = ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_width +
2728             SET_CTB_ALIGN(
2729                 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_width, min_cu_size);
2730     *a_ht =
2731         ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_height +
2732         SET_CTB_ALIGN(
2733             ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_height, min_cu_size);
2734 
2735     n_tot_layers = hme_derive_num_layers(1, a_wd, a_ht, a_disp_wd, a_disp_ht);
2736 
2737     /* Decomp state structure */
2738     ps_master_ctxt =
2739         (ihevce_decomp_pre_intra_master_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_CTXT].pv_base;
2740     ps_master_ctxt->i4_num_proc_thrds = i4_num_proc_thrds;
2741 
2742     ps_ctxt = (ihevce_decomp_pre_intra_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].pv_base;
2743     ps_ed_ctxt = (ihevce_ed_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].pv_base;
2744 
2745     for(thread_no = 0; thread_no < ps_master_ctxt->i4_num_proc_thrds; thread_no++)
2746     {
2747         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no] = ps_ctxt;
2748 
2749         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->i4_num_layers = n_tot_layers;
2750 
2751         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->pu1_wkg_mem =
2752             &ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->au1_wkg_mem[0];
2753 
2754         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->ps_ed_ctxt = ps_ed_ctxt;
2755 
2756         for(layer_no = 0; layer_no < n_tot_layers; layer_no++)
2757         {
2758             WORD32 max_ctb_size;
2759             WORD32 decomp_blk_ht, decomp_blk_wd;
2760 
2761             ps_ctxt->as_layers[layer_no].i4_actual_wd = a_wd[layer_no];
2762             ps_ctxt->as_layers[layer_no].i4_actual_ht = a_ht[layer_no];
2763             ps_ctxt->as_layers[layer_no].i4_inp_stride = 0;
2764             ps_ctxt->as_layers[layer_no].pu1_inp = NULL;
2765             ps_ctxt->as_layers[layer_no].i4_num_rows_processed = 0;
2766 
2767             for(count = 0; count < MAX_NUM_CTB_ROWS_FRM; count++)
2768             {
2769                 ps_ctxt->as_layers[layer_no].ai4_curr_row_no[count] = -1;
2770             }
2771             if(0 == layer_no)
2772             {
2773                 ps_ctxt->as_layers[layer_no].i4_padded_ht = a_ht[layer_no];
2774                 ps_ctxt->as_layers[layer_no].i4_padded_wd = a_wd[layer_no];
2775             }
2776             else
2777             {
2778                 ps_ctxt->as_layers[layer_no].i4_padded_ht = a_ht[layer_no] + 32 + 4;
2779                 ps_ctxt->as_layers[layer_no].i4_padded_wd = a_wd[layer_no] + 32 + 4;
2780             }
2781 
2782             /** If CTB size= 64.decomp_blk_wd = 64 for L0, 32 for L1 , 16 for L2, 8 for L3 */
2783             max_ctb_size = 1 << ps_init_prms->s_config_prms.i4_max_log2_cu_size;
2784 
2785             ps_ctxt->as_layers[layer_no].i4_decomp_blk_ht = max_ctb_size >> layer_no;
2786             ps_ctxt->as_layers[layer_no].i4_decomp_blk_wd = max_ctb_size >> layer_no;
2787 
2788             decomp_blk_ht = ps_ctxt->as_layers[layer_no].i4_decomp_blk_ht;
2789             decomp_blk_wd = ps_ctxt->as_layers[layer_no].i4_decomp_blk_wd;
2790 
2791             ps_ctxt->as_layers[layer_no].i4_num_row_blks =
2792                 ((a_ht[layer_no] + (decomp_blk_ht - 1)) / decomp_blk_ht);
2793 
2794             ps_ctxt->as_layers[layer_no].i4_num_col_blks =
2795                 ((a_wd[layer_no] + (decomp_blk_wd - 1)) / decomp_blk_wd);
2796         }
2797         ps_ed_ctxt->ps_func_selector = ps_func_selector;
2798 
2799         ps_ctxt->i4_quality_preset =
2800             ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_quality_preset;
2801 
2802         if(ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P7)
2803         {
2804             ps_ctxt->i4_quality_preset = IHEVCE_QUALITY_P6;
2805         }
2806 
2807         if(ps_init_prms->s_coding_tools_prms.i4_vqet &
2808            (1 << BITPOS_IN_VQ_TOGGLE_FOR_CONTROL_TOGGLER))
2809         {
2810             if(ps_init_prms->s_coding_tools_prms.i4_vqet &
2811                (1 << BITPOS_IN_VQ_TOGGLE_FOR_ENABLING_NOISE_PRESERVATION))
2812             {
2813                 ps_ctxt->i4_enable_noise_detection = 1;
2814             }
2815             else
2816             {
2817                 ps_ctxt->i4_enable_noise_detection = 0;
2818             }
2819         }
2820         else
2821         {
2822             ps_ctxt->i4_enable_noise_detection = 0;
2823         }
2824 
2825         ihevce_cmn_utils_instr_set_router(
2826             &ps_ctxt->s_cmn_opt_func, u1_is_popcnt_available, ps_init_prms->e_arch_type);
2827 
2828         ihevce_ipe_instr_set_router(
2829             &ps_ctxt->s_ipe_optimised_function_list, ps_init_prms->e_arch_type);
2830 
2831         ps_ctxt++;
2832         ps_ed_ctxt++;
2833     }
2834     /* return the handle to caller */
2835     return ((void *)ps_master_ctxt);
2836 }
2837 
2838 /*!
2839 ******************************************************************************
2840 * \if Function name : ihevce_decomp_pre_intra_frame_init \endif
2841 *
2842 * \brief
2843 *    Frame Intialization for Decomp intra pre analysis.
2844 *
2845 * \param[in] pv_ctxt : pointer to module ctxt
2846 * \param[in] ppu1_decomp_lyr_bufs : pointer to array of layer buffer pointers
2847 * \param[in] pi4_lyr_buf_stride : pointer to array of layer buffer strides
2848 *
2849 * \return
2850 *    None
2851 *
2852 * \author
2853 *  Ittiam
2854 *
2855 *****************************************************************************
2856 */
ihevce_decomp_pre_intra_frame_init(void * pv_ctxt,UWORD8 ** ppu1_decomp_lyr_bufs,WORD32 * pi4_lyr_buf_stride,ihevce_ed_blk_t * ps_layer1_buf,ihevce_ed_blk_t * ps_layer2_buf,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,WORD32 i4_ol_sad_lambda_qf,WORD32 i4_slice_type,ctb_analyse_t * ps_ctb_analyse)2857 void ihevce_decomp_pre_intra_frame_init(
2858     void *pv_ctxt,
2859     UWORD8 **ppu1_decomp_lyr_bufs,
2860     WORD32 *pi4_lyr_buf_stride,
2861     ihevce_ed_blk_t *ps_layer1_buf,
2862     ihevce_ed_blk_t *ps_layer2_buf,
2863     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
2864     WORD32 i4_ol_sad_lambda_qf,
2865     WORD32 i4_slice_type,
2866     ctb_analyse_t *ps_ctb_analyse)
2867 {
2868     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt;
2869     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt;
2870     WORD32 thread_no;
2871 
2872     /* Decomp state structure */
2873     ps_master_ctxt = (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
2874 
2875     for(thread_no = 0; thread_no < ps_master_ctxt->i4_num_proc_thrds; thread_no++)
2876     {
2877         WORD32 layer_no;
2878 
2879         ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no];
2880 
2881         /* L0 layer (actual input) is registered in process call */
2882         for(layer_no = 1; layer_no < ps_ctxt->i4_num_layers; layer_no++)
2883         {
2884             ps_ctxt->as_layers[layer_no].i4_inp_stride = pi4_lyr_buf_stride[layer_no - 1];
2885             ps_ctxt->as_layers[layer_no].pu1_inp = ppu1_decomp_lyr_bufs[layer_no - 1];
2886 
2887             /*Populating the buffer pointers for layer1 and layer2 buffers to store the
2888             structure for each 4x4 block after pre intra analysis on their respective laeyrs*/
2889 
2890             if(layer_no == 1)
2891             {
2892                 WORD32 sad_lambda_l1 = (3 * i4_ol_sad_lambda_qf >> 2);
2893                 WORD32 temp = 1 << LAMBDA_Q_SHIFT;
2894                 WORD32 lambda = ((temp) > sad_lambda_l1) ? temp : sad_lambda_l1;
2895                 //ps_ctxt->as_layers[1].s_early_decision.ps_ed_pic = ps_layer1_buf;
2896                 //ps_ctxt->as_layers[1].s_early_decision.ps_ed = ps_layer1_buf;
2897                 ps_ctxt->ps_layer1_buf = ps_layer1_buf;
2898                 ps_ctxt->ps_ed_ctb_l1 = ps_ed_ctb_l1;
2899                 ps_ctxt->ai4_lambda[layer_no] = lambda;
2900                 ps_ctxt->i4_codec_level = 0;
2901                 ps_ctxt->i4_slice_type = i4_slice_type;
2902             }
2903             else if(layer_no == 2)
2904             {
2905                 WORD32 sad_lambda_l2 = i4_ol_sad_lambda_qf >> 1;
2906                 WORD32 temp = 1 << LAMBDA_Q_SHIFT;
2907                 WORD32 lambda = ((temp) > sad_lambda_l2) ? temp : sad_lambda_l2;
2908 
2909                 //ps_ctxt->as_layers[2].s_early_decision.ps_ed_pic = ps_layer2_buf;
2910                 //ps_ctxt->as_layers[2].s_early_decision.ps_ed = ps_layer2_buf;
2911                 ps_ctxt->ps_layer2_buf = ps_layer2_buf;
2912                 //ihevce_ed_frame_init(ps_ctxt->ps_ed_ctxt);
2913                 ps_ctxt->ai4_lambda[layer_no] = lambda;
2914                 ps_ctxt->i4_codec_level = 0;
2915                 ps_ctxt->i4_slice_type = i4_slice_type;
2916             }
2917             else
2918             {
2919                 //ps_ctxt->as_layers[0].s_early_decision.ps_ed_pic = NULL;
2920                 //ps_ctxt->as_layers[0].s_early_decision.ps_ed = NULL;
2921                 //ps_ctxt->ps_layer1_buf = NULL;
2922                 ps_ctxt->ai4_lambda[layer_no] = -1;
2923                 ps_ctxt->i4_codec_level = 0;
2924                 ps_ctxt->i4_slice_type = i4_slice_type;
2925             }
2926         }
2927 
2928         /* make the ps_ctb_analyse refernce as a part of the private context */
2929         ps_ctxt->ps_ctb_analyse = ps_ctb_analyse;
2930     }
2931 }
2932 
2933 /**
2934 *******************************************************************************
2935 *
2936 * @brief
2937 *     Merge Sort function.
2938 *
2939 * @par Description:
2940 *     This function sorts the data in the input array in ascending
2941 *     order using merge sort algorithm. Intermediate data obtained in
2942 *     merge sort are stored in output 2-D array.
2943 *
2944 * @param[in]
2945 *   pi4_input_val  :   Input 1-D array
2946 *   aai4_output_val:   Output 2-D array containing elements sorted in sets of
2947 *                      4,16,64 etc.
2948 *   i4_length      : length of the array
2949 *   i4_ip_sort_level: Input sort level. Specifies the level upto which array is sorted.
2950 *                     It should be 1 if the array is unsorted. Should be 4 if array is sorted
2951 *                     in sets of 4.
2952 *   i4_op_sort_level: Output sort level. Specify the level upto which sorting is required.
2953 *                     If it is given as length of array it sorts for whole array.
2954 *
2955 * @returns
2956 *
2957 * @remarks
2958 *  None
2959 *
2960 *******************************************************************************
2961 */
ihevce_merge_sort(WORD32 * pi4_input_val,WORD32 aai4_output_val[][64],WORD32 i4_length,WORD32 i4_ip_sort_level,WORD32 i4_op_sort_level)2962 void ihevce_merge_sort(
2963     WORD32 *pi4_input_val,
2964     WORD32 aai4_output_val[][64],
2965     WORD32 i4_length,
2966     WORD32 i4_ip_sort_level,
2967     WORD32 i4_op_sort_level)
2968 {
2969     WORD32 i, j, k;
2970     WORD32 count, level;
2971     WORD32 temp[64];
2972     WORD32 *pi4_temp_buf_cpy;
2973     WORD32 *pi4_temp = &temp[0];
2974     WORD32 calc_level;
2975 
2976     pi4_temp_buf_cpy = pi4_temp;
2977 
2978     GETRANGE(calc_level, i4_op_sort_level / i4_ip_sort_level);
2979 
2980     calc_level = calc_level - 1;
2981 
2982     /*** This function is written under the assumption that we need only intermediate values of
2983     sort in the range of 4,16,64 etc. ***/
2984     ASSERT((calc_level % 2) == 0);
2985 
2986     /** One iteration of this for loop does 1 sets of sort and produces one intermediate value in 2 iterations **/
2987     for(level = 0; level < calc_level; level++)
2988     {
2989         /** Merges adjacent sets of elements based on current sort level **/
2990         for(count = 0; count < i4_length; (count = count + (i4_ip_sort_level * 2)))
2991         {
2992             i = 0;
2993             j = 0;
2994             if(pi4_input_val[i4_ip_sort_level - 1] < pi4_input_val[i4_ip_sort_level])
2995             {
2996                 /*** Condition for early exit ***/
2997                 memcpy(&pi4_temp[0], pi4_input_val, sizeof(WORD32) * i4_ip_sort_level * 2);
2998             }
2999             else
3000             {
3001                 for(k = 0; k < (i4_ip_sort_level * 2); k++)
3002                 {
3003                     if((i < i4_ip_sort_level) && (j < i4_ip_sort_level))
3004                     {
3005                         if(pi4_input_val[i] > pi4_input_val[j + i4_ip_sort_level])
3006                         {
3007                             /** copy to output array **/
3008                             pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
3009                             j++;
3010                         }
3011                         else
3012                         {
3013                             /** copy to output array **/
3014                             pi4_temp[k] = pi4_input_val[i];
3015                             i++;
3016                         }
3017                     }
3018                     else if(i == i4_ip_sort_level)
3019                     {
3020                         /** copy the remaining data to output array **/
3021                         pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
3022                         j++;
3023                     }
3024                     else
3025                     {
3026                         /** copy the remaining data to output array **/
3027                         pi4_temp[k] = pi4_input_val[i];
3028                         i++;
3029                     }
3030                 }
3031             }
3032             pi4_input_val += (i4_ip_sort_level * 2);
3033             pi4_temp += (i4_ip_sort_level * 2);
3034         }
3035         pi4_input_val = pi4_temp - i4_length;
3036 
3037         if(level % 2)
3038         {
3039             /** Assign a temp address for storing next sort level output as we will not need this data as output **/
3040             pi4_temp = pi4_temp_buf_cpy;
3041         }
3042         else
3043         {
3044             /** Assign address for storing the intermediate data into output 2-D array **/
3045             pi4_temp = aai4_output_val[level / 2];
3046         }
3047         i4_ip_sort_level *= 2;
3048     }
3049 }
3050 
ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit(void * pv_pre_intra_ctxt,pre_enc_me_ctxt_t * ps_curr_out,WORD32 i4_is_last_thread,frm_ctb_ctxt_t * ps_frm_ctb_prms,WORD32 i4_temporal_lyr_id,WORD32 i4_enable_noise_detection)3051 void ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit(
3052     void *pv_pre_intra_ctxt,
3053     pre_enc_me_ctxt_t *ps_curr_out,
3054     WORD32 i4_is_last_thread,
3055     frm_ctb_ctxt_t *ps_frm_ctb_prms,
3056     WORD32 i4_temporal_lyr_id,
3057     WORD32 i4_enable_noise_detection)
3058 {
3059     ihevce_decomp_pre_intra_master_ctxt_t *ps_pre_intra_master_ctxt =
3060         (ihevce_decomp_pre_intra_master_ctxt_t *)pv_pre_intra_ctxt;
3061     ihevce_decomp_pre_intra_ctxt_t *ps_pre_intra_ctxt =
3062         ps_pre_intra_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
3063 
3064     WORD32 i4_k;
3065     WORD32 ctb_ctr, vert_ctr;
3066 
3067     WORD32 ai4_curr_frame_8x8_sum_act[2] = { 0, 0 };
3068     LWORD64 ai8_curr_frame_8x8_sum_act_sqr[2] = { 0, 0 };
3069     WORD32 ai4_curr_frame_8x8_sum_blks[2] = { 0, 0 };
3070     ULWORD64 u8_curr_frame_8x8_sum_act_sqr = 0;
3071 
3072     LWORD64 ai8_curr_frame_16x16_sum_act_sqr[3] = { 0, 0, 0 };
3073     WORD32 ai4_curr_frame_16x16_sum_act[3] = { 0, 0, 0 };
3074     WORD32 ai4_curr_frame_16x16_sum_blks[3] = { 0, 0, 0 };
3075 
3076     LWORD64 ai8_curr_frame_32x32_sum_act_sqr[3] = { 0, 0, 0 };
3077     WORD32 ai4_curr_frame_32x32_sum_act[3] = { 0, 0, 0 };
3078     WORD32 ai4_curr_frame_32x32_sum_blks[3] = { 0, 0, 0 };
3079 
3080     (void)i4_temporal_lyr_id;
3081     (void)i4_enable_noise_detection;
3082 
3083     if(i4_is_last_thread == 1)
3084     {
3085         WORD32 i4_slice_type = ps_curr_out->s_slice_hdr.i1_slice_type;
3086         //ps_pre_intra_ctxt->i4_slice_type;
3087         WORD32 ctb_ctr_blks = ps_pre_intra_ctxt->as_layers[1].i4_num_col_blks;
3088         WORD32 vert_ctr_blks = ps_pre_intra_ctxt->as_layers[1].i4_num_row_blks;
3089         ihevce_ed_ctb_l1_t *ps_ed_ctb_pic_l1 = ps_curr_out->ps_ed_ctb_l1;
3090         WORD32 block_wd = ps_pre_intra_ctxt->as_layers[1].i4_decomp_blk_wd;
3091         WORD32 inc_ctb = ((block_wd >> 2) * (block_wd >> 2));
3092         ihevce_ed_blk_t *ps_ed_blk_l1 = ps_curr_out->ps_layer1_buf;
3093         ihevce_ed_blk_t *ps_ed;
3094         WORD32 i, j;
3095         WORD32 i4_avg_noise_satd;
3096         WORD32 k;
3097         WORD32 i4_layer_wd = ps_pre_intra_ctxt->as_layers[1].i4_actual_wd;
3098         WORD32 i4_layer_ht = ps_pre_intra_ctxt->as_layers[1].i4_actual_ht;
3099 
3100         /*Calculate min noise threshold */
3101         /*Min noise threshold is calculted by taking average of lowest 1% satd val in the complete 4x4 frame satds*/
3102         //ihevce_ed_ctxt_t *ps_ed_ctxt =  ps_pre_intra_ctxt->ps_ed_ctxt;
3103         WORD32 i4_min_blk = ((MIN_BLKS * (i4_layer_wd >> 1) * (i4_layer_ht >> 1)) / 100);
3104         WORD32 ai4_noise_thr_hstrgm[MAX_SATD_THRSHLD];
3105         memset(&ai4_noise_thr_hstrgm[0], 0, (sizeof(WORD32) * MAX_SATD_THRSHLD));
3106         ASSERT(!(USE_CUR_L0_SATD && USE_CUR_SATD));
3107         for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
3108         {
3109             ps_ed = ps_ed_blk_l1 + (vert_ctr * inc_ctb * (ctb_ctr_blks));
3110             for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
3111             {
3112                 /* Populate avg satd to calculate MI and activity factors */
3113                 for(i = 0; i < 4; i++)
3114                 {
3115                     for(j = 0; j < 4; j++)
3116                     {
3117                         for(k = 0; k < 4; k++)
3118                         {
3119                             if(-1 != (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd)
3120                             {
3121                                 WORD32 i4_satd_lim;
3122                                 i4_satd_lim = (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd;
3123                                 /* Histogram creation for Noise threshold */
3124                                 if(i4_satd_lim < MAX_SATD_THRSHLD)
3125                                 {
3126                                     ai4_noise_thr_hstrgm[i4_satd_lim]++;
3127                                 }
3128                             }
3129                         }
3130                     }
3131                 }
3132                 ps_ed += inc_ctb;
3133             }
3134         }
3135         {
3136             WORD32 i4_total_blks = 0;
3137             LWORD64 i8_acc_satd = 0;
3138             for(i = MIN_SATD_THRSHLD; i < MAX_SATD_THRSHLD; i++)
3139             {
3140                 i4_total_blks += ai4_noise_thr_hstrgm[i];
3141                 i8_acc_satd += (i * ai4_noise_thr_hstrgm[i]);
3142 
3143                 if(i4_total_blks > i4_min_blk)
3144                     break;
3145             }
3146             if(i4_total_blks < i4_min_blk)
3147             {
3148                 i4_avg_noise_satd = SATD_NOISE_FLOOR_THRESHOLD;
3149             }
3150             else
3151             {
3152                 i4_avg_noise_satd = (WORD32)(i8_acc_satd + (i4_total_blks >> 1)) / i4_total_blks;
3153             }
3154         }
3155 
3156         ps_curr_out->i4_avg_noise_thrshld_4x4 = i4_avg_noise_satd;
3157 
3158         for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
3159         {
3160             ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
3161                 ps_ed_ctb_pic_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
3162             ps_ed = ps_ed_blk_l1 + (vert_ctr * inc_ctb * (ctb_ctr_blks));
3163 
3164             for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
3165             {
3166                 /*sum of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 level */
3167                 WORD32 ai4_sum_sum_4x4_satd_16x16[4] = { 0, 0, 0, 0 };
3168                 /*min of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 level */
3169                 WORD32 ai4_min_sum_4x4_satd_16x16[4] = {
3170                     MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL
3171                 };
3172                 /*min of (min of L1_4x4 @ L1_8x8) @ L1_16x16 level */
3173                 WORD32 ai4_min_min_4x4_satd_16x16[4] = {
3174                     MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL
3175                 };
3176                 WORD32 i4_sum_4x4_satd, i4_min_4x4_satd;
3177                 ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
3178 
3179                 WORD32 is_min_block_uncompensated_in_l32x32 = 0;
3180 
3181                 /*min of L1_4x4 @ L1_8x8*/
3182                 WORD32 ai4_min_satd_ctb[MAX_CTB_SIZE];
3183                 /*** This 2-D array will contain 4x4 satds sorted in ascending order in sets of 4,16,64 ***/
3184                 /*** For example : '5 10 2 7 6 12 3 1' array input will return '2 5 7 10 1 3 6 12' if sorted in sets of 4 ***/
3185                 WORD32 aai4_min_4_16_64_satd[3][MAX_CTB_SIZE];
3186 
3187                 /*sum of L1_4x4 @ L1_8x8*/
3188                 WORD32 ai4_sum_satd_ctb[MAX_CTB_SIZE >> 2];
3189                 /*** This 2-D array will contain 4x4 satds sorted in ascending order in sets of 4,16***/
3190                 WORD32 aai4_sum_4_16_satd_ctb[2][MAX_CTB_SIZE];
3191 
3192                 /* sum of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 */
3193                 WORD32 ai4_sum_sum_satd_ctb[(MAX_CTB_SIZE >> 2) >> 2];
3194                 /*L1_32x32 = L0_64x64
3195                 so in L1_32x32 there are 64 L1_4x4blocks*/
3196                 for(i = 0; i < MAX_CTB_SIZE; i++)
3197                 {
3198                     ai4_min_satd_ctb[i] = -1;
3199                 }
3200                 for(j = 0; j < 3; j++)
3201                 {
3202                     for(i = 0; i < MAX_CTB_SIZE; i++)
3203                     {
3204                         aai4_min_4_16_64_satd[j][i] = -1;
3205                     }
3206                 }
3207                 /*L1_32x32 = L0_64x64
3208                 so in L1_32x32 there are 16 L1_8x8blocks*/
3209                 for(i = 0; i < (MAX_CTB_SIZE >> 2); i++)
3210                 {
3211                     ai4_sum_satd_ctb[i] = -1;
3212                 }
3213                 for(j = 0; j < 2; j++)
3214                 {
3215                     for(i = 0; i < (MAX_CTB_SIZE >> 2); i++)
3216                     {
3217                         aai4_sum_4_16_satd_ctb[j][i] = -1;
3218                     }
3219                 }
3220                 /*L1_32x32 = L0_64x64
3221                 so in L1_32x32 there are 16 L1_16x16blocks*/
3222                 for(i = 0; i < ((MAX_CTB_SIZE >> 2) >> 2); i++)
3223                 {
3224                     ai4_sum_sum_satd_ctb[i] = 0;
3225                 }
3226                 /*Populate sum min 4x4 activty */
3227                 /*loop for L1_32x32 block*/
3228                 for(i = 0; i < 4; i++)
3229                 {
3230                     /*loop for L1_16x16 block*/
3231                     for(j = 0; j < 4; j++)
3232                     {
3233                         WORD32 i4_sum_satd_dumyy = 0;
3234                         WORD32 i4_num_satd_blks = 0;
3235                         /* loop for L1_8x8 block*/
3236                         for(k = 0; k < 4; k++)
3237                         {
3238                             WORD32 i4_satd_lim;
3239                             i4_satd_lim = (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd;
3240 
3241                             /*complete ctb will not have i4_4x4_satd = -1*/
3242                             if(-1 != i4_satd_lim)
3243                             {
3244 #if SUB_NOISE_THRSHLD
3245                                 i4_satd_lim = i4_satd_lim - i4_avg_noise_satd;
3246                                 if(i4_satd_lim < 0)
3247                                 {
3248                                     i4_satd_lim = 0;
3249                                 }
3250 #else
3251                                 if(i4_satd_lim < i4_avg_noise_satd)
3252                                 {
3253                                     i4_satd_lim = i4_avg_noise_satd;
3254                                 }
3255 #endif
3256                                 i4_num_satd_blks++;
3257                                 /*populate 4x4 data to calculate modulation index */
3258                                 (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd = i4_satd_lim;
3259 
3260                                 i4_sum_satd_dumyy += i4_satd_lim;
3261                                 ai4_min_satd_ctb[j * 4 + i * 16 + k] = i4_satd_lim;
3262                             }
3263                         }
3264                         if(i4_num_satd_blks != 0)
3265                         {
3266                             /*make the sum of satd always for 4 blocks even it is incomplete ctb */
3267                             i4_sum_satd_dumyy = i4_sum_satd_dumyy * 4 / i4_num_satd_blks;
3268                         }
3269                         else
3270                         {
3271                             i4_sum_satd_dumyy = -1;
3272                         }
3273                         /*sum of L1_4x4 @ L1_8x8block level*/
3274                         ai4_sum_satd_ctb[j + i * 4] = i4_sum_satd_dumyy;
3275                         /*sum of L1_8x8 @ L1_16x16block level*/
3276                         ai4_sum_sum_satd_ctb[i] += i4_sum_satd_dumyy;
3277                         /*store sum of 4x4 @ L1_8x8block level*/
3278                         ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j] = i4_sum_satd_dumyy;
3279                         /*store min of 4x4 @ L1_8x8block level */
3280                         //ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] = i4_min_satd_dumyy;
3281                     }
3282                 }
3283                 {
3284                     WORD32 i4_array_length = sizeof(ai4_min_satd_ctb) / sizeof(WORD32);
3285 
3286                     /*** This function will sort 64 elements in array ai4_min_satd_ctb in ascending order to ***/
3287                     /*** 3 arrays in sets of 4,16,64 into the 2-D array   aai4_min_4_16_64_satd              ***/
3288                     ihevce_merge_sort(
3289                         &ai4_min_satd_ctb[0], aai4_min_4_16_64_satd, i4_array_length, 1, 64);
3290 
3291                     i4_array_length = sizeof(ai4_sum_satd_ctb) / sizeof(WORD32);
3292 
3293                     /*** This function will sort 16 elements in array ai4_sum_satd_ctb in ascending order to ***/
3294                     /*** 2 arrays in sets of 4,16 into the 2-D array   aai4_sum_4_16_satd_ctb                ***/
3295                     ihevce_merge_sort(
3296                         &ai4_sum_satd_ctb[0], aai4_sum_4_16_satd_ctb, i4_array_length, 1, 16);
3297                 }
3298 
3299                 /*Populate avg satd to calculate MI and activity factors*/
3300                 for(i = 0; i < 4; i++)
3301                 {
3302                     WORD32 is_min_block_uncompensated_in_l116x16 = 0;
3303                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = -1;
3304                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = -1;
3305                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = -1;
3306 
3307                     for(j = 0; j < 4; j++)
3308                     {
3309                         ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] =
3310                             aai4_min_4_16_64_satd[0][i * 16 + j * 4 + MEDIAN_CU_TU];
3311                         /*Accumulate the sum of 8*8 activities in the current layer (16*16 CU in L0)*/
3312                         i4_sum_4x4_satd = ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j];
3313                         i4_min_4x4_satd = ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j];
3314                         ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = -1;
3315                         ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = -1;
3316                         ASSERT(-2 != i4_sum_4x4_satd);
3317 
3318                         if((-1 != i4_sum_4x4_satd))
3319                         {
3320                             WORD32 not_skipped = 1;
3321 
3322                             if((i4_slice_type == ISLICE) || (1 == not_skipped))
3323                             {
3324                                 is_min_block_uncompensated_in_l116x16 = 1;
3325                                 is_min_block_uncompensated_in_l32x32 = 1;
3326 
3327                                 u8_curr_frame_8x8_sum_act_sqr +=
3328                                     (i4_sum_4x4_satd * i4_sum_4x4_satd);
3329 
3330                                 ai4_curr_frame_8x8_sum_act[0] += i4_sum_4x4_satd;
3331                                 ai8_curr_frame_8x8_sum_act_sqr[0] +=
3332                                     (i4_sum_4x4_satd * i4_sum_4x4_satd);
3333                                 ai4_curr_frame_8x8_sum_blks[0] += 1;
3334                                 ai4_curr_frame_8x8_sum_act[1] += i4_min_4x4_satd;
3335                                 ai8_curr_frame_8x8_sum_act_sqr[1] +=
3336                                     (i4_min_4x4_satd * i4_min_4x4_satd);
3337                                 ai4_curr_frame_8x8_sum_blks[1] += 1;
3338                             }
3339 
3340                             ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = i4_sum_4x4_satd;
3341                             ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = i4_min_4x4_satd;
3342                         }
3343                         else
3344                         {
3345                             ai4_sum_sum_4x4_satd_16x16[i] = MAX_32BIT_VAL;
3346                             ai4_min_sum_4x4_satd_16x16[i] = MAX_32BIT_VAL;
3347                             ai4_min_min_4x4_satd_16x16[i] = MAX_32BIT_VAL;
3348                         }
3349                     }
3350 
3351                     //if(1 == is_min_block_comensated_in_l116x16)
3352                     {
3353                         ai4_min_sum_4x4_satd_16x16[i] =
3354                             aai4_sum_4_16_satd_ctb[0][i * 4 + MEDIAN_CU_TU];
3355                         ai4_min_min_4x4_satd_16x16[i] =
3356                             aai4_min_4_16_64_satd[1][i * 16 + MEDIAN_CU_TU_BY_2];
3357 
3358                         if(ai4_sum_sum_4x4_satd_16x16[i] != MAX_32BIT_VAL)
3359                         {
3360                             ai4_sum_sum_4x4_satd_16x16[i] = 0;
3361                             for(j = 0; j < 4; j++)
3362                             {
3363                                 ai4_sum_sum_4x4_satd_16x16[i] +=
3364                                     ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j];
3365                             }
3366                             ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = ai4_sum_sum_4x4_satd_16x16[i];
3367                             ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = ai4_min_sum_4x4_satd_16x16[i];
3368                             ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = ai4_min_min_4x4_satd_16x16[i];
3369                         }
3370                     }
3371                     if(1 == is_min_block_uncompensated_in_l116x16)
3372                     {
3373                         if(MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[i])
3374                         {
3375                             ai4_curr_frame_16x16_sum_act[0] += ai4_sum_sum_4x4_satd_16x16[i];
3376                             ai8_curr_frame_16x16_sum_act_sqr[0] +=
3377                                 (ai4_sum_sum_4x4_satd_16x16[i] * ai4_sum_sum_4x4_satd_16x16[i]);
3378                             ai4_curr_frame_16x16_sum_blks[0] += 1;
3379                         }
3380                         if(MAX_32BIT_VAL != ai4_min_sum_4x4_satd_16x16[i])
3381                         {
3382                             ai4_curr_frame_16x16_sum_act[1] += ai4_min_sum_4x4_satd_16x16[i];
3383                             ai8_curr_frame_16x16_sum_act_sqr[1] +=
3384                                 (ai4_min_sum_4x4_satd_16x16[i] * ai4_min_sum_4x4_satd_16x16[i]);
3385                             ai4_curr_frame_16x16_sum_blks[1] += 1;
3386                             ai4_curr_frame_16x16_sum_act[2] += ai4_min_min_4x4_satd_16x16[i];
3387                             ai8_curr_frame_16x16_sum_act_sqr[2] +=
3388                                 (ai4_min_min_4x4_satd_16x16[i] * ai4_min_min_4x4_satd_16x16[i]);
3389                             ai4_curr_frame_16x16_sum_blks[2] += 1;
3390                         }
3391                     }
3392                 }
3393                 /*32x32*/
3394                 {
3395                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = -1;
3396                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = -1;
3397                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = -1;
3398                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = -1;
3399 
3400                     if((MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[0]) ||
3401                        (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[2]) ||
3402                        (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[1]) ||
3403                        (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[3]))
3404                     {
3405                         //if(1 == is_min_block_comensated_in_l32x32)
3406                         {
3407                             {
3408                                 WORD32 aai4_min_sum_sum_4x4_satd_16x16[1][64];
3409                                 WORD32 i4_array_length =
3410                                     sizeof(ai4_sum_sum_4x4_satd_16x16) / sizeof(WORD32);
3411                                 /*** Sort 4 elements in ascending order ***/
3412                                 ihevce_merge_sort(
3413                                     &ai4_sum_sum_4x4_satd_16x16[0],
3414                                     aai4_min_sum_sum_4x4_satd_16x16,
3415                                     i4_array_length,
3416                                     1,
3417                                     4);
3418 
3419                                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] =
3420                                     aai4_min_sum_sum_4x4_satd_16x16[0][MEDIAN_CU_TU];
3421                             }
3422                             {
3423                                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] =
3424                                     aai4_sum_4_16_satd_ctb[1][MEDIAN_CU_TU_BY_2];
3425                             }
3426                             {
3427                                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] =
3428                                     aai4_min_4_16_64_satd[2][MEDIAN_CU_TU_BY_4];
3429                             }
3430 
3431                             /*Sum of all 32x32 activity */
3432                             ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = 0;
3433                             for(j = 0; j < 4; j++)
3434                             {
3435                                 if(MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[j])
3436                                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] +=
3437                                         ai4_sum_sum_4x4_satd_16x16[j];
3438                             }
3439 
3440                             if(1 == is_min_block_uncompensated_in_l32x32)
3441                             {
3442                                 /*Accumulate the sum of 32*32 activities in the current layer (64*64 CU in L0)*/
3443                                 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][0])
3444                                 {
3445                                     ai4_curr_frame_32x32_sum_act[0] +=
3446                                         ps_ed_ctb_curr_l1->i4_32x32_satd[0][0];
3447                                     ai8_curr_frame_32x32_sum_act_sqr[0] +=
3448                                         (ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] *
3449                                          ps_ed_ctb_curr_l1->i4_32x32_satd[0][0]);
3450                                     ai4_curr_frame_32x32_sum_blks[0] += 1;
3451                                 }
3452 
3453                                 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][1])
3454                                 {
3455                                     ai4_curr_frame_32x32_sum_act[1] +=
3456                                         ps_ed_ctb_curr_l1->i4_32x32_satd[0][1];
3457                                     ai8_curr_frame_32x32_sum_act_sqr[1] +=
3458                                         (ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] *
3459                                          ps_ed_ctb_curr_l1->i4_32x32_satd[0][1]);
3460                                     ai4_curr_frame_32x32_sum_blks[1] += 1;
3461                                 }
3462 
3463                                 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][2])
3464                                 {
3465                                     ai4_curr_frame_32x32_sum_act[2] +=
3466                                         ps_ed_ctb_curr_l1->i4_32x32_satd[0][2];
3467                                     ai8_curr_frame_32x32_sum_act_sqr[2] +=
3468                                         (ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] *
3469                                          ps_ed_ctb_curr_l1->i4_32x32_satd[0][2]);
3470                                     ai4_curr_frame_32x32_sum_blks[2] += 1;
3471                                 }
3472                             }
3473                         }
3474                     }
3475                 }
3476                 /*Increment ctb count*/
3477                 ps_ed += inc_ctb;
3478             }
3479         }
3480 
3481         /* Spatial Variation and modulation index calculated for the frame */
3482         {
3483             for(i4_k = 0; i4_k < 2; i4_k++)
3484             {
3485                 /*8x8*/
3486 #if USE_SQRT_AVG_OF_SATD_SQR
3487                 ps_curr_out->i8_curr_frame_8x8_sum_act[i4_k] = ai8_curr_frame_8x8_sum_act_sqr[i4_k];
3488 #else
3489                 ps_curr_out->i8_curr_frame_8x8_sum_act[i4_k] = ai4_curr_frame_8x8_sum_act[i4_k];
3490 #endif
3491                 ps_curr_out->i4_curr_frame_8x8_sum_act_for_strength[i4_k] =
3492                     ai4_curr_frame_8x8_sum_act[i4_k];
3493                 ps_curr_out->i4_curr_frame_8x8_num_blks[i4_k] = ai4_curr_frame_8x8_sum_blks[i4_k];
3494                 ps_curr_out->u8_curr_frame_8x8_sum_act_sqr = u8_curr_frame_8x8_sum_act_sqr;
3495 
3496                 /*16x16*/
3497 #if USE_SQRT_AVG_OF_SATD_SQR
3498                 ps_curr_out->i8_curr_frame_16x16_sum_act[i4_k] =
3499                     ai8_curr_frame_16x16_sum_act_sqr[i4_k];
3500 #else
3501                 ps_curr_out->i8_curr_frame_16x16_sum_act[i4_k] = ai4_curr_frame_16x16_sum_act[i4_k];
3502 #endif
3503                 ps_curr_out->i4_curr_frame_16x16_num_blks[i4_k] =
3504                     ai4_curr_frame_16x16_sum_blks[i4_k];
3505 
3506                 /*32x32*/
3507 #if USE_SQRT_AVG_OF_SATD_SQR
3508                 ps_curr_out->i8_curr_frame_32x32_sum_act[i4_k] =
3509                     ai8_curr_frame_32x32_sum_act_sqr[i4_k];
3510 #else
3511                 ps_curr_out->i8_curr_frame_32x32_sum_act[i4_k] = ai4_curr_frame_32x32_sum_act[i4_k];
3512 #endif
3513                 ps_curr_out->i4_curr_frame_32x32_num_blks[i4_k] =
3514                     ai4_curr_frame_32x32_sum_blks[i4_k];
3515             }
3516 
3517             /*16x16*/
3518 #if USE_SQRT_AVG_OF_SATD_SQR
3519             ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai8_curr_frame_16x16_sum_act_sqr[2];
3520 #else
3521             ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai4_curr_frame_16x16_sum_act[2];
3522 #endif
3523 
3524             ps_curr_out->i4_curr_frame_16x16_num_blks[2] = ai4_curr_frame_16x16_sum_blks[2];
3525 
3526             /*32x32*/
3527 #if USE_SQRT_AVG_OF_SATD_SQR
3528             ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai8_curr_frame_32x32_sum_act_sqr[2];
3529 #else
3530             ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai4_curr_frame_32x32_sum_act[2];
3531 #endif
3532             ps_curr_out->i4_curr_frame_32x32_num_blks[2] = ai4_curr_frame_32x32_sum_blks[2];
3533         }
3534     }
3535 }
3536 
3537 /*!
3538 ******************************************************************************
3539 * \if Function name : ihevce_decomp_pre_intra_get_frame_satd \endif
3540 *
3541 * \brief
3542 *    Number of memory records are returned for enc_loop module
3543 *
3544 *
3545 * \return
3546 *    None
3547 *
3548 * \author
3549 *  Ittiam
3550 *
3551 *****************************************************************************
3552 */
ihevce_decomp_pre_intra_get_frame_satd(void * pv_ctxt,WORD32 * i4_width,WORD32 * i4_hieght)3553 LWORD64 ihevce_decomp_pre_intra_get_frame_satd(void *pv_ctxt, WORD32 *i4_width, WORD32 *i4_hieght)
3554 {
3555     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
3556         (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
3557     WORD32 i4_i;
3558     LWORD64 i8_tot_satd = 0;
3559 
3560     /*accumulate SATD acorss all thread. note that every thread will enter this function,
3561     hence it must be guranteed that all thread must have completed preintra pass by now*/
3562     for(i4_i = 0; i4_i < ps_master_ctxt->i4_num_proc_thrds; i4_i++)
3563     {
3564         ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
3565             ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i4_i];
3566 
3567         //i8_tot_satd += ps_ctxt->as_layers[1].s_early_decision.i8_sum_best_satd;
3568         i8_tot_satd += ps_ctxt->ps_ed_ctxt->i8_sum_best_satd;
3569 
3570         *i4_width = ps_ctxt->as_layers[1].i4_actual_wd;
3571         *i4_hieght = ps_ctxt->as_layers[1].i4_actual_ht;
3572     }
3573 
3574     return i8_tot_satd;
3575 }
3576 
ihevce_decomp_pre_intra_get_frame_satd_squared(void * pv_ctxt,WORD32 * i4_width,WORD32 * i4_hieght)3577 LWORD64 ihevce_decomp_pre_intra_get_frame_satd_squared(
3578     void *pv_ctxt, WORD32 *i4_width, WORD32 *i4_hieght)
3579 {
3580     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
3581         (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
3582     WORD32 i4_i;
3583     LWORD64 i8_tot_satd = 0;
3584 
3585     /*accumulate SATD acorss all thread. note that every thread will enter this function,
3586     hence it must be guranteed that all thread must have completed preintra pass by now*/
3587     for(i4_i = 0; i4_i < ps_master_ctxt->i4_num_proc_thrds; i4_i++)
3588     {
3589         ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
3590             ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i4_i];
3591 
3592         //i8_tot_satd += ps_ctxt->as_layers[1].s_early_decision.i8_sum_best_satd;
3593         i8_tot_satd += (ps_ctxt->ps_ed_ctxt->i8_sum_sq_best_satd);
3594 
3595         *i4_width = ps_ctxt->as_layers[1].i4_actual_wd;
3596         *i4_hieght = ps_ctxt->as_layers[1].i4_actual_ht;
3597     }
3598 
3599     return i8_tot_satd;
3600 }
3601