1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /*!
22 ******************************************************************************
23 * \file ihevce_decomp_pre_intra_pass.c
24 *
25 * \brief
26 *    This file contains definitions related to frame decomposition done during
27 *    pre intra processing
28 *
29 * \date
30 *    19/02/2013
31 *
32 * \author
33 *    Ittiam
34 *
35 * List of Functions
36 *    ihevce_intra_populate_mode_bits_cost()
37 *    ihevce_8x8_sad_computer()
38 *    ihevce_4x4_sad_computer()
39 *    ihevce_ed_4x4_find_best_modes()
40 *    ihevce_ed_calc_4x4_blk()
41 *    ihevce_ed_calc_8x8_blk()
42 *    ihevce_ed_calc_incomplete_ctb()
43 *    ihevce_cu_level_qp_mod()
44 *    ihevce_ed_calc_ctb()
45 *    ihevce_ed_frame_init()
46 *    ihevce_scale_by_2()
47 *    ihevce_decomp_pre_intra_process_row()
48 *    ihevce_decomp_pre_intra_process()
49 *    ihevce_decomp_pre_intra_get_num_mem_recs()
50 *    ihevce_decomp_pre_intra_get_mem_recs()
51 *    ihevce_decomp_pre_intra_init()
52 *    ihevce_decomp_pre_intra_frame_init()
53 *    ihevce_merge_sort()
54 *    ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit()
55 *
56 ******************************************************************************
57 */
58 
59 /*****************************************************************************/
60 /* File Includes                                                             */
61 /*****************************************************************************/
62 /* System include files */
63 #include <stdio.h>
64 #include <string.h>
65 #include <stdlib.h>
66 #include <assert.h>
67 #include <stdarg.h>
68 #include <stdint.h>
69 #include <math.h>
70 #include <limits.h>
71 
72 /* User include files */
73 #include "ihevc_typedefs.h"
74 #include "itt_video_api.h"
75 #include "ihevce_api.h"
76 
77 #include "rc_cntrl_param.h"
78 #include "rc_frame_info_collector.h"
79 #include "rc_look_ahead_params.h"
80 
81 #include "ihevc_defs.h"
82 #include "ihevc_debug.h"
83 #include "ihevc_structs.h"
84 #include "ihevc_platform_macros.h"
85 #include "ihevc_deblk.h"
86 #include "ihevc_itrans_recon.h"
87 #include "ihevc_chroma_itrans_recon.h"
88 #include "ihevc_chroma_intra_pred.h"
89 #include "ihevc_intra_pred.h"
90 #include "ihevc_inter_pred.h"
91 #include "ihevc_mem_fns.h"
92 #include "ihevc_padding.h"
93 #include "ihevc_weighted_pred.h"
94 #include "ihevc_sao.h"
95 #include "ihevc_resi_trans.h"
96 #include "ihevc_quant_iquant_ssd.h"
97 #include "ihevc_cabac_tables.h"
98 
99 #include "ihevce_defs.h"
100 #include "ihevce_hle_interface.h"
101 #include "ihevce_lap_enc_structs.h"
102 #include "ihevce_multi_thrd_structs.h"
103 #include "ihevce_multi_thrd_funcs.h"
104 #include "ihevce_me_common_defs.h"
105 #include "ihevce_had_satd.h"
106 #include "ihevce_error_codes.h"
107 #include "ihevce_bitstream.h"
108 #include "ihevce_cabac.h"
109 #include "ihevce_rdoq_macros.h"
110 #include "ihevce_function_selector.h"
111 #include "ihevce_enc_structs.h"
112 #include "ihevce_entropy_structs.h"
113 #include "ihevce_cmn_utils_instr_set_router.h"
114 #include "ihevce_ipe_instr_set_router.h"
115 #include "ihevce_decomp_pre_intra_structs.h"
116 #include "ihevce_decomp_pre_intra_pass.h"
117 #include "ihevce_enc_loop_structs.h"
118 #include "hme_datatype.h"
119 #include "hme_interface.h"
120 #include "hme_common_defs.h"
121 #include "ihevce_global_tables.h"
122 
123 /*****************************************************************************/
124 /* Global variables                                                          */
125 /*****************************************************************************/
126 
127 /**
128 *****************************************************************************
129 * @brief subset of intra modes to be evaluated during pre enc intra process
130 *****************************************************************************
131 */
132 static const UWORD8 gau1_modes_to_eval[11] = { 0, 1, 26, 2, 6, 10, 14, 18, 22, 30, 34 };
133 
134 /**
135 *****************************************************************************
136 * @brief  list of pointers to luma intra pred functions
137 *****************************************************************************
138 */
139 pf_intra_pred g_apf_lum_ip[NUM_IP_FUNCS];
140 
141 /*****************************************************************************/
142 /* Function Definitions                                                      */
143 /*****************************************************************************/
144 
145 /*!
146 ******************************************************************************
147 * \if Function name : ihevce_intra_populate_mode_bits_cost \endif
148 *
149 * \brief: look-up table of cost of signalling an intra mode in the
150 *  bitstream
151 *
152 *****************************************************************************
153 */
ihevce_intra_populate_mode_bits_cost(UWORD16 * mode_bits_cost,WORD32 lambda)154 static void ihevce_intra_populate_mode_bits_cost(UWORD16 *mode_bits_cost, WORD32 lambda)
155 {
156     WORD32 i;
157     // 5.5 * lambda
158     UWORD16 five_bits_cost = COMPUTE_RATE_COST_CLIP30(11, lambda, (LAMBDA_Q_SHIFT + 1));
159 
160     for(i = 0; i < NUM_MODES; i++)
161     {
162         mode_bits_cost[i] = five_bits_cost;
163     }
164 }
165 
166 /*!
167 ******************************************************************************
168 * \if Function name : ihevce_8x8_sad_computer \endif
169 *
170 * \brief: compute sad between 2 8x8 blocks
171 *
172 *****************************************************************************
173 */
ihevce_8x8_sad_computer(UWORD8 * src,UWORD8 * pred,WORD32 src_strd,WORD32 pred_strd)174 UWORD16 ihevce_8x8_sad_computer(UWORD8 *src, UWORD8 *pred, WORD32 src_strd, WORD32 pred_strd)
175 {
176     UWORD16 sad = 0;
177     WORD32 i, j;
178 
179     for(i = 0; i < 8; i++)
180     {
181         for(j = 0; j < 8; j++)
182         {
183             sad += ABS(src[j] - pred[j]);
184         }
185         src += src_strd;
186         pred += pred_strd;
187     }
188 
189     return sad;
190 }
191 
192 /*!
193 ******************************************************************************
194 * \if Function name : ihevce_4x4_sad_computer \endif
195 *
196 * \brief: compute sad between 2 4x4 blocks
197 *
198 *****************************************************************************
199 */
ihevce_4x4_sad_computer(UWORD8 * src,UWORD8 * pred,WORD32 src_strd,WORD32 pred_strd)200 UWORD16 ihevce_4x4_sad_computer(UWORD8 *src, UWORD8 *pred, WORD32 src_strd, WORD32 pred_strd)
201 {
202     UWORD16 sad = 0;
203     WORD32 i, j;
204 
205     for(i = 0; i < 4; i++)
206     {
207         for(j = 0; j < 4; j++)
208         {
209             sad += ABS(src[j] - pred[j]);
210         }
211         src += src_strd;
212         pred += pred_strd;
213     }
214 
215     return sad;
216 }
217 
218 /*!
219 ******************************************************************************
220 * \if Function name : ihevce_ed_4x4_find_best_modes \endif
221 *
222 * \brief: evaluate input 4x4 block for pre-selected list intra modes and
223 * return best sad, cost
224 *
225 *****************************************************************************
226 */
ihevce_ed_4x4_find_best_modes(UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * ref,UWORD16 * mode_bits_cost,UWORD8 * pu1_best_modes,WORD32 * pu1_best_sad_costs,WORD32 u1_low_resol,FT_SAD_COMPUTER * pf_4x4_sad_computer)227 void ihevce_ed_4x4_find_best_modes(
228     UWORD8 *pu1_src,
229     WORD32 src_stride,
230     UWORD8 *ref,
231     UWORD16 *mode_bits_cost,
232     UWORD8 *pu1_best_modes,
233     WORD32 *pu1_best_sad_costs,
234     WORD32 u1_low_resol,
235     FT_SAD_COMPUTER *pf_4x4_sad_computer)
236 {
237     WORD32 i;
238     UWORD8 mode = 0, best_amode = 0, best_nmode = 0;
239     UWORD8 pred[16];
240     WORD32 sad = 0;
241     WORD32 sad_cost = 0;
242     WORD32 best_asad_cost = 0xFFFFF;
243     WORD32 best_nsad_cost = 0xFFFFF;
244 
245     /* If lower layers, l1 or l2, all the 11 modes are evaluated */
246     /* If L0 layer, all modes excluding DC and Planar are evaluated */
247     if(1 == u1_low_resol)
248         i = 0;
249     else
250         i = 2;
251 
252     /* Find the best non-angular and angular mode till level 4 */
253     for(; i < 11; i++)
254     {
255         mode = gau1_modes_to_eval[i];
256         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
257         sad = pf_4x4_sad_computer(pu1_src, pred, src_stride, 4);
258         sad_cost = sad + mode_bits_cost[mode];
259         if(mode < 2)
260         {
261             if(sad_cost < best_nsad_cost)
262             {
263                 best_nmode = mode;
264                 best_nsad_cost = sad_cost;
265             }
266         }
267         else
268         {
269             if(sad_cost < best_asad_cost)
270             {
271                 best_amode = mode;
272                 best_asad_cost = sad_cost;
273             }
274         }
275     }
276 
277     pu1_best_modes[0] = best_amode;
278     pu1_best_sad_costs[0] = best_asad_cost;
279 
280     if(1 == u1_low_resol)
281     {
282         pu1_best_modes[1] = best_nmode;
283         pu1_best_sad_costs[1] = best_nsad_cost;
284     }
285 }
286 
287 /*!
288 ******************************************************************************
289 * \if Function name : ihevce_ed_calc_4x4_blk \endif
290 *
291 * \brief: evaluate input 4x4 block for all intra modes and return best sad &
292 *  cost
293 *
294 *****************************************************************************
295 */
ihevce_ed_calc_4x4_blk(ihevce_ed_blk_t * ps_ed,UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * ref,UWORD16 * mode_bits_cost,WORD32 * pi4_best_satd,WORD32 i4_quality_preset,WORD32 * pi4_best_sad_cost,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list)296 static void ihevce_ed_calc_4x4_blk(
297     ihevce_ed_blk_t *ps_ed,
298     UWORD8 *pu1_src,
299     WORD32 src_stride,
300     UWORD8 *ref,
301     UWORD16 *mode_bits_cost,
302     WORD32 *pi4_best_satd,
303     WORD32 i4_quality_preset,
304     WORD32 *pi4_best_sad_cost,
305     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list)
306 {
307     WORD32 i, i_end;
308     UWORD8 mode, best_amode, best_nmode;
309     UWORD8 pred[16];
310     UWORD16 sad;
311     WORD32 sad_cost = 0;
312     WORD32 best_asad_cost = 0xFFFFF;
313     WORD32 best_nsad_cost = 0xFFFFF;
314     UWORD8 au1_best_modes[2];
315     WORD32 ai4_best_sad_costs[2];
316     /* L1/L2 resolution hence low resolution enable */
317     const WORD32 u1_low_resol = 1;
318     UWORD8 modes_to_eval[2];
319 
320     ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
321         pu1_src,
322         src_stride,
323         ref,
324         mode_bits_cost,
325         au1_best_modes,
326         ai4_best_sad_costs,
327         u1_low_resol,
328         ps_ipe_optimised_function_list->pf_4x4_sad_computer);
329 
330     best_nmode = au1_best_modes[1];
331     best_amode = au1_best_modes[0];
332     best_nsad_cost = ai4_best_sad_costs[1];
333     best_asad_cost = ai4_best_sad_costs[0];
334     *pi4_best_satd = best_asad_cost - mode_bits_cost[best_amode];
335 
336     /* Around best level 4 angular mode, search for best level 2 mode */
337     modes_to_eval[0] = best_amode - 2;
338     modes_to_eval[1] = best_amode + 2;
339     i = 0;
340     i_end = 2;
341     if(best_amode == 2)
342         i = 1;
343     else if(best_amode == 34)
344         i_end = 1;
345     for(; i < i_end; i++)
346     {
347         mode = modes_to_eval[i];
348         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
349         sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, pred, src_stride, 4);
350         sad_cost = sad + mode_bits_cost[mode];
351         if(sad_cost < best_asad_cost)
352         {
353             best_amode = mode;
354             best_asad_cost = sad_cost;
355             *pi4_best_satd = sad;
356         }
357     }
358 
359     if(i4_quality_preset < IHEVCE_QUALITY_P4)
360     {
361         /* Around best level 2 angular mode, search for best level 1 mode */
362         modes_to_eval[0] = best_amode - 1;
363         modes_to_eval[1] = best_amode + 1;
364         i = 0;
365         i_end = 2;
366         if(best_amode == 2)
367             i = 1;
368         else if(best_amode == 34)
369             i_end = 1;
370         for(; i < i_end; i++)
371         {
372             mode = modes_to_eval[i];
373             g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
374             sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, pred, src_stride, 4);
375             sad_cost = sad + mode_bits_cost[mode];
376             if(sad_cost < best_asad_cost)
377             {
378                 best_amode = mode;
379                 best_asad_cost = sad_cost;
380                 *pi4_best_satd = sad;
381             }
382         }
383     }
384 
385     if(best_asad_cost < best_nsad_cost)
386     {
387         ps_ed->best_mode = best_amode;
388         *pi4_best_sad_cost = best_asad_cost;
389     }
390     else
391     {
392         ps_ed->best_mode = best_nmode;
393         *pi4_best_sad_cost = best_nsad_cost;
394     }
395     ps_ed->intra_or_inter = 0;
396     ps_ed->merge_success = 0;
397 }
398 
399 /*!
400 ******************************************************************************
401 * \if Function name : ihevce_ed_calc_8x8_blk \endif
402 *
403 * \brief: evaluate input 8x8 block for intra modes basing on the intra mode
404 *  decisions made at 4x4 level. This function also makes a decision whether
405 *  to split blk in to 4x4 partitions or not.
406 *
407 *****************************************************************************
408 */
ihevce_ed_calc_8x8_blk(ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_8x8,UWORD8 * pu1_src,WORD32 src_stride,WORD32 * nbr_flags_ptr,WORD32 lambda,WORD32 * pi4_best_satd,WORD32 i4_layer_id,WORD32 i4_quality_preset,WORD32 * pi4_best_sad_cost_8x8_l1_ipe,WORD32 * pi4_best_sad_8x8_l1_ipe,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)409 static void ihevce_ed_calc_8x8_blk(
410     ihevce_ed_ctxt_t *ps_ed_ctxt,
411     ihevce_ed_blk_t *ps_ed_8x8,
412     UWORD8 *pu1_src,
413     WORD32 src_stride,
414     WORD32 *nbr_flags_ptr,
415     WORD32 lambda,
416     WORD32 *pi4_best_satd,
417     WORD32 i4_layer_id,
418     WORD32 i4_quality_preset,
419     WORD32 *pi4_best_sad_cost_8x8_l1_ipe,
420     WORD32 *pi4_best_sad_8x8_l1_ipe,
421     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
422     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
423 {
424     ihevce_ed_blk_t *ps_ed_4x4 = ps_ed_8x8;
425     UWORD8 *pu1_src_arr[4];
426     WORD32 ai4_4x4_best_sad_cost[4];
427     WORD32 nbr_flags_c, nbr_flags_r;
428     UWORD8 *pu1_src_4x4;
429     WORD32 i, j;
430     func_selector_t *ps_func_selector = ps_ed_ctxt->ps_func_selector;
431     ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution =
432         ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
433 
434     /* linearize ref samples for ipe of 8x8 block */
435     nbr_flags_c = nbr_flags_ptr[0];
436     nbr_flags_r = nbr_flags_ptr[1];
437     if(CHECK_TR_AVAILABLE(nbr_flags_r))
438     {
439         SET_TR_AVAILABLE(nbr_flags_c);
440     }
441     else
442     {
443         SET_TR_UNAVAILABLE(nbr_flags_c);
444     }
445 
446     pf_intra_pred_luma_ref_substitution(
447         pu1_src - src_stride - 1,
448         pu1_src - src_stride,
449         pu1_src - 1,
450         src_stride,
451         8,
452         nbr_flags_c,
453         &ps_ed_ctxt->au1_ref_8x8[0][0],
454         0);
455 
456     for(i = 0; i < 2; i++)
457     {
458         pu1_src_4x4 = pu1_src + i * 4 * src_stride;
459         for(j = 0; j < 2; j++)
460         {
461             WORD32 i4_best_satd;
462 
463             pu1_src_arr[i * 2 + j] = pu1_src_4x4;
464             nbr_flags_c = nbr_flags_ptr[i * 8 + j];
465 
466             /* linearize ref samples for ipe of 4x4 block */
467             pf_intra_pred_luma_ref_substitution(
468                 pu1_src_4x4 - src_stride - 1,
469                 pu1_src_4x4 - src_stride,
470                 pu1_src_4x4 - 1,
471                 src_stride,
472                 4,
473                 nbr_flags_c,
474                 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
475                 0);
476 
477             /* populates mode bits cost */
478             ihevce_intra_populate_mode_bits_cost(
479                 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0], lambda);
480 
481             ihevce_ed_calc_4x4_blk(
482                 ps_ed_4x4,
483                 pu1_src_4x4,
484                 src_stride,
485                 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
486                 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0],
487                 &i4_best_satd,
488                 i4_quality_preset,
489                 &ai4_4x4_best_sad_cost[i * 2 + j],
490                 ps_ipe_optimised_function_list);
491 
492             pu1_src_4x4 += 4;
493             ps_ed_4x4 += 1;
494         }
495     }
496 
497     /* 8x8 merge */
498     {
499         UWORD8 pred[64];
500         WORD32 merge_success;
501         WORD32 sad, satd, cost;
502         UWORD16 u2_sum_best_4x4_sad_cost = 0;
503         UWORD16 u2_sum_best_4x4_satd_cost = 0;
504         WORD32 i4_best_8x8_sad, i4_best_8x8_satd = 0;
505         UWORD16 u2_best_8x8_cost = (UWORD16)(-1);
506         UWORD8 u1_best_8x8_mode;
507         UWORD8 modes_to_eval[6];
508         UWORD8 u1_cond_4x4_satd;
509         UWORD8 mode;
510 
511         /* init */
512         ps_ed_4x4 = ps_ed_8x8;
513         u1_best_8x8_mode = mode = ps_ed_4x4[0].best_mode;
514         merge_success =
515             (((ps_ed_4x4[0].best_mode == ps_ed_4x4[1].best_mode) +
516               (ps_ed_4x4[0].best_mode == ps_ed_4x4[2].best_mode) +
517               (ps_ed_4x4[0].best_mode == ps_ed_4x4[3].best_mode)) == 3);
518         *pi4_best_satd = 0;
519 
520         for(i = 0; i < 4; i++)
521         {
522             u2_sum_best_4x4_sad_cost += ai4_4x4_best_sad_cost[i];
523             modes_to_eval[i] = ps_ed_4x4[i].best_mode;
524         }
525 
526         u1_cond_4x4_satd = ((1 == i4_layer_id) || (!merge_success && i4_quality_preset < IHEVCE_QUALITY_P4));
527         if(u1_cond_4x4_satd)
528         {
529             /* Get SATD for 4x4 blocks */
530             for(i = 0; i < 4; i++)
531             {
532                 mode = modes_to_eval[i];
533                 g_apf_lum_ip[g_i4_ip_funcs[mode]](
534                     &ps_ed_ctxt->au1_ref_full_ctb[i][0], 0, &pred[0], 4, 4, mode);
535 
536                 satd = ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
537                     pu1_src_arr[i], src_stride, &pred[0], 4, NULL, 0);
538 
539                 (ps_ed_4x4 + i)->i4_4x4_satd = satd;
540 
541                 u2_sum_best_4x4_satd_cost +=
542                     (satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
543                 *pi4_best_satd += satd;
544             }
545         }
546 
547         if(!merge_success)
548         {
549             UWORD8 i1_start; /* no of modes to evaluate */
550             UWORD8 ai1_modes[6];
551             WORD32 i4_merge_success_stage2 = 0;
552 
553             /* Prepare 6 candidates for 8x8 block. Two are DC and planar */
554             ai1_modes[4] = 0;
555             ai1_modes[5] = 1;
556             i1_start = 4;
557 
558             /* Assign along with removing duplicates rest 4 candidates. */
559             for(i = 3; i >= 0; i--)
560             {
561                 WORD8 i1_fresh_mode_flag = 1;
562 
563                 mode = modes_to_eval[i];
564                 /* Check if duplicate already exists in ai1_modes */
565                 for(j = i1_start; j < 6; j++)
566                 {
567                     if(mode == ai1_modes[j])
568                         i1_fresh_mode_flag = 0;
569                 }
570                 if(i1_fresh_mode_flag)
571                 {
572                     i1_start--;
573                     ai1_modes[i1_start] = mode;
574                 }
575             }
576 
577             if(i4_quality_preset < IHEVCE_QUALITY_P4)
578             {
579                 // 7.5 * lambda to incorporate transform flags
580                 u2_sum_best_4x4_satd_cost +=
581                     (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
582 
583                 /* loop over all modes for calculating SATD */
584                 for(i = i1_start; i < 6; i++)
585                 {
586                     mode = ai1_modes[i];
587                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
588                         &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, mode);
589 
590                     satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
591                         pu1_src_arr[0], src_stride, &pred[0], 8, NULL, 0);
592 
593                     cost = satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode];
594 
595                     /* Update data corresponding to least 8x8 cost */
596                     if(cost <= u2_best_8x8_cost)
597                     {
598                         u2_best_8x8_cost = cost;
599                         i4_best_8x8_satd = satd;
600                         u1_best_8x8_mode = mode;
601                     }
602                 }
603 
604                 /* 8x8 vs 4x4 decision based on SATD values */
605                 if((u2_best_8x8_cost <= u2_sum_best_4x4_satd_cost) || (u2_best_8x8_cost <= 300))
606                 {
607                     i4_merge_success_stage2 = 1;
608                 }
609 
610                 /* Find the SAD based cost for 8x8 block for best mode */
611                 if(1 == i4_layer_id)
612                 {
613                     UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
614                     WORD32 i4_best_8x8_sad_curr;
615 
616                     g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]](
617                         &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, i4_best_8x8_mode);
618 
619                     i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
620                         pu1_src_arr[0], &pred[0], src_stride, 8);
621 
622                     *pi4_best_sad_cost_8x8_l1_ipe =
623                         i4_best_8x8_sad_curr +
624                         ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode];
625                     *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
626                 }
627             }
628             else /*If high_speed or extreme speed*/
629             {
630                 // 7.5 * lambda to incorporate transform flags
631                 u2_sum_best_4x4_sad_cost +=
632                     (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
633 
634                 /*Loop over all modes for calculating SAD*/
635                 for(i = i1_start; i < 6; i++)
636                 {
637                     mode = ai1_modes[i];
638                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
639                         &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, mode);
640 
641                     sad = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
642                         pu1_src_arr[0], &pred[0], src_stride, 8);
643 
644                     cost = sad + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode];
645 
646                     /*Find the data correspoinding to least cost */
647                     if(cost <= u2_best_8x8_cost)
648                     {
649                         u2_best_8x8_cost = cost;
650                         i4_best_8x8_sad = sad;
651                         u1_best_8x8_mode = mode;
652                     }
653                 }
654 
655                 /* 8x8 vs 4x4 decision based on SAD values */
656                 if((u2_best_8x8_cost <= u2_sum_best_4x4_sad_cost) || (u2_best_8x8_cost <= 300))
657                 {
658                     i4_merge_success_stage2 = 1;
659                     if(1 == i4_layer_id)
660                     {
661                         g_apf_lum_ip[g_i4_ip_funcs[u1_best_8x8_mode]](
662                             &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, u1_best_8x8_mode);
663                         i4_best_8x8_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
664                             pu1_src_arr[0], src_stride, &pred[0], 8, NULL, 0);
665                     }
666                 }
667 
668                 if(1 == i4_layer_id)
669                 {
670                     *pi4_best_sad_cost_8x8_l1_ipe = u2_best_8x8_cost;
671                     *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad;
672                 }
673             }
674             if(i4_merge_success_stage2)
675             {
676                 ps_ed_4x4->merge_success = 1;
677                 ps_ed_4x4->best_merge_mode = u1_best_8x8_mode;
678                 *pi4_best_satd = i4_best_8x8_satd;
679             }
680         }
681         else
682         {
683             ps_ed_4x4->merge_success = 1;
684             ps_ed_4x4->best_merge_mode = u1_best_8x8_mode;
685 
686             if(1 == i4_layer_id)
687             {
688                 mode = u1_best_8x8_mode;
689                 g_apf_lum_ip[g_i4_ip_funcs[mode]](
690                     &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, mode);
691 
692                 i4_best_8x8_sad = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
693                     pu1_src_arr[0], &pred[0], src_stride, 8);
694 
695                 *pi4_best_sad_cost_8x8_l1_ipe =
696                     i4_best_8x8_sad + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode];
697                 *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad;
698 
699                 i4_best_8x8_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
700                     pu1_src_arr[0], src_stride, &pred[0], 8, NULL, 0);
701             }
702             *pi4_best_satd = i4_best_8x8_satd;
703         }
704     }
705 }
706 
707 /*!
708 ******************************************************************************
709 * \if Function name : ihevce_ed_calc_ctb \endif
710 *
711 * \brief: performs L1/L2 8x8 and 4x4 intra mode analysis
712 *
713 *****************************************************************************
714 */
ihevce_ed_calc_ctb(ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_ctb,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,UWORD8 * pu1_src,WORD32 src_stride,WORD32 num_4x4_blks_x,WORD32 num_4x4_blks_y,WORD32 * nbr_flags,WORD32 i4_layer_id,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)715 void ihevce_ed_calc_ctb(
716     ihevce_ed_ctxt_t *ps_ed_ctxt,
717     ihevce_ed_blk_t *ps_ed_ctb,
718     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
719     UWORD8 *pu1_src,
720     WORD32 src_stride,
721     WORD32 num_4x4_blks_x,
722     WORD32 num_4x4_blks_y,
723     WORD32 *nbr_flags,
724     WORD32 i4_layer_id,
725     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
726     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
727 {
728     ihevce_ed_blk_t *ps_ed_8x8;
729     UWORD8 *pu1_src_8x8;
730     WORD32 *nbr_flags_ptr;
731     WORD32 lambda = ps_ed_ctxt->lambda;
732     WORD32 i, j;
733     WORD32 z_scan_idx = 0;
734     WORD32 z_scan_act_idx = 0;
735 
736     if(i4_layer_id == 1)
737     {
738         WORD32 i4_i;
739 
740         for(i4_i = 0; i4_i < 64; i4_i++)
741         {
742             (ps_ed_ctb + i4_i)->i4_4x4_satd = -1;
743         }
744 
745         for(i4_i = 0; i4_i < 16; i4_i++)
746         {
747             ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2;
748             ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF;
749             ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2;
750             ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2;
751         }
752 
753         for(i4_i = 0; i4_i < 4; i4_i++)
754         {
755             ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2;
756             ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2;
757             ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2;
758         }
759         ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2;
760         ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2;
761         ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2;
762         ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2;
763 
764         for(i4_i = 0; i4_i < 16; i4_i++)
765         {
766             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -1;
767             ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -1;
768             ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -1;
769             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -1;
770 
771             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -1;
772 
773             ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -1;
774             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -1;
775             ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -1;
776         }
777     }
778 
779     ASSERT((num_4x4_blks_x & 1) == 0);
780     ASSERT((num_4x4_blks_y & 1) == 0);
781     for(i = 0; i < num_4x4_blks_y / 2; i++)
782     {
783         pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride;
784         nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i;
785 
786         for(j = 0; j < num_4x4_blks_x / 2; j++)
787         {
788             WORD32 i4_best_satd;
789             WORD32 i4_best_sad_cost_8x8_l1_ipe;
790             WORD32 i4_best_sad_8x8_l1_ipe;
791 
792             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2];
793             z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
794             ASSERT(z_scan_act_idx <= 15);
795 
796             ps_ed_8x8 = ps_ed_ctb + z_scan_idx;
797             ihevce_ed_calc_8x8_blk(
798                 ps_ed_ctxt,
799                 ps_ed_8x8,
800                 pu1_src_8x8,
801                 src_stride,
802                 nbr_flags_ptr,
803                 lambda,
804                 &i4_best_satd,
805                 i4_layer_id,
806                 ps_ed_ctxt->i4_quality_preset,
807                 &i4_best_sad_cost_8x8_l1_ipe,
808                 &i4_best_sad_8x8_l1_ipe,
809                 ps_ipe_optimised_function_list,
810                 ps_cmn_utils_optimised_function_list);
811             ASSERT(i4_best_satd >= 0);
812 
813             if(i4_layer_id == 1)
814             {
815                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] =
816                     i4_best_sad_cost_8x8_l1_ipe;
817                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe;
818                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd;
819                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
820                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
821             }
822             pu1_src_8x8 += 8;
823             nbr_flags_ptr += 2;
824         }
825     }
826 }
827 
fast_log2(float val)828 float fast_log2(float val)
829 {
830     union { float val; int32_t x; } u = { val };
831     float log_2 = (float)(((u.x >> 23) & 255) - 128);
832 
833     u.x &= ~(255 << 23);
834     u.x += 127 << 23;
835     log_2 += ((-1.0f / 3) * u.val + 2) * u.val - 2.0f / 3;
836     return log_2;
837 }
838 
839 /*!
840 ******************************************************************************
841 * \if Function name : ihevce_cu_level_qp_mod \endif
842 *
843 * \brief: Performs CU level QP modulation
844 *
845 *****************************************************************************
846 */
ihevce_cu_level_qp_mod(WORD32 frm_qscale,WORD32 cu_satd,long double frm_avg_activity,float f_mod_strength,WORD32 * pi4_act_factor,WORD32 * pi4_q_scale_mod,rc_quant_t * rc_quant_ctxt)847 WORD32 ihevce_cu_level_qp_mod(
848     WORD32 frm_qscale,
849     WORD32 cu_satd,
850     long double frm_avg_activity,
851     float f_mod_strength,
852     WORD32 *pi4_act_factor,
853     WORD32 *pi4_q_scale_mod,
854     rc_quant_t *rc_quant_ctxt)
855 {
856     WORD32 cu_qscale;
857     WORD32 cu_qp;
858 
859     *pi4_act_factor = (1 << QP_LEVEL_MOD_ACT_FACTOR);
860     if(cu_satd != -1 && (WORD32)frm_avg_activity != 0)
861     {
862         ULWORD64 sq_cur_satd = (cu_satd * cu_satd);
863         float log2_sq_cur_satd = fast_log2(1 + sq_cur_satd);
864         WORD32 qp_offset = f_mod_strength * (log2_sq_cur_satd - frm_avg_activity);
865 
866         ASSERT(USE_SQRT_AVG_OF_SATD_SQR);
867         qp_offset = CLIP3(qp_offset, MIN_QP_MOD_OFFSET, MAX_QP_MOD_OFFSET);
868         *pi4_act_factor *= gad_look_up_activity[qp_offset + ABS(MIN_QP_MOD_OFFSET)];
869         ASSERT(*pi4_act_factor > 0);
870         cu_qscale = ((frm_qscale * (*pi4_act_factor)) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1)));
871         cu_qscale >>= QP_LEVEL_MOD_ACT_FACTOR;
872     }
873     else
874     {
875         cu_qscale = frm_qscale;
876     }
877     cu_qscale = CLIP3(cu_qscale, rc_quant_ctxt->i2_min_qscale, rc_quant_ctxt->i2_max_qscale);
878     cu_qp = rc_quant_ctxt->pi4_qscale_to_qp[cu_qscale];
879     cu_qp = CLIP3(cu_qp, rc_quant_ctxt->i2_min_qp, rc_quant_ctxt->i2_max_qp);
880     *pi4_q_scale_mod = cu_qscale;
881 
882     return (cu_qp);
883 }
884 
885 /*!
886 ******************************************************************************
887 * \if Function name : ihevce_ed_frame_init \endif
888 *
889 * \brief: Initialize frame context for early decision
890 *
891 *****************************************************************************
892 */
ihevce_ed_frame_init(void * pv_ed_ctxt,WORD32 i4_layer_no)893 void ihevce_ed_frame_init(void *pv_ed_ctxt, WORD32 i4_layer_no)
894 {
895     ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
896 
897     g_apf_lum_ip[IP_FUNC_MODE_0] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_planar_fptr;
898     g_apf_lum_ip[IP_FUNC_MODE_1] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_dc_fptr;
899     g_apf_lum_ip[IP_FUNC_MODE_2] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode2_fptr;
900     g_apf_lum_ip[IP_FUNC_MODE_3TO9] =
901         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_3_to_9_fptr;
902     g_apf_lum_ip[IP_FUNC_MODE_10] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_horz_fptr;
903     g_apf_lum_ip[IP_FUNC_MODE_11TO17] =
904         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_11_to_17_fptr;
905     g_apf_lum_ip[IP_FUNC_MODE_18_34] =
906         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_18_34_fptr;
907     g_apf_lum_ip[IP_FUNC_MODE_19TO25] =
908         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_19_to_25_fptr;
909     g_apf_lum_ip[IP_FUNC_MODE_26] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ver_fptr;
910     g_apf_lum_ip[IP_FUNC_MODE_27TO33] =
911         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_27_to_33_fptr;
912 
913     if(i4_layer_no == 1)
914     {
915         ps_ed_ctxt->i8_sum_best_satd = 0;
916         ps_ed_ctxt->i8_sum_sq_best_satd = 0;
917     }
918 }
919 
920 /**
921 ********************************************************************************
922 *
923 *  @brief  downscales by 2 in horz and vertical direction, creates output of
924 *          size wd/2 * ht/2
925 *
926 *  @param[in]  pu1_src : source pointer
927 *  @param[in]  src_stride : source stride
928 *  @param[out] pu1_dst : destination pointer. Starting of a row.
929 *  @param[in]  dst_stride : destination stride
930 *  @param[in]  wd : width
931 *  @param[in]  ht : height
932 *  @param[in]  pu1_wkg_mem : working memory (atleast of size CEIL16(wd) * ht))
933 *  @param[in]  ht_offset : height offset of the block to be scaled
934 *  @param[in]  block_ht : height of the block to be scaled
935 *  @param[in]  wd_offset : width offset of the block to be scaled
936 *  @param[in]  block_wd : width of the block to be scaled
937 *
938 *  @return void
939 *
940 *  @remarks Assumption made block_ht should me multiple of 2. LANCZOS_SCALER
941 *
942 ********************************************************************************
943 */
ihevce_scaling_filter_mxn(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_scrtch,WORD32 scrtch_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht,WORD32 wd)944 void ihevce_scaling_filter_mxn(
945     UWORD8 *pu1_src,
946     WORD32 src_strd,
947     UWORD8 *pu1_scrtch,
948     WORD32 scrtch_strd,
949     UWORD8 *pu1_dst,
950     WORD32 dst_strd,
951     WORD32 ht,
952     WORD32 wd)
953 {
954 #define FILT_TAP_Q 8
955 #define N_TAPS 7
956     const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 };
957     WORD32 i, j;
958     WORD32 tmp;
959     UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd;
960     UWORD8 *pu1_scrtch_tmp = pu1_scrtch;
961 
962     /* horizontal filtering */
963     for(i = -3; i < ht + 2; i++)
964     {
965         for(j = 0; j < wd; j += 2)
966         {
967             tmp = (i4_ftaps[3] * pu1_src_tmp[j] +
968                    i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) +
969                    i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) +
970                    i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) +
971                    (1 << (FILT_TAP_Q - 1))) >>
972                   FILT_TAP_Q;
973             pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp);
974         }
975         pu1_scrtch_tmp += scrtch_strd;
976         pu1_src_tmp += src_strd;
977     }
978     /* vertical filtering */
979     pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd;
980     for(i = 0; i < ht; i += 2)
981     {
982         for(j = 0; j < (wd >> 1); j++)
983         {
984             tmp =
985                 (i4_ftaps[3] * pu1_scrtch_tmp[j] +
986                  i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) +
987                  i4_ftaps[1] *
988                      (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) +
989                  i4_ftaps[0] *
990                      (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) +
991                  (1 << (FILT_TAP_Q - 1))) >>
992                 FILT_TAP_Q;
993             pu1_dst[j] = CLIP_U8(tmp);
994         }
995         pu1_dst += dst_strd;
996         pu1_scrtch_tmp += (scrtch_strd << 1);
997     }
998 }
999 
ihevce_scale_by_2(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht,UWORD8 * pu1_wkg_mem,WORD32 ht_offset,WORD32 block_ht,WORD32 wd_offset,WORD32 block_wd,FT_COPY_2D * pf_copy_2d,FT_SCALING_FILTER_BY_2 * pf_scaling_filter_mxn)1000 void ihevce_scale_by_2(
1001     UWORD8 *pu1_src,
1002     WORD32 src_strd,
1003     UWORD8 *pu1_dst,
1004     WORD32 dst_strd,
1005     WORD32 wd,
1006     WORD32 ht,
1007     UWORD8 *pu1_wkg_mem,
1008     WORD32 ht_offset,
1009     WORD32 block_ht,
1010     WORD32 wd_offset,
1011     WORD32 block_wd,
1012     FT_COPY_2D *pf_copy_2d,
1013     FT_SCALING_FILTER_BY_2 *pf_scaling_filter_mxn)
1014 {
1015 #define N_TAPS 7
1016 #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1))
1017     UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ];
1018     UWORD32 cpy_strd = MAX_BLK_SZ;
1019     UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1);
1020 
1021     UWORD8 *pu1_in, *pu1_out;
1022     WORD32 in_strd, wkg_mem_strd;
1023 
1024     WORD32 row_start, row_end;
1025     WORD32 col_start, col_end;
1026     WORD32 i, fun_select;
1027     WORD32 ht_tmp, wd_tmp;
1028     FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2];
1029 
1030     assert((wd & 1) == 0);
1031     assert((ht & 1) == 0);
1032     assert(block_wd <= MAX_CTB_SIZE);
1033     assert(block_ht <= MAX_CTB_SIZE);
1034 
1035     /* function pointers for filtering different dimensions */
1036     ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn;
1037     ihevce_scaling_filters[1] = pf_scaling_filter_mxn;
1038 
1039     /* handle boundary blks */
1040     col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0;
1041     row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0;
1042     col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0;
1043     row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0;
1044     if(col_end && (wd % block_wd != 0))
1045     {
1046         block_wd = (wd % block_wd);
1047     }
1048     if(row_end && (ht % block_ht != 0))
1049     {
1050         block_ht = (ht % block_ht);
1051     }
1052 
1053     /* boundary blks needs to be padded, copy src to tmp buffer */
1054     if(col_start || col_end || row_end || row_start)
1055     {
1056         UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd;
1057 
1058         pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start));
1059         pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start));
1060         ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end);
1061         wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end);
1062         pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp);
1063         pu1_in = au1_cpy + cpy_strd * 3 + 3;
1064         in_strd = cpy_strd;
1065     }
1066     else
1067     {
1068         pu1_in = pu1_src + wd_offset + ht_offset * src_strd;
1069         in_strd = src_strd;
1070     }
1071 
1072     /*top padding*/
1073     if(row_start)
1074     {
1075         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3;
1076 
1077         pu1_cpy = au1_cpy + cpy_strd * (3 - 1);
1078         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1079         pu1_cpy -= cpy_strd;
1080         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1081         pu1_cpy -= cpy_strd;
1082         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1083     }
1084 
1085     /*bottom padding*/
1086     if(row_end)
1087     {
1088         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd;
1089 
1090         pu1_cpy = pu1_cpy_tmp + cpy_strd;
1091         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1092         pu1_cpy += cpy_strd;
1093         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1094         pu1_cpy += cpy_strd;
1095         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1096     }
1097 
1098     /*left padding*/
1099     if(col_start)
1100     {
1101         UWORD8 *pu1_cpy_tmp = au1_cpy + 3;
1102 
1103         pu1_cpy = au1_cpy;
1104         for(i = 0; i < block_ht + 6; i++)
1105         {
1106             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
1107             pu1_cpy += cpy_strd;
1108             pu1_cpy_tmp += cpy_strd;
1109         }
1110     }
1111 
1112     /*right padding*/
1113     if(col_end)
1114     {
1115         UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1;
1116 
1117         pu1_cpy = au1_cpy + 3 + block_wd;
1118         for(i = 0; i < block_ht + 6; i++)
1119         {
1120             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
1121             pu1_cpy += cpy_strd;
1122             pu1_cpy_tmp += cpy_strd;
1123         }
1124     }
1125 
1126     wkg_mem_strd = block_wd >> 1;
1127     pu1_out = pu1_dst + (wd_offset >> 1);
1128     fun_select = (block_wd % 16 == 0);
1129     ihevce_scaling_filters[fun_select](
1130         pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd);
1131 
1132     /* Left padding of 16 for 1st block of every row */
1133     if(wd_offset == 0)
1134     {
1135         UWORD8 u1_val;
1136         WORD32 pad_wd = 16;
1137         WORD32 pad_ht = block_ht >> 1;
1138         UWORD8 *dst = pu1_dst;
1139 
1140         for(i = 0; i < pad_ht; i++)
1141         {
1142             u1_val = dst[0];
1143             memset(&dst[-pad_wd], u1_val, pad_wd);
1144             dst += dst_strd;
1145         }
1146     }
1147 
1148     if(wd == wd_offset + block_wd)
1149     {
1150         /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */
1151         /* Right padding is done only after processing of last block of that row is done*/
1152         UWORD8 u1_val;
1153         WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4;
1154         WORD32 pad_ht = block_ht >> 1;
1155         UWORD8 *dst = pu1_dst + (wd >> 1) - 1;
1156 
1157         for(i = 0; i < pad_ht; i++)
1158         {
1159             u1_val = dst[0];
1160             memset(&dst[1], u1_val, pad_wd);
1161             dst += dst_strd;
1162         }
1163 
1164         if(ht_offset == 0)
1165         {
1166             /* Top padding of 16 is done for 1st row only after we reach end of that row */
1167             pad_wd = dst_strd;
1168             pad_ht = 16;
1169             dst = pu1_dst - 16;
1170             for(i = 1; i <= pad_ht; i++)
1171             {
1172                 memcpy(dst - (i * dst_strd), dst, pad_wd);
1173             }
1174         }
1175 
1176         /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have
1177          reached end of frame */
1178         if(ht - ht_offset - block_ht == 0)
1179         {
1180             pad_wd = dst_strd;
1181             pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4;
1182             dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16;
1183             for(i = 1; i <= pad_ht; i++)
1184                 memcpy(dst + (i * dst_strd), dst, pad_wd);
1185         }
1186     }
1187 }
1188 
1189 /*!
1190 ******************************************************************************
1191 * \if Function name : ihevce_decomp_pre_intra_process_row \endif
1192 *
1193 * \brief
1194 *  Row level function which down scales a given row by 2 in horz and vertical
1195 *  direction creates output of size wd/2 * ht/2. When decomposition is done
1196 *  from L1 to L2 pre intra analysis is done on L1
1197 *
1198 *****************************************************************************
1199 */
ihevce_decomp_pre_intra_process_row(UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * pu1_dst_decomp,WORD32 dst_stride,WORD32 layer_wd,WORD32 layer_ht,UWORD8 * pu1_wkg_mem,WORD32 ht_offset,WORD32 block_ht,WORD32 block_wd,WORD32 num_col_blks,WORD32 layer_no,ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_row,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1_row,WORD32 num_4x4_blks_ctb_y,WORD32 num_4x4_blks_last_ctb_x,WORD32 skip_decomp,WORD32 skip_pre_intra,WORD32 row_block_no,ctb_analyse_t * ps_ctb_analyse,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)1200 void ihevce_decomp_pre_intra_process_row(
1201     UWORD8 *pu1_src,
1202     WORD32 src_stride,
1203     UWORD8 *pu1_dst_decomp,
1204     WORD32 dst_stride,
1205     WORD32 layer_wd,
1206     WORD32 layer_ht,
1207     UWORD8 *pu1_wkg_mem,
1208     WORD32 ht_offset,
1209     WORD32 block_ht,
1210     WORD32 block_wd,
1211     WORD32 num_col_blks,
1212     WORD32 layer_no,
1213     ihevce_ed_ctxt_t *ps_ed_ctxt,
1214     ihevce_ed_blk_t *ps_ed_row,
1215     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1_row,
1216     WORD32 num_4x4_blks_ctb_y,
1217     WORD32 num_4x4_blks_last_ctb_x,
1218     WORD32 skip_decomp,
1219     WORD32 skip_pre_intra,
1220     WORD32 row_block_no,
1221     ctb_analyse_t *ps_ctb_analyse,
1222     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
1223     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
1224 {
1225     WORD32 do_pre_intra_analysis = ((layer_no == 1) || (layer_no == 2)) && (!skip_pre_intra);
1226     WORD32 col_block_no;
1227     WORD32 i, j;
1228 
1229     if(!skip_decomp)
1230     {
1231         ctb_analyse_t *ps_ctb_analyse_curr = ps_ctb_analyse + row_block_no * num_col_blks;
1232 
1233         for(col_block_no = 0; col_block_no < num_col_blks; col_block_no++)
1234         {
1235             ihevce_scale_by_2(
1236                 pu1_src,
1237                 src_stride,
1238                 pu1_dst_decomp,
1239                 dst_stride,
1240                 layer_wd,
1241                 layer_ht,
1242                 pu1_wkg_mem,
1243                 ht_offset,
1244                 block_ht,
1245                 block_wd * col_block_no,
1246                 block_wd,
1247                 ps_cmn_utils_optimised_function_list->pf_copy_2d,
1248                 ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
1249 
1250             /* Disable noise detection */
1251             memset(
1252                 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
1253                 0,
1254                 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
1255 
1256             ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
1257 
1258             ps_ctb_analyse_curr++;
1259         }
1260     }
1261 
1262     if(do_pre_intra_analysis)
1263     {
1264         ihevce_ed_blk_t *ps_ed_ctb = ps_ed_row;
1265         ihevce_ed_ctb_l1_t *ps_ed_ctb_l1 = ps_ed_ctb_l1_row;
1266         WORD32 *nbr_flags_ptr = &ps_ed_ctxt->ai4_nbr_flags[0];
1267         UWORD8 *pu1_src_pre_intra = pu1_src + (ht_offset * src_stride);
1268         WORD32 num_4x4_blks_in_ctb = block_wd >> 2;
1269         WORD32 src_inc_pre_intra = num_4x4_blks_in_ctb * 4;
1270         WORD32 inc_ctb = num_4x4_blks_in_ctb * num_4x4_blks_in_ctb;
1271 
1272         /* To analyse any given CTB we need to set the availability flags of the
1273          * following neighbouring CTB: BL,L,TL,T,TR */
1274         /* copy the neighbor flags for a general ctb (ctb inside the frame); not any corners */
1275         memcpy(
1276             ps_ed_ctxt->ai4_nbr_flags,
1277             gau4_nbr_flags_8x8_4x4blks,
1278             sizeof(gau4_nbr_flags_8x8_4x4blks));
1279 
1280         /* set top flags unavailable for first ctb row */
1281         if(ht_offset == 0)
1282         {
1283             for(j = 0; j < num_4x4_blks_in_ctb; j++)
1284             {
1285                 SET_T_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
1286                 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
1287                 SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
1288             }
1289         }
1290 
1291         /* set bottom left flags as not available for last row */
1292         if(ht_offset + block_ht >= layer_ht)
1293         {
1294             for(j = 0; j < num_4x4_blks_in_ctb; j++)
1295             {
1296                 SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(num_4x4_blks_ctb_y - 1) * 8 + j]);
1297             }
1298         }
1299 
1300         /* set left flags unavailable for 1st ctb col */
1301         for(j = 0; j < num_4x4_blks_ctb_y; j++)
1302         {
1303             SET_L_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1304             SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1305             SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1306         }
1307 
1308         for(col_block_no = 0; col_block_no < num_col_blks; col_block_no++)
1309         {
1310             if(col_block_no == 1)
1311             {
1312                 /* For the rest of the ctbs, set left flags available */
1313                 for(j = 0; j < num_4x4_blks_ctb_y; j++)
1314                 {
1315                     SET_L_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1316                 }
1317                 for(j = 0; j < num_4x4_blks_ctb_y - 1; j++)
1318                 {
1319                     SET_BL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1320                     SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(j + 1) * 8]);
1321                 }
1322                 if(ht_offset != 0)
1323                 {
1324                     SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[0]);
1325                 }
1326             }
1327 
1328             if(col_block_no == num_col_blks - 1)
1329             {
1330                 /* set top right flags unavailable for last ctb col */
1331                 for(i = 0; i < num_4x4_blks_ctb_y; i++)
1332                 {
1333                     SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[i * 8 + num_4x4_blks_last_ctb_x - 1]);
1334                 }
1335             }
1336 
1337             /* Call intra analysis for the ctb */
1338             ihevce_ed_calc_ctb(
1339                 ps_ed_ctxt,
1340                 ps_ed_ctb,
1341                 ps_ed_ctb_l1,
1342                 pu1_src_pre_intra,
1343                 src_stride,
1344                 (col_block_no == num_col_blks - 1) ? num_4x4_blks_last_ctb_x : num_4x4_blks_in_ctb,
1345                 num_4x4_blks_ctb_y,
1346                 nbr_flags_ptr,
1347                 layer_no,
1348                 ps_ipe_optimised_function_list,
1349                 ps_cmn_utils_optimised_function_list);
1350             pu1_src_pre_intra += src_inc_pre_intra;
1351             ps_ed_ctb += inc_ctb;
1352             ps_ed_ctb_l1 += 1;
1353         }
1354     }
1355 }
1356 
1357 /*!
1358 ******************************************************************************
1359 * \if Function name : ihevce_decomp_pre_intra_process \endif
1360 *
1361 * \brief
1362 *  Frame level function to decompose given layer L0 into coarser layers and
1363 *  perform intra analysis on layers below L0
1364 *
1365 *****************************************************************************
1366 */
ihevce_decomp_pre_intra_process(void * pv_ctxt,ihevce_lap_output_params_t * ps_lap_out_prms,frm_ctb_ctxt_t * ps_frm_ctb_prms,void * pv_multi_thrd_ctxt,WORD32 thrd_id,WORD32 i4_ping_pong)1367 void ihevce_decomp_pre_intra_process(
1368     void *pv_ctxt,
1369     ihevce_lap_output_params_t *ps_lap_out_prms,
1370     frm_ctb_ctxt_t *ps_frm_ctb_prms,
1371     void *pv_multi_thrd_ctxt,
1372     WORD32 thrd_id,
1373     WORD32 i4_ping_pong)
1374 {
1375     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_ctxt;
1376     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thrd_id];
1377     multi_thrd_ctxt_t *ps_multi_thrd = (multi_thrd_ctxt_t *)pv_multi_thrd_ctxt;
1378     WORD32 i4_num_layers = ps_ctxt->i4_num_layers;
1379     UWORD8 *pu1_wkg_mem = ps_ctxt->au1_wkg_mem;
1380     ihevce_ed_ctxt_t *ps_ed_ctxt = ps_ctxt->ps_ed_ctxt;
1381     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1;
1382     ihevce_ed_blk_t *ps_ed;
1383     WORD32 i4_layer_no;
1384     WORD32 end_of_layer;
1385     UWORD8 *pu1_src, *pu1_dst;
1386     WORD32 src_stride, dst_stride;
1387     WORD32 i4_layer_wd, i4_layer_ht;
1388     WORD32 ht_offset, block_ht, row_block_no, num_row_blocks;
1389     WORD32 block_wd, num_col_blks;
1390     WORD32 skip_decomp, skip_pre_intra;
1391     WORD32 inc_ctb;
1392 
1393     ASSERT(i4_num_layers >= 3);
1394     ps_ctxt->as_layers[0].pu1_inp = (UWORD8 *)ps_lap_out_prms->s_input_buf.pv_y_buf;
1395     ps_ctxt->as_layers[0].i4_inp_stride = ps_lap_out_prms->s_input_buf.i4_y_strd;
1396     ps_ctxt->as_layers[0].i4_actual_wd = ps_lap_out_prms->s_input_buf.i4_y_wd;
1397     ps_ctxt->as_layers[0].i4_actual_ht = ps_lap_out_prms->s_input_buf.i4_y_ht;
1398 
1399     /* This loop does decomp & intra by picking jobs from job queue */
1400     for(i4_layer_no = 0; i4_layer_no < i4_num_layers; i4_layer_no++)
1401     {
1402         WORD32 idx = 0;
1403 
1404         src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride;
1405         pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp;
1406         i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd;
1407         i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht;
1408         pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp;
1409         dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride;
1410         block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd;
1411         block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht;
1412         num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks;
1413         num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks;
1414         inc_ctb = (block_wd >> 2) * (block_wd >> 2);
1415         end_of_layer = 0;
1416         skip_pre_intra = 1;
1417         skip_decomp = 0;
1418         if(i4_layer_no >= (ps_ctxt->i4_num_layers - 1))
1419         {
1420             skip_decomp = 1;
1421         }
1422 
1423         /* ------------ Loop over all the CTB rows & perform Decomp --------------- */
1424         while(0 == end_of_layer)
1425         {
1426             job_queue_t *ps_pre_enc_job;
1427             WORD32 num_4x4_blks_ctb_y = 0, num_4x4_blks_last_ctb_x = 0;
1428 
1429             /* Get the current row from the job queue */
1430             ps_pre_enc_job = (job_queue_t *)ihevce_pre_enc_grp_get_next_job(
1431                 pv_multi_thrd_ctxt, (DECOMP_JOB_LYR0 + i4_layer_no), 1, i4_ping_pong);
1432 
1433             /* If all rows are done, set the end of layer flag to 1, */
1434             if(NULL == ps_pre_enc_job)
1435             {
1436                 end_of_layer = 1;
1437             }
1438             else
1439             {
1440                 /* Obtain the current row's details from the job */
1441                 row_block_no = ps_pre_enc_job->s_job_info.s_decomp_job_info.i4_vert_unit_row_no;
1442                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = row_block_no;
1443                 ht_offset = row_block_no * block_ht;
1444 
1445                 if(row_block_no < (num_row_blocks))
1446                 {
1447                     pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
1448                               ((block_ht >> 1) * dst_stride * row_block_no);
1449 
1450                     /* call the row level processing function */
1451                     ihevce_decomp_pre_intra_process_row(
1452                         pu1_src,
1453                         src_stride,
1454                         pu1_dst,
1455                         dst_stride,
1456                         i4_layer_wd,
1457                         i4_layer_ht,
1458                         pu1_wkg_mem,
1459                         ht_offset,
1460                         block_ht,
1461                         block_wd,
1462                         num_col_blks,
1463                         i4_layer_no,
1464                         ps_ed_ctxt,
1465                         ps_ed,
1466                         ps_ed_ctb_l1,
1467                         num_4x4_blks_ctb_y,
1468                         num_4x4_blks_last_ctb_x,
1469                         skip_decomp,
1470                         skip_pre_intra,
1471                         row_block_no,
1472                         ps_ctxt->ps_ctb_analyse,
1473                         &ps_ctxt->s_ipe_optimised_function_list,
1474                         &ps_ctxt->s_cmn_opt_func);
1475                 }
1476                 idx++;
1477                 /* set the output dependency */
1478                 ihevce_pre_enc_grp_job_set_out_dep(
1479                     pv_multi_thrd_ctxt, ps_pre_enc_job, i4_ping_pong);
1480             }
1481         }
1482         ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = idx;
1483 
1484         /* ------------ For the same rows perform preintra if required --------------- */
1485         ihevce_ed_frame_init(ps_ed_ctxt, i4_layer_no);
1486 
1487         if((1 == i4_layer_no) && (IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset))
1488         {
1489             WORD32 vert_ctr, ctb_ctr, i;
1490             WORD32 ctb_ctr_blks = ps_ctxt->as_layers[1].i4_num_col_blks;
1491             WORD32 vert_ctr_blks = ps_ctxt->as_layers[1].i4_num_row_blks;
1492 
1493             if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1494                (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))
1495             {
1496                 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
1497                 {
1498                     ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
1499                         ps_ctxt->ps_ed_ctb_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
1500 
1501                     for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
1502                     {
1503                         ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
1504 
1505                         for(i = 0; i < 16; i++)
1506                         {
1507                             ps_ed_ctb_curr_l1->i4_best_sad_cost_8x8_l1_ipe[i] = 0x7fffffff;
1508                             ps_ed_ctb_curr_l1->i4_best_sad_8x8_l1_ipe[i] = 0x7fffffff;
1509                         }
1510                     }
1511                 }
1512             }
1513         }
1514 
1515 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
1516         if(((2 == i4_layer_no) && (ps_lap_out_prms->i4_pic_type == IV_I_FRAME ||
1517                                    ps_lap_out_prms->i4_pic_type == IV_IDR_FRAME)) ||
1518            ((1 == i4_layer_no) &&
1519             (ps_lap_out_prms->i4_temporal_lyr_id <= TEMPORAL_LAYER_DISABLE)) ||
1520            ((IHEVCE_QUALITY_P6 != ps_ctxt->i4_quality_preset) && (0 != i4_layer_no)))
1521 #else
1522         if((0 != i4_layer_no) &&
1523            (1 != ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
1524                   (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
1525 #endif
1526         {
1527             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
1528 
1529             ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no];
1530             if(0 == i4_layer_no)
1531             {
1532                 ps_ed_ctxt->ps_ed_pic = NULL;
1533                 ps_ed_ctxt->ps_ed = NULL;
1534                 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
1535                 ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
1536             }
1537             else if(1 == i4_layer_no)
1538             {
1539                 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf;
1540                 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf;
1541                 ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1;
1542                 ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1;
1543             }
1544             else if(2 == i4_layer_no)
1545             {
1546                 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf;
1547                 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf;
1548                 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
1549                 ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
1550             }
1551 
1552             skip_decomp = 1;
1553             skip_pre_intra = 0;
1554 
1555             for(idx = 0; idx < i4_num_rows; idx++)
1556             {
1557                 WORD32 num_4x4_blks_ctb_y = 0, num_4x4_blks_last_ctb_x = 0;
1558 
1559                 /* Obtain the current row's details from the job */
1560                 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
1561                 ht_offset = row_block_no * block_ht;
1562 
1563                 if(row_block_no < (num_row_blocks))
1564                 {
1565                     pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
1566                               ((block_ht >> 1) * dst_stride * row_block_no);
1567 
1568                     if(i4_layer_no == 1 || i4_layer_no == 2)
1569                     {
1570                         ps_ed = ps_ed_ctxt->ps_ed + (row_block_no * inc_ctb * (num_col_blks));
1571                         ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1 + (row_block_no * num_col_blks);
1572                         ps_ed_ctxt->i4_quality_preset = ps_ctxt->i4_quality_preset;
1573                         num_4x4_blks_last_ctb_x = block_wd >> 2;
1574                         num_4x4_blks_ctb_y = block_ht >> 2;
1575                         if(row_block_no == num_row_blocks - 1)
1576                         {
1577                             if(i4_layer_ht % block_ht)
1578                             {
1579                                 num_4x4_blks_ctb_y = ((i4_layer_ht % block_ht) + 3) >> 2;
1580                             }
1581                         }
1582                         if(i4_layer_wd % block_wd)
1583                         {
1584                             num_4x4_blks_last_ctb_x = ((i4_layer_wd % block_wd) + 3) >> 2;
1585                         }
1586                     }
1587 
1588                     /* call the row level processing function */
1589                     ihevce_decomp_pre_intra_process_row(
1590                         pu1_src,
1591                         src_stride,
1592                         pu1_dst,
1593                         dst_stride,
1594                         i4_layer_wd,
1595                         i4_layer_ht,
1596                         pu1_wkg_mem,
1597                         ht_offset,
1598                         block_ht,
1599                         block_wd,
1600                         num_col_blks,
1601                         i4_layer_no,
1602                         ps_ed_ctxt,
1603                         ps_ed,
1604                         ps_ed_ctb_l1,
1605                         num_4x4_blks_ctb_y,
1606                         num_4x4_blks_last_ctb_x,
1607                         skip_decomp,
1608                         skip_pre_intra,
1609                         row_block_no,
1610                         NULL,
1611                         &ps_ctxt->s_ipe_optimised_function_list,
1612                         &ps_ctxt->s_cmn_opt_func);
1613                 }
1614 
1615                 if(1 == i4_layer_no)
1616                 {
1617                     ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
1618                 }
1619             }
1620             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
1621             {
1622                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
1623             }
1624             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
1625         }
1626 
1627 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
1628         if((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
1629            (((i4_layer_no == 2) && (ps_lap_out_prms->i4_pic_type == ISLICE)) ||
1630             ((i4_layer_no == 1) && (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
1631         {
1632             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
1633             if(1 == i4_layer_no)
1634             {
1635                 for(idx = 0; idx < i4_num_rows; idx++)
1636                 {
1637                     row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
1638 
1639                     {
1640                         ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
1641                     }
1642                 }
1643             }
1644             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
1645             {
1646                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
1647             }
1648             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
1649         }
1650 #else
1651         if((i4_layer_no != 0) && ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
1652                                   (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE)))
1653         {
1654             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
1655             for(idx = 0; idx < i4_num_rows; idx++)
1656             {
1657                 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
1658                 if(1 == i4_layer_no)
1659                 {
1660                     ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
1661                 }
1662             }
1663             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
1664             {
1665                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
1666             }
1667             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
1668         }
1669 #endif
1670     }
1671 }
1672 
1673 /*!
1674 ************************************************************************
1675 * \brief
1676 *    return number of records used by decomp pre intra
1677 *
1678 ************************************************************************
1679 */
ihevce_decomp_pre_intra_get_num_mem_recs(void)1680 WORD32 ihevce_decomp_pre_intra_get_num_mem_recs(void)
1681 {
1682     return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
1683 }
1684 
1685 /*!
1686 ************************************************************************
1687 * @brief
1688 *    return each record attributes of  decomp pre intra
1689 ************************************************************************
1690 */
ihevce_decomp_pre_intra_get_mem_recs(iv_mem_rec_t * ps_mem_tab,WORD32 i4_num_proc_thrds,WORD32 i4_mem_space)1691 WORD32 ihevce_decomp_pre_intra_get_mem_recs(
1692     iv_mem_rec_t *ps_mem_tab, WORD32 i4_num_proc_thrds, WORD32 i4_mem_space)
1693 {
1694     /* memories should be requested assuming worst case requirememnts */
1695 
1696     /* Module context structure */
1697     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_size = sizeof(ihevce_decomp_pre_intra_master_ctxt_t);
1698     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
1699     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_alignment = 8;
1700 
1701     /* Thread context structure */
1702     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_size =
1703         i4_num_proc_thrds * sizeof(ihevce_decomp_pre_intra_ctxt_t);
1704     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
1705     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_alignment = 8;
1706 
1707     /* early decision context structure */
1708     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_size = i4_num_proc_thrds * sizeof(ihevce_ed_ctxt_t);
1709     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
1710     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_alignment = 8;
1711 
1712     return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
1713 }
1714 
1715 /*!
1716 ************************************************************************
1717 * @brief
1718 *    Init decomp pre intra context
1719 ************************************************************************
1720 */
ihevce_decomp_pre_intra_init(iv_mem_rec_t * ps_mem_tab,ihevce_static_cfg_params_t * ps_init_prms,WORD32 i4_num_proc_thrds,func_selector_t * ps_func_selector,WORD32 i4_resolution_id,UWORD8 u1_is_popcnt_available)1721 void *ihevce_decomp_pre_intra_init(
1722     iv_mem_rec_t *ps_mem_tab,
1723     ihevce_static_cfg_params_t *ps_init_prms,
1724     WORD32 i4_num_proc_thrds,
1725     func_selector_t *ps_func_selector,
1726     WORD32 i4_resolution_id,
1727     UWORD8 u1_is_popcnt_available)
1728 {
1729     ihevce_decomp_pre_intra_master_ctxt_t *ps_mstr_ctxt = ps_mem_tab[DECOMP_PRE_INTRA_CTXT].pv_base;
1730     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].pv_base;
1731     ihevce_ed_ctxt_t *ps_ed_ctxt = ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].pv_base;
1732     ihevce_tgt_params_t *ps_tgt_prms = &ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id];
1733     WORD32 min_cu_size = 1 << ps_init_prms->s_config_prms.i4_min_log2_cu_size;
1734     WORD32 a_wd[MAX_NUM_HME_LAYERS], a_ht[MAX_NUM_HME_LAYERS];
1735     WORD32 a_disp_wd[MAX_NUM_LAYERS], a_disp_ht[MAX_NUM_LAYERS];
1736     WORD32 n_tot_layers;
1737     WORD32 i, j, k;
1738 
1739     /* Get the height and width of each layer */
1740     *a_wd = ps_tgt_prms->i4_width + SET_CTB_ALIGN(ps_tgt_prms->i4_width, min_cu_size);
1741     *a_ht = ps_tgt_prms->i4_height + SET_CTB_ALIGN(ps_tgt_prms->i4_height, min_cu_size);
1742     n_tot_layers = hme_derive_num_layers(1, a_wd, a_ht, a_disp_wd, a_disp_ht);
1743     ps_mstr_ctxt->i4_num_proc_thrds = i4_num_proc_thrds;
1744     for(i = 0; i < ps_mstr_ctxt->i4_num_proc_thrds; i++)
1745     {
1746         ps_mstr_ctxt->aps_decomp_pre_intra_thrd_ctxt[i] = ps_ctxt;
1747         ps_ctxt->i4_num_layers = n_tot_layers;
1748         ps_ctxt->ps_ed_ctxt = ps_ed_ctxt;
1749         for(j = 0; j < n_tot_layers; j++)
1750         {
1751             /** If CTB size= 64, decomp_blk_wd = 64 for L0, 32 for L1 , 16 for L2, 8 for L3 */
1752             WORD32 max_ctb_size = 1 << ps_init_prms->s_config_prms.i4_max_log2_cu_size;
1753             WORD32 decomp_blk_wd = max_ctb_size >> j;
1754             WORD32 decomp_blk_ht = max_ctb_size >> j;
1755 
1756             ps_ctxt->as_layers[j].i4_actual_wd = a_wd[j];
1757             ps_ctxt->as_layers[j].i4_actual_ht = a_ht[j];
1758             if(0 == j)
1759             {
1760                 ps_ctxt->as_layers[j].i4_padded_ht = a_ht[j];
1761                 ps_ctxt->as_layers[j].i4_padded_wd = a_wd[j];
1762             }
1763             else
1764             {
1765                 ps_ctxt->as_layers[j].i4_padded_ht = a_ht[j] + 32 + 4;
1766                 ps_ctxt->as_layers[j].i4_padded_wd = a_wd[j] + 32 + 4;
1767             }
1768             ps_ctxt->as_layers[j].pu1_inp = NULL;
1769             ps_ctxt->as_layers[j].i4_inp_stride = 0;
1770             ps_ctxt->as_layers[j].i4_decomp_blk_ht = decomp_blk_ht;
1771             ps_ctxt->as_layers[j].i4_decomp_blk_wd = decomp_blk_wd;
1772             ps_ctxt->as_layers[j].i4_num_row_blks = ((a_ht[j] + (decomp_blk_ht - 1)) / decomp_blk_ht);
1773             ps_ctxt->as_layers[j].i4_num_col_blks = ((a_wd[j] + (decomp_blk_wd - 1)) / decomp_blk_wd);
1774             for(k = 0; k < MAX_NUM_CTB_ROWS_FRM; k++)
1775             {
1776                 ps_ctxt->as_layers[j].ai4_curr_row_no[k] = -1;
1777             }
1778             ps_ctxt->as_layers[j].i4_num_rows_processed = 0;
1779         }
1780         ps_ctxt->i4_quality_preset = ps_tgt_prms->i4_quality_preset;
1781         if(ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P7)
1782         {
1783             ps_ctxt->i4_quality_preset = IHEVCE_QUALITY_P6;
1784         }
1785         if(ps_init_prms->s_coding_tools_prms.i4_vqet &
1786            (1 << BITPOS_IN_VQ_TOGGLE_FOR_CONTROL_TOGGLER))
1787         {
1788             if(ps_init_prms->s_coding_tools_prms.i4_vqet &
1789                (1 << BITPOS_IN_VQ_TOGGLE_FOR_ENABLING_NOISE_PRESERVATION))
1790             {
1791                 ps_ctxt->i4_enable_noise_detection = 1;
1792             }
1793             else
1794             {
1795                 ps_ctxt->i4_enable_noise_detection = 0;
1796             }
1797         }
1798         else
1799         {
1800             ps_ctxt->i4_enable_noise_detection = 0;
1801         }
1802         ihevce_cmn_utils_instr_set_router(
1803             &ps_ctxt->s_cmn_opt_func, u1_is_popcnt_available, ps_init_prms->e_arch_type);
1804         ihevce_ipe_instr_set_router(
1805             &ps_ctxt->s_ipe_optimised_function_list, ps_init_prms->e_arch_type);
1806 
1807         ps_ed_ctxt->ps_func_selector = ps_func_selector;
1808 
1809         ps_ctxt++;
1810         ps_ed_ctxt++;
1811     }
1812     /* return the handle to caller */
1813     return ((void *)ps_mstr_ctxt);
1814 }
1815 
1816 /*!
1817 ************************************************************************
1818 * @brief
1819 *    Init decomp pre intra layer buffers
1820 ************************************************************************
1821 */
ihevce_decomp_pre_intra_frame_init(void * pv_ctxt,UWORD8 ** ppu1_decomp_lyr_bufs,WORD32 * pi4_lyr_buf_stride,ihevce_ed_blk_t * ps_layer1_buf,ihevce_ed_blk_t * ps_layer2_buf,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,WORD32 i4_ol_sad_lambda_qf,ctb_analyse_t * ps_ctb_analyse)1822 void ihevce_decomp_pre_intra_frame_init(
1823     void *pv_ctxt,
1824     UWORD8 **ppu1_decomp_lyr_bufs,
1825     WORD32 *pi4_lyr_buf_stride,
1826     ihevce_ed_blk_t *ps_layer1_buf,
1827     ihevce_ed_blk_t *ps_layer2_buf,
1828     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
1829     WORD32 i4_ol_sad_lambda_qf,
1830     ctb_analyse_t *ps_ctb_analyse)
1831 {
1832     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_ctxt;
1833     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt;
1834     WORD32 i, j;
1835 
1836     for(i = 0; i < ps_master_ctxt->i4_num_proc_thrds; i++)
1837     {
1838         ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i];
1839 
1840         /* L0 layer (actual input) is registered in process call */
1841         for(j = 1; j < ps_ctxt->i4_num_layers; j++)
1842         {
1843             ps_ctxt->as_layers[j].i4_inp_stride = pi4_lyr_buf_stride[j - 1];
1844             ps_ctxt->as_layers[j].pu1_inp = ppu1_decomp_lyr_bufs[j - 1];
1845 
1846             /* Populating the buffer pointers for layer1 and layer2 buffers to store the
1847             structure for each 4x4 block after pre intra analysis on their respective layers */
1848             if(j == 1)
1849             {
1850                 WORD32 sad_lambda_l1 = (3 * i4_ol_sad_lambda_qf >> 2);
1851                 WORD32 temp = 1 << LAMBDA_Q_SHIFT;
1852                 WORD32 lambda = ((temp) > sad_lambda_l1) ? temp : sad_lambda_l1;
1853 
1854                 ps_ctxt->ps_layer1_buf = ps_layer1_buf;
1855                 ps_ctxt->ps_ed_ctb_l1 = ps_ed_ctb_l1;
1856                 ps_ctxt->ai4_lambda[j] = lambda;
1857             }
1858             else if(j == 2)
1859             {
1860                 WORD32 sad_lambda_l2 = i4_ol_sad_lambda_qf >> 1;
1861                 WORD32 temp = 1 << LAMBDA_Q_SHIFT;
1862                 WORD32 lambda = ((temp) > sad_lambda_l2) ? temp : sad_lambda_l2;
1863 
1864                 ps_ctxt->ps_layer2_buf = ps_layer2_buf;
1865                 ps_ctxt->ai4_lambda[j] = lambda;
1866             }
1867             else
1868             {
1869                 ps_ctxt->ai4_lambda[j] = -1;
1870             }
1871         }
1872 
1873         /* make the ps_ctb_analyse refernce as a part of the private context */
1874         ps_ctxt->ps_ctb_analyse = ps_ctb_analyse;
1875     }
1876 }
1877 
1878 /**
1879 *******************************************************************************
1880 *
1881 * @brief Merge Sort function.
1882 *
1883 * @par Description:
1884 *     This function sorts the data in the input array in ascending
1885 *     order using merge sort algorithm. Intermediate data obtained in
1886 *     merge sort are stored in output 2-D array.
1887 *
1888 * @param[in]
1889 *   pi4_input_val  :   Input 1-D array
1890 *   aai4_output_val:   Output 2-D array containing elements sorted in sets of
1891 *                      4,16,64 etc.
1892 *   i4_length      : length of the array
1893 *   i4_ip_sort_level: Input sort level. Specifies the level upto which array is sorted.
1894 *                     It should be 1 if the array is unsorted. Should be 4 if array is sorted
1895 *                     in sets of 4.
1896 *   i4_op_sort_level: Output sort level. Specify the level upto which sorting is required.
1897 *                     If it is given as length of array it sorts for whole array.
1898 *
1899 *******************************************************************************
1900 */
ihevce_merge_sort(WORD32 * pi4_input_val,WORD32 aai4_output_val[][64],WORD32 i4_length,WORD32 i4_ip_sort_level,WORD32 i4_op_sort_level)1901 void ihevce_merge_sort(
1902     WORD32 *pi4_input_val,
1903     WORD32 aai4_output_val[][64],
1904     WORD32 i4_length,
1905     WORD32 i4_ip_sort_level,
1906     WORD32 i4_op_sort_level)
1907 {
1908     WORD32 i, j, k;
1909     WORD32 count, level;
1910     WORD32 temp[64];
1911     WORD32 *pi4_temp_buf_cpy;
1912     WORD32 *pi4_temp = &temp[0];
1913     WORD32 calc_level;
1914 
1915     pi4_temp_buf_cpy = pi4_temp;
1916 
1917     GETRANGE(calc_level, i4_op_sort_level / i4_ip_sort_level);
1918 
1919     calc_level = calc_level - 1;
1920 
1921     /*** This function is written under the assumption that we need only intermediate values of
1922     sort in the range of 4,16,64 etc. ***/
1923     ASSERT((calc_level % 2) == 0);
1924 
1925     /** One iteration of this for loop does 1 sets of sort and produces one intermediate value in 2 iterations **/
1926     for(level = 0; level < calc_level; level++)
1927     {
1928         /** Merges adjacent sets of elements based on current sort level **/
1929         for(count = 0; count < i4_length; (count = count + (i4_ip_sort_level * 2)))
1930         {
1931             i = 0;
1932             j = 0;
1933             if(pi4_input_val[i4_ip_sort_level - 1] < pi4_input_val[i4_ip_sort_level])
1934             {
1935                 /*** Condition for early exit ***/
1936                 memcpy(&pi4_temp[0], pi4_input_val, sizeof(WORD32) * i4_ip_sort_level * 2);
1937             }
1938             else
1939             {
1940                 for(k = 0; k < (i4_ip_sort_level * 2); k++)
1941                 {
1942                     if((i < i4_ip_sort_level) && (j < i4_ip_sort_level))
1943                     {
1944                         if(pi4_input_val[i] > pi4_input_val[j + i4_ip_sort_level])
1945                         {
1946                             /** copy to output array **/
1947                             pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
1948                             j++;
1949                         }
1950                         else
1951                         {
1952                             /** copy to output array **/
1953                             pi4_temp[k] = pi4_input_val[i];
1954                             i++;
1955                         }
1956                     }
1957                     else if(i == i4_ip_sort_level)
1958                     {
1959                         /** copy the remaining data to output array **/
1960                         pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
1961                         j++;
1962                     }
1963                     else
1964                     {
1965                         /** copy the remaining data to output array **/
1966                         pi4_temp[k] = pi4_input_val[i];
1967                         i++;
1968                     }
1969                 }
1970             }
1971             pi4_input_val += (i4_ip_sort_level * 2);
1972             pi4_temp += (i4_ip_sort_level * 2);
1973         }
1974         pi4_input_val = pi4_temp - i4_length;
1975 
1976         if(level % 2)
1977         {
1978             /** Assign a temp address for storing next sort level output as we will not need this data as output **/
1979             pi4_temp = pi4_temp_buf_cpy;
1980         }
1981         else
1982         {
1983             /** Assign address for storing the intermediate data into output 2-D array **/
1984             pi4_temp = aai4_output_val[level / 2];
1985         }
1986         i4_ip_sort_level *= 2;
1987     }
1988 }
1989 
1990 /*!
1991 ************************************************************************
1992 * @brief
1993 *   Calculate the average activities at 16*16 (8*8 in L1) and 32*32
1994 *   (8*8 in L2) block sizes. As this function accumulates activities
1995 *   across blocks of a frame, this needs to be called by only one thread
1996 *   and only after ensuring the processing of entire frame is done
1997 ************************************************************************
1998 */
ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit(void * pv_pre_intra_ctxt,pre_enc_me_ctxt_t * ps_curr_out,frm_ctb_ctxt_t * ps_frm_ctb_prms)1999 void ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit(
2000     void *pv_pre_intra_ctxt,
2001     pre_enc_me_ctxt_t *ps_curr_out,
2002     frm_ctb_ctxt_t *ps_frm_ctb_prms)
2003 {
2004     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_pre_intra_ctxt;
2005     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
2006 
2007     ULWORD64 u8_frame_8x8_sum_act_sqr = 0;
2008     LWORD64 ai8_frame_8x8_sum_act_sqr[2] = { 0, 0 };
2009     WORD32 ai4_frame_8x8_sum_act[2] = { 0, 0 };
2010     WORD32 ai4_frame_8x8_sum_blks[2] = { 0, 0 };
2011 
2012     LWORD64 ai8_frame_16x16_sum_act_sqr[3] = { 0, 0, 0 };
2013     WORD32 ai4_frame_16x16_sum_act[3] = { 0, 0, 0 };
2014     WORD32 ai4_frame_16x16_sum_blks[3] = { 0, 0, 0 };
2015 
2016     LWORD64 ai8_frame_32x32_sum_act_sqr[3] = { 0, 0, 0 };
2017     WORD32 ai4_frame_32x32_sum_act[3] = { 0, 0, 0 };
2018     WORD32 ai4_frame_32x32_sum_blks[3] = { 0, 0, 0 };
2019 
2020     ihevce_ed_ctb_l1_t *ps_ed_ctb_pic_l1 = ps_curr_out->ps_ed_ctb_l1;
2021     ihevce_ed_blk_t *ps_ed_blk_l1 = ps_curr_out->ps_layer1_buf;
2022     WORD32 ctb_wd = ps_ctxt->as_layers[1].i4_decomp_blk_wd;
2023     WORD32 h_ctb_cnt = ps_ctxt->as_layers[1].i4_num_col_blks;
2024     WORD32 v_ctb_cnt = ps_ctxt->as_layers[1].i4_num_row_blks;
2025     WORD32 sub_blk_cnt = ((ctb_wd >> 2) * (ctb_wd >> 2));
2026     WORD32 i4_avg_noise_satd;
2027     WORD32 ctb_ctr, vert_ctr;
2028     WORD32 i, j, k;
2029 
2030     {
2031         /* Calculate min noise threshold */
2032         /* Min noise threshold is calculated by taking average of lowest 1% satd val in
2033          * the complete 4x4 frame satds */
2034 #define MAX_SATD 64
2035 #define SATD_NOISE_FLOOR_THRESHOLD 16
2036 #define MIN_BLKS 2
2037         WORD32 i4_layer_wd = ps_ctxt->as_layers[1].i4_actual_wd;
2038         WORD32 i4_layer_ht = ps_ctxt->as_layers[1].i4_actual_ht;
2039         WORD32 i4_min_blk = ((MIN_BLKS * (i4_layer_wd >> 1) * (i4_layer_ht >> 1)) / 100);
2040         WORD32 i4_total_blks = 0;
2041         WORD32 satd_hist[MAX_SATD];
2042         LWORD64 i8_acc_satd = 0;
2043 
2044         memset(satd_hist, 0, sizeof(satd_hist));
2045         for(i = 0; i < sub_blk_cnt * h_ctb_cnt * v_ctb_cnt; i++)
2046         {
2047             if(ps_ed_blk_l1[i].i4_4x4_satd >= 0 && ps_ed_blk_l1[i].i4_4x4_satd < MAX_SATD)
2048             {
2049                 satd_hist[ps_ed_blk_l1[i].i4_4x4_satd]++;
2050             }
2051         }
2052         for(i = 0; i < MAX_SATD && i4_total_blks <= i4_min_blk; i++)
2053         {
2054             i4_total_blks += satd_hist[i];
2055             i8_acc_satd += (i * satd_hist[i]);
2056         }
2057         if(i4_total_blks < i4_min_blk)
2058         {
2059             i4_avg_noise_satd = SATD_NOISE_FLOOR_THRESHOLD;
2060         }
2061         else
2062         {
2063             i4_avg_noise_satd = (WORD32)(i8_acc_satd + (i4_total_blks >> 1)) / i4_total_blks;
2064         }
2065         ps_curr_out->i4_avg_noise_thrshld_4x4 = i4_avg_noise_satd;
2066     }
2067 
2068     for(vert_ctr = 0; vert_ctr < v_ctb_cnt; vert_ctr++)
2069     {
2070         ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
2071             ps_ed_ctb_pic_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
2072         ihevce_ed_blk_t *ps_ed = ps_ed_blk_l1 + (vert_ctr * sub_blk_cnt * h_ctb_cnt);
2073 
2074         for(ctb_ctr = 0; ctb_ctr < h_ctb_cnt; ctb_ctr++, ps_ed += sub_blk_cnt)
2075         {
2076             ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
2077             WORD8 b8_satd_eval[4];
2078             WORD32 ai4_satd_4x4[64];
2079             WORD32 ai4_satd_8x8[16];  // derived from accumulating 4x4 satds
2080             WORD32 ai4_satd_16x16[4] = { 0 };  // derived from accumulating 8x8 satds
2081             WORD32 i4_satd_32x32 = 0;  // derived from accumulating 8x8 satds
2082             /* This 2-D array will contain 4x4 satds sorted in ascending order in sets
2083              * of 4, 16, 64  For example : '5 10 2 7 6 12 3 1' array input will return
2084              * '2 5 7 10 1 3 6 12' if sorted in sets of 4 */
2085             WORD32 aai4_sort_4_16_64_satd[3][64];
2086             /* This 2-D array will contain 8x8 satds sorted in ascending order in sets of
2087              * 4, 16***/
2088             WORD32 aai4_sort_4_16_satd[2][64];
2089 
2090             memset(b8_satd_eval, 1, sizeof(b8_satd_eval));
2091             for(i = 0; i < 4; i++)
2092             {
2093                 ihevce_ed_blk_t *ps_ed_b32 = &ps_ed[i * 16];
2094 
2095                 for(j = 0; j < 4; j++)
2096                 {
2097                     ihevce_ed_blk_t *ps_ed_b16 = &ps_ed_b32[j * 4];
2098                     WORD32 satd_sum = 0;
2099                     WORD32 blk_cnt = 0;
2100 
2101                     for(k = 0; k < 4; k++)
2102                     {
2103                         ihevce_ed_blk_t *ps_ed_b4 = &ps_ed_b16[k];
2104 
2105                         if(-1 != ps_ed_b4->i4_4x4_satd)
2106                         {
2107 #define SUB_NOISE_THRSHLD 0
2108 #if SUB_NOISE_THRSHLD
2109                             ps_ed_b4->i4_4x4_satd = ps_ed_b4->i4_4x4_satd - i4_avg_noise_satd;
2110                             if(ps_ed_b4->i4_4x4_satd < 0)
2111                             {
2112                                 ps_ed_b4->i4_4x4_satd = 0;
2113                             }
2114 #else
2115                             if(ps_ed_b4->i4_4x4_satd < i4_avg_noise_satd)
2116                             {
2117                                 ps_ed_b4->i4_4x4_satd = i4_avg_noise_satd;
2118                             }
2119 #endif
2120                             blk_cnt++;
2121                             satd_sum += ps_ed_b4->i4_4x4_satd;
2122                         }
2123                         ai4_satd_4x4[i * 16 + j * 4 + k] = ps_ed_b4->i4_4x4_satd;
2124                     }
2125                     ASSERT(blk_cnt == 0 || blk_cnt == 4);
2126                     if(blk_cnt == 0)
2127                     {
2128                         satd_sum = -1;
2129                     }
2130                     ai4_satd_8x8[i * 4 + j] = satd_sum;
2131                     ai4_satd_16x16[i] += satd_sum;
2132                     i4_satd_32x32 += satd_sum;
2133                     ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j] = satd_sum;
2134                 }
2135             }
2136 
2137             {
2138                 /* This function will sort 64 elements in array ai4_satd_4x4 in ascending order
2139                  *  to 3 arrays in sets of 4, 16, 64 into the 2-D array aai4_min_4_16_64_satd */
2140                 WORD32 array_length = sizeof(ai4_satd_4x4) / sizeof(WORD32);
2141                 ihevce_merge_sort(
2142                     &ai4_satd_4x4[0], aai4_sort_4_16_64_satd, array_length, 1, 64);
2143 
2144                 /* This function will sort 64 elements in array ai4_satd_8x8 in ascending order
2145                  *  to 2 arrays in sets of 4, 16 into the 2-D array aai4_sum_4_16_satd_ctb */
2146                 array_length = sizeof(ai4_satd_8x8) / sizeof(WORD32);
2147                 ihevce_merge_sort(
2148                     &ai4_satd_8x8[0], aai4_sort_4_16_satd, array_length, 1, 16);
2149             }
2150 
2151             /* Populate avg satd to calculate modulation index and activity factors */
2152             /* 16x16 */
2153             for(i = 0; i < 4; i++)
2154             {
2155                 for(j = 0; j < 4; j++)
2156                 {
2157                     WORD32 satd_sum = ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j];
2158                     WORD32 satd_min = aai4_sort_4_16_64_satd[0][i * 16 + j * 4 + MEDIAN_CU_TU];
2159 
2160                     ASSERT(-2 != satd_sum);
2161                     ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] = satd_min;
2162 
2163                     if(-1 != satd_sum)
2164                     {
2165                         ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = satd_sum;
2166                         ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = satd_min;
2167 
2168                         u8_frame_8x8_sum_act_sqr += (satd_sum * satd_sum);
2169                         ai4_frame_8x8_sum_act[0] += satd_sum;
2170                         ai8_frame_8x8_sum_act_sqr[0] += (satd_sum * satd_sum);
2171                         ai4_frame_8x8_sum_blks[0] += 1;
2172                         ai4_frame_8x8_sum_act[1] += satd_min;
2173                         ai8_frame_8x8_sum_act_sqr[1] += (satd_min * satd_min);
2174                         ai4_frame_8x8_sum_blks[1] += 1;
2175                     }
2176                     else
2177                     {
2178                         ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = -1;
2179                         ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = -1;
2180                         b8_satd_eval[i] = 0;
2181                     }
2182                 }
2183 
2184                 if(b8_satd_eval[i])
2185                 {
2186                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = ai4_satd_16x16[i];
2187                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = aai4_sort_4_16_satd[0][i * 4 + MEDIAN_CU_TU];
2188                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = aai4_sort_4_16_64_satd[1][i * 16 + MEDIAN_CU_TU_BY_2];
2189 
2190                     for(k = 0; k < 3; k++)
2191                     {
2192                         WORD32 satd = ps_ed_ctb_curr_l1->i4_16x16_satd[i][k];
2193 
2194                         ai4_frame_16x16_sum_act[k] += satd;
2195                         ai8_frame_16x16_sum_act_sqr[k] += (satd * satd);
2196                         ai4_frame_16x16_sum_blks[k] += 1;
2197                     }
2198                 }
2199                 else
2200                 {
2201                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = -1;
2202                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = -1;
2203                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = -1;
2204                 }
2205             }
2206 
2207             /*32x32*/
2208             if(b8_satd_eval[0] && b8_satd_eval[1] && b8_satd_eval[2] && b8_satd_eval[3])
2209             {
2210                 WORD32 aai4_sort_4_satd[1][64];
2211                 WORD32 array_length = sizeof(ai4_satd_16x16) / sizeof(WORD32);
2212                 WORD32 satd;
2213 
2214                 /* Sort 4 elements in ascending order */
2215                 ihevce_merge_sort(ai4_satd_16x16, aai4_sort_4_satd, array_length, 1, 4);
2216 
2217                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = aai4_sort_4_satd[0][MEDIAN_CU_TU];
2218                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = aai4_sort_4_16_satd[1][MEDIAN_CU_TU_BY_2];
2219                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = aai4_sort_4_16_64_satd[2][MEDIAN_CU_TU_BY_4];
2220                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = i4_satd_32x32;
2221 
2222                 for(k = 0; k < 3; k++)
2223                 {
2224                     WORD32 satd = ps_ed_ctb_curr_l1->i4_32x32_satd[0][k];
2225 
2226                     ai4_frame_32x32_sum_act[k] += satd;
2227                     ai8_frame_32x32_sum_act_sqr[k] += (satd * satd);
2228                     ai4_frame_32x32_sum_blks[k] += 1;
2229                 }
2230             }
2231             else
2232             {
2233                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = -1;
2234                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = -1;
2235                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = -1;
2236                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = -1;
2237             }
2238         }
2239     }
2240 
2241     for(i = 0; i < 2; i++)
2242     {
2243         /*8x8*/
2244 #if USE_SQRT_AVG_OF_SATD_SQR
2245         ps_curr_out->i8_curr_frame_8x8_sum_act[i] = ai8_frame_8x8_sum_act_sqr[i];
2246 #else
2247         ps_curr_out->i8_curr_frame_8x8_sum_act[i] = ai4_frame_8x8_sum_act[i];
2248 #endif
2249         ps_curr_out->i4_curr_frame_8x8_sum_act_for_strength[i] = ai4_frame_8x8_sum_act[i];
2250         ps_curr_out->i4_curr_frame_8x8_num_blks[i] = ai4_frame_8x8_sum_blks[i];
2251         ps_curr_out->u8_curr_frame_8x8_sum_act_sqr = u8_frame_8x8_sum_act_sqr;
2252 
2253         /*16x16*/
2254 #if USE_SQRT_AVG_OF_SATD_SQR
2255         ps_curr_out->i8_curr_frame_16x16_sum_act[i] = ai8_frame_16x16_sum_act_sqr[i];
2256 #else
2257         ps_curr_out->i8_curr_frame_16x16_sum_act[i] = ai4_frame_16x16_sum_act[i];
2258 #endif
2259         ps_curr_out->i4_curr_frame_16x16_num_blks[i] = ai4_frame_16x16_sum_blks[i];
2260 
2261         /*32x32*/
2262 #if USE_SQRT_AVG_OF_SATD_SQR
2263         ps_curr_out->i8_curr_frame_32x32_sum_act[i] = ai8_frame_32x32_sum_act_sqr[i];
2264 #else
2265         ps_curr_out->i8_curr_frame_32x32_sum_act[i] = ai4_frame_32x32_sum_act[i];
2266 #endif
2267         ps_curr_out->i4_curr_frame_32x32_num_blks[i] = ai4_frame_32x32_sum_blks[i];
2268     }
2269 
2270     /*16x16*/
2271 #if USE_SQRT_AVG_OF_SATD_SQR
2272     ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai8_frame_16x16_sum_act_sqr[2];
2273 #else
2274     ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai4_frame_16x16_sum_act[2];
2275 #endif
2276     ps_curr_out->i4_curr_frame_16x16_num_blks[2] = ai4_frame_16x16_sum_blks[2];
2277 
2278     /*32x32*/
2279 #if USE_SQRT_AVG_OF_SATD_SQR
2280     ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai8_frame_32x32_sum_act_sqr[2];
2281 #else
2282     ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai4_frame_32x32_sum_act[2];
2283 #endif
2284     ps_curr_out->i4_curr_frame_32x32_num_blks[2] = ai4_frame_32x32_sum_blks[2];
2285 }
2286 
2287 /*!
2288 ************************************************************************
2289 * @brief
2290 *  accumulate L1 intra satd across all threads.
2291 *  Note: call to this function has to be made after all threads have
2292 *  finished preintra processing
2293 *
2294 ************************************************************************
2295 */
ihevce_decomp_pre_intra_get_frame_satd(void * pv_ctxt,WORD32 * wd,WORD32 * ht)2296 LWORD64 ihevce_decomp_pre_intra_get_frame_satd(void *pv_ctxt, WORD32 *wd, WORD32 *ht)
2297 {
2298     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_ctxt;
2299     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
2300     LWORD64 satd_sum = ps_ctxt->ps_ed_ctxt->i8_sum_best_satd;
2301     WORD32 i;
2302 
2303     *wd = ps_ctxt->as_layers[1].i4_actual_wd;
2304     *ht = ps_ctxt->as_layers[1].i4_actual_ht;
2305     for(i = 1; i < ps_master_ctxt->i4_num_proc_thrds; i++)
2306     {
2307         ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i];
2308         satd_sum += ps_ctxt->ps_ed_ctxt->i8_sum_best_satd;
2309     }
2310 
2311     return satd_sum;
2312 }
2313 
ihevce_decomp_pre_intra_get_frame_satd_squared(void * pv_ctxt,WORD32 * wd,WORD32 * ht)2314 LWORD64 ihevce_decomp_pre_intra_get_frame_satd_squared(void *pv_ctxt, WORD32 *wd, WORD32 *ht)
2315 {
2316     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_ctxt;
2317     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
2318     LWORD64 satd_sum = ps_ctxt->ps_ed_ctxt->i8_sum_sq_best_satd;
2319     WORD32 i;
2320 
2321     *wd = ps_ctxt->as_layers[1].i4_actual_wd;
2322     *ht = ps_ctxt->as_layers[1].i4_actual_ht;
2323     for(i = 1; i < ps_master_ctxt->i4_num_proc_thrds; i++)
2324     {
2325         ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i];
2326         satd_sum += ps_ctxt->ps_ed_ctxt->i8_sum_sq_best_satd;
2327     }
2328 
2329     return satd_sum;
2330 }
2331