libhevc/encoder/ihevce_enc_loop_structs.h

/******************************************************************************
 *
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*!
******************************************************************************
* \file ihevce_enc_loop_structs.h
*
* \brief
*    This file contains strcutures of enc_loop pass
*
* \date
*    18/09/2012
*
* \author
*    Ittiam
*
******************************************************************************
*/

#ifndef _IHEVCE_ENC_LOOP_STRUCTS_H_
#define _IHEVCE_ENC_LOOP_STRUCTS_H_

#include "ihevc_macros.h"

extern UWORD16 gau2_ihevce_cabac_bin_to_bits[64 * 2];

/*****************************************************************************/
/* Constant Macros                                                           */
/*****************************************************************************/
/** /breif 4x4 DST, 4x4, 8x8, 16x16, 32x32 */
#define NUM_TRANS_TYPES 5
#define INTRA_PLANAR 0
#define INTRA_DC 1
#define NUM_POSSIBLE_TU_SIZES_CHR_INTRA_SATD 2
#define MAX_TU_IN_TU_EQ_DIV_2 4
#define MAX_MVP_LIST_CAND 2
#define MAX_COST 0x7ffffff
#define MAX_COST_64 0x7ffffffffffffff
#define NUM_32CU_AND_64CU_IN_CTB 5 /* 4 - 32x32 + 1 64x64*/
#define PING_PONG 2
#define MAX_SAO_RD_CAND 10
#define SCRATCH_BUF_STRIDE 80

/*****************************************************************************/
/* Function Macros                                                           */
/*****************************************************************************/
#define INTRA_ANGULAR(x) (x)

/** @breif max 30bit value */
#define MAX30 ((1 << 30) - 1)

/* @brief macro to clip a data to max of 30bits (assuming unsgined) */
#define CLIP30(x) ((x) > MAX30 ? MAX30 : (x))

/* @brief compute the (lambda * rate) with a qshift and clip result to 30bits */
#define COMPUTE_RATE_COST_CLIP30(r, l, qshift) ((WORD32)CLIP30((ULWORD64)((r) * (l)) >> (qshift)))

#define IHEVCE_INV_WT_PRED(inp, wt, off, shift)                                                    \
    (((((inp) - (off)) << (shift)) * wt + (1 << 14)) >> 15)

#define POPULATE_PU_STRUCT(ps_pu, mvx, mvy, offset_x, offset_y, wd, ht, ref_idx, pred_lx)          \
    {                                                                                              \
        (ps_pu)->b4_pos_x = (offset_x) >> 2;                                                       \
        (ps_pu)->b4_pos_y = (offset_y) >> 2;                                                       \
        (ps_pu)->b4_wd = ((wd) >> 2) - 1;                                                          \
        (ps_pu)->b4_ht = ((ht) >> 2) - 1;                                                          \
        (ps_pu)->b1_intra_flag = 0;                                                                \
        (ps_pu)->b2_pred_mode = pred_lx;                                                           \
        if(pred_lx)                                                                                \
        {                                                                                          \
            (ps_pu)->mv.i1_l0_ref_idx = -1;                                                        \
            (ps_pu)->mv.i1_l1_ref_idx = ref_idx;                                                   \
            (ps_pu)->mv.s_l1_mv.i2_mvx = mvx;                                                      \
            (ps_pu)->mv.s_l1_mv.i2_mvy = mvy;                                                      \
        }                                                                                          \
        else                                                                                       \
        {                                                                                          \
            (ps_pu)->mv.i1_l0_ref_idx = ref_idx;                                                   \
            (ps_pu)->mv.i1_l1_ref_idx = -1;                                                        \
            (ps_pu)->mv.s_l0_mv.i2_mvx = mvx;                                                      \
            (ps_pu)->mv.s_l0_mv.i2_mvy = mvy;                                                      \
        }                                                                                          \
    }

#define GET_FRAME_QSTEP_FROM_QP(frame_qp, frame_qstep)                                             \
    {                                                                                              \
        double q_steps[6] = { 0.625, 0.703, 0.79, 0.889, 1.0, 1.125 };                             \
                                                                                                   \
        frame_qstep = (WORD32)((1 << ((frame_qp) / 6)) * q_steps[(frame_qp) % 6]);                 \
    }

#define INITIALISE_MERGE_RESULT_STRUCT(ps_merge_data, pas_pu_results)                              \
    {                                                                                              \
        WORD32 i, j, k;                                                                            \
                                                                                                   \
        for(i = 0; i < TOT_NUM_PARTS; i++)                                                         \
        {                                                                                          \
            (ps_merge_data)->s_pu_results.u1_num_results_per_part_l0[i] = 0;                       \
            (ps_merge_data)->s_pu_results.u1_num_results_per_part_l1[i] = 0;                       \
        }                                                                                          \
        for(i = 0; i < 2; i++)                                                                     \
        {                                                                                          \
            for(j = 0; j < TOT_NUM_PARTS; j++)                                                     \
            {                                                                                      \
                (ps_merge_data)->s_pu_results.aps_pu_results[i][j] = pas_pu_results[i][j];         \
                for(k = 0; k < MAX_NUM_RESULTS_PER_PART_LIST; k++)                                 \
                {                                                                                  \
                    pas_pu_results[i][j][k].i4_tot_cost = MAX_COST;                                \
                    pas_pu_results[i][j][k].pu.mv.i1_l0_ref_idx = -1;                              \
                    pas_pu_results[i][j][k].pu.mv.i1_l1_ref_idx = -1;                              \
                }                                                                                  \
            }                                                                                      \
        }                                                                                          \
    }

#define POPULATE_CTB_PARAMS                                                                        \
    (ps_common_frm_prms,                                                                           \
     apu1_wt_inp,                                                                                  \
     i4_ctb_x_off,                                                                                 \
     i4_ctb_y_off,                                                                                 \
     ppu1_pred,                                                                                    \
     cu_size,                                                                                      \
     ref_stride,                                                                                   \
     bidir_enabled,                                                                                \
     num_refs,                                                                                     \
     pps_rec_list_l0,                                                                              \
     pps_rec_list_l1,                                                                              \
     pu1_non_wt_inp,                                                                               \
     lambda,                                                                                       \
     lambda_q_shift,                                                                               \
     wpred_log_wdc)                                                                                \
    {                                                                                              \
        WORD32 i, j;                                                                               \
        (ps_common_frm_prms)->i4_bidir_enabled = bidir_enabled;                                    \
        (ps_common_frm_prms)->i4_ctb_x_off = i4_ctb_x_off;                                         \
        (ps_common_frm_prms)->i4_ctb_y_off = i4_ctb_y_off;                                         \
        (ps_common_frm_prms)->i4_inp_stride = cu_size;                                             \
        (ps_common_frm_prms)->i4_lamda = lambda;                                                   \
        (ps_common_frm_prms)->i4_pred_stride = cu_size;                                            \
        (ps_common_frm_prms)->i4_rec_stride = ref_stride;                                          \
        (ps_common_frm_prms)->pps_rec_list_l0 = pps_rec_list_l0;                                   \
        (ps_common_frm_prms)->pps_rec_list_l1 = pps_rec_list_l1;                                   \
        (ps_common_frm_prms)->ppu1_pred = ppu1_pred;                                               \
        (ps_common_frm_prms)->pu1_non_wt_inp = pu1_non_wt_inp;                                     \
        (ps_common_frm_prms)->pu1_wkg_mem = NULL;                                                  \
        (ps_common_frm_prms)->u1_lamda_qshift = lambda_q_shift;                                    \
        (ps_common_frm_prms)->u1_num_ref = num_refs;                                               \
        (ps_common_frm_prms)->wpred_log_wdc = wpred_log_wdc;                                       \
        for(i = 0; i < 2; i++)                                                                     \
        {                                                                                          \
            for(j = 0; j < MAX_NUM_REF; j++)                                                       \
            {                                                                                      \
                (ps_common_frm_prms)->apu1_wt_inp = (apu1_wt_inp)[i][j];                           \
            }                                                                                      \
        }                                                                                          \
    }

#define COMPUTE_MERGE_IDX_COST(merge_idx_0_model, merge_idx, max_merge_cand, lambda, cost)         \
    {                                                                                              \
        WORD32 cab_bits_q12 = 0;                                                                   \
                                                                                                   \
        /* sanity checks */                                                                        \
        ASSERT((merge_idx >= 0) && (merge_idx < max_merge_cand));                                  \
                                                                                                   \
        /* encode the merge idx only if required */                                                \
        if(max_merge_cand > 1)                                                                     \
        {                                                                                          \
            WORD32 bin = (merge_idx > 0);                                                          \
                                                                                                   \
            /* bits for the context modelled first bin */                                          \
            cab_bits_q12 += gau2_ihevce_cabac_bin_to_bits[merge_idx_0_model ^ bin];                \
                                                                                                   \
            /* bits for larged merge idx coded as bypass tunary */                                 \
            if((max_merge_cand > 2) && (merge_idx > 0))                                            \
            {                                                                                      \
                cab_bits_q12 += (MIN(merge_idx, (max_merge_cand - 2))) << CABAC_FRAC_BITS_Q;       \
            }                                                                                      \
                                                                                                   \
            cost = COMPUTE_RATE_COST_CLIP30(                                                       \
                cab_bits_q12, lambda, (LAMBDA_Q_SHIFT + CABAC_FRAC_BITS_Q));                       \
        }                                                                                          \
        else                                                                                       \
        {                                                                                          \
            cost = 0;                                                                              \
        }                                                                                          \
    }

/*****************************************************************************/
/* Typedefs                                                                  */
/*****************************************************************************/

typedef FT_CALC_HAD_SATD_8BIT *pf_res_trans_luma_had_chroma;

/** \breif function pointer prototype for residue and transform enc_loop */
typedef UWORD32 (*pf_res_trans_chroma)(
    UWORD8 *pu1_src,
    UWORD8 *pu1_pred,
    WORD32 *pi4_tmp,
    WORD16 *pi2_dst,
    WORD32 src_strd,
    WORD32 pred_strd,
    WORD32 dst_strd_chr_flag);

/** \breif function pointer prototype for quantization and inv Quant for ssd
calc. for all transform sizes */
typedef WORD32 (*pf_quant_iquant_ssd)(
    WORD16 *pi2_coeffs,
    WORD16 *pi2_quant_coeff,
    WORD16 *pi2_q_dst,
    WORD16 *pi2_iq_dst,
    WORD32 trans_size,
    WORD32 qp_div, /* qpscaled / 6 */
    WORD32 qp_rem, /* qpscaled % 6 */
    WORD32 q_add,
    WORD32 *pi4_quant_round_factor_0_1,
    WORD32 *pi4_quant_round_factor_1_2,
    WORD32 src_strd,
    WORD32 dst_q_strd,
    WORD32 dst_iq_strd,
    UWORD8 *csbf,
    WORD32 csbf_strd,
    WORD32 *zero_col,
    WORD32 *zero_row,
    WORD16 *pi2_dequant_coeff,
    LWORD64 *pi8_cost);

/** \breif function pointer prototype for quantization and inv Quant for ssd
calc. for all transform sizes (in case of RDOQ + SBH) */
typedef WORD32 (*pf_quant_iquant_ssd_sbh)(
    WORD16 *pi2_coeffs,
    WORD16 *pi2_quant_coeff,
    WORD16 *pi2_q_dst,
    WORD16 *pi2_iq_dst,
    WORD32 trans_size,
    WORD32 qp_div, /* qpscaled / 6 */
    WORD32 qp_rem, /* qpscaled % 6 */
    WORD32 q_add,
    WORD32 src_strd,
    WORD32 dst_q_strd,
    WORD32 dst_iq_strd,
    UWORD8 *csbf,
    WORD32 csbf_strd,
    WORD32 *zero_col,
    WORD32 *zero_row,
    WORD16 *pi2_dequant_coeff,
    WORD32 *pi4_cost,
    WORD32 i4_scan_idx,
    WORD32 i4_perform_rdoq);

/** \breif function pointer prototype for inverse transform and recon
for all transform sizes : Luma */
typedef void (*pf_it_recon)(
    WORD16 *pi2_src,
    WORD16 *pi2_tmp,
    UWORD8 *pu1_pred,
    UWORD8 *pu1_dst,
    WORD32 src_strd,
    WORD32 pred_strd,
    WORD32 dst_strd,
    WORD32 zero_cols,
    WORD32 zero_rows);

/** \breif function pointer prototype for inverse transform and recon
for all transform sizes : Chroma */
typedef void (*pf_it_recon_chroma)(
    WORD16 *pi2_src,
    WORD16 *pi2_tmp,
    UWORD8 *pu1_pred,
    UWORD8 *pu1_dst,
    WORD32 src_strd,
    WORD32 pred_strd,
    WORD32 dst_strd,
    WORD32 zero_cols,
    WORD32 zero_rows);

/** \breif function pointer prototype for luma sao. */
typedef void (*pf_sao_luma)(
    UWORD8 *pu1_src,
    WORD32 src_strd,
    UWORD8 *pu1_src_left,
    UWORD8 *pu1_src_top,
    UWORD8 *pu1_src_top_left,
    UWORD8 *pu1_src_top_right,
    UWORD8 *pu1_src_bot_left,
    UWORD8 *pu1_avail,
    WORD8 *pi1_sao_offset,
    WORD32 wd,
    WORD32 ht);

/** \breif function pointer prototype for chroma sao. */
typedef void (*pf_sao_chroma)(
    UWORD8 *pu1_src,
    WORD32 src_strd,
    UWORD8 *pu1_src_left,
    UWORD8 *pu1_src_top,
    UWORD8 *pu1_src_top_left,
    UWORD8 *pu1_src_top_right,
    UWORD8 *pu1_src_bot_left,
    UWORD8 *pu1_avail,
    WORD8 *pi1_sao_offset_u,
    WORD8 *pi1_sao_offset_v,
    WORD32 wd,
    WORD32 ht);

/*****************************************************************************/
/* Enums                                                                     */
/*****************************************************************************/

typedef enum
{
    IP_FUNC_MODE_0 = 0,
    IP_FUNC_MODE_1,
    IP_FUNC_MODE_2,
    IP_FUNC_MODE_3TO9,
    IP_FUNC_MODE_10,
    IP_FUNC_MODE_11TO17,
    IP_FUNC_MODE_18_34,
    IP_FUNC_MODE_19TO25,
    IP_FUNC_MODE_26,
    IP_FUNC_MODE_27TO33,

    NUM_IP_FUNCS

} IP_FUNCS_T;

typedef enum
{
    /* currently only cu and cu/2 modes are supported */
    TU_EQ_CU = 0,
    TU_EQ_CU_DIV2,
    TU_EQ_SUBCU, /* only applicable for NXN mode at mincusize */

    /* support for below modes needs to be added */
    TU_EQ_CU_DIV4,
    TU_EQ_CU_DIV8,
    TU_EQ_CU_DIV16,

    NUM_TU_WRT_CU,

} TU_SIZE_WRT_CU_T;

typedef enum
{
    RDOPT_MODE = 0,
    RDOPT_SKIP_MODE = 1,

    NUM_CORE_CALL_MODES,

} CORE_FUNC_CALL_MODE_T;

typedef enum
{
    ENC_LOOP_CTXT = 0,
    ENC_LOOP_THRDS_CTXT,
    ENC_LOOP_SCALE_MAT,
    ENC_LOOP_RESCALE_MAT,
    ENC_LOOP_TOP_LUMA,
    ENC_LOOP_TOP_CHROMA,
    ENC_LOOP_TOP_NBR4X4,
    ENC_LOOP_RC_PARAMS, /* memory to dump rate control parameters by each thread for each bit-rate instance */
    ENC_LOOP_QP_TOP_4X4,
    ENC_LOOP_DEBLOCKING,
    ENC_LOOP_422_CHROMA_INTRA_PRED,
    ENC_LOOP_INTER_PRED,
    ENC_LOOP_CHROMA_PRED_INTRA,
    ENC_LOOP_REF_SUB_OUT,
    ENC_LOOP_REF_FILT_OUT,
    ENC_LOOP_CU_RECUR_LUMA_RECON,
    ENC_LOOP_CU_RECUR_CHROMA_RECON,
    ENC_LOOP_CU_RECUR_LUMA_PRED,
    ENC_LOOP_CU_RECUR_CHROMA_PRED,
    ENC_LOOP_LEFT_LUMA_DATA,
    ENC_LOOP_LEFT_CHROMA_DATA,
    ENC_LOOP_SAO,
    ENC_LOOP_CU_COEFF_DATA,
    ENC_LOOP_CU_RECUR_COEFF_DATA,
    ENC_LOOP_CU_DEQUANT_DATA,
    ENC_LOOP_RECON_DATA_STORE,
    /* should always be the last entry */
    NUM_ENC_LOOP_MEM_RECS

} ENC_LOOP_MEM_TABS_T;

/** This is for assigning the pred buiffers for luma (2 ping-pong) and
chroma(1)   */
typedef enum
{
    CU_ME_INTRA_PRED_LUMA_IDX0 = 0,
    CU_ME_INTRA_PRED_LUMA_IDX1,
    CU_ME_INTRA_PRED_CHROMA_IDX,

    /* should be always the last entry */
    NUM_CU_ME_INTRA_PRED_IDX

} CU_ME_INTRA_PRED_IDX_T;

/*****************************************************************************/
/* Structure                                                                 */
/*****************************************************************************/

/**
******************************************************************************
*  @brief     Structure to store TU prms req. for enc_loop only
******************************************************************************
*/
typedef struct
{
    /** Zero_col info. for the current TU Luma */
    UWORD32 u4_luma_zero_col;
    /** Zero_row info. for the current TU Luma */
    UWORD32 u4_luma_zero_row;

    /** Zero_col info. for the current TU Chroma Cb */
    UWORD32 au4_cb_zero_col[2];
    /** Zero_row info. for the current TU Chroma Cb */
    UWORD32 au4_cb_zero_row[2];
    /** Zero_col info. for the current TU Chroma Cr */
    UWORD32 au4_cr_zero_col[2];
    /** Zero_row info. for the current TU Chroma Cr */
    UWORD32 au4_cr_zero_row[2];

    /** bytes consumed by the luma ecd data */
    WORD16 i2_luma_bytes_consumed;
    /** bytes consumed by the Cb ecd data */
    WORD16 ai2_cb_bytes_consumed[2];
    /** bytes consumed by the Cr ecd data */
    WORD16 ai2_cr_bytes_consumed[2];

    /** flag to re-evaluate IQ and Coeff data of luma in the final_recon
    function. If zero, uses the data from RDOPT cand.                   */
    UWORD16 b1_eval_luma_iq_and_coeff_data : 1;
    /** flag to re-evaluate IQ and Coeff data of chroma in the final_recon
    function. If zero, uses the data from RDOPT cand.                   */
    UWORD16 b1_eval_chroma_iq_and_coeff_data : 1;

    /* TO DO : No support now, need to add. Always comapre ZERO_CBF cost */
    /** Luma ZERO_CBF cost is compared with residue coding cost only if this
    flag is enabled */
    UWORD16 b1_eval_luma_zero_cbf_cost : 1;
    /** Chroma ZERO_CBF cost is compared with residue coding cost only if this
    flag is enabled */
    UWORD16 b1_eval_chroma_zero_cbf_cost : 1;

    /** Reserved to make WORD32 alignment */
    UWORD16 b12_reserved : 12;

} tu_enc_loop_temp_prms_t;

typedef struct recon_datastore_t
{
    /* 2 to store current and best */
    void *apv_luma_recon_bufs[2];

    /* 0 to store cur chroma mode recon */
    /* 1 to store winning independent chroma mode with a single TU's recon */
    /* 2 to store winning independent chroma mode with 4 TUs' recon */
    void *apv_chroma_recon_bufs[3];

    /* The following two arrays are used to store the ID's of the buffers */
    /* where the winning recon is being stored */
    /* For Luma buffers, the permissible values are 0, 1 and UCHAR_MAX */
    /* For Chroma buffers, the permissible values are 0, 1, 2 and UCHAR_MAX */
    /* The value 'UCHAR_MAX' indicates the absence of Recon for that particular TU */
    UWORD8 au1_bufId_with_winning_LumaRecon[MAX_TU_IN_CTB_ROW * MAX_TU_IN_CTB_ROW];

    /* 2 - 2 Chroma planes */
    /* 2 - 2 possible subTU's */
    UWORD8 au1_bufId_with_winning_ChromaRecon[2][MAX_TU_IN_CTB_ROW * MAX_TU_IN_CTB_ROW][2];

    WORD32 i4_lumaRecon_stride;

    WORD32 i4_chromaRecon_stride;

    UWORD8 au1_is_chromaRecon_available[3];

    UWORD8 u1_is_lumaRecon_available;

} recon_datastore_t;

typedef struct enc_loop_cu_final_prms_t
{
    recon_datastore_t s_recon_datastore;

    /**
    * Cu size of the current cu being processed
    */
    UWORD8 u1_cu_size;
    /**
    * flags to indicate the final cu prediction mode
    */
    UWORD8 u1_intra_flag;

    /**
    * flags to indicate Skip mode for CU
    */
    UWORD8 u1_skip_flag;

    /**
    * number of tu in current cu for a given mode
    * if skip then this value should be 1
    */
    UWORD16 u2_num_tus_in_cu;

    /**
    * number of pu in current cu for a given mode
    * if skip then this value should be 1
    */
    UWORD16 u2_num_pus_in_cu;

    /**
    * total bytes produced in ECD data buffer
    * if skip then this value should be 0
    */
    WORD32 i4_num_bytes_ecd_data;

    /**
    * Partition mode of the best candidate
    * if skip then this value should be SIZE_2Nx2N
    * @sa PART_SIZE_E
    */
    UWORD8 u1_part_mode;

    /**
    * indicates if inter cu has coded coeffs 1: coded, 0: not coded
    * if skip then this value shoudl be ignored
    */
    UWORD8 u1_is_cu_coded;

    /**
    * Chroma pred mode as signalled in bitstream
    */
    UWORD8 u1_chroma_intra_pred_mode;

    /**
    * To store the best chroma mode for TU. Will be same for NxN case.
    * Actual Chroma pred
    */
    UWORD8 u1_chroma_intra_pred_actual_mode;

    /**
    * sad accumulated over all Tus of given CU
    */
    UWORD32 u4_cu_sad;

    /**
    * sad accumulated over all Tus of given CU
    */
    LWORD64 i8_cu_ssd;

    /**
    * open loop intra sad
    */
    UWORD32 u4_cu_open_intra_sad;

    /**
    * header bits of cu estimated during RDO evaluation.
    * Includes tu splits flags excludes cbf flags
    */
    UWORD32 u4_cu_hdr_bits;
    /**
    * luma residual bits of a cu estimated during RDO evaluation.
    */
    UWORD32 u4_cu_luma_res_bits;

    /**
    * chroma residual bits of a cu estimated during RDO evaluation.
    */
    UWORD32 u4_cu_chroma_res_bits;

    /**
    * cbf bits of a cu estimated during RDO evaluation (considered as part of texture bits later)
    */
    UWORD32 u4_cu_cbf_bits;

    /**
    * array of PU for current CU
    * For Inter PUs this will contain the follwoing
    *   - merge flag
    *   - (MVD and reference indicies) or (Merge Index)
    *   - (if Cu is skipped then Merge index for skip
    *      will be in 1st PU entry in array)
    * for intra PU only intra flag will be set to 1
    *
    */
    pu_t as_pu_enc_loop[NUM_PU_PARTS];

    /**
    * array of PU for chroma usage
    * in case of Merge MVs and reference idx of the final candidate
    * used by luma need sto be stored
    * for intra PU this will not be used
    */
    pu_t as_pu_chrm_proc[NUM_PU_PARTS];

    /**
    * array of colocated PU for current CU
    * MV and Ref pic id should be stored in this
    * for intra PU only intra flag will be set to 1
    */
    pu_col_mv_t as_col_pu_enc_loop[NUM_INTER_PU_PARTS];

    /** array to store the intra mode pred related params
    * if nxn mode the all 4 lcoations will be used
    */
    intra_prev_rem_flags_t as_intra_prev_rem[NUM_PU_PARTS];

    /**
    * array to store TU propeties of the each tu in a CU
    */
    tu_enc_loop_out_t as_tu_enc_loop[MAX_TU_IN_CTB_ROW * MAX_TU_IN_CTB_ROW];

    /**
    * array to store TU propeties (req. for enc_loop only and not for
    * entropy) of the each tu in a CU
    */
    tu_enc_loop_temp_prms_t as_tu_enc_loop_temp_prms[MAX_TU_IN_CTB_ROW * MAX_TU_IN_CTB_ROW];

    /**
    * Neighbour flags stored for chroma reuse
    */
    UWORD32 au4_nbr_flags[MAX_TU_IN_CTB_ROW * MAX_TU_IN_CTB_ROW];

    /**
    * intra pred modes stored for chroma reuse
    */
    UWORD8 au1_intra_pred_mode[4];

    /**
    * array for storing coeffs during RD opt stage at CU level.
    * Luma and chroma together
    */
    UWORD8 *pu1_cu_coeffs;

    /**
    * Chroma deq_coeffs start point in the ai2_cu_deq_coeffs buffer.
    */
    WORD32 i4_chrm_cu_coeff_strt_idx;

    /**
    * array for storing dequantized vals. during RD opt stage at CU level
    * Luma and chroma together.
    * Stride is assumed to be cu_size
    * u-v interleaved storing is at TU level
    */
    WORD16 *pi2_cu_deq_coeffs;

    /**
    * Chroma deq_coeffs start point in the ai2_cu_deq_coeffs buffer.
    */
    WORD32 i4_chrm_deq_coeff_strt_idx;

    /**
    * The total RDOPT cost of the CU for the best mode
    */
    LWORD64 i8_best_rdopt_cost;

    /**
    * The current running RDOPT cost for the current mode
    */
    LWORD64 i8_curr_rdopt_cost;

    LWORD64 i8_best_distortion;

} enc_loop_cu_final_prms_t;

typedef struct
{
    /** Current Cu chroma recon pointer in pic buffer */
    UWORD8 *pu1_final_recon;

    UWORD16 *pu2_final_recon;

    /** Current Cu chroma source pointer in pic buffer */
    UWORD8 *pu1_curr_src;

    UWORD16 *pu2_curr_src;

    /** Current CU chroma reocn buffer stride */
    WORD32 i4_chrm_recon_stride;

    /** Current CU chroma source buffer stride */
    WORD32 i4_chrm_src_stride;

    /** Current Cu chroma Left pointer for intra pred */
    UWORD8 *pu1_cu_left;

    UWORD16 *pu2_cu_left;

    /** Left buffer stride */
    WORD32 i4_cu_left_stride;

    /** Current Cu chroma top pointer for intra pred */
    UWORD8 *pu1_cu_top;

    UWORD16 *pu2_cu_top;

    /** Current Cu chroma top left pointer for intra pred */
    UWORD8 *pu1_cu_top_left;

    UWORD16 *pu2_cu_top_left;

} enc_loop_chrm_cu_buf_prms_t;

typedef struct
{
    /** cost of the current satd cand */
    WORD32 i4_cost;

    /** tu size w.r.t to cu of the current satd cand
    * @sa TU_SIZE_WRT_CU_T
    */
    WORD8 i4_tu_depth;

    /**
    *  access valid number of entries in this array based on u1_part_size
    */
    UWORD8 au1_intra_luma_modes[NUM_PU_PARTS];

    /** @remarks u1_part_size 2Nx2N or  NxN  */
    UWORD8 u1_part_mode; /* @sa: PART_SIZE_E */

    /** Flag to indicate whether current candidate needs to be evaluated */
    UWORD8 u1_eval_flag;

} cu_intra_satd_out_t;

/** \brief cu level parameters for SATD / RDOPT function */

typedef struct
{
    /** pointer to source luma pointer
    *  pointer will be pointing to CTB start location
    *  At CU level based on the CU position this pointer
    *  has to appropriately incremented
    */
    UWORD8 *pu1_luma_src;

    UWORD16 *pu2_luma_src;

    /** pointer to source chroma pointer
    *  pointer will be pointing to CTB start location
    *  At CU level based on the CU position this pointer
    *  has to appropriately incremented
    */
    UWORD8 *pu1_chrm_src;

    UWORD16 *pu2_chrm_src;

    /** pointer to recon luma pointer
    *  pointer will be pointing to CTB start location
    *  At CU level based on the CU position this pointer
    *  has to appropriately incremented
    */
    UWORD8 *pu1_luma_recon;

    UWORD16 *pu2_luma_recon;

    /** pointer to recon chroma pointer
    *  pointer will be pointing to CTB start location
    *  At CU level based on the CU position this pointer
    *  has to appropriately incremented
    */
    UWORD8 *pu1_chrm_recon;

    UWORD16 *pu2_chrm_recon;

    /*1st pass parallel dpb buffer pointers aimilar to the above*/
    UWORD8 *pu1_luma_recon_src;

    UWORD16 *pu2_luma_recon_src;

    UWORD8 *pu1_chrm_recon_src;

    UWORD16 *pu2_chrm_recon_src;

    /** Pointer to Subpel Plane Buffer */
    UWORD8 *pu1_sbpel_hxfy;

    /** Pointer to Subpel Plane Buffer */
    UWORD8 *pu1_sbpel_fxhy;

    /** Pointer to Subpel Plane Buffer */
    UWORD8 *pu1_sbpel_hxhy;

    /** Luma source stride */
    WORD32 i4_luma_src_stride;

    /** chroma soruce stride */
    WORD32 i4_chrm_src_stride;

    /** Luma recon stride */
    WORD32 i4_luma_recon_stride;

    /** chroma recon stride */
    WORD32 i4_chrm_recon_stride;

    /** ctb size */
    WORD32 i4_ctb_size;

    /** current ctb postion horz */
    WORD32 i4_ctb_pos;

    /** number of PU finalized for curr CU  */
    WORD32 i4_num_pus_in_cu;

    /** number of bytes consumed for current in ecd data buf */
    WORD32 i4_num_bytes_cons;

    UWORD8 u1_is_cu_noisy;

    UWORD8 *pu1_is_8x8Blk_noisy;

} enc_loop_cu_prms_t;

/**
******************************************************************************
*  @brief Pad inter pred recon context
******************************************************************************
*/
typedef struct
{
    /** Pointer to Subpel Plane Buffer */
    UWORD8 *pu1_sbpel_hxfy;

    /** Pointer to Subpel Plane Buffer */
    UWORD8 *pu1_sbpel_fxhy;

    /** Pointer to Subpel Plane Buffer */
    UWORD8 *pu1_sbpel_hxhy;

    /** pointer to recon luma pointer
    *  pointer will be pointing to CTB start location
    *  At CU level based on the CU position this pointer
    *  has to appropriately incremented
    */
    UWORD8 *pu1_luma_recon;

    /** pointer to recon chroma pointer
    *  pointer will be pointing to CTB start location
    *  At CU level based on the CU position this pointer
    *  has to appropriately incremented
    */
    UWORD8 *pu1_chrm_recon;

    /*FOr recon source 1st pass starts*/

    UWORD8 *pu1_luma_recon_src;

    /** pointer to recon chroma pointer
    *  pointer will be pointing to CTB start location
    *  At CU level based on the CU position this pointer
    *  has to appropriately incremented
    */
    UWORD8 *pu1_chrm_recon_src;
    /*FOr recon source 1st pass ends */
    /** Luma recon stride */
    WORD32 i4_luma_recon_stride;

    /** chroma recon stride */
    WORD32 i4_chrm_recon_stride;

    /** ctb size */
    WORD32 i4_ctb_size;

    /* 0 - 400; 1 - 420; 2 - 422; 3 - 444 */
    UWORD8 u1_chroma_array_type;

} pad_interp_recon_frm_t;

/**
******************************************************************************
*  @brief inter prediction (MC) context for enc loop
******************************************************************************
*/
/*IMPORTANT please keep inter_pred_ctxt_t and inter_pred_me_ctxt_t as identical*/
typedef struct
{
    /** pointer to reference lists */
    recon_pic_buf_t *(*ps_ref_list)[HEVCE_MAX_REF_PICS * 2];

    /** scratch buffer for horizontal interpolation destination */
    WORD16 MEM_ALIGN16 ai2_horz_scratch[MAX_CTB_SIZE * (MAX_CTB_SIZE + 8)];

    /** scratch 16 bit buffer for interpolation in l0 direction */
    WORD16 MEM_ALIGN16 ai2_scratch_buf_l0[MAX_CTB_SIZE * MAX_CTB_SIZE];

    /** scratch 16 bit buffer for interpolation in l1 direction */
    WORD16 MEM_ALIGN16 ai2_scratch_buf_l1[MAX_CTB_SIZE * MAX_CTB_SIZE];

    /** Pointer to struct containing function pointers to
    functions in the 'common' library' */
    func_selector_t *ps_func_selector;

    /** common denominator used for luma weights */
    WORD32 i4_log2_luma_wght_denom;

    /** common denominator used for chroma weights */
    WORD32 i4_log2_chroma_wght_denom;

    /**  offset w.r.t frame start in horz direction (pels) */
    WORD32 i4_ctb_frm_pos_x;

    /**  offset w.r.t frame start in vert direction (pels) */
    WORD32 i4_ctb_frm_pos_y;

    /* Bit Depth of Input */
    WORD32 i4_bit_depth;

    /* 0 - 400; 1 - 420; 2 - 422; 3 - 444 */
    UWORD8 u1_chroma_array_type;

    /** weighted_pred_flag      */
    WORD8 i1_weighted_pred_flag;

    /** weighted_bipred_flag    */
    WORD8 i1_weighted_bipred_flag;

    /** Structure to describe extra CTBs around frame due to search
    range associated with distributed-mode. Entries are top, left,
    right and bottom */
    WORD32 ai4_tile_xtra_pel[4];

} inter_pred_ctxt_t;
/*IMPORTANT please keep inter_pred_ctxt_t and inter_pred_me_ctxt_t as identical*/

typedef IV_API_CALL_STATUS_T (*PF_LUMA_INTER_PRED_PU)(
    void *pv_inter_pred_ctxt,
    pu_t *ps_pu,
    void *pv_dst_buf,
    WORD32 dst_stride,
    WORD32 i4_flag_inter_pred_source);

/**
******************************************************************************
*  @brief  Motion predictor context structure
******************************************************************************
*/
typedef struct
{
    /** pointer to reference lists */
    recon_pic_buf_t *(*ps_ref_list)[HEVCE_MAX_REF_PICS * 2];

    /** pointer to the slice header */
    slice_header_t *ps_slice_hdr;

    /** pointer to SPS */
    sps_t *ps_sps;

    /** CTB x. In CTB unit*/
    WORD32 i4_ctb_x;

    /** CTB y. In CTB unit */
    WORD32 i4_ctb_y;

    /** Log2 Parallel Merge Level - 2  */
    WORD32 i4_log2_parallel_merge_level_minus2;

    /* Number of extra CTBs external to tile due to fetched search-range around Tile */
    /* TOP, left, right and bottom */
    WORD32 ai4_tile_xtra_ctb[4];

} mv_pred_ctxt_t;

/**
******************************************************************************
*  @brief  Deblocking and Boundary strength CTB level structure
******************************************************************************
*/
typedef struct
{
    /** Array to store the packed BS values in horizontal direction  */
    UWORD32 au4_horz_bs[(MAX_CTB_SIZE >> 3) + 1];

    /** Array to store the packed BS values in vertical direction  */
    UWORD32 au4_vert_bs[(MAX_CTB_SIZE >> 3) + 1];

    /** CTB neighbour availability flags for deblocking */
    UWORD8 u1_not_first_ctb_col_of_frame;
    UWORD8 u1_not_first_ctb_row_of_frame;

} deblk_bs_ctb_ctxt_t;

/**
******************************************************************************
*  @brief  Deblocking and CTB level structure
******************************************************************************
*/
typedef struct
{
    /**
    * BS of the last vertical 4x4 column of previous CTB
    */
    UWORD8 au1_prev_bs[MAX_CTB_SIZE >> 3];

    /**
    * BS of the last vertical 4x4 column of previous CTB
    */
    UWORD8 au1_prev_bs_uv[MAX_CTB_SIZE >> 3];

    /** pointer to top 4x4 ctb nbr structure; for accessing qp  */
    nbr_4x4_t *ps_top_ctb_nbr_4x4;

    /** pointer to left 4x4 ctb nbr structure; for accessing qp */
    nbr_4x4_t *ps_left_ctb_nbr_4x4;

    /** pointer to current 4x4 ctb nbr structure; for accessing qp */
    nbr_4x4_t *ps_cur_ctb_4x4;

    /** max of 8 such contiguous bs to be computed for 64x64 ctb */
    UWORD32 *pu4_bs_horz;

    /** max of 8 such contiguous bs to be computed for 64x64 ctb */
    UWORD32 *pu4_bs_vert;

    /** ptr to current ctb luma pel in frame */
    UWORD8 *pu1_ctb_y;

    UWORD16 *pu2_ctb_y;

    /** ptr to current ctb sp interleaved chroma pel in frame */
    UWORD8 *pu1_ctb_uv;

    UWORD16 *pu2_ctb_uv;

    func_selector_t *ps_func_selector;

    /** left nbr buffer stride in terms of 4x4 units */
    WORD32 i4_left_nbr_4x4_strd;

    /** current  buffer stride in terms of 4x4 units */
    WORD32 i4_cur_4x4_strd;

    /** size in pels 16 / 32 /64 */
    WORD32 i4_ctb_size;

    /** stride for luma       */
    WORD32 i4_luma_pic_stride;

    /** stride for  chroma */
    WORD32 i4_chroma_pic_stride;

    /** boolean indicating if left ctb edge is to be deblocked or not */
    WORD32 i4_deblock_left_ctb_edge;

    /** boolean indicating if top ctb edge is to be deblocked or not */
    WORD32 i4_deblock_top_ctb_edge;

    /** beta offset index */
    WORD32 i4_beta_offset_div2;

    /** tc offset index */
    WORD32 i4_tc_offset_div2;

    /** chroma cb qp offset index */
    WORD32 i4_cb_qp_indx_offset;

    /** chroma cr qp offset index */
    WORD32 i4_cr_qp_indx_offset;

    WORD32 i4_bit_depth;

    /* 0 - 400; 1 - 420; 2 - 422; 3 - 444 */
    UWORD8 u1_chroma_array_type;

} deblk_ctb_params_t;

/**
******************************************************************************
*  @brief  Stores the BS and Qp of a CTB row. For CTB-row level deblocking
******************************************************************************
*/
typedef struct deblk_ctbrow_prms
{
    /**
    * Refer to ihevce_enc_loop_get_mem_recs() and
    * ihevce_enc_loop_init()for more info
    * regarding memory allocation to each one below.
    */

    /**
    * Stores the vertical boundary strength of a CTB row.
    */
    UWORD32 *pu4_ctb_row_bs_vert;

    /**
    * Storage is same as above. Contains horizontal BS.
    */
    UWORD32 *pu4_ctb_row_bs_horz;

    /**
    * Pointer to the CTB row's Qp storage
    */
    WORD8 *pi1_ctb_row_qp;

    /**
    * Stride of the pu1_ctb_row_qp_p buffer in WORD32 unit
    */
    WORD32 u4_qp_buffer_stride;

    /*
    *   Pointer to the  memory which contains the Qp of
    *   top4x4 neighbour blocks for each CTB row.
    *   This memory is at frame level.
    */
    WORD8 *api1_qp_top_4x4_ctb_row[MAX_NUM_ENC_LOOP_PARALLEL];

    /*
    *   Stride of the above memory location.
    *   Values in one-stride correspondes to one CTB row.
    */
    WORD32 u4_qp_top_4x4_buf_strd;

    /*size of frm level qp buffer*/
    WORD32 u4_qp_top_4x4_buf_size;

} deblk_ctbrow_prms_t;

/**
******************************************************************************
*  @brief  Entropy rd opt context for cabac bit estimation and RDO
******************************************************************************
*/
typedef struct rdopt_entropy_ctxt
{
    /**
    * array for entropy contexts during RD opt stage at CU level
    * one best and one current is required
    */
    entropy_context_t as_cu_entropy_ctxt[2];

    /**
    * init state of entropy context models during CU RD opt stage,
    * required for saving and restoring the cabac states
    */
    UWORD8 au1_init_cabac_ctxt_states[IHEVC_CAB_CTXT_END];

    /*
    * ptr to top row cu skip flags (1 bit per 8x8CU)
    */
    UWORD8 *pu1_cu_skip_top_row;

    /**
    * Current entropy ctxt idx
    */
    WORD32 i4_curr_buf_idx;

} rdopt_entropy_ctxt_t;

/**
******************************************************************************
*  @brief  structure to save predicted data from Inter SATD stage to Inter RD opt stage
******************************************************************************
*/
typedef struct
{
    /*Buffer to store the predicted data after motion compensation for merge and
    * skip candidates.
    * [2] Because for a given candidate we do motion compensation for 5 merge candidates.
    *     store the pred data after mc for the first 2 candidates and from 3rd candidate
    *     onwards, overwrite the data which has higher SATD cost.
    */
    void *apv_pred_data[2];

    /** Stride to store the predicted data
    */
    WORD32 i4_pred_data_stride;

} merge_skip_pred_data_t;
/**
******************************************************************************
*  @brief  Structure to hold Rate control related parameters
*          for each bit-rate instance and each thread
******************************************************************************
*/
typedef struct
{
    /**
    *frame level open loop intra sad
    *
    */
    LWORD64 i8_frame_open_loop_ssd;

    /**
    *frame level open loop intra sad
    *
    */
    UWORD32 u4_frame_open_loop_intra_sad;
    /**
    * frame level intra sad accumulator
    */
    UWORD32 u4_frame_intra_sad;

    /**
    *  frame level sad accumulator
    */
    UWORD32 u4_frame_sad_acc;

    /**
    *  frame level intra sad accumulator
    */
    UWORD32 u4_frame_inter_sad_acc;

    /**
    *  frame level inter sad accumulator
    */
    UWORD32 u4_frame_intra_sad_acc;

    /**
    *  frame level cost accumulator
    */
    LWORD64 i8_frame_cost_acc;

    /**
    *  frame level intra cost accumulator
    */
    LWORD64 i8_frame_inter_cost_acc;

    /**
    *  frame level inter cost accumulator
    */
    LWORD64 i8_frame_intra_cost_acc;

    /**
    * frame level rdopt bits accumulator
    */
    UWORD32 u4_frame_rdopt_bits;

    /**
    * frame level rdopt header bits accumulator
    */
    UWORD32 u4_frame_rdopt_header_bits;

    /* Sum the Qps of each 8*8 block in CU
    * 8*8 block is considered as Min CU size possible as per standard is 8
    * 0 corresponds to INTER and 1 corresponds to INTRA
    */
    WORD32 i4_qp_normalized_8x8_cu_sum[2];

    /* Count the number of 8x8 blocks in each CU type (INTER/INTRA)
    * 0 corresponds to INTER and 1 corresponds to INTRA
    */
    WORD32 i4_8x8_cu_sum[2];

    /* SAD/Qscale accumulated over all CUs. CU size is inherently
    * taken care in SAD
    */
    LWORD64 i8_sad_by_qscale[2];

} enc_loop_rc_params_t;
/**
******************************************************************************
*  @brief  CU information structure. This is to store the
*  CU final out after Recursion
******************************************************************************
*/
typedef struct ihevce_enc_cu_node_ctxt_t
{
    /* CU params */
    /** CU X position in terms of min CU (8x8) units */
    UWORD8 b3_cu_pos_x : 3;

    /** CU Y position in terms of min CU (8x8) units */
    UWORD8 b3_cu_pos_y : 3;

    /** reserved bytes */
    UWORD8 b2_reserved : 2;

    /** CU size 2N (width or height) in pixels */
    UWORD8 u1_cu_size;

    /**
    * array for storing cu level final params for a given mode
    * one best and one current is required
    */
    enc_loop_cu_final_prms_t s_cu_prms;

    /**
    * array for storing cu level final params for a given mode
    * one best and one current is required
    */
    enc_loop_cu_final_prms_t *ps_cu_prms;

    /* flag to indicate if current CU is the first
    CU of the Quantisation group*/
    UWORD32 b1_first_cu_in_qg : 1;

    /** qp used during for CU
    * @remarks :
    */
    WORD8 i1_cu_qp;

} ihevce_enc_cu_node_ctxt_t;

typedef struct
{
    WORD32 i4_sad;

    WORD32 i4_mv_cost;

    WORD32 i4_tot_cost;

    WORD8 i1_ref_idx;

    mv_t s_mv;

} block_merge_nodes_t;

/**
******************************************************************************
*  @brief  This struct is used for storing output of block merge
******************************************************************************
*/
typedef struct
{
    block_merge_nodes_t *aps_best_results[MAX_NUM_PARTS];

    /* Contains the best uni dir for each partition type */
    WORD32 ai4_best_uni_dir[MAX_NUM_PARTS];

    /* Contains the best pred dir for each partition type */
    WORD32 ai4_best_pred_dir[MAX_NUM_PARTS];

    WORD32 i4_tot_cost;

    PART_TYPE_T e_part_type;
} block_merge_results_t;

/**
******************************************************************************
*  @brief  This struct is used for storing output of block merge and also
*          all of the intermediate results required
******************************************************************************
*/
typedef struct
{
    block_merge_results_t as_best_results[3 + 1][NUM_BEST_ME_OUTPUTS];

    block_merge_nodes_t as_nodes[3][TOT_NUM_PARTS][NUM_BEST_ME_OUTPUTS];

    WORD32 part_mask;

    WORD32 num_results_per_part;

    WORD32 num_best_results;

    /**
    * Overall best CU cost, while other entries store CU costs
    * in single direction, this is best CU cost, where each
    * partition cost is evaluated as best of uni/bi
    */
    WORD32 best_cu_cost;

} block_merge_data_t;
/**
******************************************************************************
*  @brief  CU nbr information structure. This is to store the
*  neighbour information for final reconstruction function
******************************************************************************
*/
typedef struct
{
    /* Pointer to top-left nbr */
    nbr_4x4_t *ps_topleft_nbr_4x4;
    /* Pointer to left nbr */
    nbr_4x4_t *ps_left_nbr_4x4;
    /* Pointer to top nbr */
    nbr_4x4_t *ps_top_nbr_4x4;
    /* stride of left_nbr_4x4 */
    WORD32 nbr_4x4_left_strd;

    /* Pointer to CU top */
    UWORD8 *pu1_cu_top;

    UWORD16 *pu2_cu_top;

    /* Pointer to CU top-left */
    UWORD8 *pu1_cu_top_left;

    UWORD16 *pu2_cu_top_left;

    /* Pointer to CU left */
    UWORD8 *pu1_cu_left;

    UWORD16 *pu2_cu_left;

    /* stride of left pointer */
    WORD32 cu_left_stride;
} cu_nbr_prms_t;

/** Structure to save the flags required for Final mode Reconstruction
function. These flags are set based on quality presets and
the bit-rate we are working on */
typedef struct
{
    /** Flag to indicate whether Luma pred data need to recomputed in the
    final_recon function. Now disabled for all modes */
    UWORD8 u1_eval_luma_pred_data;

    /** Flag to indicate whether Chroma pred data need to recomputed in the
    final_recon function. Now disabled for MedSpeed only */
    UWORD8 u1_eval_chroma_pred_data;

    /** Flag to indicate whether header data need to recomputed in the
    final_recon function. Now disabled for all modes */
    UWORD8 u1_eval_header_data;

    UWORD8 u1_eval_recon_data;
} cu_final_recon_flags_t;

/**
******************************************************************************
*  @brief  structure to save pred data of ME cand. 1 ping-pong to store the
*  the best and current luma cand. 1 buffer to store the best chroma pred
******************************************************************************
*/
typedef struct
{
    /** Pointers to store luma pred data of me/intra cand.(2) and chroma(1) */
    UWORD8 *pu1_pred_data[NUM_CU_ME_INTRA_PRED_IDX];

    UWORD16 *pu2_pred_data[NUM_CU_ME_INTRA_PRED_IDX];

    /** Stride to store the predicted data of me/intra cand.(2) and chroma(1) */
    WORD32 ai4_pred_data_stride[NUM_CU_ME_INTRA_PRED_IDX];
    /** Counter saying how many pointers are assigned */
    WORD32 i4_pointer_count;

} cu_me_intra_pred_prms_t;

/**
******************************************************************************
*  @brief  Chroma RDOPT context structure
******************************************************************************
*/
typedef struct
{
    /** Storing the inverse quantized data (cb) for the special modes*/
    WORD16 ai2_iq_data_cb[(MAX_TU_SIZE * MAX_TU_SIZE) << 1];

    /** Storing the inverse quantized data (cr) for the special modes*/
    WORD16 ai2_iq_data_cr[(MAX_TU_SIZE * MAX_TU_SIZE) << 1];

    /** Storing the scan coeffs (cb) for the special modes*/
    UWORD8 au1_scan_coeff_cb[2][(MAX_TU_IN_CTB >> 1) * MAX_SCAN_COEFFS_BYTES_4x4];

    /** Storing the scan coeffs (cb) for the special modes*/
    UWORD8 au1_scan_coeff_cr[2][(MAX_TU_IN_CTB >> 1) * MAX_SCAN_COEFFS_BYTES_4x4];

    /** Max number of bytes filled in scan coeff data (cb) per TU*/
    WORD32 ai4_num_bytes_scan_coeff_cb_per_tu[2][MAX_TU_IN_TU_EQ_DIV_2];

    /** Max number of bytes filled in scan coeff data (cr) per TU*/
    WORD32 ai4_num_bytes_scan_coeff_cr_per_tu[2][MAX_TU_IN_TU_EQ_DIV_2];

    /** Stride of the iq buffer*/
    WORD32 i4_iq_buff_stride;

    /** Storing the pred data
    The predicted data is always interleaved. Therefore the size of this array will be
    ((MAX_TU_SIZE * MAX_TU_SIZE) >> 2) * 2)*/
    void *pv_pred_data;

    /** Predicted data stride*/
    WORD32 i4_pred_stride;

    /** Storing the cbfs for each tu
    For 1 tu case, only the 0th element will be valid*/
    UWORD8 au1_cbf_cb[2][MAX_TU_IN_TU_EQ_DIV_2];

    /** Storing the cbfs for each tu
    For 1 tu case, only the 0th element will be valid*/
    UWORD8 au1_cbf_cr[2][MAX_TU_IN_TU_EQ_DIV_2];

    /** To store the cabac ctxt model updated by the RDOPT of best chroma mode
    [0] : for 1 TU case, [1] : for 4 TU case */
    UWORD8 au1_chrm_satd_updated_ctxt_models[IHEVC_CAB_CTXT_END];

    /** Best SATD chroma mode, [0] : for 1 TU case (TU_EQ_CU) , [1] : for 4 TU case
    Values : 0(PLANAR), 1(VERT), 2(HOR), 3(DC) chroma mode per each TU */
    UWORD8 u1_best_cr_mode;

    /** Best SATD chroma mode's RDOPT cost, [0] : for 1 TU case, [1] : for 4 TU case */
    LWORD64 i8_chroma_best_rdopt;

    /* Account for coding b3_chroma_intra_pred_mode prefix and suffix bins */
    /* This is done by adding the bits for signalling chroma mode (0-3)    */
    /* and subtracting the bits for chroma mode same as luma mode (4)      */
    LWORD64 i8_cost_to_encode_chroma_mode;

    /** Best SATD chroma mode's tu bits, [0] : for 1 TU case, [1] : for 4 TU case */
    WORD32 i4_chrm_tu_bits;

    /** Storing the zero col values for each TU for cb*/
    WORD32 ai4_zero_col_cb[2][MAX_TU_IN_TU_EQ_DIV_2];

    /** Storing the zero col values for each TU for cr*/
    WORD32 ai4_zero_col_cr[2][MAX_TU_IN_TU_EQ_DIV_2];

    /** Storing the zero row values for each TU for cb*/
    WORD32 ai4_zero_row_cb[2][MAX_TU_IN_TU_EQ_DIV_2];

    /** Storing the zero row values for each TU for cr*/
    WORD32 ai4_zero_row_cr[2][MAX_TU_IN_TU_EQ_DIV_2];
} chroma_intra_satd_ctxt_t;

/**
******************************************************************************
*  @brief  Chroma RDOPT context structure
******************************************************************************
*/
typedef struct
{
    /** Chroma SATD context structure. It is an array of two to account for the TU_EQ_CU candidate
    and the TU_EQ_CU_DIV2 candidate*/
    chroma_intra_satd_ctxt_t as_chr_intra_satd_ctxt[NUM_POSSIBLE_TU_SIZES_CHR_INTRA_SATD];

    /** Chroma SATD has has to be evaluated only for the HIGH QUALITY */
    UWORD8 u1_eval_chrm_satd;

    /** Chroma RDOPT has to be evaluated only for the HIGH QUALITY / MEDIUM SPEED preset */
    UWORD8 u1_eval_chrm_rdopt;

} ihevce_chroma_rdopt_ctxt_t;

typedef struct
{
    inter_cu_results_t s_cu_results;

    inter_pu_results_t s_pu_results;
} block_merge_output_t;

/**
******************************************************************************
*  @brief  Structure to store the Merge/Skip Cand. for EncLoop
******************************************************************************
*/
typedef struct
{
    /** List of all  merge/skip candidates to be evalauted (SATD/RDOPT) for
    * this CU
    */
    cu_inter_cand_t as_cu_inter_merge_skip_cand[MAX_NUM_CU_MERGE_SKIP_CAND];

    /** number of merge/skip candidates
    */
    UWORD8 u1_num_merge_cands;

    UWORD8 u1_num_skip_cands;

    UWORD8 u1_num_merge_skip_cands;

} cu_inter_merge_skip_t;

/** Structure to store the Mixed mode Cand. for EncLoop */
typedef struct
{
    cu_inter_cand_t as_cu_data[MAX_NUM_MIXED_MODE_INTER_RDO_CANDS];

    UWORD8 u1_num_mixed_mode_type0_cands;

    UWORD8 u1_num_mixed_mode_type1_cands;

} cu_mixed_mode_inter_t;

typedef struct
{
    /* +2 because an additional buffer is required for */
    /* storing both cur and best during merge eval */
    void *apv_inter_pred_data[MAX_NUM_INTER_RDO_CANDS + 4];

    /* Bit field used to determine the indices of free bufs in 'apv_pred_data' buf array */
    UWORD32 u4_is_buf_in_use;

    /* Assumption is that the same stride is used for the */
    /* entire set of buffers above and is equal to the */
    /* CU size */
    WORD32 i4_pred_stride;

} ihevce_inter_pred_buf_data_t;
/** Structure to store the Inter Cand. info in EncLoop */
typedef struct
{
    cu_inter_cand_t *aps_cu_data[MAX_NUM_INTER_RDO_CANDS];

    UWORD32 au4_cost[MAX_NUM_INTER_RDO_CANDS];

    UWORD8 au1_pred_buf_idx[MAX_NUM_INTER_RDO_CANDS];

    UWORD32 u4_src_variance;

    UWORD8 u1_idx_of_worst_cost_in_cost_array;

    UWORD8 u1_idx_of_worst_cost_in_pred_buf_array;

    UWORD8 u1_num_inter_cands;

} inter_cu_mode_info_t;
typedef struct
{
    /*Frame level base pointer of buffers for each ctb row to store the top pixels
    *and top left pixel for the next ctb row.These buffers are common accross all threads
    */
    UWORD8 *apu1_sao_src_frm_top_luma[MAX_NUM_ENC_LOOP_PARALLEL];
    /*Ctb level pointer to buffer to store the top pixels
    *and top left pixel for the next ctb row.These buffers are common accross all threads
    */
    UWORD8 *pu1_curr_sao_src_top_luma;
    /*Buffer to store the left boundary before
    * doing sao on current ctb for the next ctb in the current row
    */
    UWORD8 au1_sao_src_left_luma[MAX_CTB_SIZE];
    /*Frame level base pointer of buffers for each ctb row to store the top pixels
    *and top left pixel for the next ctb row.These buffers are common accross all threads
    */
    UWORD8 *apu1_sao_src_frm_top_chroma[MAX_NUM_ENC_LOOP_PARALLEL];

    WORD32 i4_frm_top_chroma_buf_stride;

    /*Ctb level pointer to buffer to store the top chroma pixels
    *and top left pixel for the next ctb row.These buffers are common accross all threads
    */
    UWORD8 *pu1_curr_sao_src_top_chroma;

    /*Scratch buffer to store the left boundary before
    * doing sao on current ctb for the next ctb in the current row
    */
    UWORD8 au1_sao_src_left_chroma[MAX_CTB_SIZE * 2];

    /**
    * Luma recon buffer
    */
    UWORD8 *pu1_frm_luma_recon_buf;
    /**
    * Chroma recon buffer
    */
    UWORD8 *pu1_frm_chroma_recon_buf;
    /**
    * Luma recon buffer for curr ctb
    */
    UWORD8 *pu1_cur_luma_recon_buf;
    /**
    * Chroma recon buffer for curr ctb
    */
    UWORD8 *pu1_cur_chroma_recon_buf;
    /**
    * Luma src buffer
    */
    UWORD8 *pu1_frm_luma_src_buf;
    /**
    * Chroma src buffer
    */
    UWORD8 *pu1_frm_chroma_src_buf;
    /**
    * Luma src(input yuv) buffer for curr ctb
    */
    UWORD8 *pu1_cur_luma_src_buf;
    /**
    * Chroma src buffer for curr ctb
    */
    UWORD8 *pu1_cur_chroma_src_buf;
    /* Left luma scratch buffer required for sao RD optimisation*/
    UWORD8 au1_left_luma_scratch[MAX_CTB_SIZE];

    /* Left chroma scratch buffer required for sao RD optimisation*/
    /* Min size required= MAX_CTB_SIZE/2 * 2
    * Multiplied by 2 because size reuired is MAX_CTB_SIZE/2 each for U and V
    */
    UWORD8 au1_left_chroma_scratch[MAX_CTB_SIZE * 2];

    /* Top luma scratch buffer required for sao RD optimisation*/
    UWORD8 au1_top_luma_scratch[MAX_CTB_SIZE + 2];  // +1 for top left pixel and +1 for top right

    /* Top chroma scratch buffer required for sao RD optimisation*/
    UWORD8 au1_top_chroma_scratch[MAX_CTB_SIZE + 4];  // +2 for top left pixel and +2 for top right

    /* Scratch buffer to store the sao'ed output during sao RD optimisation*/
    /* One extra row(bot pixels) is copied to scratch buf but 2d buf copy func copies multiple of 4 ,hence
    MAX_CTB _SIZE + 4*/
    UWORD8 au1_sao_luma_scratch[PING_PONG][SCRATCH_BUF_STRIDE * (MAX_CTB_SIZE + 4)];

    /* Scratch buffer to store the sao'ed output during sao RD optimisation*/
    /* One extra row(bot pixels) is copied to scratch buf but 2d buf copy func copies multiple of 4 ,hence
    MAX_CTB _SIZE + 4*/
    UWORD8 au1_sao_chroma_scratch[PING_PONG][SCRATCH_BUF_STRIDE * (MAX_CTB_SIZE + 4)];

    /**
    * CTB size
    */
    WORD32 i4_ctb_size;
    /**
    * Luma recon buffer stride
    */
    WORD32 i4_frm_luma_recon_stride;
    /**
    * Chroma recon buffer stride
    */
    WORD32 i4_frm_chroma_recon_stride;
    /**
    * Luma recon buffer stride for curr ctb
    */
    WORD32 i4_cur_luma_recon_stride;
    /**
    * Chroma recon buffer stride for curr ctb
    */
    WORD32 i4_cur_chroma_recon_stride;
    /**
    * Luma src buffer stride
    */
    WORD32 i4_frm_luma_src_stride;
    /**
    * Chroma src buffer stride
    */
    WORD32 i4_frm_chroma_src_stride;

    WORD32 i4_frm_top_luma_buf_stride;
    /**
    * Luma src buffer stride for curr ctb
    */
    WORD32 i4_cur_luma_src_stride;
    /**
    * Chroma src buffer stride for curr ctb
    */
    WORD32 i4_cur_chroma_src_stride;

    /* Top luma buffer size */
    WORD32 i4_top_luma_buf_size;

    /* Top Chroma buffer size */
    WORD32 i4_top_chroma_buf_size;

    /*** Number of CTB units **/
    WORD32 i4_num_ctb_units;

    /**
    * CTB x pos
    */
    WORD32 i4_ctb_x;
    /**
    * CTB y pos
    */
    WORD32 i4_ctb_y;
    /* SAO block width*/
    WORD32 i4_sao_blk_wd;

    /* SAO block height*/
    WORD32 i4_sao_blk_ht;

    /* Last ctb row flag*/
    WORD32 i4_is_last_ctb_row;

    /* Last ctb col flag*/
    WORD32 i4_is_last_ctb_col;

    /* CTB aligned width */
    UWORD32 u4_ctb_aligned_wd;

    /* Number of ctbs in a row*/
    UWORD32 u4_num_ctbs_horz;

    UWORD32 u4_num_ctbs_vert;
    /**
    * Closed loop SSD Lambda
    * This is multiplied with bits for RD cost computations in SSD mode
    * This is represented in q format with shift of LAMBDA_Q_SHIFT
    */
    LWORD64 i8_cl_ssd_lambda_qf;

    /**
    * Closed loop SSD Lambda for chroma (chroma qp is different from luma qp)
    * This is multiplied with bits for RD cost computations in SSD mode
    * This is represented in q format with shift of LAMBDA_Q_SHIFT
    */
    LWORD64 i8_cl_ssd_lambda_chroma_qf;
    /**
    * Pointer to current PPS
    */
    pps_t *ps_pps;  //not used currently
    /**
    * Pointer to current SPS
    */
    sps_t *ps_sps;

    /**
    * Pointer to current slice header structure
    */
    slice_header_t *ps_slice_hdr;
    /**
    * Pointer to current frame ctb out array of structures
    */
    ctb_enc_loop_out_t *ps_ctb_out;
    /**
    *  context for cabac bit estimation used during rdopt stage
    */
    rdopt_entropy_ctxt_t *ps_rdopt_entropy_ctxt;
    /**
    * Pointer to sao_enc_t for the current ctb
    */
    sao_enc_t *ps_sao;
    /*
    * Pointer to an array to store the sao information of the top ctb
    * This is required for to decide top merge
    */
    sao_enc_t *aps_frm_top_ctb_sao[MAX_NUM_ENC_LOOP_PARALLEL];

    /*
    * Pointer to structure to store the sao parameters of (x,y)th ctb
    * for top merge of (x,y+1)th ctb
    */
    sao_enc_t *ps_top_ctb_sao;

    /* structure to store the sao parameters of (x,y)th ctb for
    * the left merge of (x+1,y)th ctb
    */
    sao_enc_t s_left_ctb_sao;

    /* Array of structures for SAO RDO candidates*/
    sao_enc_t as_sao_rd_cand[MAX_SAO_RD_CAND];

    /** array of function pointers for luma sao */
    pf_sao_luma apf_sao_luma[4];

    /** array of function pointers for chroma sao */
    pf_sao_chroma apf_sao_chroma[4];

    /* Flag to do SAO luma and chroma filtering*/
    WORD8 i1_slice_sao_luma_flag;

    WORD8 i1_slice_sao_chroma_flag;

#if DISABLE_SAO_WHEN_NOISY
    ctb_analyse_t *ps_ctb_data;

    WORD32 i4_ctb_data_stride;
#endif

    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list;

} sao_ctxt_t;

/**
******************************************************************************
*  @brief  Encode loop module context structure
******************************************************************************
*/
typedef struct
{
#if ENABLE_TU_TREE_DETERMINATION_IN_RDOPT
    void *pv_err_func_selector;
#endif

    /**
    * Quality preset for comtrolling numbe of RD opt cand
    * @sa : IHEVCE_QUALITY_CONFIG_T
    */
    WORD32 i4_quality_preset;
    /**
    *
    *
    */
    WORD32 i4_rc_pass;
    /**
    * Lamda to be mulitplied with bits for SATD
    * should be equal to Lamda*Qp
    */
    WORD32 i4_satd_lamda;

    /**
    * Lamda to be mulitplied with bits for SAD
    * should be equal to Lamda*Qp
    */
    WORD32 i4_sad_lamda;

    /**
    * Closed loop SSD Lambda
    * This is multiplied with bits for RD cost computations in SSD mode
    * This is represented in q format with shift of LAMBDA_Q_SHIFT
    */
    LWORD64 i8_cl_ssd_lambda_qf;

    /**
    * Closed loop SSD Lambda for chroma (chroma qp is different from luma qp)
    * This is multiplied with bits for RD cost computations in SSD mode
    * This is represented in q format with shift of LAMBDA_Q_SHIFT
    */
    LWORD64 i8_cl_ssd_lambda_chroma_qf;

    /**
    * Ratio of Closed loop SSD Lambda and Closed loop SSD Lambda for chroma
    * This is multiplied with (1 << CHROMA_COST_WEIGHING_FACTOR_Q_SHIFT)
    * to keep the precision of the ratio
    */
    UWORD32 u4_chroma_cost_weighing_factor;
    /**
    * Frame level QP to be used
    */
    WORD32 i4_frame_qp;

    WORD32 i4_frame_mod_qp;

    WORD32 i4_frame_qstep;

    UWORD8 u1_max_tr_depth;

    /**
    * CU level Qp
    */
    WORD32 i4_cu_qp;

    /**
    * CU level Qp / 6
    */
    WORD32 i4_cu_qp_div6;

    /**
    * CU level Qp % 6
    */
    WORD32 i4_cu_qp_mod6;

    /**
    *  CU level QP to be used
    */
    WORD32 i4_chrm_cu_qp;

    /**
    * CU level Qp / 6
    */
    WORD32 i4_chrm_cu_qp_div6;

    /**
    * CU level Qp % 6
    */
    WORD32 i4_chrm_cu_qp_mod6;

    /** previous cu qp
    * @remarks : This needs to be remembered to handle skip cases in deblocking.
    */
    WORD32 i4_prev_cu_qp;

    /** chroma qp offset
    * @remarks : Used to calculate chroma qp and other qp related parameter at CU level
    */
    WORD32 i4_chroma_qp_offset;

    /**
    * Buffer Pointer to populate the scale matrix for all transform size
    */
    WORD16 *pi2_scal_mat;

    /**
    * Buffer Pointer to populate the rescale matrix for all transform size
    */
    WORD16 *pi2_rescal_mat;

    /** array of pointer to store the scaling matrices for
    *  all transform sizes and qp % 6 (pre computed)
    */
    WORD16 *api2_scal_mat[NUM_TRANS_TYPES * 2];

    /** array of pointer to store the re-scaling matrices for
    *  all transform sizes and qp % 6 (pre computed)
    */
    WORD16 *api2_rescal_mat[NUM_TRANS_TYPES * 2];

    /** array of function pointers for residual and
    *  forward transform for all transform sizes
    */
    pf_res_trans_luma apf_resd_trns[NUM_TRANS_TYPES];

    /** array of function pointers for residual and
    *  forward HAD transform for all transform sizes
    */
    pf_res_trans_luma_had_chroma apf_chrm_resd_trns_had[NUM_TRANS_TYPES - 2];

    /** array of function pointers for residual and
    *  forward transform for all transform sizes
    *  for chroma
    */
    pf_res_trans_chroma apf_chrm_resd_trns[NUM_TRANS_TYPES - 2];

    /** array of function pointers for qunatization and
    *  inv Quant for ssd calc. for all transform sizes
    */
    pf_quant_iquant_ssd apf_quant_iquant_ssd[4];

    /** array of function pointers for inv.transform and
    *  recon for all transform sizes
    */
    pf_it_recon apf_it_recon[NUM_TRANS_TYPES];

    /** array of function pointers for inverse transform
    * and recon for all transform sizes for chroma
    */
    pf_it_recon_chroma apf_chrm_it_recon[NUM_TRANS_TYPES - 2];

    /** array of luma intra prediction function pointers */
    pf_intra_pred apf_lum_ip[NUM_IP_FUNCS];

    /** array of chroma intra prediction function pointers */
    pf_intra_pred apf_chrm_ip[NUM_IP_FUNCS];

    /* - Function pointer to cu_mode_decide function */
    /* - The 'void *' is used since one of the parameters of */
    /* this class of functions is the current structure */
    /* - This function pointer is used to choose the */
    /* appropriate function depending on whether bit_depth is */
    /* chosen as 8 bits or greater */
    /* - This function pointer's type is defined at the end */
    /* of this file */
    void *pv_cu_mode_decide;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_inter_rdopt_cu_mc_mvp;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_inter_rdopt_cu_ntu;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_intra_chroma_pred_mode_selector;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_intra_rdopt_cu_ntu;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_final_rdopt_mode_prcs;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_store_cu_results;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_enc_loop_cu_bot_copy;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_final_mode_reevaluation_with_modified_cu_qp;

    /* Infer from the comment for the variable 'pv_cu_mode_decide' */
    void *pv_enc_loop_ctb_left_copy;

    /** Qunatization rounding factor for inter and intra CUs */
    WORD32 i4_quant_rnd_factor[2];

    /**
    * Frame Buffer Pointer to store the top row luma data.
    * one pixel row in every ctb row
    */
    void *apv_frm_top_row_luma[MAX_NUM_ENC_LOOP_PARALLEL];

    /**
    * One CTB row size of Top row luma data buffer
    */
    WORD32 i4_top_row_luma_stride;

    /**
    * One frm of Top row luma data buffer
    */
    WORD32 i4_frm_top_row_luma_size;

    /**
    * Current luma row bottom data store pointer
    */
    void *pv_bot_row_luma;

    /**
    * Top luma row top data access pointer
    */
    void *pv_top_row_luma;

    /**
    * Frame Buffer Pointer to store the top row chroma data (Cb  Cr pixel interleaved )
    * one pixel row in every ctb row
    */
    void *apv_frm_top_row_chroma[MAX_NUM_ENC_LOOP_PARALLEL];

    /**
    * One CTB row size of Top row chroma data buffer (Cb  Cr pixel interleaved )
    */
    WORD32 i4_top_row_chroma_stride;

    /**
    * One frm size of Top row chroma data buffer (Cb  Cr pixel interleaved )
    */
    WORD32 i4_frm_top_row_chroma_size;

    /**
    * Current chroma row bottom data store pointer
    */
    void *pv_bot_row_chroma;

    /**
    * Top chroma row top data access pointer
    */
    void *pv_top_row_chroma;

    /**
    * Frame Buffer Pointer to store the top row neighbour modes stored at 4x4 level
    * one 4x4 row in every ctb row
    */
    nbr_4x4_t *aps_frm_top_row_nbr[MAX_NUM_ENC_LOOP_PARALLEL];

    /**
    * One CTB row size of Top row nbr 4x4 params buffer
    */
    WORD32 i4_top_row_nbr_stride;

    /**
    * One frm size of Top row nbr 4x4 params buffer
    */
    WORD32 i4_frm_top_row_nbr_size;

    /**
    * Current row nbr prms bottom data store pointer
    */
    nbr_4x4_t *ps_bot_row_nbr;

    /**
    * Top row nbr prms top data access pointer
    */
    nbr_4x4_t *ps_top_row_nbr;

    /**
    * Pointer to (1,1) location in au1_nbr_ctb_map
    */
    UWORD8 *pu1_ctb_nbr_map;

    /**
    * neigbour map buffer stride;
    */
    WORD32 i4_nbr_map_strd;

    /**
    * Array at ctb level to store the neighour map
    * its size is 25x25 for ctb size of 64x64
    */
    UWORD8 au1_nbr_ctb_map[MAX_PU_IN_CTB_ROW + 1 + 8][MAX_PU_IN_CTB_ROW + 1 + 8];

    /**
    * Array to store left ctb data for luma
    * some padding is added to take care of unconditional access
    */
    void *pv_left_luma_data;

    /**
    * Array to store left ctb data for chroma (cb abd cr pixel interleaved
    * some padding is added to take care of unconditional access
    */
    void *pv_left_chrm_data;

    /**
    * Array to store the left neighbour modes at 4x4 level
    */
    nbr_4x4_t as_left_col_nbr[MAX_PU_IN_CTB_ROW];

    /**
    * Array to store currrent CTb pred modes at a 4x4 level
    * used for prediction inside ctb
    */
    nbr_4x4_t as_ctb_nbr_arr[MAX_PU_IN_CTB_ROW * MAX_PU_IN_CTB_ROW];

    /**
    * array for storing csbf during RD opt stage at CU level
    * one best and one current is required
    */
    UWORD8 au1_cu_csbf[MAX_TU_IN_CTB_ROW * MAX_TU_IN_CTB_ROW];

    /**
    * Stride of csbf buffer. will be useful for scanning access
    * if stored in a 2D order. right now set to max tx size >> 4;
    */
    WORD32 i4_cu_csbf_strd;

    /**
    * Array to store pred modes  during SATD and RD opt stage at CU level
    * one best and one current is required
    */
    nbr_4x4_t as_cu_nbr[2][MAX_PU_IN_CTB_ROW * MAX_PU_IN_CTB_ROW];

    /**
    * array to store the output of reference substitution process output
    * for intra CUs
    * TOP (32 x 2) + Left (32 x 2) + Top left (1) + Alignment (3)
    */
    void *pv_ref_sub_out;

    /**
    * array to store the filtered reference samples for intra CUs
    * TOP (32 x 2) + Left (32 x 2) + Top left (1) + Alignment (3)
    */
    void *pv_ref_filt_out;

    /**
    * Used for 3 purposes
    *
    * 1. MC Intermediate buffer
    * array for storing intermediate 16-bit value for hxhy subpel
    * generation at CTB level (+ 16) for subpel planes boundary
    * +4 is for horizontal 4pels
    *
    * 2. Temprory scratch buffer for transform and coeffs storage
    * MAX_TRANS_SIZE *2 for trans_scratch(32bit) and MAX_TRANS_SIZE *1 for trans_values
    * The first part i.e. from 0 to MAX_TRANS_SIZE is then reused for storing the quant coeffs
    * Max of both are used
    *
    * 3. MC Intermediate buffer
    * buffer for storing intermediate 16 bit values prior to conversion to 8bit in HBD
    *
    */
    MEM_ALIGN16 WORD16 ai2_scratch[(MAX_CTB_SIZE + 8 + 8) * (MAX_CTB_SIZE + 8 + 8 + 8) * 2];

    /**
    * array for storing cu level final params for a given mode
    * one best and one current is required
    */
    enc_loop_cu_final_prms_t as_cu_prms[2];

    /**
    * Scan index to be used for any gien transform
    * this is a scartch variable used to communicate
    * scan idx at every transform level
    */
    WORD32 i4_scan_idx;

    /**
    * Buffer index in ping pong buffers
    * to be used SATD mode evaluations
    */
    WORD32 i4_satd_buf_idx;

    /**
    * Motion Compensation module context structre
    */
    inter_pred_ctxt_t s_mc_ctxt;

    /**
    * MV pred module context structre
    */
    mv_pred_ctxt_t s_mv_pred_ctxt;

    /**
    * Deblock BS ctb structure
    */
    deblk_bs_ctb_ctxt_t s_deblk_bs_prms;

    /**
    * Deblocking ctb structure
    */
    deblk_ctb_params_t s_deblk_prms;

    /**
    * Deblocking structure. For ctb-row level
    */
    deblk_ctbrow_prms_t s_deblk_ctbrow_prms;

    /**
    * Deblocking enable flag
    */
    WORD32 i4_deblock_type;

    /**
    *  context for cabac bit estimation used during rdopt stage
    */
    rdopt_entropy_ctxt_t s_rdopt_entropy_ctxt;

    /**
    * Context models stored for RDopt store and restore purpose
    */
    UWORD8 au1_rdopt_init_ctxt_models[IHEVC_CAB_CTXT_END];

    /**
    * current picture slice type
    */
    WORD8 i1_slice_type;

    /**
    * strong_intra_smoothing_enable_flag
    */
    WORD8 i1_strong_intra_smoothing_enable_flag;

    /** Pointer to Dep Mngr for controlling Top-Right CU dependency */
    void *pv_dep_mngr_enc_loop_cu_top_right;

    /** Pointer to Dep Mngr for controlling Deblocking Top dependency */
    void *pv_dep_mngr_enc_loop_dblk;

    /** pointer to store the cabac states at end of second CTB in current row */
    UWORD8 *pu1_curr_row_cabac_state;

    /** pointer to copy the cabac states at start of first CTB in current row */
    UWORD8 *pu1_top_rt_cabac_state;
    /** flag to indicate rate control mode.
    * @remarks :  To enable CU level qp modulation only when required.
    */
    WORD8 i1_cu_qp_delta_enable;

    /** flag to indicate rate control mode.
    * @remarks :  Entropy sync enable flag
    */
    WORD8 i1_entropy_coding_sync_enabled_flag;

    /** Use SATD or SAD for best merge candidate evaluation */
    WORD32 i4_use_satd_for_merge_eval;

    UWORD8 u1_use_early_cbf_data;

    /** Use SATD or SAD for best CU merge candidate evaluation */
    WORD32 i4_use_satd_for_cu_merge;

    /** Maximum number of merge candidates to be evaluated */
    WORD32 i4_max_merge_candidates;

    /** Flag to indicate whether current pictute needs to be deblocked,
    padded and hpel planes need to be generated.
    These are turned off typically in non referecne pictures when psnr
    and recon dump is disabled
    */
    WORD32 i4_deblk_pad_hpel_cur_pic;

    /* Array of structures for storing mc predicted data for
    * merge and skip modes
    */
    merge_skip_pred_data_t as_merge_skip_pred_data[MAX_NUM_CU_MERGE_SKIP_CAND];

    /* Sum the Qps of each 8*8 block in CU
    * 8*8 block is considered as Min CU size possible as per standard is 8
    * 0 corresponds to INTER and 1 corresponds to INTRA
    */
    LWORD64 i8_cl_ssd_lambda_qf_array[MAX_HEVC_QP_12bit + 1];
    UWORD32 au4_chroma_cost_weighing_factor_array[MAX_HEVC_QP_12bit + 1];
    LWORD64 i8_cl_ssd_lambda_chroma_qf_array[MAX_HEVC_QP_12bit + 1];
    WORD32 i4_satd_lamda_array[MAX_HEVC_QP_12bit + 1];
    WORD32 i4_sad_lamda_array[MAX_HEVC_QP_12bit + 1];

    /************************************************************************/
    /* The fields with the string 'type2' in their names are required */
    /* when both 8bit and hbd lambdas are needed. The lambdas corresponding */
    /* to the bit_depth != internal_bit_depth are stored in these fields */
    /************************************************************************/
    LWORD64 i8_cl_ssd_type2_lambda_qf_array[MAX_HEVC_QP_12bit + 1];
    LWORD64 i8_cl_ssd_type2_lambda_chroma_qf_array[MAX_HEVC_QP_12bit + 1];
    WORD32 i4_satd_type2_lamda_array[MAX_HEVC_QP_12bit + 1];
    WORD32 i4_sad_type2_lamda_array[MAX_HEVC_QP_12bit + 1];

    /* Lokesh: Added to find if the CU is the first to be coded in the group */
    WORD32 i4_is_first_cu_qg_coded;

    /* Chroma RDOPT related parameters */
    ihevce_chroma_rdopt_ctxt_t s_chroma_rdopt_ctxt;

    /* Structure to save pred data of ME/Intra cand */
    cu_me_intra_pred_prms_t s_cu_me_intra_pred_prms;

    /* Structure to save the flags required for Final mode Reconstruction
    function. These flags are set based on quality presets and bit-rate
    we are working on */
    cu_final_recon_flags_t s_cu_final_recon_flags;

    /* Parameter to how at which level RDOQ will be implemented:
    0 - RDOQ disbaled
    1 - RDOQ enabled during RDOPT for all candidates
    2 - RDOQ enabled only for the final candidate*/
    WORD32 i4_rdoq_level;

    /* Parameter to how at which level Quant rounding factors are computed:
    FIXED_QUANT_ROUNDING       : Fixed Quant rounding values are used
    NCTB_LEVEL_QUANT_ROUNDING  : NCTB level Cmputed Quant rounding values are used
    CTB_LEVEL_QUANT_ROUNDING   : CTB level Cmputed Quant rounding values are used
    CU_LEVEL_QUANT_ROUNDING    : CU level Cmputed Quant rounding values are used
    TU_LEVEL_QUANT_ROUNDING    : TU level Cmputed Quant rounding values are used*/
    WORD32 i4_quant_rounding_level;

    /* Parameter to how at which level Quant rounding factors are computed:
    CHROMA_QUANT_ROUNDING    : Chroma Quant rounding values are used for chroma */
    WORD32 i4_chroma_quant_rounding_level;

    /* Parameter to how at which level RDOQ will be implemented:
    0 - SBH disbaled
    1 - SBH enabled during RDOPT for all candidates
    2 - SBH enabled only for the final candidate*/
    WORD32 i4_sbh_level;

    /* Parameter to how at which level ZERO CBF RDO will be implemented:
    0 - ZCBF disbaled
    1 - ZCBF enabled during RDOPT for all candidates
    2 - ZCBF enabled only for the final candidate
    */
    WORD32 i4_zcbf_rdo_level;

    /*RDOQ-SBH context structure*/
    rdoq_sbh_ctxt_t s_rdoq_sbh_ctxt;

    /** Structure to store the Merge/Skip Cand. for EncLoop */
    cu_inter_merge_skip_t s_cu_inter_merge_skip;
    /** Structure to store the Mixed mode Cand. for EncLoop */
    cu_mixed_mode_inter_t s_mixed_mode_inter_cu;

    ihevce_inter_pred_buf_data_t s_pred_buf_data;

    void *pv_422_chroma_intra_pred_buf;

    WORD32 i4_max_num_inter_rdopt_cands;

    /* Output Struct per each CU during recursions */
    ihevce_enc_cu_node_ctxt_t as_enc_cu_ctxt[MAX_CU_IN_CTB + 1];

    /* Used to store best inter candidate. Used only when */
    /* 'CU modulated QP override' is enabled */
    cu_inter_cand_t as_best_cand[MAX_CU_IN_CTB + 1];

    cu_inter_cand_t *ps_best_cand;

    UWORD8 au1_cu_init_cabac_state_a_priori[MAX_CU_IN_CTB + 1][IHEVC_CAB_CTXT_END];

    UWORD8 (*pau1_curr_cu_a_priori_cabac_state)[IHEVC_CAB_CTXT_END];

    /* Used to store pred data of each CU in the CTB. */
    /* Used only when 'CU modulated QP override' is enabled */
    void *pv_CTB_pred_luma;

    void *pv_CTB_pred_chroma;

    /**
    * array for storing recon during SATD and RD opt stage at CU level
    * one best and one current is required.Luma and chroma together
    */
    void *pv_cu_luma_recon;

    /**
    * array for storing recon during SATD and RD opt stage at CU level
    * one best and one current is required.Luma and chroma together
    */
    void *pv_cu_chrma_recon;

    /**
    * Array to store pred modes  during SATD and RD opt stage at CU level
    * one best and one current is required
    */
    nbr_4x4_t as_cu_recur_nbr[MAX_PU_IN_CTB_ROW * MAX_PU_IN_CTB_ROW];

    /**
    * Pointer to Array to store pred modes  during SATD and RD opt stage at CU level
    * one best and one current is required
    */
    nbr_4x4_t *ps_cu_recur_nbr;

    /**
    * Context models stored for CU recursion parent evaluation
    */
    UWORD8 au1_rdopt_recur_ctxt_models[4][IHEVC_CAB_CTXT_END];

    ihevce_enc_cu_node_ctxt_t *ps_enc_out_ctxt;

    /**
    * array for storing coeffs during RD opt stage at CU level
    * one best and one current is required. Luma and chroma together
    */
    /*UWORD8 au1_cu_recur_coeffs[MAX_LUMA_COEFFS_CTB + MAX_CHRM_COEFFS_CTB];*/

    UWORD8 *pu1_cu_recur_coeffs;

    UWORD8 *apu1_cu_level_pingpong_coeff_buf_addr[2];

    WORD16 *api2_cu_level_pingpong_deq_buf_addr[2];

    UWORD8 *pu1_ecd_data;

    /* OPT: flag to skip parent CU=4TU eval during recursion */
    UWORD8 is_parent_cu_rdopt;

    /**
    *   Array of structs containing block merge data for
    *   4 32x32 CU's in indices 1 - 4 and 64x64 CU at 0
    */
    UWORD8 u1_cabac_states_next_row_copied_flag;

    UWORD8 u1_cabac_states_first_cu_copied_flag;

    UWORD32 u4_cur_ctb_wd;

    UWORD32 u4_cur_ctb_ht;

    /* thread id of the current context */
    WORD32 thrd_id;

    /** Number of processing threads created run time */
    WORD32 i4_num_proc_thrds;

    /* Instance number of bit-rate for multiple bit-rate encode */
    WORD32 i4_bitrate_instance_num;

    WORD32 i4_num_bitrates;

    WORD32 i4_enc_frm_id;

    /* Flag to indicate if chroma needs to be considered for cost calculation */
    WORD32 i4_consider_chroma_cost;

    /* Number of modes to be evaluated for intra */
    WORD32 i4_num_modes_to_evaluate_intra;

    /* Number of modes to be evaluated for inter */
    WORD32 i4_num_modes_to_evaluate_inter;
    /*pointers for struct to hold RC parameters for each bit-rate instance */
    enc_loop_rc_params_t
        *aaps_enc_loop_rc_params[MAX_NUM_ENC_LOOP_PARALLEL][IHEVCE_MAX_NUM_BITRATES];

    /** Pointer to structure containing function pointers of common*/
    func_selector_t *ps_func_selector;

    /* Flag to control Top Right Sync for during Merge */
    UWORD8 u1_use_top_at_ctb_boundary;

    UWORD8 u1_is_input_data_hbd;

    UWORD8 u1_bit_depth;

    /* 0 - 400; 1 - 420; 2 - 422; 3 - 444 */
    UWORD8 u1_chroma_array_type;

    rc_quant_t *ps_rc_quant_ctxt;

    sao_ctxt_t s_sao_ctxt_t;

    /* Offset to get the Qp for the last CU of upper CTB-row.
    This offset is from the current tile top row QP map start.
    This will only be consumed by the first CU of current CTB-row
    iff [it is skip && entropy sync is off] */
    WORD32 *pi4_offset_for_last_cu_qp;

    double i4_lamda_modifier;
    double i4_uv_lamda_modifier;
    WORD32 i4_temporal_layer_id;

    UWORD8 u1_disable_intra_eval;

    WORD32 i4_quant_round_tu[2][32 * 32];

    WORD32 *pi4_quant_round_factor_tu_0_1[5];
    WORD32 *pi4_quant_round_factor_tu_1_2[5];

    WORD32 i4_quant_round_4x4[2][4 * 4];
    WORD32 i4_quant_round_8x8[2][8 * 8];
    WORD32 i4_quant_round_16x16[2][16 * 16];
    WORD32 i4_quant_round_32x32[2][32 * 32];

    WORD32 *pi4_quant_round_factor_cu_ctb_0_1[5];
    WORD32 *pi4_quant_round_factor_cu_ctb_1_2[5];

    WORD32 i4_quant_round_cr_4x4[2][4 * 4];
    WORD32 i4_quant_round_cr_8x8[2][8 * 8];
    WORD32 i4_quant_round_cr_16x16[2][16 * 16];

    WORD32 *pi4_quant_round_factor_cr_cu_ctb_0_1[3];
    WORD32 *pi4_quant_round_factor_cr_cu_ctb_1_2[3];
    /* cost for not coding cu residue i.e forcing no residue syntax as 1 */
    LWORD64 i8_cu_not_coded_cost;

    /* dependency manager for forward ME  sync */
    void *pv_dep_mngr_encloop_dep_me;

    LWORD64 ai4_source_satd_8x8[64];

    LWORD64 ai4_source_chroma_satd[256];

    UWORD8 u1_is_refPic;

    WORD32 i4_qp_mod;

    WORD32 i4_is_ref_pic;

    WORD32 i4_chroma_format;

    WORD32 i4_temporal_layer;

    WORD32 i4_use_const_lamda_modifier;

    double f_i_pic_lamda_modifier;

    LWORD64 i8_distortion;

    WORD32 i4_use_ctb_level_lamda;

    float f_str_ratio;

    /* Flag to indicate if current frame is to be shared with other clients.
    Used only in distributed-encoding */
    WORD32 i4_share_flag;

    /* Pointer to the current recon being processed.
    Needed for enabling TMVP in dist-encoding */
    void *pv_frm_recon;

    ihevce_cmn_opt_func_t s_cmn_opt_func;

    /* The ME analogue to the struct above was not included since */
    /* that would have entailed inclusion of all ME specific */
    /* header files */
    /*FT_SAD_EVALUATOR **/

    /*FT_SAD_EVALUATOR **/
    void *pv_evalsad_pt_npu_mxn_8bit;
    UWORD8 u1_enable_psyRDOPT;

    UWORD8 u1_is_stasino_enabled;

    UWORD32 u4_psy_strength;
    /*Sub PIC rc context */

    WORD32 i4_sub_pic_level_rc;
    WORD32 i4_num_ctb_for_out_scale;

    /**
     * Accumalated bits of all cu for required CTBS estimated during RDO evaluation.
     * Required for sup pic level RC. Reset when required CU/CTB count is reached.
     */
    UWORD32 u4_total_cu_bits;

    UWORD32 u4_total_cu_bits_mul_qs;

    UWORD32 u4_total_cu_hdr_bits;

    UWORD32 u4_cu_tot_bits_into_qscale;

    UWORD32 u4_cu_tot_bits;

    /*Scale added to the current qscale, output from sub pic rc*/
    WORD32 i4_cu_qp_sub_pic_rc;

    /*Frame level L1 IPE sad*/
    LWORD64 i8_frame_l1_ipe_sad;

    /*Frame level L0 IPE satd*/
    LWORD64 i8_frame_l0_ipe_satd;

    /*Frame level L1 ME sad*/
    LWORD64 i8_frame_l1_me_sad;

    /*Frame level L1 activity factor*/
    LWORD64 i8_frame_l1_activity_fact;
    /*bits esimated for frame calulated for sub pic rc bit control */
    WORD32 ai4_frame_bits_estimated[MAX_NUM_ENC_LOOP_PARALLEL][IHEVCE_MAX_NUM_BITRATES];
    /** I Scene cut */
    WORD32 i4_is_I_scenecut;

    /** Non Scene cut */
    WORD32 i4_is_non_I_scenecut;

    /** Frames for which online/offline model is not valid */
    WORD32 i4_is_model_valid;

    /** Steady State Frame */
    //WORD32 i4_is_steady_state;

    WORD32 i4_is_first_query;

    /* Pointer to Tile params base */
    void *pv_tile_params_base;

    /** The index of column tile for which it is working */
    WORD32 i4_tile_col_idx;

    WORD32 i4_max_search_range_horizontal;

    WORD32 i4_max_search_range_vertical;

    WORD32 i4_is_ctb_qp_modified;

    WORD32 i4_display_num;

    WORD32 i4_pred_qp;

    /*assumption of qg size is 8x8 block size*/
    WORD32 ai4_qp_qg[8 * 8];

    WORD32 i4_last_cu_qp_from_prev_ctb;

    WORD32 i4_prev_QP;

    UWORD8 u1_max_inter_tr_depth;

    UWORD8 u1_max_intra_tr_depth;

} ihevce_enc_loop_ctxt_t;

/*****************************************************************************/
/* Enums                                                                     */
/*****************************************************************************/

/** @brief RDOQ_LEVELS_T: This enumeration specifies the RDOQ mode of operation
*
*  NO_RDOQ    : RDOQ is not performed
*  BEST_CAND_RDOQ : RDOQ for final candidate only
*  ALL_CAND_RDOQ : RDOQ for all candidates
*/
typedef enum
{
    NO_RDOQ,
    BEST_CAND_RDOQ,
    ALL_CAND_RDOQ,
} RDOQ_LEVELS_T;

/** @brief QUANT_ROUNDING_COEFF_LEVELS_T: This enumeration specifies the Coef level RDOQ mode of operation
*
*  FIXED_QUANT_ROUNDING       : Fixed Quant rounding values are used
*  NCTB_LEVEL_QUANT_ROUNDING  : NCTB level Cmputed Quant rounding values are used
*  CTB_LEVEL_QUANT_ROUNDING   : CTB level Cmputed Quant rounding values are used
*  CU_LEVEL_QUANT_ROUNDING    : CU level Cmputed Quant rounding values are used
*  TU_LEVEL_QUANT_ROUNDING    : TU level Cmputed Quant rounding values are used
*               Defaulat for all candidtes, based on RDOQ_LEVELS_T choose to best candidate
*/
typedef enum
{
    FIXED_QUANT_ROUNDING,
    NCTB_LEVEL_QUANT_ROUNDING,
    CTB_LEVEL_QUANT_ROUNDING,
    CU_LEVEL_QUANT_ROUNDING,
    TU_LEVEL_QUANT_ROUNDING,
    CHROMA_QUANT_ROUNDING
} QUANT_ROUNDING_COEFF_LEVELS_T;

/*****************************************************************************/
/* Enums                                                                     */
/*****************************************************************************/

/** @brief SBH_LEVELS_T: This enumeration specifies the RDOQ mode of operation
*
*  NO_SBH    : SBH is not performed
*  BEST_CAND_SBH : SBH for final candidate only
*  ALL_CAND_SBH : SBH for all candidates
*/
typedef enum
{
    NO_SBH,
    BEST_CAND_SBH,
    ALL_CAND_SBH,
} SBH_LEVELS_T;

/** @brief ZCBF_LEVELS_T: This enumeration specifies the ZeroCBF RDO mode of operation
*
*  NO_ZCBF    : ZCBF RDO is not performed
*  ALL_CAND_ZCBF : ZCBF RDO for all candidates
*/
typedef enum
{
    NO_ZCBF,
    ZCBF_ENABLE,
} ZCBF_LEVELS_T;

/**
******************************************************************************
*  @brief  Encode loop master context structure
******************************************************************************
*/
typedef struct
{
    /** Array of encode loop structure */
    ihevce_enc_loop_ctxt_t *aps_enc_loop_thrd_ctxt[MAX_NUM_FRM_PROC_THRDS_ENC];

    /** Number of processing threads created run time */
    WORD32 i4_num_proc_thrds;

    /**
    *  Array of top row cu skip flags (1 bit per 8x8CU)
    */
    UWORD8 au1_cu_skip_top_row[HEVCE_MAX_WIDTH >> 6];

    /** Context models stored at the end of second CTB in a row)
    *  stored in packed form pState[bits6-1] | MPS[bit0]
    *  for each CTB row
    *  using entropy sync model in RD opt
    */
    UWORD8 au1_ctxt_models[MAX_NUM_CTB_ROWS_FRM][IHEVC_CAB_CTXT_END];

    /** Dependency manager for controlling EncLoop Top-Right CU dependency
    * One per each bit-rate and one per each frame in parallel
    */
    void *aapv_dep_mngr_enc_loop_cu_top_right[MAX_NUM_ENC_LOOP_PARALLEL][IHEVCE_MAX_NUM_BITRATES];

    /** Dependency manager for controlling Deblocking Top dependency
    * One per each bit-rate and one per each frame in parallel
    */
    void *aapv_dep_mngr_enc_loop_dblk[MAX_NUM_ENC_LOOP_PARALLEL][IHEVCE_MAX_NUM_BITRATES];

    /** number of bit-rate instances running */
    WORD32 i4_num_bitrates;

    /** number of enc frames running in parallel */
    WORD32 i4_num_enc_loop_frm_pllel;

    /* Pointer to Tile params base */
    void *pv_tile_params_base;
    /* Offset to get the Qp for the last CU of upper CTB-row.
    This offset is from the current tile top row QP map start.

    This will only be consumed by the first CU of current CTB-row
    iff [it is skip && entropy sync is off]
    There is one entry of every tile-column bcoz offset remains constant
    for all tiles lying in a tile-column */
    WORD32 ai4_offset_for_last_cu_qp[MAX_TILE_COLUMNS];
} ihevce_enc_loop_master_ctxt_t;

/**
******************************************************************************
*  @brief  This struct is used for storing data required by the block merge
*          function
******************************************************************************
*/
typedef struct
{
    block_data_8x8_t *ps_8x8_data;

    block_data_16x16_t *ps_16x16_data;

    block_data_32x32_t *ps_32x32_data;

    block_data_64x64_t *ps_64x64_data;

    part_type_results_t **ps_32x32_results;

    cur_ctb_cu_tree_t *ps_cu_tree;

    ipe_l0_ctb_analyse_for_me_t *ps_cur_ipe_ctb;

    mv_pred_ctxt_t *ps_mv_pred_ctxt;

    recon_pic_buf_t *(*aps_ref_list)[HEVCE_MAX_REF_PICS * 2];

    nbr_4x4_t *ps_top_nbr_4x4;

    nbr_4x4_t *ps_left_nbr_4x4;

    nbr_4x4_t *ps_curr_nbr_4x4;

    UWORD8 *pu1_inp;

    UWORD8 *pu1_ctb_nbr_map;

    WORD32 i4_nbr_map_strd;

    WORD32 inp_stride;

    WORD32 i4_ctb_x_off;

    WORD32 i4_ctb_y_off;

    WORD32 use_satd_for_err_calc;

    WORD32 lambda;

    WORD32 lambda_q_shift;

    WORD32 frm_qstep;

    WORD32 num_4x4_in_ctb;

    UWORD8 *pu1_wkg_mem;

    UWORD8 **ppu1_pred;

    UWORD8 u1_bidir_enabled;

    UWORD8 u1_max_tr_depth;

    WORD32 i4_ctb_pos;

    WORD32 i4_ctb_size;

    UWORD8 *apu1_wt_inp[MAX_REFS_SEARCHABLE + 1];

    /** Pointer of Dep Mngr for EncLoop Top-Right CU dependency */
    void *pv_dep_mngr_enc_loop_cu_top_right;
    /** The current cu row no. for Dep Manager to Check */
    WORD32 i4_dep_mngr_cur_cu_row_no;
    /** The Top cu row no. for Dep Manager to Check */
    WORD32 i4_dep_mngr_top_cu_row_no;

    WORD8 i1_quality_preset;

    /* Flag to control Top Right Sync for during Merge */
    UWORD8 u1_use_top_at_ctb_boundary;

} block_merge_input_t;

/* Structure which stores the info regarding the TU's present in the CU*/
typedef struct tu_prms_t
{
    UWORD8 u1_tu_size;

    UWORD8 u1_x_off;

    UWORD8 u1_y_off;

    WORD32 i4_tu_cost;

    WORD32 i4_early_cbf;

} tu_prms_t;

typedef struct
{
    cu_enc_loop_out_t **pps_cu_final;

    pu_t **pps_row_pu;

    tu_enc_loop_out_t **pps_row_tu;

    UWORD8 **ppu1_row_ecd_data;

    WORD32 *pi4_num_pus_in_ctb;

    WORD32 *pi4_last_cu_pos_in_ctb;

    WORD32 *pi4_last_cu_size;

    UWORD8 *pu1_num_cus_in_ctb_out;

} cu_final_update_prms;

typedef struct
{
    cu_nbr_prms_t *ps_cu_nbr_prms;

    cu_inter_cand_t *ps_best_inter_cand;

    enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms;

    WORD32 packed_pred_mode;

    WORD32 rd_opt_best_idx;

    void *pv_src;

    WORD32 src_strd;

    void *pv_pred;

    WORD32 pred_strd;

    void *pv_pred_chrm;

    WORD32 pred_chrm_strd;

    UWORD8 *pu1_final_ecd_data;

    UWORD8 *pu1_csbf_buf;

    WORD32 csbf_strd;

    void *pv_luma_recon;

    WORD32 recon_luma_strd;

    void *pv_chrm_recon;

    WORD32 recon_chrma_strd;

    UWORD8 u1_cu_pos_x;

    UWORD8 u1_cu_pos_y;

    UWORD8 u1_cu_size;

    WORD8 i1_cu_qp;

    UWORD8 u1_will_cabac_state_change;

    UWORD8 u1_recompute_sbh_and_rdoq;

    UWORD8 u1_is_first_pass;

#if USE_NOISE_TERM_IN_ZERO_CODING_DECISION_ALGORITHMS
    UWORD8 u1_is_cu_noisy;
#endif

} final_mode_process_prms_t;

typedef struct
{
    cu_inter_cand_t s_best_cand;

    /* The size is twice of what is required to ensure availability */
    /* of adequate space for 'HBD' case */
    UWORD8 au1_pred_luma[MAX_CU_SIZE * MAX_CU_SIZE * 2];

    /* The size is twice of what is required to ensure availability */
    /* of adequate space for 422 case */
    UWORD8 au1_pred_chroma[MAX_CU_SIZE * MAX_CU_SIZE * 2];
} final_mode_state_t;

typedef struct
{
    cu_mixed_mode_inter_t *ps_mixed_modes_datastore;

    cu_inter_cand_t *ps_me_cands;

    cu_inter_cand_t *ps_merge_cands;

    mv_pred_ctxt_t *ps_mv_pred_ctxt;

    inter_pred_ctxt_t *ps_mc_ctxt;

    UWORD8 *pu1_ctb_nbr_map;

    void *pv_src;

    nbr_4x4_t *ps_cu_nbr_buf;

    nbr_4x4_t *ps_left_nbr_4x4;

    nbr_4x4_t *ps_top_nbr_4x4;

    nbr_4x4_t *ps_topleft_nbr_4x4;

    WORD32 i4_ctb_nbr_map_stride;

    WORD32 i4_src_strd;

    WORD32 i4_nbr_4x4_left_strd;

    UWORD8 u1_cu_size;

    UWORD8 u1_cu_pos_x;

    UWORD8 u1_cu_pos_y;

    UWORD8 u1_num_me_cands;

    UWORD8 u1_num_merge_cands;

    UWORD8 u1_max_num_mixed_mode_cands_to_select;

    UWORD8 u1_max_merge_candidates;

    UWORD8 u1_use_satd_for_merge_eval;

} ihevce_mixed_inter_modes_selector_prms_t;

typedef struct
{
    LWORD64 i8_ssd;

    LWORD64 i8_cost;

#if ENABLE_INTER_ZCU_COST
    LWORD64 i8_not_coded_cost;
#endif

    UWORD32 u4_sad;

    WORD32 i4_bits;

    WORD32 i4_num_bytes_used_for_ecd;

    WORD32 i4_zero_col;

    WORD32 i4_zero_row;

    UWORD8 u1_cbf;

    UWORD8 u1_reconBufId;

    UWORD8 u1_is_valid_node;

    UWORD8 u1_size;

    UWORD8 u1_posx;

    UWORD8 u1_posy;
} tu_node_data_t;

typedef struct tu_tree_node_t
{
    struct tu_tree_node_t *ps_child_node_tl;

    struct tu_tree_node_t *ps_child_node_tr;

    struct tu_tree_node_t *ps_child_node_bl;

    struct tu_tree_node_t *ps_child_node_br;

    tu_node_data_t s_luma_data;

    /* 2 because of the 2 subTU's when input is 422 */
    tu_node_data_t as_cb_data[2];

    tu_node_data_t as_cr_data[2];

    UWORD8 u1_is_valid_node;

} tu_tree_node_t;

/*****************************************************************************/
/* Extern Variable Declarations                                              */
/*****************************************************************************/

/*****************************************************************************/
/* Extern Function Declarations                                              */
/*****************************************************************************/

/*****************************************************************************/
/* Typedefs                                                                  */
/*****************************************************************************/
typedef LWORD64 (*pf_cu_mode_decide)(
    ihevce_enc_loop_ctxt_t *ps_ctxt,
    enc_loop_cu_prms_t *ps_cu_prms,
    cu_analyse_t *ps_cu_analyse,
    final_mode_state_t *ps_final_mode_state,
    UWORD8 *pu1_ecd_data,
    pu_col_mv_t *ps_col_pu,
    UWORD8 *pu1_col_pu_map,
    WORD32 col_start_pu_idx);

typedef LWORD64 (*pf_inter_rdopt_cu_mc_mvp)(
    ihevce_enc_loop_ctxt_t *ps_ctxt,
    cu_inter_cand_t *ps_inter_cand,
    WORD32 cu_size,
    WORD32 cu_pos_x,
    WORD32 cu_pos_y,
    nbr_4x4_t *ps_left_nbr_4x4,
    nbr_4x4_t *ps_top_nbr_4x4,
    nbr_4x4_t *ps_topleft_nbr_4x4,
    WORD32 nbr_4x4_left_strd,
    WORD32 curr_buf_idx);

typedef LWORD64 (*pf_inter_rdopt_cu_ntu)(
    ihevce_enc_loop_ctxt_t *ps_ctxt,
    enc_loop_cu_prms_t *ps_cu_prms,
    void *pv_src,
    WORD32 cu_size,
    WORD32 cu_pos_x,
    WORD32 cu_pos_y,
    WORD32 curr_buf_idx,
    enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
    cu_inter_cand_t *ps_inter_cand,
    cu_analyse_t *ps_cu_analyse,
    WORD32 i4_alpha_stim_multiplier);

typedef void (*pf_intra_chroma_pred_mode_selector)(
    ihevce_enc_loop_ctxt_t *ps_ctxt,
    enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
    cu_analyse_t *ps_cu_analyse,
    WORD32 rd_opt_curr_idx,
    WORD32 tu_mode,
    WORD32 i4_alpha_stim_multiplier,
    UWORD8 u1_is_cu_noisy);

typedef LWORD64 (*pf_intra_rdopt_cu_ntu)(
    ihevce_enc_loop_ctxt_t *ps_ctxt,
    enc_loop_cu_prms_t *ps_cu_prms,
    void *pv_pred_org,
    WORD32 pred_strd_org,
    enc_loop_chrm_cu_buf_prms_t *ps_chrm_cu_buf_prms,
    UWORD8 *pu1_luma_mode,
    cu_analyse_t *ps_cu_analyse,
    void *pv_curr_src,
    void *pv_cu_left,
    void *pv_cu_top,
    void *pv_cu_top_left,
    nbr_4x4_t *ps_left_nbr_4x4,
    nbr_4x4_t *ps_top_nbr_4x4,
    WORD32 nbr_4x4_left_strd,
    WORD32 cu_left_stride,
    WORD32 curr_buf_idx,
    WORD32 func_proc_mode,
    WORD32 i4_alpha_stim_multiplier);

typedef void (*pf_final_rdopt_mode_prcs)(
    ihevce_enc_loop_ctxt_t *ps_ctxt, final_mode_process_prms_t *ps_prms);

typedef void (*pf_store_cu_results)(
    ihevce_enc_loop_ctxt_t *ps_ctxt,
    enc_loop_cu_prms_t *ps_cu_prms,
    final_mode_state_t *ps_final_state);

typedef void (*pf_enc_loop_cu_bot_copy)(
    ihevce_enc_loop_ctxt_t *ps_ctxt,
    enc_loop_cu_prms_t *ps_cu_prms,
    ihevce_enc_cu_node_ctxt_t *ps_enc_out_ctxt,
    WORD32 curr_cu_pos_in_row,
    WORD32 curr_cu_pos_in_ctb);

typedef void (*pf_enc_loop_ctb_left_copy)(
    ihevce_enc_loop_ctxt_t *ps_ctxt, enc_loop_cu_prms_t *ps_cu_prms);

#endif /* _IHEVCE_ENC_LOOP_STRUCTS_H_ */