1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_stasino_helpers.c
24 *
25 * @brief
26 *
27 * @author
28 *  Ittiam
29 *
30 * @par List of Functions:
31 *
32 * @remarks
33 *  None
34 *
35 *******************************************************************************
36 */
37 
38 /*****************************************************************************/
39 /* File Includes                                                             */
40 /*****************************************************************************/
41 /* System include files */
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <assert.h>
45 #include <string.h>
46 
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "itt_video_api.h"
50 #include "ihevce_api.h"
51 
52 #include "rc_cntrl_param.h"
53 #include "rc_frame_info_collector.h"
54 #include "rc_look_ahead_params.h"
55 
56 #include "ihevc_defs.h"
57 #include "ihevc_structs.h"
58 #include "ihevc_platform_macros.h"
59 #include "ihevc_deblk.h"
60 #include "ihevc_itrans_recon.h"
61 #include "ihevc_chroma_itrans_recon.h"
62 #include "ihevc_chroma_intra_pred.h"
63 #include "ihevc_intra_pred.h"
64 #include "ihevc_inter_pred.h"
65 #include "ihevc_mem_fns.h"
66 #include "ihevc_padding.h"
67 #include "ihevc_weighted_pred.h"
68 #include "ihevc_sao.h"
69 #include "ihevc_resi_trans.h"
70 #include "ihevc_quant_iquant_ssd.h"
71 #include "ihevc_cabac_tables.h"
72 
73 #include "ihevce_defs.h"
74 #include "ihevce_lap_enc_structs.h"
75 #include "ihevce_multi_thrd_structs.h"
76 #include "ihevce_me_common_defs.h"
77 #include "ihevce_had_satd.h"
78 #include "ihevce_error_codes.h"
79 #include "ihevce_bitstream.h"
80 #include "ihevce_cabac.h"
81 #include "ihevce_rdoq_macros.h"
82 #include "ihevce_function_selector.h"
83 #include "ihevce_enc_structs.h"
84 #include "ihevce_entropy_structs.h"
85 #include "ihevce_cmn_utils_instr_set_router.h"
86 #include "ihevce_enc_loop_structs.h"
87 #include "ihevce_stasino_helpers.h"
88 
89 /*****************************************************************************/
90 /* Function Definitions                                                      */
91 /*****************************************************************************/
92 
93 /**
94 *******************************************************************************
95 *
96 * @brief
97 *  This function calculates the variance of given data set.
98 *
99 * @par Description:
100 *  This function is mainly used to find the variance of the block of pixel values.
101 *  The block can be rectangular also. Single pass variance calculation
102 *  implementation.
103 *
104 * @param[in] p_input
105 *  The input buffer to calculate the variance.
106 *
107 * @param[out] pi4_mean
108 *  Pointer ot the mean of the datset
109 *
110 * @param[out] pi4_variance
111 *  Pointer tot he variabce of the data set
112 *
113 * @param[in] u1_is_hbd
114 *  1 if the data is in  high bit depth
115 *
116 * @param[in] stride
117 *  Stride for the input buffer
118 *
119 * @param[in] block_height
120 *  height of the pixel block
121 *
122 * @param[in] block_width
123 *  width of the pixel block
124 *
125 * @remarks
126 *  None
127 *
128 *******************************************************************************
129 */
ihevce_calc_variance(void * pv_input,WORD32 i4_stride,WORD32 * pi4_mean,UWORD32 * pu4_variance,UWORD8 u1_block_height,UWORD8 u1_block_width,UWORD8 u1_is_hbd,UWORD8 u1_disable_normalization)130 void ihevce_calc_variance(
131     void *pv_input,
132     WORD32 i4_stride,
133     WORD32 *pi4_mean,
134     UWORD32 *pu4_variance,
135     UWORD8 u1_block_height,
136     UWORD8 u1_block_width,
137     UWORD8 u1_is_hbd,
138     UWORD8 u1_disable_normalization)
139 {
140     UWORD8 *pui1_buffer;  // pointer for 8 bit usecase
141     WORD32 i, j;
142     WORD32 total_elements;
143 
144     LWORD64 mean;
145     ULWORD64 variance;
146     ULWORD64 sum;
147     ULWORD64 sq_sum;
148 
149     /* intialisation */
150     total_elements = u1_block_height * u1_block_width;
151     mean = 0;
152     variance = 0;
153     sum = 0;
154     sq_sum = 0;
155 
156     /* handle the case of 8/10 bit depth separately */
157     if(!u1_is_hbd)
158     {
159         pui1_buffer = (UWORD8 *)pv_input;
160 
161         /* loop over all the values in the block */
162         for(i = 0; i < u1_block_height; i++)
163         {
164             /* loop over a row in the block */
165             for(j = 0; j < u1_block_width; j++)
166             {
167                 sum += pui1_buffer[i * i4_stride + j];
168                 sq_sum += (pui1_buffer[i * i4_stride + j] * pui1_buffer[i * i4_stride + j]);
169             }
170         }
171 
172         if(!u1_disable_normalization)
173         {
174             mean = sum / total_elements;
175             variance =
176                 ((total_elements * sq_sum) - (sum * sum)) / (total_elements * (total_elements));
177         }
178         else
179         {
180             mean = sum;
181             variance = ((total_elements * sq_sum) - (sum * sum));
182         }
183     }
184 
185     /* copy back the values to the output variables */
186     *pi4_mean = mean;
187     *pu4_variance = variance;
188 }
189 
190 /**
191 *******************************************************************************
192 *
193 * @brief
194 *  This function calcluates the variance of given data set which is WORD16
195 *
196 * @par Description:
197 *  This function is mainly used to find the variance of the block of pixel values.
198 *  Single pass variance calculation implementation.
199 *
200 * @param[in] pv_input
201 *  The input buffer to calculate the variance.
202 *
203 *
204 * @param[in] stride
205 *  Stride for the input buffer
206 *
207 * @param[out] pi4_mean
208 *  Pointer ot the mean of the datset
209 *
210 * @param[out] pi4_variance
211 *  Pointer tot he variabce of the data set
212 *
213 * @param[in] block_height
214 *  height of the pixel block
215 *
216 * @param[in] block_width
217 *  width of the pixel block
218 *
219 *
220 * @remarks
221 *  None
222 *
223 *******************************************************************************/
ihevce_calc_variance_signed(WORD16 * pv_input,WORD32 i4_stride,WORD32 * pi4_mean,UWORD32 * pu4_variance,UWORD8 u1_block_height,UWORD8 u1_block_width)224 void ihevce_calc_variance_signed(
225     WORD16 *pv_input,
226     WORD32 i4_stride,
227     WORD32 *pi4_mean,
228     UWORD32 *pu4_variance,
229     UWORD8 u1_block_height,
230     UWORD8 u1_block_width)
231 {
232     WORD16 *pi2_buffer;  // poinbter for 10 bit use case
233 
234     WORD32 i, j;
235     WORD32 total_elements;
236 
237     LWORD64 mean;
238     LWORD64 variance;
239     LWORD64 sum;
240     LWORD64 sq_sum;
241 
242     /* intialisation */
243     total_elements = u1_block_height * u1_block_width;
244     mean = 0;
245     variance = 0;
246     sum = 0;
247     sq_sum = 0;
248 
249     pi2_buffer = pv_input;
250 
251     for(i = 0; i < u1_block_height; i++)
252     {
253         for(j = 0; j < u1_block_width; j++)
254         {
255             sum += pi2_buffer[i * i4_stride + j];
256             sq_sum += (pi2_buffer[i * i4_stride + j] * pi2_buffer[i * i4_stride + j]);
257         }
258     }
259 
260     mean = sum;  /// total_elements;
261     variance = ((total_elements * sq_sum) - (sum * sum));  // / (total_elements * (total_elements) )
262 
263     /* copy back the values to the output variables */
264     *pi4_mean = mean;
265     *pu4_variance = variance;
266 }
267 
268 /**
269 *******************************************************************************
270 *
271 * @brief
272 *  This function calculates the variance of a chrominance plane for 420SP data
273 *
274 * @par Description:
275 *  This function is mainly used to find the variance of the block of pixel values.
276 *  The block can be rectangular also. Single pass variance calculation
277 *  implementation.
278 *
279 * @param[in] p_input
280 *  The input buffer to calculate the variance.
281 *
282 * @param[in] stride
283 *  Stride for the input buffer
284 *
285 * @param[out] pi4_mean
286 *  Pointer ot the mean of the datset
287 *
288 * @param[out] pi4_variance
289 *  Pointer tot he variabce of the data set
290 *
291 * @param[in] block_height
292 *  height of the pixel block
293 *
294 * @param[in] block_width
295 *  width of the pixel block
296 *
297 * @param[in] u1_is_hbd
298 *  1 if the data is in  high bit depth
299 *
300 * @param[in] e_chroma_plane
301 *  is U or V
302 *
303 * @remarks
304 *  None
305 *
306 *******************************************************************************
307 */
ihevce_calc_chroma_variance(void * pv_input,WORD32 i4_stride,WORD32 * pi4_mean,UWORD32 * pu4_variance,UWORD8 u1_block_height,UWORD8 u1_block_width,UWORD8 u1_is_hbd,CHROMA_PLANE_ID_T e_chroma_plane)308 void ihevce_calc_chroma_variance(
309     void *pv_input,
310     WORD32 i4_stride,
311     WORD32 *pi4_mean,
312     UWORD32 *pu4_variance,
313     UWORD8 u1_block_height,
314     UWORD8 u1_block_width,
315     UWORD8 u1_is_hbd,
316     CHROMA_PLANE_ID_T e_chroma_plane)
317 {
318     UWORD8 *pui1_buffer;  // pointer for 8 bit usecase
319     WORD32 i, j;
320     WORD32 total_elements;
321 
322     LWORD64 mean;
323     ULWORD64 variance;
324     LWORD64 sum;
325     LWORD64 sq_sum;
326 
327     /* intialisation */
328     total_elements = u1_block_height * u1_block_width;
329     mean = 0;
330     variance = 0;
331     sum = 0;
332     sq_sum = 0;
333 
334     /* handle the case of 8/10 bit depth separately */
335     if(!u1_is_hbd)
336     {
337         pui1_buffer = (UWORD8 *)pv_input;
338 
339         pui1_buffer += e_chroma_plane;
340 
341         /* loop over all the values in the block */
342         for(i = 0; i < u1_block_height; i++)
343         {
344             /* loop over a row in the block */
345             for(j = 0; j < u1_block_width; j++)
346             {
347                 sum += pui1_buffer[i * i4_stride + j * 2];
348                 sq_sum += (pui1_buffer[i * i4_stride + j * 2] * pui1_buffer[i * i4_stride + j * 2]);
349             }
350         }
351 
352         mean = sum / total_elements;
353         variance = ((total_elements * sq_sum) - (sum * sum)) / (total_elements * (total_elements));
354     }
355 
356     /* copy back the values to the output variables */
357     *pi4_mean = mean;
358     *pu4_variance = variance;
359 }
360 
ihevce_inject_stim_into_distortion(void * pv_src,WORD32 i4_src_stride,void * pv_pred,WORD32 i4_pred_stride,LWORD64 i8_distortion,WORD32 i4_alpha_stim_multiplier,UWORD8 u1_blk_size,UWORD8 u1_is_hbd,UWORD8 u1_enable_psyRDOPT,CHROMA_PLANE_ID_T e_chroma_plane)361 LWORD64 ihevce_inject_stim_into_distortion(
362     void *pv_src,
363     WORD32 i4_src_stride,
364     void *pv_pred,
365     WORD32 i4_pred_stride,
366     LWORD64 i8_distortion,
367     WORD32 i4_alpha_stim_multiplier,
368     UWORD8 u1_blk_size,
369     UWORD8 u1_is_hbd,
370     UWORD8 u1_enable_psyRDOPT,
371     CHROMA_PLANE_ID_T e_chroma_plane)
372 {
373     if(!u1_enable_psyRDOPT)
374     {
375         UWORD32 u4_src_variance;
376         UWORD32 u4_pred_variance;
377         WORD32 i4_mean;
378         WORD32 i4_noise_term;
379 
380         if(NULL_PLANE == e_chroma_plane)
381         {
382             ihevce_calc_variance(
383                 pv_src,
384                 i4_src_stride,
385                 &i4_mean,
386                 &u4_src_variance,
387                 u1_blk_size,
388                 u1_blk_size,
389                 u1_is_hbd,
390                 0);
391 
392             ihevce_calc_variance(
393                 pv_pred,
394                 i4_pred_stride,
395                 &i4_mean,
396                 &u4_pred_variance,
397                 u1_blk_size,
398                 u1_blk_size,
399                 u1_is_hbd,
400                 0);
401         }
402         else
403         {
404             ihevce_calc_chroma_variance(
405                 pv_src,
406                 i4_src_stride,
407                 &i4_mean,
408                 &u4_src_variance,
409                 u1_blk_size,
410                 u1_blk_size,
411                 u1_is_hbd,
412                 e_chroma_plane);
413 
414             ihevce_calc_chroma_variance(
415                 pv_pred,
416                 i4_pred_stride,
417                 &i4_mean,
418                 &u4_pred_variance,
419                 u1_blk_size,
420                 u1_blk_size,
421                 u1_is_hbd,
422                 e_chroma_plane);
423         }
424 
425         i4_noise_term =
426             ihevce_compute_noise_term(i4_alpha_stim_multiplier, u4_src_variance, u4_pred_variance);
427 
428         MULTIPLY_STIM_WITH_DISTORTION(i8_distortion, i4_noise_term, STIM_Q_FORMAT, ALPHA_Q_FORMAT);
429 
430         return i8_distortion;
431     }
432     else
433     {
434         return i8_distortion;
435     }
436 }
437 
ihevce_determine_cu_noise_based_on_8x8Blk_data(UWORD8 * pu1_is_8x8Blk_noisy,UWORD8 u1_cu_x_pos,UWORD8 u1_cu_y_pos,UWORD8 u1_cu_size)438 UWORD8 ihevce_determine_cu_noise_based_on_8x8Blk_data(
439     UWORD8 *pu1_is_8x8Blk_noisy, UWORD8 u1_cu_x_pos, UWORD8 u1_cu_y_pos, UWORD8 u1_cu_size)
440 {
441     UWORD8 u1_num_noisy_children = 0;
442     UWORD8 u1_start_index = (u1_cu_x_pos / 8) + u1_cu_y_pos;
443 
444     if(8 == u1_cu_size)
445     {
446         return pu1_is_8x8Blk_noisy[u1_start_index];
447     }
448 
449     u1_num_noisy_children += ihevce_determine_cu_noise_based_on_8x8Blk_data(
450         pu1_is_8x8Blk_noisy, u1_cu_x_pos, u1_cu_y_pos, u1_cu_size / 2);
451 
452     u1_num_noisy_children += ihevce_determine_cu_noise_based_on_8x8Blk_data(
453         pu1_is_8x8Blk_noisy, u1_cu_x_pos + (u1_cu_size / 2), u1_cu_y_pos, u1_cu_size / 2);
454 
455     u1_num_noisy_children += ihevce_determine_cu_noise_based_on_8x8Blk_data(
456         pu1_is_8x8Blk_noisy, u1_cu_x_pos, u1_cu_y_pos + (u1_cu_size / 2), u1_cu_size / 2);
457 
458     u1_num_noisy_children += ihevce_determine_cu_noise_based_on_8x8Blk_data(
459         pu1_is_8x8Blk_noisy,
460         u1_cu_x_pos + (u1_cu_size / 2),
461         u1_cu_y_pos + (u1_cu_size / 2),
462         u1_cu_size / 2);
463 
464     return (u1_num_noisy_children >= 2);
465 }
466 
467 /*!
468 ******************************************************************************
469 * \if Function name : ihevce_psy_rd_cost_croma \endif
470 *
471 * \brief
472 *    Calculates the psyco visual cost for RD opt. This is
473 *
474 * \param[in] pui4_source_satd
475 *   This is the pointer to the array of 8x8 satd of the corresponding source CTB. This is pre calculated.
476 * \param[in] *pui1_recon
477 *   This si the pointer to the pred data.
478 * \param[in] recon_stride
479 *   This si the pred stride
480 * \param[in] pic_type
481 *   Picture type.
482 * \param[in] layer_id
483 *   Indicates the temporal layer.
484 * \param[in] lambda
485 *   This is the weighting factor for the cost.
486 * \param[in] is_hbd
487 *   This is the high bit depth flag which indicates if the bit depth of the pixels is 10 bit or 8 bit.
488 * \param[in] sub_sampling_type
489 *   This is the chroma subsampling type. 11 - for 420 and 13 for 422
490 * \return
491 *    the cost for the psyRDopt
492 *
493 * \author
494 *  Ittiam
495 *
496 *****************************************************************************
497 */
ihevce_psy_rd_cost_croma(LWORD64 * pui4_source_satd,void * p_recon,WORD32 recon_stride_vert,WORD32 recond_stride_horz,WORD32 cu_size_luma,WORD32 pic_type,WORD32 layer_id,WORD32 lambda,WORD32 start_index,WORD32 is_hbd,WORD32 sub_sampling_type,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)498 LWORD64 ihevce_psy_rd_cost_croma(
499     LWORD64 *pui4_source_satd,
500     void *p_recon,
501     WORD32 recon_stride_vert,
502     WORD32 recond_stride_horz,
503     WORD32 cu_size_luma,
504     WORD32 pic_type,
505     WORD32 layer_id,
506     WORD32 lambda,
507     WORD32 start_index,
508     WORD32 is_hbd,
509     WORD32 sub_sampling_type,
510     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
511 {
512     /* declare local variables to store the SATD values for the pred  for the current block. */
513     LWORD64 psy_rd_cost;
514     UWORD32 lambda_mod;
515     WORD32 psy_factor;
516 
517     /* declare local variables */
518     WORD32 i;
519     WORD32 cu_total_size;
520     WORD32 num_comp_had_blocks;
521 
522     UWORD8 *pu1_l0_block;
523     UWORD8 *pu1_l0_block_prev;
524     UWORD8 *pu1_recon;
525     WORD32 ht_offset;
526     WORD32 wd_offset;
527     WORD32 cu_ht;
528     WORD32 cu_wd;
529 
530     WORD32 num_horz_blocks;
531 
532     WORD16 pi2_residue_had[64];
533     /* this is used as a buffer with all values equal to 0. This is emulate the case with
534        pred being zero in HAD fucntion */
535     UWORD8 ai1_zeros_buffer[64];
536 
537     WORD32 had_block_size;
538     LWORD64 source_satd;  // to hold source for current 8x8 block
539     LWORD64 recon_satd;  // holds the current recon 8x8 satd
540 
541     WORD32 index_for_src_satd;
542 
543     (void)recond_stride_horz;
544     (void)pic_type;
545     (void)layer_id;
546     if(!is_hbd)
547     {
548         pu1_recon = (UWORD8 *)p_recon;
549     }
550 
551     /**** initialize the variables ****/
552     had_block_size = 4;
553 
554     if(sub_sampling_type == 1)  // 420
555     {
556         cu_ht = cu_size_luma / 2;
557         cu_wd = cu_size_luma / 2;
558     }
559     else
560     {
561         cu_ht = cu_size_luma;
562         cu_wd = cu_size_luma / 2;
563     }
564 
565     num_horz_blocks = 2 * cu_wd / had_block_size;  //ctb_width / had_block_size;
566     ht_offset = -had_block_size;
567     wd_offset = 0;  //-had_block_size;
568 
569     cu_total_size = cu_ht * cu_wd;
570     num_comp_had_blocks = 2 * cu_total_size / (had_block_size * had_block_size);
571 
572     index_for_src_satd = start_index;
573 
574     for(i = 0; i < 64; i++)
575     {
576         ai1_zeros_buffer[i] = 0;
577     }
578 
579     psy_factor = PSY_STRENGTH_CHROMA;
580     psy_rd_cost = 0;
581     lambda_mod = lambda * psy_factor;
582 
583     /************************************************************/
584     /* loop over for every 4x4 blocks in the CU for Cb */
585     for(i = 0; i < num_comp_had_blocks; i++)
586     {
587         if(i % num_horz_blocks == 0)
588         {
589             wd_offset = -had_block_size;
590             ht_offset += had_block_size;
591         }
592         wd_offset += had_block_size;
593 
594         /* source satd for the current 8x8 block */
595         source_satd = pui4_source_satd[index_for_src_satd];
596 
597         if(i % 2 != 0)
598         {
599             if(!is_hbd)
600             {
601                 pu1_l0_block = pu1_l0_block_prev + 1;
602             }
603         }
604         else
605         {
606             if(!is_hbd)
607             {
608                 /* get memory pointers for each of L0 and L1 blocks whose hadamard has to be computed */
609                 pu1_l0_block = pu1_recon + recon_stride_vert * ht_offset + wd_offset;
610                 pu1_l0_block_prev = pu1_l0_block;
611             }
612         }
613 
614         if(had_block_size == 4)
615         {
616             if(!is_hbd)
617             {
618                 recon_satd = ps_cmn_utils_optimised_function_list->pf_chroma_AC_HAD_4x4_8bit(
619                     pu1_l0_block,
620                     recon_stride_vert,
621                     ai1_zeros_buffer,
622                     had_block_size,
623                     pi2_residue_had,
624                     had_block_size);
625             }
626 
627             /* get the additional cost function based on the absolute SATD diff of source and recon. */
628             psy_rd_cost += (lambda_mod * llabs(source_satd - recon_satd));
629 
630             index_for_src_satd++;
631 
632             if((i % num_horz_blocks) == (num_horz_blocks - 1))
633             {
634                 index_for_src_satd -= num_horz_blocks;
635                 index_for_src_satd +=
636                     (MAX_CU_SIZE / 8); /* Assuming CTB size = 64 and blocksize = 8 */
637             }
638 
639         }  // if had block size ==4
640     }  // for loop for all 4x4 block in the cu
641 
642     psy_rd_cost = psy_rd_cost >> (Q_PSY_STRENGTH_CHROMA + LAMBDA_Q_SHIFT);
643     /* reutrn the additional cost for the psy RD opt */
644     return (psy_rd_cost);
645 }
646 
647 /*!
648 ******************************************************************************
649 * \if Function name : ihevce_psy_rd_cost \endif
650 *
651 * \brief
652 *    Calculates the psyco visual cost for RD opt. This is
653 *
654 * \param[in] pui4_source_satd
655 *   This is the pointer to the array of 8x8 satd of the corresponding source CTB. This is pre calculated.
656 * \param[in] *pui1_recon
657 *   This si the pointer to the pred data.
658 * \param[in] recon_stride
659 *   This si the pred stride
660 * \param[in] pic_type
661 *   Picture type.
662 * \param[in] layer_id
663 *   Indicates the temporal layer.
664 * \param[in] lambda
665 *   This is the weighting factor for the cost.
666 *
667 * \return
668 *    the cost for the psyRDopt
669 *
670 * \author
671 *  Ittiam
672 *
673 *****************************************************************************
674 */
ihevce_psy_rd_cost(LWORD64 * pui4_source_satd,void * pv_recon,WORD32 recon_stride_vert,WORD32 recond_stride_horz,WORD32 cu_size,WORD32 pic_type,WORD32 layer_id,WORD32 lambda,WORD32 start_index,WORD32 is_hbd,UWORD32 u4_psy_strength,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)675 LWORD64 ihevce_psy_rd_cost(
676     LWORD64 *pui4_source_satd,
677     void *pv_recon,
678     WORD32 recon_stride_vert,
679     WORD32 recond_stride_horz,
680     WORD32 cu_size,
681     WORD32 pic_type,
682     WORD32 layer_id,
683     WORD32 lambda,
684     WORD32 start_index,
685     WORD32 is_hbd,
686     UWORD32 u4_psy_strength,
687     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
688 {
689     /* declare local variables to store the SATD values for the pred  for the current block. */
690     LWORD64 psy_rd_cost;  // TODO : check if overflow is there.
691     UWORD32 lambda_mod;
692     WORD32 psy_factor;
693 
694     /* declare local variables */
695     WORD32 i;
696     WORD32 cu_total_size;
697     WORD32 num_comp_had_blocks;
698 
699     UWORD8 *pu1_l0_block;
700     UWORD8 *pu1_recon;
701 
702     WORD32 ht_offset;
703     WORD32 wd_offset;
704     WORD32 cu_ht;
705     WORD32 cu_wd;
706 
707     WORD32 num_horz_blocks;
708 
709     //WORD16 pi2_residue_had[64];
710     WORD16 pi2_residue_had_zscan[64];
711     //WORD16 pi2_residue[64];
712     /* this is used as a buffer with all values equal to 0. This is emulate the case with
713        pred being zero in HAD fucntion */
714     UWORD8 ai1_zeros_buffer[64];
715 
716     WORD32 had_block_size;
717     LWORD64 source_satd;  // to hold source for current 8x8 block
718     LWORD64 recon_satd;  // holds the current recon 8x8 satd
719 
720     WORD32 index_for_src_satd;
721 
722     (void)recond_stride_horz;
723     (void)pic_type;
724     (void)layer_id;
725     /***** initialize the variables ****/
726     had_block_size = 8;
727     cu_ht = cu_size;
728     cu_wd = cu_size;
729 
730     num_horz_blocks = cu_wd / had_block_size;  //ctb_width / had_block_size;
731 
732     ht_offset = -had_block_size;
733     wd_offset = 0 - had_block_size;
734 
735     cu_total_size = cu_ht * cu_wd;
736     num_comp_had_blocks = cu_total_size / (had_block_size * had_block_size);
737 
738     index_for_src_satd = start_index;
739 
740     for(i = 0; i < 64; i++)
741     {
742         ai1_zeros_buffer[i] = 0;
743     }
744     psy_factor = u4_psy_strength;  //PSY_STRENGTH;
745     psy_rd_cost = 0;
746     lambda_mod = lambda * psy_factor;
747 
748     if(!is_hbd)
749     {
750         pu1_recon = (UWORD8 *)pv_recon;
751     }
752 
753     /**************************************************************/
754     /* loop over for every 8x8 blocks in the CU */
755     for(i = 0; i < num_comp_had_blocks; i++)
756     {
757         if(i % num_horz_blocks == 0)
758         {
759             wd_offset = -had_block_size;
760             ht_offset += had_block_size;
761         }
762         wd_offset += had_block_size;
763 
764         /* source satd for the current 8x8 block */
765         source_satd = pui4_source_satd[index_for_src_satd];
766 
767         if(had_block_size == 8)
768         {
769             //WORD32 index;
770             //WORD32 u4_satd;
771             //WORD32 dst_strd = 8;
772             //WORD32 i4_frm_qstep = 0;
773             //WORD32 early_cbf;
774             if(!is_hbd)
775             {
776                 /* get memory pointers for each of L0 and L1 blocks whose hadamard has to be computed */
777                 pu1_l0_block = pu1_recon + recon_stride_vert * ht_offset + wd_offset;
778 
779                 recon_satd = ps_cmn_utils_optimised_function_list->pf_AC_HAD_8x8_8bit(
780                     pu1_l0_block,
781                     recon_stride_vert,
782                     ai1_zeros_buffer,
783                     had_block_size,
784                     pi2_residue_had_zscan,
785                     had_block_size);
786             }
787 
788             /* get the additional cost function based on the absolute SATD diff of source and recon. */
789             psy_rd_cost += (lambda_mod * llabs(source_satd - recon_satd));
790 
791             index_for_src_satd++;
792             if((i % num_horz_blocks) == (num_horz_blocks - 1))
793             {
794                 index_for_src_satd -= num_horz_blocks;
795                 index_for_src_satd +=
796                     (MAX_CU_SIZE / 8); /* Assuming CTB size = 64 and blocksize = 8 */
797             }
798         }  // if
799     }  // for loop
800     psy_rd_cost = psy_rd_cost >> (Q_PSY_STRENGTH + LAMBDA_Q_SHIFT);
801 
802     /* reutrn the additional cost for the psy RD opt */
803     return (psy_rd_cost);
804 }
805 
ihevce_calc_stim_injected_variance(ULWORD64 * pu8_sigmaX,ULWORD64 * pu8_sigmaXSquared,ULWORD64 * u8_var,WORD32 i4_inv_wpred_wt,WORD32 i4_inv_wt_shift_val,WORD32 i4_wpred_log_wdc,WORD32 i4_part_id)806 unsigned long ihevce_calc_stim_injected_variance(
807     ULWORD64 *pu8_sigmaX,
808     ULWORD64 *pu8_sigmaXSquared,
809     ULWORD64 *u8_var,
810     WORD32 i4_inv_wpred_wt,
811     WORD32 i4_inv_wt_shift_val,
812     WORD32 i4_wpred_log_wdc,
813     WORD32 i4_part_id)
814 {
815     ULWORD64 u8_X_Square, u8_temp_var;
816     WORD32 i4_bits_req;
817 
818     const WORD32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
819 
820     u8_X_Square = (pu8_sigmaX[i4_part_id] * pu8_sigmaX[i4_part_id]);
821     u8_temp_var = pu8_sigmaXSquared[i4_part_id] - u8_X_Square;
822 
823     if(i4_inv_wpred_wt != i4_default_src_wt)
824     {
825         i4_inv_wpred_wt = i4_inv_wpred_wt >> i4_inv_wt_shift_val;
826 
827         u8_temp_var = SHR_NEG(
828             (u8_temp_var * i4_inv_wpred_wt * i4_inv_wpred_wt),
829             (30 - (2 * i4_inv_wt_shift_val) - i4_wpred_log_wdc * 2));
830     }
831 
832     GETRANGE64(i4_bits_req, u8_temp_var);
833 
834     if(i4_bits_req > 27)
835     {
836         *u8_var = u8_temp_var >> (i4_bits_req - 27);
837         return (i4_bits_req - 27);
838     }
839     else
840     {
841         *u8_var = u8_temp_var;
842         return 0;
843     }
844 }
845 
ihevce_calc_variance_for_diff_weights(ULWORD64 * pu8_sigmaX,ULWORD64 * pu8_sigmaXSquared,ULWORD64 * u8_var,WORD32 * pi4_inv_wt,WORD32 * pi4_inv_wt_shift_val,pu_result_t * ps_result,WORD32 i4_wpred_log_wdc,PART_ID_T * pe_part_id,UWORD8 u1_cu_size,UWORD8 u1_num_parts,UWORD8 u1_is_for_src)846 unsigned long ihevce_calc_variance_for_diff_weights(
847     ULWORD64 *pu8_sigmaX,
848     ULWORD64 *pu8_sigmaXSquared,
849     ULWORD64 *u8_var,
850     WORD32 *pi4_inv_wt,
851     WORD32 *pi4_inv_wt_shift_val,
852     pu_result_t *ps_result,
853     WORD32 i4_wpred_log_wdc,
854     PART_ID_T *pe_part_id,
855     UWORD8 u1_cu_size,
856     UWORD8 u1_num_parts,
857     UWORD8 u1_is_for_src)
858 {
859     WORD32 i4_k;
860     UWORD32 u4_wd, u4_ht;
861     UWORD8 u1_num_base_blks;
862     UWORD32 u4_num_pixels_in_part;
863     UWORD8 u1_index;
864     WORD32 i4_bits_req;
865 
866     UWORD8 u1_base_blk_size = 4;
867     UWORD32 u4_tot_num_pixels = u1_cu_size * u1_cu_size;
868     ULWORD64 u8_temp_sigmaX[MAX_NUM_INTER_PARTS] = { 0, 0 };
869     ULWORD64 u8_temp_sigmaXsquared[MAX_NUM_INTER_PARTS] = { 0, 0 };
870     ULWORD64 u8_z;
871 
872     const WORD32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
873 
874     for(i4_k = 0; i4_k < u1_num_parts; i4_k++)
875     {
876         u4_wd = ps_result[i4_k].pu.b4_wd + 1;
877         u4_ht = ps_result[i4_k].pu.b4_ht + 1;
878         u1_num_base_blks = u4_wd * u4_ht;
879         u4_num_pixels_in_part = u1_num_base_blks * u1_base_blk_size * u1_base_blk_size;
880 
881         if(u1_is_for_src)
882         {
883             u1_index = pe_part_id[i4_k];
884         }
885         else
886         {
887             u1_index = i4_k;
888         }
889 
890         u8_temp_sigmaXsquared[i4_k] = pu8_sigmaXSquared[u1_index] / u4_num_pixels_in_part;
891         u8_temp_sigmaX[i4_k] = pu8_sigmaX[u1_index];
892 
893         if(u1_is_for_src)
894         {
895             if(pi4_inv_wt[i4_k] != i4_default_src_wt)
896             {
897                 pi4_inv_wt[i4_k] = pi4_inv_wt[i4_k] >> pi4_inv_wt_shift_val[i4_k];
898                 u8_temp_sigmaX[i4_k] = SHR_NEG(
899                     (u8_temp_sigmaX[i4_k] * pi4_inv_wt[i4_k]),
900                     (15 - pi4_inv_wt_shift_val[i4_k] - i4_wpred_log_wdc));
901                 u8_temp_sigmaXsquared[i4_k] = SHR_NEG(
902                     (u8_temp_sigmaXsquared[i4_k] * pi4_inv_wt[i4_k] * pi4_inv_wt[i4_k]),
903                     (30 - (2 * pi4_inv_wt_shift_val[i4_k]) - i4_wpred_log_wdc * 2));
904             }
905         }
906     }
907 
908     u8_z = (u4_tot_num_pixels * (u8_temp_sigmaXsquared[0] + u8_temp_sigmaXsquared[1])) -
909            ((u8_temp_sigmaX[0] + u8_temp_sigmaX[1]) * (u8_temp_sigmaX[0] + u8_temp_sigmaX[1]));
910 
911     GETRANGE64(i4_bits_req, u8_z);
912 
913     if(i4_bits_req > 27)
914     {
915         *u8_var = u8_z >> (i4_bits_req - 27);
916         return (i4_bits_req - 27);
917     }
918     else
919     {
920         *u8_var = u8_z;
921         return 0;
922     }
923 }
924