1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21  *******************************************************************************
22  * @file
23  *  ihevc_quant_iquant_ssd.c
24  *
25  * @brief
26  *  Contains function definitions for quantization, followed by Inverse
27  *  quantization to find transform domain SSD
28  *
29  * @author
30  *  100453, 100578
31  *
32  * @par List of Functions:
33  *   - ihevc_quant_iquant_ssd()
34  *   - ihevc_quant_iquant_ssd_flat_scale_mat()
35  *
36  * @remarks
37  *  None
38  *
39  *******************************************************************************
40  */
41 
42 #include <stdio.h>
43 #include <string.h>
44 #include <stdlib.h>
45 #include "ihevc_typedefs.h"
46 #include "ihevc_macros.h"
47 #include "ihevc_platform_macros.h"
48 #include "ihevc_defs.h"
49 #include "ihevc_debug.h"
50 #include "ihevc_trans_tables.h"
51 #include "ihevc_quant_iquant_ssd.h"
52 #include "ihevc_func_selector.h"
53 #include "ihevc_trans_macros.h"
54 #include <assert.h>
55 
56 /*****************************************************************************/
57 /* Globals                                                                   */
58 /*****************************************************************************/
59 
60 
61 /**
62  *******************************************************************************
63  *
64  * @brief
65  *  This function performs quantization, followed by Inverse
66  *  quantization to find transform domain SSD
67  *
68  * @par Description:
69  *  Performs quantization on coeffs
70  *
71  * @param[in] pi2_coeffs
72  *  4x4 Coeffs
73  *
74  * @param[in] pi2_quant_coeff
75  *  Scaling Matrix
76  *
77  * @param[out] pi2_dst
78  *  Output 4x4 coefficients
79  *
80  * @param[in] qp_div
81  *  Quantization parameter / 6
82  *
83  * @param[in] qp_rem
84  *  Quantization parameter % 6
85  *
86  * @param[in] src_strd
87  *  Input stride
88  *
89  * @param[in] dst_strd
90  *  Output Stride
91  *
92  * @param[out] csbf
93  *  coded sub block flag
94  *
95  * @param[in] csbf_strd
96  *  coded sub block flag
97  *
98  * @param[out] zero_col
99  *  zero column flag
100  *
101  * @param[out] zero_row
102  *  zero column flag
103  *
104  * @returns  cbf
105  * coded block flag
106  *
107  * @remarks
108  *  None
109  *
110  *******************************************************************************
111  */
112 
ihevc_quant_iquant_ssd(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)113 WORD32 ihevc_quant_iquant_ssd
114     (
115     WORD16 *pi2_coeffs,
116     WORD16 *pi2_quant_coeff,
117     WORD16 *pi2_q_dst,
118     WORD16 *pi2_iq_dst,
119     WORD32  trans_size,
120     WORD32 qp_div,/* qpscaled / 6 */
121     WORD32 qp_rem,/* qpscaled % 6 */
122     WORD32 q_add,
123     WORD32 *pi4_quant_round_factor_0_1,
124     WORD32 *pi4_quant_round_factor_1_2,
125     WORD32 src_strd,
126     WORD32 dst_q_strd,
127     WORD32 dst_iq_strd,
128     UWORD8 *csbf,
129     WORD32 csbf_strd,
130     WORD32 *zero_col,
131     WORD32 *zero_row,
132     WORD16 *pi2_dequant_coeff,
133     LWORD64 *pi8_cost
134     )
135 {
136     WORD32 i, j;
137     WORD32 log2_size;
138     WORD16 *pi2_q_dst_orig;
139     WORD32 cbf = 0;
140     WORD32 bit_depth,shift_iq;
141     WORD32 val;
142     WORD16 i2_temp;
143     WORD32 ssd_cost = 0;
144 
145     (void)pi4_quant_round_factor_0_1;
146     (void)pi4_quant_round_factor_1_2;
147     pi2_q_dst_orig  = pi2_q_dst;
148 
149     /* Quant initialization */
150     GETRANGE(log2_size, trans_size);
151     log2_size -= 1;
152 
153     bit_depth = 8 + 0;
154     shift_iq = bit_depth + log2_size - 5;
155 
156     for(i = 0; i < trans_size; i++)
157     {
158         for(j = 0; j < trans_size; j++)
159         {
160             /*  Back up the coefficients before Quantization    */
161             i2_temp = pi2_coeffs[j];
162 
163             /*  Quantization    */
164             QUANT(pi2_q_dst[j], pi2_coeffs[j],
165                   pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
166                   log2_size, q_add);
167 
168             /*  Inverse Quantization    */
169             IQUANT(pi2_iq_dst[j],
170                    pi2_q_dst[j], /*pi2_src[index*src_strd]*/
171                    pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
172                    /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
173                    shift_iq,
174                    qp_div);
175 
176             /*  SSD Computation & Accumulation  */
177             val = i2_temp - pi2_iq_dst[j];
178             ssd_cost += val*val;
179 
180         }
181 
182         pi2_q_dst   += dst_q_strd;
183         pi2_iq_dst  += dst_iq_strd;
184         pi2_quant_coeff += trans_size;
185         pi2_coeffs += src_strd;
186         pi2_dequant_coeff += trans_size;
187     }
188 
189     /* Store the cost */
190     *pi8_cost = ssd_cost;
191 
192     /* CSBF update */
193     {
194         WORD32 block_row, block_col;
195         WORD32 row, col;
196         WORD16 *pi2_block;
197         UWORD32 temp_zero_col = 0;
198         UWORD32 temp_zero_row = 0;
199 
200         pi2_q_dst = pi2_q_dst_orig;
201 
202         for(block_row = 0; block_row < trans_size; block_row += 4)
203         {
204             //block_col is incrementing by 1 for easy update of csbf pointer
205             for(block_col = 0; block_col < trans_size / 4; block_col++)
206             {
207                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
208                 *(csbf + block_col) = 0;
209 
210                 for(row = 0; row < 4; row++)
211                 {
212                     for(col = 0; col < 4; col++)
213                     {
214                         if(pi2_block[row * dst_q_strd + col] != 0)
215                         {
216                             *(csbf + block_col) = 1;
217                             break;
218                         }
219                     }
220                     if(*(csbf + block_col) == 1)
221                     {
222                         /* zero_col update *//* temp_zero_col = ~zero_col */
223                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
224                         // zero col can be optimized further. Now clearing the
225                         // entire 4 bits corresponding to 4 colums of 4x4 block
226                         // even if any 4x4 csbf is set
227 
228                         /* zero row update */ /* temp_zero_row = ~zero_row */
229                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
230                         // zero row can be optimized further. Now clearing the
231                         // entire 4 bits corresponding to 4 rows of 4x4 block
232                         // even if any 4x4 csbf is set
233 
234                         break;
235                     }
236                 }
237 
238                 cbf = cbf || (*(csbf + block_col)); // cbf update
239             }
240             csbf += csbf_strd;
241         }
242 
243         *zero_col = ~temp_zero_col; //final zero_col storing
244         *zero_row = ~temp_zero_row; //final zero_row storing
245     }
246 
247     return cbf;
248 }
249 
250 /**
251  *******************************************************************************
252  *
253  * @brief
254  *  This function performs quantization, followed by Inverse
255  *  quantization
256  *
257  * @par Description:
258  *  Performs quantization on coeffs
259  *
260  * @param[in] pi2_coeffs
261  *  4x4 Coeffs
262  *
263  * @param[in] pi2_quant_coeff
264  *  Scaling Matrix
265  *
266  * @param[out] pi2_dst
267  *  Output 4x4 coefficients
268  *
269  * @param[in] qp_div
270  *  Quantization parameter / 6
271  *
272  * @param[in] qp_rem
273  *  Quantization parameter % 6
274  *
275  * @param[in] src_strd
276  *  Input stride
277  *
278  * @param[in] dst_strd
279  *  Output Stride
280  *
281  * @param[out] csbf
282  *  coded sub block flag
283  *
284  * @param[in] csbf_strd
285  *  coded sub block flag
286  *
287  * @param[out] zero_col
288  *  zero column flag
289  *
290  * @param[out] zero_row
291  *  zero column flag
292  *
293  * @returns  cbf
294  * coded block flag
295  *
296  * @remarks
297  *  None
298  *
299  *******************************************************************************
300  */
301 
ihevc_quant_iquant(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)302 WORD32 ihevc_quant_iquant
303     (
304     WORD16 *pi2_coeffs,
305     WORD16 *pi2_quant_coeff,
306     WORD16 *pi2_q_dst,
307     WORD16 *pi2_iq_dst,
308     WORD32  trans_size,
309     WORD32 qp_div,/* qpscaled / 6 */
310     WORD32 qp_rem,/* qpscaled % 6 */
311     WORD32 q_add,
312     WORD32 *pi4_quant_round_factor_0_1,
313     WORD32 *pi4_quant_round_factor_1_2,
314     WORD32 src_strd,
315     WORD32 dst_q_strd,
316     WORD32 dst_iq_strd,
317     UWORD8 *csbf,
318     WORD32 csbf_strd,
319     WORD32 *zero_col,
320     WORD32 *zero_row,
321     WORD16 *pi2_dequant_coeff,
322     LWORD64 *pi8_cost
323     )
324 {
325     WORD32 i, j;
326     WORD32 log2_size;
327     WORD16 *pi2_q_dst_orig;
328     WORD32 cbf = 0;
329     WORD32 bit_depth,shift_iq;
330     WORD16 i2_temp;
331 
332     (void)pi8_cost;
333     (void)pi4_quant_round_factor_0_1;
334     (void)pi4_quant_round_factor_1_2;
335     pi2_q_dst_orig  = pi2_q_dst;
336 
337     /* Quant initialization */
338     GETRANGE(log2_size, trans_size);
339     log2_size -= 1;
340 
341     bit_depth = 8;
342     shift_iq = bit_depth + log2_size - 5;
343 
344     for(i = 0; i < trans_size; i++)
345     {
346         for(j = 0; j < trans_size; j++)
347         {
348             /*  Back up the coefficients before Quantization    */
349             i2_temp = pi2_coeffs[j];
350 
351             /*  Quantization    */
352             QUANT(pi2_q_dst[j], pi2_coeffs[j],
353                   pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
354                   log2_size, q_add);
355 
356             /*  Inverse Quantization    */
357             IQUANT(pi2_iq_dst[j],
358                    pi2_q_dst[j], /*pi2_src[index*src_strd]*/
359                    pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
360                    shift_iq,
361                    qp_div);
362         }
363 
364         pi2_q_dst   += dst_q_strd;
365         pi2_iq_dst  += dst_iq_strd;
366         pi2_quant_coeff += trans_size;
367         pi2_coeffs += src_strd;
368         pi2_dequant_coeff += trans_size;
369     }
370 
371     /* CSBF update */
372     {
373         WORD32 block_row, block_col;
374         WORD32 row, col;
375         WORD16 *pi2_block;
376         UWORD32 temp_zero_col = 0;
377         UWORD32 temp_zero_row = 0;
378 
379         pi2_q_dst = pi2_q_dst_orig;
380 
381         for(block_row = 0; block_row < trans_size; block_row += 4)
382         {
383             //block_col is incrementing by 1 for easy update of csbf pointer
384             for(block_col = 0; block_col < trans_size / 4; block_col++)
385             {
386                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
387                 *(csbf + block_col) = 0;
388 
389                 for(row = 0; row < 4; row++)
390                 {
391                     for(col = 0; col < 4; col++)
392                     {
393                         if(pi2_block[row * dst_q_strd + col] != 0)
394                         {
395                             *(csbf + block_col) = 1;
396                             break;
397                         }
398                     }
399                     if(*(csbf + block_col) == 1)
400                     {
401                         /* zero_col update *//* temp_zero_col = ~zero_col */
402                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
403                         // zero col can be optimized further. Now clearing the
404                         // entire 4 bits corresponding to 4 colums of 4x4 block
405                         // even if any 4x4 csbf is set
406 
407                         /* zero row update */ /* temp_zero_row = ~zero_row */
408                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
409                         // zero row can be optimized further. Now clearing the
410                         // entire 4 bits corresponding to 4 rows of 4x4 block
411                         // even if any 4x4 csbf is set
412 
413                         break;
414                     }
415                 }
416 
417                 cbf = cbf || (*(csbf + block_col)); // cbf update
418             }
419 
420             csbf += csbf_strd;
421         }
422 
423         *zero_col = ~temp_zero_col; //final zero_col storing
424         *zero_row = ~temp_zero_row; //final zero_row storing
425     }
426 
427     return cbf;
428 }
429 
430 /**
431  *******************************************************************************
432  *
433  * @brief
434  *  This function performs quantization, followed by Inverse
435  *  quantization to find transform domain SSD
436  *
437  * @par Description:
438  *  Performs quantization on coeffs
439  *
440  * @param[in] pi2_coeffs
441  *  4x4 Coeffs
442  *
443  * @param[in] pi2_quant_coeff
444  *  Scaling Matrix
445  *
446  * @param[out] pi2_dst
447  *  Output 4x4 coefficients
448  *
449  * @param[in] qp_div
450  *  Quantization parameter / 6
451  *
452  * @param[in] qp_rem
453  *  Quantization parameter % 6
454  *
455  * @param[in] src_strd
456  *  Input stride
457  *
458  * @param[in] dst_strd
459  *  Output Stride
460  *
461  * @param[out] csbf
462  *  coded sub block flag
463  *
464  * @param[in] csbf_strd
465  *  coded sub block flag
466  *
467  * @param[out] zero_col
468  *  zero column flag
469  *
470  * @param[out] zero_row
471  *  zero column flag
472  *
473  * @returns  cbf
474  * coded block flag
475  *
476  * @remarks
477  *  None
478  *
479  *******************************************************************************
480  */
481 
ihevc_quant_iquant_ssd_rdoq(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)482 WORD32 ihevc_quant_iquant_ssd_rdoq
483     (
484     WORD16 *pi2_coeffs,
485     WORD16 *pi2_quant_coeff,
486     WORD16 *pi2_q_dst,
487     WORD16 *pi2_iq_dst,
488     WORD32  trans_size,
489     WORD32 qp_div,/* qpscaled / 6 */
490     WORD32 qp_rem,/* qpscaled % 6 */
491     WORD32 q_add,
492     WORD32 *pi4_quant_round_factor_0_1,
493     WORD32 *pi4_quant_round_factor_1_2,
494     WORD32 src_strd,
495     WORD32 dst_q_strd,
496     WORD32 dst_iq_strd,
497     UWORD8 *csbf,
498     WORD32 csbf_strd,
499     WORD32 *zero_col,
500     WORD32 *zero_row,
501     WORD16 *pi2_dequant_coeff,
502     LWORD64 *pi8_cost
503     )
504 {
505     WORD32 i, j;
506     WORD32 log2_size;
507     WORD16 *pi2_q_dst_orig;
508     WORD32 cbf = 0;
509     WORD32 bit_depth,shift_iq;
510     WORD32 val;
511     WORD16 i2_temp;
512     WORD32 ssd_cost = 0;
513 
514     (void)pi4_quant_round_factor_0_1;
515     (void)pi4_quant_round_factor_1_2;
516     pi2_q_dst_orig  = pi2_q_dst;
517 
518     GETRANGE(log2_size, trans_size);
519     log2_size -= 1;
520 
521     bit_depth = 8 + 0;
522     shift_iq = bit_depth + log2_size - 5;
523 
524     for(i = 0; i < trans_size; i++)
525     {
526         for(j = 0; j < trans_size; j++)
527         {
528             /*  Back up the coefficients before Quantization    */
529             i2_temp = pi2_coeffs[j];
530 
531             /*  Quantization    */
532             QUANT(pi2_q_dst[j], pi2_coeffs[j],
533                 pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
534                 log2_size, q_add);
535 
536 
537             if (abs(pi2_q_dst[j]) > 1)
538             {
539                 QUANT(pi2_q_dst[j],i2_temp,
540                     pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
541                     log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
542 
543             }
544 
545 
546             /*  Inverse Quantization    */
547             IQUANT(pi2_iq_dst[j],
548                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
549                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
550                 /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
551                 shift_iq,
552                 qp_div);
553 
554             /*  SSD Computation & Accumulation  */
555             val = i2_temp - pi2_iq_dst[j];
556             ssd_cost += val*val;
557 
558         }
559 
560         pi2_q_dst   += dst_q_strd;
561         pi2_iq_dst  += dst_iq_strd;
562         pi2_quant_coeff += trans_size;
563         pi2_coeffs += src_strd;
564         pi2_dequant_coeff += trans_size;
565     }
566     /* Store the cost */
567     *pi8_cost = ssd_cost;
568 
569     /* CSBF update */
570     {
571         WORD32 block_row, block_col;
572         WORD32 row, col;
573         WORD16 *pi2_block;
574         UWORD32 temp_zero_col = 0;
575         UWORD32 temp_zero_row = 0;
576 
577         pi2_q_dst = pi2_q_dst_orig;
578 
579         for(block_row = 0; block_row < trans_size; block_row += 4)
580         {
581             //block_col is incrementing by 1 for easy update of csbf pointer
582             for(block_col = 0; block_col < trans_size / 4; block_col++)
583             {
584                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
585                 *(csbf + block_col) = 0;
586 
587                 for(row = 0; row < 4; row++)
588                 {
589                     for(col = 0; col < 4; col++)
590                     {
591                         if(pi2_block[row * dst_q_strd + col] != 0)
592                         {
593                             *(csbf + block_col) = 1;
594                             break;
595                         }
596                     }
597                     if(*(csbf + block_col) == 1)
598                     {
599                         /* zero_col update *//* temp_zero_col = ~zero_col */
600                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
601                         // zero col can be optimized further. Now clearing the
602                         // entire 4 bits corresponding to 4 colums of 4x4 block
603                         // even if any 4x4 csbf is set
604 
605                         /* zero row update */ /* temp_zero_row = ~zero_row */
606                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
607                         // zero row can be optimized further. Now clearing the
608                         // entire 4 bits corresponding to 4 rows of 4x4 block
609                         // even if any 4x4 csbf is set
610 
611                         break;
612                     }
613                 }
614 
615                 cbf = cbf || (*(csbf + block_col)); // cbf update
616             }
617             csbf += csbf_strd;
618         }
619 
620         *zero_col = ~temp_zero_col; //final zero_col storing
621         *zero_row = ~temp_zero_row; //final zero_row storing
622     }
623 
624     return cbf;
625 }
626 
ihevc_quant_iquant_rdoq(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)627 WORD32 ihevc_quant_iquant_rdoq
628     (
629     WORD16 *pi2_coeffs,
630     WORD16 *pi2_quant_coeff,
631     WORD16 *pi2_q_dst,
632     WORD16 *pi2_iq_dst,
633     WORD32  trans_size,
634     WORD32 qp_div,/* qpscaled / 6 */
635     WORD32 qp_rem,/* qpscaled % 6 */
636     WORD32 q_add,
637     WORD32 *pi4_quant_round_factor_0_1,
638     WORD32 *pi4_quant_round_factor_1_2,
639     WORD32 src_strd,
640     WORD32 dst_q_strd,
641     WORD32 dst_iq_strd,
642     UWORD8 *csbf,
643     WORD32 csbf_strd,
644     WORD32 *zero_col,
645     WORD32 *zero_row,
646     WORD16 *pi2_dequant_coeff,
647     LWORD64 *pi8_cost
648     )
649 {
650     WORD32 i, j;
651     WORD32 log2_size;
652     WORD16 *pi2_q_dst_orig;
653     WORD32 cbf = 0;
654     WORD32 bit_depth,shift_iq;
655     WORD16 i2_temp;
656 
657     (void)pi8_cost;
658     (void)pi4_quant_round_factor_0_1;
659     (void)pi4_quant_round_factor_1_2;
660     pi2_q_dst_orig  = pi2_q_dst;
661 
662     GETRANGE(log2_size, trans_size);
663     log2_size -= 1;
664 
665     bit_depth = 8 + 0;
666     shift_iq = bit_depth + log2_size - 5;
667 
668     for(i = 0; i < trans_size; i++)
669     {
670         for(j = 0; j < trans_size; j++)
671         {
672             /*  Back up the coefficients before Quantization    */
673             i2_temp = pi2_coeffs[j];
674 
675             /*  Quantization    */
676             QUANT(pi2_q_dst[j], pi2_coeffs[j],
677                 pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
678                 log2_size, q_add);
679 
680             if (abs(pi2_q_dst[j]) > 1)
681             {
682                 QUANT(pi2_q_dst[j],i2_temp,
683                     pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
684                     log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
685             }
686 
687             /*  Inverse Quantization    */
688             IQUANT(pi2_iq_dst[j],
689                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
690                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
691                 shift_iq,
692                 qp_div);
693         }
694 
695         pi2_q_dst   += dst_q_strd;
696         pi2_iq_dst  += dst_iq_strd;
697         pi2_quant_coeff += trans_size;
698         pi2_coeffs += src_strd;
699         pi2_dequant_coeff += trans_size;
700     }
701 
702     /* CSBF update */
703     {
704         WORD32 block_row, block_col;
705         WORD32 row, col;
706         WORD16 *pi2_block;
707         UWORD32 temp_zero_col = 0;
708         UWORD32 temp_zero_row = 0;
709 
710         pi2_q_dst = pi2_q_dst_orig;
711 
712         for(block_row = 0; block_row < trans_size; block_row += 4)
713         {
714             //block_col is incrementing by 1 for easy update of csbf pointer
715             for(block_col = 0; block_col < trans_size / 4; block_col++)
716             {
717                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
718                 *(csbf + block_col) = 0;
719 
720                 for(row = 0; row < 4; row++)
721                 {
722                     for(col = 0; col < 4; col++)
723                     {
724                         if(pi2_block[row * dst_q_strd + col] != 0)
725                         {
726                             *(csbf + block_col) = 1;
727                             break;
728                         }
729                     }
730                     if(*(csbf + block_col) == 1)
731                     {
732                         /* zero_col update *//* temp_zero_col = ~zero_col */
733                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
734                         // zero col can be optimized further. Now clearing the
735                         // entire 4 bits corresponding to 4 colums of 4x4 block
736                         // even if any 4x4 csbf is set
737 
738                         /* zero row update */ /* temp_zero_row = ~zero_row */
739                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
740                         // zero row can be optimized further. Now clearing the
741                         // entire 4 bits corresponding to 4 rows of 4x4 block
742                         // even if any 4x4 csbf is set
743 
744                         break;
745                     }
746                 }
747 
748                 cbf = cbf || (*(csbf + block_col)); // cbf update
749             }
750             csbf += csbf_strd;
751         }
752 
753         *zero_col = ~temp_zero_col; //final zero_col storing
754         *zero_row = ~temp_zero_row; //final zero_row storing
755     }
756 
757     return cbf;
758 }
759 
760 /**
761  *******************************************************************************
762  *
763  * @brief
764  *  This function performs quantization(using flat scale matrix), followed by
765  *  inverse quantization to find transform domain SSD
766  *
767  * @par Description:
768  *  Performs quantization on coeffs
769  *
770  * @param[in] pi2_coeffs
771  *  4x4 Coeffs
772  *
773  * @param[in] pi2_quant_coeff
774  *  Scaling Matrix
775  *
776  * @param[out] pi2_dst
777  *  Output 4x4 coefficients
778  *
779  * @param[in] qp_div
780  *  Quantization parameter / 6
781  *
782  * @param[in] qp_rem
783  *  Quantization parameter % 6
784  *
785  * @param[in] src_strd
786  *  Input stride
787  *
788  * @param[in] dst_strd
789  *  Output Stride
790  *
791  * @param[out] csbf
792  *  coded sub block flag
793  *
794  * @param[in] csbf_strd
795  *  coded sub block flag
796  *
797  * @param[out] zero_col
798  *  zero column flag
799  *
800  * @param[out] zero_row
801  *  zero column flag
802  *
803  * @returns  cbf
804  * coded block flag
805  *
806  * @remarks
807  *  None
808  *
809  *******************************************************************************
810  */
811 
ihevc_quant_iquant_ssd_flat_scale_mat(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)812 WORD32 ihevc_quant_iquant_ssd_flat_scale_mat
813     (
814     WORD16 *pi2_coeffs,
815     WORD16 *pi2_quant_coeff,
816     WORD16 *pi2_q_dst,
817     WORD16 *pi2_iq_dst,
818     WORD32  trans_size,
819     WORD32 qp_div,/* qpscaled / 6 */
820     WORD32 qp_rem,/* qpscaled % 6 */
821     WORD32 q_add,
822     WORD32 *pi4_quant_round_factor_0_1,
823     WORD32 *pi4_quant_round_factor_1_2,
824     WORD32 src_strd,
825     WORD32 dst_q_strd,
826     WORD32 dst_iq_strd,
827     UWORD8 *csbf,
828     WORD32 csbf_strd,
829     WORD32 *zero_col,
830     WORD32 *zero_row,
831     WORD16 *pi2_dequant_coeff,
832     LWORD64 *pi8_cost
833     )
834 {
835     WORD32 i, j;
836     WORD32 log2_size;
837     WORD16 *pi2_q_dst_orig;
838     WORD32 cbf = 0;
839     WORD32 bit_depth,shift_iq;
840     WORD32 val;
841     WORD16 i2_temp;
842     /* Initialize cost to zero */
843     WORD32 ssd_cost = 0;
844 
845     (void)pi4_quant_round_factor_0_1;
846     (void)pi4_quant_round_factor_1_2;
847     pi2_q_dst_orig  = pi2_q_dst;
848 
849     /* Quant initialization */
850     GETRANGE(log2_size, trans_size);
851     log2_size -= 1;
852 
853     bit_depth = 8 + 0;
854     shift_iq = bit_depth + log2_size - 5;
855 
856     for(i = 0; i < trans_size; i++)
857     {
858         for(j = 0; j < trans_size; j++)
859         {
860             /*  Back up the coefficients before Quantization    */
861             i2_temp = pi2_coeffs[j];
862 
863             /*QUANT(pi2_dst[j], pi2_coeffs[j],
864             pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
865             log2_size, q_add);*/
866 
867             /* modified by 1028 */
868             /*  Quantization    */
869             QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
870                   g_ihevc_quant_scales[qp_rem], qp_div,
871                   log2_size, q_add);
872 
873             if(pi2_q_dst[j] == 0)
874             {
875                 pi2_iq_dst[j] = 0;
876             }
877             else
878             {
879             /*  Inverse Quantization    */
880             IQUANT(pi2_iq_dst[j],
881                     pi2_q_dst[j], /*pi2_src[index*src_strd]*/
882                     pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
883                     shift_iq,
884                     qp_div);
885             }
886 
887             /*  SSD Computation & Accumulation  */
888             val = i2_temp - pi2_iq_dst[j];
889             ssd_cost += val*val;
890 
891         }
892 
893         pi2_q_dst   += dst_q_strd;
894         pi2_iq_dst  += dst_iq_strd;
895         pi2_quant_coeff += trans_size;
896         pi2_coeffs += src_strd;
897         pi2_dequant_coeff += trans_size;
898     }
899     /* Store the cost */
900     *pi8_cost = ssd_cost;
901 
902     /* CSBF update */
903     {
904         WORD32 block_row, block_col;
905         WORD32 row, col;
906         WORD16 *pi2_block;
907         UWORD32 temp_zero_col = 0;
908         UWORD32 temp_zero_row = 0;
909 
910         pi2_q_dst = pi2_q_dst_orig;
911 
912         for(block_row = 0; block_row < trans_size; block_row += 4)
913         {
914             //block_col is incrementing by 1 for easy update of csbf pointer
915             for(block_col = 0; block_col < trans_size / 4; block_col++)
916             {
917                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
918                 *(csbf + block_col) = 0;
919 
920                 for(row = 0; row < 4; row++)
921                 {
922                     for(col = 0; col < 4; col++)
923                     {
924                         if(pi2_block[row * dst_q_strd + col] != 0)
925                         {
926                             *(csbf + block_col) = 1;
927                             break;
928                         }
929                     }
930                     if(*(csbf + block_col) == 1)
931                     {
932                         /* zero_col update *//* temp_zero_col = ~zero_col */
933                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
934                         // zero col can be optimized further. Now clearing the
935                         // entire 4 bits corresponding to 4 colums of 4x4 block
936                         // even if any 4x4 csbf is set
937 
938                         /* zero row update */ /* temp_zero_row = ~zero_row */
939                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
940                         // zero row can be optimized further. Now clearing the
941                         // entire 4 bits corresponding to 4 rows of 4x4 block
942                         // even if any 4x4 csbf is set
943 
944                         break;
945                     }
946                 }
947 
948                 cbf = cbf || (*(csbf + block_col)); // cbf update
949             }
950             csbf += csbf_strd;
951         }
952 
953         *zero_col = ~temp_zero_col; //final zero_col storing
954         *zero_row = ~temp_zero_row; //final zero_row storing
955     }
956 
957     return cbf;
958 }
959 
ihevc_quant_iquant_flat_scale_mat(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)960 WORD32 ihevc_quant_iquant_flat_scale_mat
961     (
962     WORD16 *pi2_coeffs,
963     WORD16 *pi2_quant_coeff,
964     WORD16 *pi2_q_dst,
965     WORD16 *pi2_iq_dst,
966     WORD32  trans_size,
967     WORD32 qp_div,/* qpscaled / 6 */
968     WORD32 qp_rem,/* qpscaled % 6 */
969     WORD32 q_add,
970     WORD32 *pi4_quant_round_factor_0_1,
971     WORD32 *pi4_quant_round_factor_1_2,
972     WORD32 src_strd,
973     WORD32 dst_q_strd,
974     WORD32 dst_iq_strd,
975     UWORD8 *csbf,
976     WORD32 csbf_strd,
977     WORD32 *zero_col,
978     WORD32 *zero_row,
979     WORD16 *pi2_dequant_coeff,
980     LWORD64 *pi8_cost
981     )
982 {
983     WORD32 i, j;
984     WORD32 log2_size;
985     WORD16 *pi2_q_dst_orig;
986     WORD32 cbf = 0;
987     WORD32 bit_depth,shift_iq;
988     WORD16 i2_temp;
989 
990     (void)pi8_cost;
991     (void)pi4_quant_round_factor_0_1;
992     (void)pi4_quant_round_factor_1_2;
993     pi2_q_dst_orig  = pi2_q_dst;
994 
995     /* Quant initialization */
996     GETRANGE(log2_size, trans_size);
997     log2_size -= 1;
998 
999     bit_depth = 8 + 0;
1000     shift_iq = bit_depth + log2_size - 5;
1001 
1002     for(i = 0; i < trans_size; i++)
1003     {
1004         for(j = 0; j < trans_size; j++)
1005         {
1006             /*  Back up the coefficients before Quantization    */
1007             i2_temp = pi2_coeffs[j];
1008 
1009             /*  Quantization    */
1010             QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1011                   g_ihevc_quant_scales[qp_rem], qp_div,
1012                   log2_size, q_add);
1013 
1014             if(pi2_q_dst[j] == 0)
1015             {
1016                 pi2_iq_dst[j] = 0;
1017             }
1018             else
1019             {
1020             /*  Inverse Quantization    */
1021             IQUANT(pi2_iq_dst[j],
1022                     pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1023                     pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1024                     shift_iq,
1025                     qp_div);
1026             }
1027         }
1028 
1029         pi2_q_dst   += dst_q_strd;
1030         pi2_iq_dst  += dst_iq_strd;
1031         pi2_quant_coeff += trans_size;
1032         pi2_coeffs += src_strd;
1033         pi2_dequant_coeff += trans_size;
1034     }
1035 
1036     /* CSBF update */
1037     {
1038         WORD32 block_row, block_col;
1039         WORD32 row, col;
1040         WORD16 *pi2_block;
1041         UWORD32 temp_zero_col = 0;
1042         UWORD32 temp_zero_row = 0;
1043 
1044         pi2_q_dst = pi2_q_dst_orig;
1045 
1046         for(block_row = 0; block_row < trans_size; block_row += 4)
1047         {
1048             //block_col is incrementing by 1 for easy update of csbf pointer
1049             for(block_col = 0; block_col < trans_size / 4; block_col++)
1050             {
1051                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1052                 *(csbf + block_col) = 0;
1053 
1054                 for(row = 0; row < 4; row++)
1055                 {
1056                     for(col = 0; col < 4; col++)
1057                     {
1058                         if(pi2_block[row * dst_q_strd + col] != 0)
1059                         {
1060                             *(csbf + block_col) = 1;
1061                             break;
1062                         }
1063                     }
1064                     if(*(csbf + block_col) == 1)
1065                     {
1066                         /* zero_col update *//* temp_zero_col = ~zero_col */
1067                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1068                         // zero col can be optimized further. Now clearing the
1069                         // entire 4 bits corresponding to 4 colums of 4x4 block
1070                         // even if any 4x4 csbf is set
1071 
1072                         /* zero row update */ /* temp_zero_row = ~zero_row */
1073                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1074                         // zero row can be optimized further. Now clearing the
1075                         // entire 4 bits corresponding to 4 rows of 4x4 block
1076                         // even if any 4x4 csbf is set
1077 
1078                         break;
1079                     }
1080                 }
1081 
1082                 cbf = cbf || (*(csbf + block_col)); // cbf update
1083             }
1084             csbf += csbf_strd;
1085         }
1086 
1087         *zero_col = ~temp_zero_col; //final zero_col storing
1088         *zero_row = ~temp_zero_row; //final zero_row storing
1089     }
1090 
1091     return cbf;
1092 }
1093 
1094 /**
1095  *******************************************************************************
1096  *
1097  * @brief
1098  *  This function performs quantization(using flat scale matrix), followed by
1099  *  inverse quantization to find transform domain SSD; when we perform RDOQ.
1100  *  In case the quantized value turns out to be grater than 1, we then requantize
1101  *  use half rounding.
1102  *
1103  * @par Description:
1104  *  Performs quantization on coeffs
1105  *
1106  * @param[in] pi2_coeffs
1107  *  4x4 Coeffs
1108  *
1109  * @param[in] pi2_quant_coeff
1110  *  Scaling Matrix
1111  *
1112  * @param[out] pi2_dst
1113  *  Output 4x4 coefficients
1114  *
1115  * @param[in] qp_div
1116  *  Quantization parameter / 6
1117  *
1118  * @param[in] qp_rem
1119  *  Quantization parameter % 6
1120  *
1121  * @param[in] src_strd
1122  *  Input stride
1123  *
1124  * @param[in] dst_strd
1125  *  Output Stride
1126  *
1127  * @param[out] csbf
1128  *  coded sub block flag
1129  *
1130  * @param[in] csbf_strd
1131  *  coded sub block flag
1132  *
1133  * @param[out] zero_col
1134  *  zero column flag
1135  *
1136  * @param[out] zero_row
1137  *  zero column flag
1138  *
1139  * @returns  cbf
1140  * coded block flag
1141  *
1142  * @remarks
1143  *  None
1144  *
1145  *******************************************************************************
1146  */
1147 
ihevc_quant_iquant_ssd_flat_scale_mat_rdoq(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1148 WORD32 ihevc_quant_iquant_ssd_flat_scale_mat_rdoq
1149     (
1150     WORD16 *pi2_coeffs,
1151     WORD16 *pi2_quant_coeff,
1152     WORD16 *pi2_q_dst,
1153     WORD16 *pi2_iq_dst,
1154     WORD32  trans_size,
1155     WORD32 qp_div,/* qpscaled / 6 */
1156     WORD32 qp_rem,/* qpscaled % 6 */
1157     WORD32 q_add,
1158     WORD32 *pi4_quant_round_factor_0_1,
1159     WORD32 *pi4_quant_round_factor_1_2,
1160     WORD32 src_strd,
1161     WORD32 dst_q_strd,
1162     WORD32 dst_iq_strd,
1163     UWORD8 *csbf,
1164     WORD32 csbf_strd,
1165     WORD32 *zero_col,
1166     WORD32 *zero_row,
1167     WORD16 *pi2_dequant_coeff,
1168     LWORD64 *pi8_cost
1169     )
1170 {
1171     WORD32 i, j;
1172     WORD32 log2_size;
1173     WORD16 *pi2_q_dst_orig;
1174     WORD32 cbf = 0;
1175     WORD32 bit_depth,shift_iq;
1176     WORD32 val;
1177     WORD16 i2_temp;
1178     /* Initialize cost to zero */
1179     WORD32 ssd_cost = 0;
1180 
1181     (void)pi4_quant_round_factor_0_1;
1182     (void)pi4_quant_round_factor_1_2;
1183     pi2_q_dst_orig  = pi2_q_dst;
1184 
1185     /* Quant initialization */
1186     GETRANGE(log2_size, trans_size);
1187     log2_size -= 1;
1188 
1189     bit_depth = 8 + 0;
1190     shift_iq = bit_depth + log2_size - 5;
1191 
1192     for(i = 0; i < trans_size; i++)
1193     {
1194         for(j = 0; j < trans_size; j++)
1195         {
1196             WORD16 i2_temp1;
1197             /*  Back up the coefficients before Quantization    */
1198             i2_temp = pi2_coeffs[j];
1199 
1200             /*QUANT(pi2_dst[j], pi2_coeffs[j],
1201             pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1202             log2_size, q_add);*/
1203 
1204             /* modified by 1028 */
1205             /*  Quantization    */
1206 
1207             if (1)
1208             {
1209                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1210                   g_ihevc_quant_scales[qp_rem], qp_div,
1211                   log2_size, q_add);
1212             }
1213             else
1214             {                                                                                                                                                                \
1215                 WORD16 inp = pi2_coeffs[j],out = pi2_q_dst[j];
1216                 WORD32 quant_coeff = g_ihevc_quant_scales[qp_rem];
1217                 WORD32 log2_trans_size = log2_size;
1218                 WORD32 tmp;                                                                                                                                                  \
1219                 WORD32 sign;                                                                                                                                                 \
1220                 WORD32 bit_depth,transform_shift;                                                                                                                            \
1221                 WORD32  q_bits, quant_multiplier;                                                                                                                            \
1222                                                                                                                                                                                 \
1223                 /* q_bits and q_add calculation*/                                                                                                                            \
1224                 /* To be moved outside in neon. To be computer once per transform call */                                                                                    \
1225                 bit_depth = 8;                                                                                                                                               \
1226                 transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size;                                                                                        \
1227                 quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */                 \
1228                 q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier - FLAT_RESCALE_MAT_Q_SHIFT /* 2048 */;                                                                       \
1229                                                                                                                                                                                 \
1230                 sign = (inp)<0 ? -1:1;                                                                                                                                       \
1231                                                                                                                                                                                 \
1232                 tmp = (WORD32)(abs(inp));                                                                                                                                    \
1233                 tmp = tmp * (quant_coeff);                                                                                                                                   \
1234                 tmp = tmp + (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));                                                                                            \
1235                 tmp = tmp >> q_bits;                                                                                                                                         \
1236                                                                                                                                                                                 \
1237                 tmp = tmp * sign;                                                                                                                                            \
1238                 out = (WORD16) CLIP_S16(tmp);                                                                                                                                \
1239             }
1240             i2_temp1 = pi2_q_dst[j];
1241             if (abs(pi2_q_dst[j]) > 1)
1242             {
1243                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1244                   g_ihevc_quant_scales[qp_rem], qp_div,
1245                   log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1246             }
1247 
1248 
1249             ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1250             ASSERT(abs(i2_temp1) <= abs(pi2_q_dst[j]));
1251 
1252 
1253             /*  Inverse Quantization    */
1254             IQUANT(pi2_iq_dst[j],
1255                     pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1256                     pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1257                     shift_iq,
1258                     qp_div);
1259 
1260             /*  SSD Computation & Accumulation  */
1261             val = i2_temp - pi2_iq_dst[j];
1262             ssd_cost += val*val;
1263 
1264         }
1265 
1266         pi2_q_dst   += dst_q_strd;
1267         pi2_iq_dst  += dst_iq_strd;
1268         pi2_quant_coeff += trans_size;
1269         pi2_coeffs += src_strd;
1270         pi2_dequant_coeff += trans_size;
1271 
1272     }
1273     /* Store the cost */
1274     *pi8_cost = ssd_cost;
1275 
1276     /* CSBF update */
1277     {
1278         WORD32 block_row, block_col;
1279         WORD32 row, col;
1280         WORD16 *pi2_block;
1281         UWORD32 temp_zero_col = 0;
1282         UWORD32 temp_zero_row = 0;
1283 
1284         pi2_q_dst = pi2_q_dst_orig;
1285 
1286         for(block_row = 0; block_row < trans_size; block_row += 4)
1287         {
1288             //block_col is incrementing by 1 for easy update of csbf pointer
1289             for(block_col = 0; block_col < trans_size / 4; block_col++)
1290             {
1291                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1292                 *(csbf + block_col) = 0;
1293 
1294                 for(row = 0; row < 4; row++)
1295                 {
1296                     for(col = 0; col < 4; col++)
1297                     {
1298                         if(pi2_block[row * dst_q_strd + col] != 0)
1299                         {
1300                             *(csbf + block_col) = 1;
1301                             break;
1302                         }
1303                     }
1304                     if(*(csbf + block_col) == 1)
1305                     {
1306                         /* zero_col update *//* temp_zero_col = ~zero_col */
1307                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1308                         // zero col can be optimized further. Now clearing the
1309                         // entire 4 bits corresponding to 4 colums of 4x4 block
1310                         // even if any 4x4 csbf is set
1311 
1312                         /* zero row update */ /* temp_zero_row = ~zero_row */
1313                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1314                         // zero row can be optimized further. Now clearing the
1315                         // entire 4 bits corresponding to 4 rows of 4x4 block
1316                         // even if any 4x4 csbf is set
1317 
1318                         break;
1319                     }
1320                 }
1321 
1322                 cbf = cbf || (*(csbf + block_col)); // cbf update
1323             }
1324             csbf += csbf_strd;
1325         }
1326 
1327         *zero_col = ~temp_zero_col; //final zero_col storing
1328         *zero_row = ~temp_zero_row; //final zero_row storing
1329     }
1330     return cbf;
1331 }
1332 
ihevc_quant_iquant_flat_scale_mat_rdoq(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1333 WORD32 ihevc_quant_iquant_flat_scale_mat_rdoq
1334     (
1335     WORD16 *pi2_coeffs,
1336     WORD16 *pi2_quant_coeff,
1337     WORD16 *pi2_q_dst,
1338     WORD16 *pi2_iq_dst,
1339     WORD32  trans_size,
1340     WORD32 qp_div,/* qpscaled / 6 */
1341     WORD32 qp_rem,/* qpscaled % 6 */
1342     WORD32 q_add,
1343     WORD32 *pi4_quant_round_factor_0_1,
1344     WORD32 *pi4_quant_round_factor_1_2,
1345     WORD32 src_strd,
1346     WORD32 dst_q_strd,
1347     WORD32 dst_iq_strd,
1348     UWORD8 *csbf,
1349     WORD32 csbf_strd,
1350     WORD32 *zero_col,
1351     WORD32 *zero_row,
1352     WORD16 *pi2_dequant_coeff,
1353     LWORD64 *pi8_cost
1354     )
1355 {
1356     WORD32 i, j;
1357     WORD32 log2_size;
1358     WORD16 *pi2_q_dst_orig;
1359     WORD32 cbf = 0;
1360     WORD32 bit_depth,shift_iq;
1361     WORD16 i2_temp;
1362 
1363     (void)pi8_cost;
1364     (void)pi4_quant_round_factor_0_1;
1365     (void)pi4_quant_round_factor_1_2;
1366     pi2_q_dst_orig  = pi2_q_dst;
1367 
1368     /* Quant initialization */
1369     GETRANGE(log2_size, trans_size);
1370     log2_size -= 1;
1371 
1372     bit_depth = 8 + 0;
1373     shift_iq = bit_depth + log2_size - 5;
1374 
1375     for(i = 0; i < trans_size; i++)
1376     {
1377         for(j = 0; j < trans_size; j++)
1378         {
1379             WORD16 i2_temp1;
1380             /*  Back up the coefficients before Quantization    */
1381             i2_temp = pi2_coeffs[j];
1382 
1383             QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1384                 g_ihevc_quant_scales[qp_rem], qp_div,
1385                 log2_size, q_add);
1386 
1387             i2_temp1 = pi2_q_dst[j];
1388 
1389             if (abs(pi2_q_dst[j]) > 1)
1390             {
1391                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1392                     g_ihevc_quant_scales[qp_rem], qp_div,
1393                     log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1394             }
1395 
1396             ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1397             ASSERT(abs(i2_temp1) <= abs(pi2_q_dst[j]));
1398 
1399             IQUANT(pi2_iq_dst[j],
1400                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1401                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1402                 shift_iq,
1403                 qp_div);
1404         }
1405 
1406         pi2_q_dst   += dst_q_strd;
1407         pi2_iq_dst  += dst_iq_strd;
1408         pi2_quant_coeff += trans_size;
1409         pi2_coeffs += src_strd;
1410         pi2_dequant_coeff += trans_size;
1411     }
1412 
1413     /* CSBF update */
1414     {
1415         WORD32 block_row, block_col;
1416         WORD32 row, col;
1417         WORD16 *pi2_block;
1418         UWORD32 temp_zero_col = 0;
1419         UWORD32 temp_zero_row = 0;
1420 
1421         pi2_q_dst = pi2_q_dst_orig;
1422 
1423         for(block_row = 0; block_row < trans_size; block_row += 4)
1424         {
1425             //block_col is incrementing by 1 for easy update of csbf pointer
1426             for(block_col = 0; block_col < trans_size / 4; block_col++)
1427             {
1428                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1429                 *(csbf + block_col) = 0;
1430 
1431                 for(row = 0; row < 4; row++)
1432                 {
1433                     for(col = 0; col < 4; col++)
1434                     {
1435                         if(pi2_block[row * dst_q_strd + col] != 0)
1436                         {
1437                             *(csbf + block_col) = 1;
1438                             break;
1439                         }
1440                     }
1441                     if(*(csbf + block_col) == 1)
1442                     {
1443                         /* zero_col update *//* temp_zero_col = ~zero_col */
1444                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1445                         // zero col can be optimized further. Now clearing the
1446                         // entire 4 bits corresponding to 4 colums of 4x4 block
1447                         // even if any 4x4 csbf is set
1448 
1449                         /* zero row update */ /* temp_zero_row = ~zero_row */
1450                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1451                         // zero row can be optimized further. Now clearing the
1452                         // entire 4 bits corresponding to 4 rows of 4x4 block
1453                         // even if any 4x4 csbf is set
1454 
1455                         break;
1456                     }
1457                 }
1458 
1459                 cbf = cbf || (*(csbf + block_col)); // cbf update
1460             }
1461             csbf += csbf_strd;
1462         }
1463 
1464         *zero_col = ~temp_zero_col; //final zero_col storing
1465         *zero_row = ~temp_zero_row; //final zero_row storing
1466     }
1467 
1468     return cbf;
1469 }
1470 
1471 
1472 /**
1473 *******************************************************************************
1474 *
1475 * @brief
1476 *  This function performs quantization, followed by Inverse
1477 *  quantization to find transform domain SSD
1478 *
1479 * @par Description:
1480 *  Performs quantization on coeffs
1481 *
1482 * @param[in] pi2_coeffs
1483 *  4x4 Coeffs
1484 *
1485 * @param[in] pi2_quant_coeff
1486 *  Scaling Matrix
1487 *
1488 * @param[out] pi2_dst
1489 *  Output 4x4 coefficients
1490 *
1491 * @param[in] qp_div
1492 *  Quantization parameter / 6
1493 *
1494 * @param[in] qp_rem
1495 *  Quantization parameter % 6
1496 *
1497 * @param[in] src_strd
1498 *  Input stride
1499 *
1500 * @param[in] dst_strd
1501 *  Output Stride
1502 *
1503 * @param[out] csbf
1504 *  coded sub block flag
1505 *
1506 * @param[in] csbf_strd
1507 *  coded sub block flag
1508 *
1509 * @param[out] zero_col
1510 *  zero column flag
1511 *
1512 * @param[out] zero_row
1513 *  zero column flag
1514 *
1515 * @returns  cbf
1516 * coded block flag
1517 *
1518 * @remarks
1519 *  None
1520 *
1521 *******************************************************************************
1522 */
1523 
ihevc_q_iq_ssd_var_rnd_fact(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1524 WORD32 ihevc_q_iq_ssd_var_rnd_fact
1525     (
1526     WORD16 *pi2_coeffs,
1527     WORD16 *pi2_quant_coeff,
1528     WORD16 *pi2_q_dst,
1529     WORD16 *pi2_iq_dst,
1530     WORD32  trans_size,
1531     WORD32 qp_div,/* qpscaled / 6 */
1532     WORD32 qp_rem,/* qpscaled % 6 */
1533     WORD32 q_add,
1534     WORD32 *pi4_quant_round_factor_0_1,
1535     WORD32 *pi4_quant_round_factor_1_2,
1536     WORD32 src_strd,
1537     WORD32 dst_q_strd,
1538     WORD32 dst_iq_strd,
1539     UWORD8 *csbf,
1540     WORD32 csbf_strd,
1541     WORD32 *zero_col,
1542     WORD32 *zero_row,
1543     WORD16 *pi2_dequant_coeff,
1544     LWORD64 *pi8_cost
1545     )
1546 {
1547     WORD32 i, j;
1548     WORD32 log2_size;
1549     WORD16 *pi2_q_dst_orig;
1550     WORD32 cbf = 0;
1551     WORD32 bit_depth,shift_iq;
1552     WORD32 val;
1553     WORD16 i2_temp;
1554     //WORD16 i2_temp_1;
1555     /* Initialize cost to zero */
1556     WORD32 ssd_cost = 0;
1557 
1558     (void)q_add;
1559     pi2_q_dst_orig  = pi2_q_dst;
1560 
1561 
1562     /* Quant initialization */
1563     GETRANGE(log2_size, trans_size);
1564     log2_size -= 1;
1565 
1566     bit_depth = 8 + 0;
1567     shift_iq = bit_depth + log2_size - 5;
1568 
1569     for(i = 0; i < trans_size; i++)
1570     {
1571         for(j = 0; j < trans_size; j++)
1572         {
1573             /*  Back up the coefficients before Quantization    */
1574             i2_temp = pi2_coeffs[j];
1575 
1576 
1577             {
1578                 /*  Quantization    */
1579                 QUANT(pi2_q_dst[j],i2_temp,
1580                     pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1581                     log2_size, 0);
1582                 if (abs(pi2_q_dst[j]) >= 2)
1583                 {
1584                     QUANT(pi2_q_dst[j],i2_temp,
1585                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1586                         log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1587 
1588                 }
1589                 else if (abs(pi2_q_dst[j]) >= 1)
1590                 {
1591                     QUANT(pi2_q_dst[j],i2_temp,
1592                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1593                         log2_size, *pi4_quant_round_factor_1_2);
1594                 }
1595 
1596                 else
1597                 {
1598                     /*  Quantization    */
1599                     QUANT(pi2_q_dst[j],i2_temp,
1600                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1601                         log2_size, *pi4_quant_round_factor_0_1);
1602                 }
1603 
1604             }
1605 
1606 
1607 
1608             /*  Inverse Quantization    */
1609             IQUANT(pi2_iq_dst[j],
1610                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1611                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
1612                 /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1613                 shift_iq,
1614                 qp_div);
1615 
1616             /*  SSD Computation & Accumulation  */
1617             val = i2_temp - pi2_iq_dst[j];
1618             ssd_cost += val*val;
1619 
1620             pi4_quant_round_factor_0_1++;
1621             pi4_quant_round_factor_1_2++;
1622         }
1623 
1624         pi2_q_dst   += dst_q_strd;
1625         pi2_iq_dst  += dst_iq_strd;
1626         pi2_quant_coeff += trans_size;
1627         pi2_coeffs += src_strd;
1628         pi2_dequant_coeff += trans_size;
1629     }
1630     /* Store the cost */
1631     *pi8_cost = ssd_cost;
1632 
1633     /* CSBF update */
1634     {
1635         WORD32 block_row, block_col;
1636         WORD32 row, col;
1637         WORD16 *pi2_block;
1638         UWORD32 temp_zero_col = 0;
1639         UWORD32 temp_zero_row = 0;
1640 
1641         pi2_q_dst = pi2_q_dst_orig;
1642 
1643         for(block_row = 0; block_row < trans_size; block_row += 4)
1644         {
1645             //block_col is incrementing by 1 for easy update of csbf pointer
1646             for(block_col = 0; block_col < trans_size / 4; block_col++)
1647             {
1648                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1649                 *(csbf + block_col) = 0;
1650 
1651                 for(row = 0; row < 4; row++)
1652                 {
1653                     for(col = 0; col < 4; col++)
1654                     {
1655                         if(pi2_block[row * dst_q_strd + col] != 0)
1656                         {
1657                             *(csbf + block_col) = 1;
1658                             break;
1659                         }
1660                     }
1661                     if(*(csbf + block_col) == 1)
1662                     {
1663                         /* zero_col update *//* temp_zero_col = ~zero_col */
1664                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1665                         // zero col can be optimized further. Now clearing the
1666                         // entire 4 bits corresponding to 4 colums of 4x4 block
1667                         // even if any 4x4 csbf is set
1668 
1669                         /* zero row update */ /* temp_zero_row = ~zero_row */
1670                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1671                         // zero row can be optimized further. Now clearing the
1672                         // entire 4 bits corresponding to 4 rows of 4x4 block
1673                         // even if any 4x4 csbf is set
1674 
1675                         break;
1676                     }
1677                 }
1678 
1679                 cbf = cbf || (*(csbf + block_col)); // cbf update
1680             }
1681             csbf += csbf_strd;
1682         }
1683 
1684         *zero_col = ~temp_zero_col; //final zero_col storing
1685         *zero_row = ~temp_zero_row; //final zero_row storing
1686     }
1687 
1688     return cbf;
1689 }
1690 
ihevc_q_iq_var_rnd_fact(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1691 WORD32 ihevc_q_iq_var_rnd_fact
1692     (
1693     WORD16 *pi2_coeffs,
1694     WORD16 *pi2_quant_coeff,
1695     WORD16 *pi2_q_dst,
1696     WORD16 *pi2_iq_dst,
1697     WORD32  trans_size,
1698     WORD32 qp_div,/* qpscaled / 6 */
1699     WORD32 qp_rem,/* qpscaled % 6 */
1700     WORD32 q_add,
1701     WORD32 *pi4_quant_round_factor_0_1,
1702     WORD32 *pi4_quant_round_factor_1_2,
1703     WORD32 src_strd,
1704     WORD32 dst_q_strd,
1705     WORD32 dst_iq_strd,
1706     UWORD8 *csbf,
1707     WORD32 csbf_strd,
1708     WORD32 *zero_col,
1709     WORD32 *zero_row,
1710     WORD16 *pi2_dequant_coeff,
1711     LWORD64 *pi8_cost
1712     )
1713 {
1714     WORD32 i, j;
1715     WORD32 log2_size;
1716     WORD16 *pi2_q_dst_orig;
1717     WORD32 cbf = 0;
1718     WORD32 bit_depth,shift_iq;
1719     WORD16 i2_temp;
1720 
1721     (void)q_add;
1722     (void)pi8_cost;
1723     pi2_q_dst_orig  = pi2_q_dst;
1724 
1725     GETRANGE(log2_size, trans_size);
1726     log2_size -= 1;
1727 
1728     bit_depth = 8 + 0;
1729     shift_iq = bit_depth + log2_size - 5;
1730 
1731     for(i = 0; i < trans_size; i++)
1732     {
1733         for(j = 0; j < trans_size; j++)
1734         {
1735             i2_temp = pi2_coeffs[j];
1736 
1737             {
1738                 QUANT(pi2_q_dst[j],i2_temp,
1739                     pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1740                     log2_size, 0);
1741 
1742                 if (abs(pi2_q_dst[j]) >= 2)
1743                 {
1744                     QUANT(pi2_q_dst[j],i2_temp,
1745                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1746                         log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1747                 }
1748                 else if (abs(pi2_q_dst[j]) >= 1)
1749                 {
1750                     QUANT(pi2_q_dst[j],i2_temp,
1751                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1752                         log2_size, *pi4_quant_round_factor_1_2);
1753                 }
1754                 else
1755                 {
1756                     QUANT(pi2_q_dst[j],i2_temp,
1757                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1758                         log2_size, *pi4_quant_round_factor_0_1);
1759                 }
1760             }
1761 
1762             IQUANT(pi2_iq_dst[j],
1763                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1764                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
1765                 shift_iq,
1766                 qp_div);
1767 
1768             pi4_quant_round_factor_0_1++;
1769             pi4_quant_round_factor_1_2++;
1770         }
1771 
1772         pi2_q_dst   += dst_q_strd;
1773         pi2_iq_dst  += dst_iq_strd;
1774         pi2_quant_coeff += trans_size;
1775         pi2_coeffs += src_strd;
1776         pi2_dequant_coeff += trans_size;
1777     }
1778 
1779     /* CSBF update */
1780     {
1781         WORD32 block_row, block_col;
1782         WORD32 row, col;
1783         WORD16 *pi2_block;
1784         UWORD32 temp_zero_col = 0;
1785         UWORD32 temp_zero_row = 0;
1786 
1787         pi2_q_dst = pi2_q_dst_orig;
1788 
1789         for(block_row = 0; block_row < trans_size; block_row += 4)
1790         {
1791             //block_col is incrementing by 1 for easy update of csbf pointer
1792             for(block_col = 0; block_col < trans_size / 4; block_col++)
1793             {
1794                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1795                 *(csbf + block_col) = 0;
1796 
1797                 for(row = 0; row < 4; row++)
1798                 {
1799                     for(col = 0; col < 4; col++)
1800                     {
1801                         if(pi2_block[row * dst_q_strd + col] != 0)
1802                         {
1803                             *(csbf + block_col) = 1;
1804                             break;
1805                         }
1806                     }
1807                     if(*(csbf + block_col) == 1)
1808                     {
1809                         /* zero_col update *//* temp_zero_col = ~zero_col */
1810                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1811                         // zero col can be optimized further. Now clearing the
1812                         // entire 4 bits corresponding to 4 colums of 4x4 block
1813                         // even if any 4x4 csbf is set
1814 
1815                         /* zero row update */ /* temp_zero_row = ~zero_row */
1816                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1817                         // zero row can be optimized further. Now clearing the
1818                         // entire 4 bits corresponding to 4 rows of 4x4 block
1819                         // even if any 4x4 csbf is set
1820 
1821                         break;
1822                     }
1823                 }
1824 
1825                 cbf = cbf || (*(csbf + block_col)); // cbf update
1826             }
1827             csbf += csbf_strd;
1828         }
1829 
1830         *zero_col = ~temp_zero_col; //final zero_col storing
1831         *zero_row = ~temp_zero_row; //final zero_row storing
1832     }
1833 
1834     return cbf;
1835 }
1836 
1837 /**
1838 *******************************************************************************
1839 *
1840 * @brief
1841 *  This function performs quantization(using flat scale matrix), followed by
1842 *  inverse quantization to find transform domain SSD; when we perform RDOQ.
1843 *  In case the quantized value turns out to be grater than 1, we then requantize
1844 *  use half rounding.
1845 *
1846 * @par Description:
1847 *  Performs quantization on coeffs
1848 *
1849 * @param[in] pi2_coeffs
1850 *  4x4 Coeffs
1851 *
1852 * @param[in] pi2_quant_coeff
1853 *  Scaling Matrix
1854 *
1855 * @param[out] pi2_dst
1856 *  Output 4x4 coefficients
1857 *
1858 * @param[in] qp_div
1859 *  Quantization parameter / 6
1860 *
1861 * @param[in] qp_rem
1862 *  Quantization parameter % 6
1863 *
1864 * @param[in] src_strd
1865 *  Input stride
1866 *
1867 * @param[in] dst_strd
1868 *  Output Stride
1869 *
1870 * @param[out] csbf
1871 *  coded sub block flag
1872 *
1873 * @param[in] csbf_strd
1874 *  coded sub block flag
1875 *
1876 * @param[out] zero_col
1877 *  zero column flag
1878 *
1879 * @param[out] zero_row
1880 *  zero column flag
1881 *
1882 * @returns  cbf
1883 * coded block flag
1884 *
1885 * @remarks
1886 *  None
1887 *
1888 *******************************************************************************
1889 */
1890 
ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)1891 WORD32 ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact
1892     (
1893     WORD16 *pi2_coeffs,
1894     WORD16 *pi2_quant_coeff,
1895     WORD16 *pi2_q_dst,
1896     WORD16 *pi2_iq_dst,
1897     WORD32  trans_size,
1898     WORD32 qp_div,/* qpscaled / 6 */
1899     WORD32 qp_rem,/* qpscaled % 6 */
1900     WORD32 q_add,
1901     WORD32 *pi4_quant_round_factor_0_1,
1902     WORD32 *pi4_quant_round_factor_1_2,
1903     WORD32 src_strd,
1904     WORD32 dst_q_strd,
1905     WORD32 dst_iq_strd,
1906     UWORD8 *csbf,
1907     WORD32 csbf_strd,
1908     WORD32 *zero_col,
1909     WORD32 *zero_row,
1910     WORD16 *pi2_dequant_coeff,
1911     LWORD64 *pi8_cost
1912     )
1913 {
1914     WORD32 i, j;
1915     WORD32 log2_size;
1916     WORD16 *pi2_q_dst_orig;
1917     WORD32 cbf = 0;
1918     WORD32 bit_depth,shift_iq;
1919     WORD32 val;
1920     WORD16 i2_temp;
1921     /* Initialize cost to zero */
1922     WORD32 ssd_cost = 0;
1923 
1924     (void)q_add;
1925     pi2_q_dst_orig  = pi2_q_dst;
1926 
1927     /* Quant initialization */
1928     GETRANGE(log2_size, trans_size);
1929     log2_size -= 1;
1930 
1931     bit_depth = 8 + 0;
1932     shift_iq = bit_depth + log2_size - 5;
1933 
1934     for(i = 0; i < trans_size; i++)
1935     {
1936         for(j = 0; j < trans_size; j++)
1937         {
1938             WORD16 i2_temp1;
1939             /*  Back up the coefficients before Quantization    */
1940             i2_temp = pi2_coeffs[j];
1941 
1942             /*QUANT(pi2_dst[j], pi2_coeffs[j],
1943             pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1944             log2_size, q_add);*/
1945 
1946             /* modified by 1028 */
1947             /*  Quantization    */
1948 
1949 
1950             {
1951                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1952                     g_ihevc_quant_scales[qp_rem], qp_div,
1953                     log2_size, 0);
1954 
1955                 i2_temp1 = pi2_q_dst[j];
1956 
1957                 if (abs(pi2_q_dst[j]) >= 2)
1958                 {
1959                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1960                         g_ihevc_quant_scales[qp_rem], qp_div,
1961                         log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1962                 }
1963                 else if (abs(pi2_q_dst[j]) >= 1)
1964                 {
1965                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1966                         g_ihevc_quant_scales[qp_rem], qp_div,
1967                         log2_size, *pi4_quant_round_factor_1_2);
1968                 }
1969 
1970                 else
1971                 {
1972                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1973                         g_ihevc_quant_scales[qp_rem], qp_div,
1974                         log2_size, *pi4_quant_round_factor_0_1);
1975                 }
1976 
1977             }
1978 
1979 
1980 
1981 
1982             ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1983 
1984 
1985             /*  Inverse Quantization    */
1986             IQUANT(pi2_iq_dst[j],
1987                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1988                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1989                 shift_iq,
1990                 qp_div);
1991 
1992             /*  SSD Computation & Accumulation  */
1993             val = i2_temp - pi2_iq_dst[j];
1994             ssd_cost += val*val;
1995 
1996             pi4_quant_round_factor_0_1++;
1997             pi4_quant_round_factor_1_2++;
1998         }
1999 
2000         pi2_q_dst   += dst_q_strd;
2001         pi2_iq_dst  += dst_iq_strd;
2002         pi2_quant_coeff += trans_size;
2003         pi2_coeffs += src_strd;
2004         pi2_dequant_coeff += trans_size;
2005 
2006     }
2007     /* Store the cost */
2008     *pi8_cost = ssd_cost;
2009 
2010     /* CSBF update */
2011     {
2012         WORD32 block_row, block_col;
2013         WORD32 row, col;
2014         WORD16 *pi2_block;
2015         UWORD32 temp_zero_col = 0;
2016         UWORD32 temp_zero_row = 0;
2017 
2018         pi2_q_dst = pi2_q_dst_orig;
2019 
2020         for(block_row = 0; block_row < trans_size; block_row += 4)
2021         {
2022             //block_col is incrementing by 1 for easy update of csbf pointer
2023             for(block_col = 0; block_col < trans_size / 4; block_col++)
2024             {
2025                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
2026                 *(csbf + block_col) = 0;
2027 
2028                 for(row = 0; row < 4; row++)
2029                 {
2030                     for(col = 0; col < 4; col++)
2031                     {
2032                         if(pi2_block[row * dst_q_strd + col] != 0)
2033                         {
2034                             *(csbf + block_col) = 1;
2035                             break;
2036                         }
2037                     }
2038                     if(*(csbf + block_col) == 1)
2039                     {
2040                         /* zero_col update *//* temp_zero_col = ~zero_col */
2041                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
2042                         // zero col can be optimized further. Now clearing the
2043                         // entire 4 bits corresponding to 4 colums of 4x4 block
2044                         // even if any 4x4 csbf is set
2045 
2046                         /* zero row update */ /* temp_zero_row = ~zero_row */
2047                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
2048                         // zero row can be optimized further. Now clearing the
2049                         // entire 4 bits corresponding to 4 rows of 4x4 block
2050                         // even if any 4x4 csbf is set
2051 
2052                         break;
2053                     }
2054                 }
2055 
2056                 cbf = cbf || (*(csbf + block_col)); // cbf update
2057             }
2058             csbf += csbf_strd;
2059         }
2060 
2061         *zero_col = ~temp_zero_col; //final zero_col storing
2062         *zero_row = ~temp_zero_row; //final zero_row storing
2063     }
2064     return cbf;
2065 }
2066 
ihevc_q_iq_flat_scale_mat_var_rnd_fact(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)2067 WORD32 ihevc_q_iq_flat_scale_mat_var_rnd_fact
2068     (
2069     WORD16 *pi2_coeffs,
2070     WORD16 *pi2_quant_coeff,
2071     WORD16 *pi2_q_dst,
2072     WORD16 *pi2_iq_dst,
2073     WORD32  trans_size,
2074     WORD32 qp_div,/* qpscaled / 6 */
2075     WORD32 qp_rem,/* qpscaled % 6 */
2076     WORD32 q_add,
2077     WORD32 *pi4_quant_round_factor_0_1,
2078     WORD32 *pi4_quant_round_factor_1_2,
2079     WORD32 src_strd,
2080     WORD32 dst_q_strd,
2081     WORD32 dst_iq_strd,
2082     UWORD8 *csbf,
2083     WORD32 csbf_strd,
2084     WORD32 *zero_col,
2085     WORD32 *zero_row,
2086     WORD16 *pi2_dequant_coeff,
2087     LWORD64 *pi8_cost
2088     )
2089 {
2090     WORD32 i, j;
2091     WORD32 log2_size;
2092     WORD16 *pi2_q_dst_orig;
2093     WORD32 cbf = 0;
2094     WORD32 bit_depth,shift_iq;
2095     WORD16 i2_temp;
2096 
2097     (void)q_add;
2098     (void)pi8_cost;
2099     pi2_q_dst_orig  = pi2_q_dst;
2100 
2101     GETRANGE(log2_size, trans_size);
2102     log2_size -= 1;
2103 
2104     bit_depth = 8 + 0;
2105     shift_iq = bit_depth + log2_size - 5;
2106 
2107     for(i = 0; i < trans_size; i++)
2108     {
2109         for(j = 0; j < trans_size; j++)
2110         {
2111             WORD16 i2_temp1;
2112 
2113             i2_temp = pi2_coeffs[j];
2114 
2115             {
2116                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2117                     g_ihevc_quant_scales[qp_rem], qp_div,
2118                     log2_size, 0);
2119 
2120                 i2_temp1 = pi2_q_dst[j];
2121 
2122                 if (abs(pi2_q_dst[j]) >= 2)
2123                 {
2124                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
2125                         g_ihevc_quant_scales[qp_rem], qp_div,
2126                         log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
2127                 }
2128                 else if (abs(pi2_q_dst[j]) >= 1)
2129                 {
2130                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2131                         g_ihevc_quant_scales[qp_rem], qp_div,
2132                         log2_size, *pi4_quant_round_factor_1_2);
2133                 }
2134                 else
2135                 {
2136                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2137                         g_ihevc_quant_scales[qp_rem], qp_div,
2138                         log2_size, *pi4_quant_round_factor_0_1);
2139                 }
2140             }
2141 
2142             ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
2143 
2144             IQUANT(pi2_iq_dst[j],
2145                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
2146                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
2147                 shift_iq,
2148                 qp_div);
2149 
2150             pi4_quant_round_factor_0_1++;
2151             pi4_quant_round_factor_1_2++;
2152         }
2153 
2154         pi2_q_dst   += dst_q_strd;
2155         pi2_iq_dst  += dst_iq_strd;
2156         pi2_quant_coeff += trans_size;
2157         pi2_coeffs += src_strd;
2158         pi2_dequant_coeff += trans_size;
2159 
2160     }
2161 
2162     /* CSBF update */
2163     {
2164         WORD32 block_row, block_col;
2165         WORD32 row, col;
2166         WORD16 *pi2_block;
2167         UWORD32 temp_zero_col = 0;
2168         UWORD32 temp_zero_row = 0;
2169 
2170         pi2_q_dst = pi2_q_dst_orig;
2171 
2172         for(block_row = 0; block_row < trans_size; block_row += 4)
2173         {
2174             //block_col is incrementing by 1 for easy update of csbf pointer
2175             for(block_col = 0; block_col < trans_size / 4; block_col++)
2176             {
2177                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
2178                 *(csbf + block_col) = 0;
2179 
2180                 for(row = 0; row < 4; row++)
2181                 {
2182                     for(col = 0; col < 4; col++)
2183                     {
2184                         if(pi2_block[row * dst_q_strd + col] != 0)
2185                         {
2186                             *(csbf + block_col) = 1;
2187                             break;
2188                         }
2189                     }
2190                     if(*(csbf + block_col) == 1)
2191                     {
2192                         /* zero_col update *//* temp_zero_col = ~zero_col */
2193                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
2194                         // zero col can be optimized further. Now clearing the
2195                         // entire 4 bits corresponding to 4 colums of 4x4 block
2196                         // even if any 4x4 csbf is set
2197 
2198                         /* zero row update */ /* temp_zero_row = ~zero_row */
2199                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
2200                         // zero row can be optimized further. Now clearing the
2201                         // entire 4 bits corresponding to 4 rows of 4x4 block
2202                         // even if any 4x4 csbf is set
2203 
2204                         break;
2205                     }
2206                 }
2207 
2208                 cbf = cbf || (*(csbf + block_col)); // cbf update
2209             }
2210             csbf += csbf_strd;
2211         }
2212 
2213         *zero_col = ~temp_zero_col; //final zero_col storing
2214         *zero_row = ~temp_zero_row; //final zero_row storing
2215     }
2216     return cbf;
2217 }
2218