1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 ******************************************************************************
23 * @file
24 *  ih264e_core_coding.h
25 *
26 * @brief
27 *  This file contains extern declarations of core coding routines
28 *
29 * @author
30 *  ittiam
31 *
32 * @remarks
33 *  none
34 ******************************************************************************
35 */
36 
37 #ifndef IH264E_CORE_CODING_H_
38 #define IH264E_CORE_CODING_H_
39 
40 /*****************************************************************************/
41 /* Constant Macros                                                           */
42 /*****************************************************************************/
43 
44 /**
45 ******************************************************************************
46  *  @brief      Enable/Disable Hadamard transform of DC Coeff's
47 ******************************************************************************
48  */
49 #define DISABLE_DC_TRANSFORM 0
50 #define ENABLE_DC_TRANSFORM 1
51 
52 /**
53 *******************************************************************************
54  *  @brief bit masks for DC and AC control flags
55 *******************************************************************************
56  */
57 
58 #define DC_COEFF_CNT_LUMA_MB        16
59 #define NUM_4X4_BLKS_LUMA_MB_ROW    4
60 #define NUM_LUMA4x4_BLOCKS_IN_MB    16
61 #define NUM_CHROMA4x4_BLOCKS_IN_MB  8
62 
63 #define SIZE_4X4_BLK_HRZ            TRANS_SIZE_4
64 #define SIZE_4X4_BLK_VERT           TRANS_SIZE_4
65 
66 #define CNTRL_FLAG_DC_MASK_LUMA     0x0000FFFF
67 #define CNTRL_FLAG_AC_MASK_LUMA     0xFFFF0000
68 
69 #define CNTRL_FLAG_AC_MASK_CHROMA_U 0xF0000000
70 #define CNTRL_FLAG_DC_MASK_CHROMA_U 0x0000F000
71 
72 #define CNTRL_FLAG_AC_MASK_CHROMA_V 0x0F000000
73 #define CNTRL_FLAG_DC_MASK_CHROMA_V 0x00000F00
74 
75 #define CNTRL_FLAG_AC_MASK_CHROMA   ( CNTRL_FLAG_AC_MASK_CHROMA_U | CNTRL_FLAG_AC_MASK_CHROMA_V )
76 #define CNTRL_FLAG_DC_MASK_CHROMA   ( CNTRL_FLAG_DC_MASK_CHROMA_U | CNTRL_FLAG_DC_MASK_CHROMA_V )
77 
78 #define CNTRL_FLAG_DCBLK_MASK_CHROMA 0x0000C000
79 
80 /**
81 *******************************************************************************
82  *  @brief macros for transforms
83 *******************************************************************************
84  */
85 #define DEQUEUE_BLKID_FROM_CONTROL( u4_cntrl,  blk_lin_id)                     \
86 {                                                                              \
87   blk_lin_id = CLZ(u4_cntrl);                                                  \
88   u4_cntrl &= (0x7FFFFFFF >> blk_lin_id);                                      \
89 };
90 
91 #define IND2SUB_LUMA_MB(u4_blk_id,i4_offset_x,i4_offset_y)                      \
92 {                                                                               \
93      i4_offset_x = (u4_blk_id % 4) << 2;                                        \
94      i4_offset_y = (u4_blk_id / 4) << 2;                                        \
95 }
96 
97 #define IND2SUB_CHROMA_MB(u4_blk_id,i4_offset_x,i4_offset_y)                   \
98 {                                                                              \
99      i4_offset_x = ((u4_blk_id & 0x1 ) << 3) + (u4_blk_id > 3);                \
100      i4_offset_y = (u4_blk_id & 0x2) << 1;                                     \
101 }
102 
103 
104 /*****************************************************************************/
105 /* Function Declarations                                                     */
106 /*****************************************************************************/
107 
108 /**
109 *******************************************************************************
110 *
111 * @brief
112 *  This function performs does the DCT transform then Hadamard transform
113 *  and quantization for a macroblock when the mb mode is intra 16x16 mode
114 *
115 * @par Description:
116 *  First  cf4 is done on all 16 4x4 blocks of the 16x16 input block.
117 *  Then hadamard transform is done on the DC coefficients
118 *  Quantization is then performed on the 16x16 block, 4x4 wise
119 *
120 * @param[in] pu1_src
121 *  Pointer to source sub-block
122 *
123 * @param[in] pu1_pred
124 *  Pointer to prediction sub-block
125 *
126 * @param[in] pi2_out
127 *  Pointer to residual sub-block
128 *  The output will be in linear format
129 *  The first 16 continuous locations will contain the values of Dc block
130 *  After DC block and a stride 1st AC block will follow
131 *  After one more stride next AC block will follow
132 *  The blocks will be in raster scan order
133 *
134 * @param[in] src_strd
135 *  Source stride
136 *
137 * @param[in] pred_strd
138 *  Prediction stride
139 *
140 * @param[in] dst_strd
141 *  Destination stride
142 *
143 * @param[in] pu2_scale_matrix
144 *  The quantization matrix for 4x4 transform
145 *
146 * @param[in] pu2_threshold_matrix
147 *  Threshold matrix
148 *
149 * @param[in] u4_qbits
150 *  15+QP/6
151 *
152 * @param[in] u4_round_factor
153 *  Round factor for quant
154 *
155 * @param[out] pu1_nnz
156 *  Memory to store the non-zeros after transform
157 *  The first byte will be the nnz of DC block
158 *  From the next byte the AC nnzs will be stored in raster scan order
159 *
160 * @param u4_dc_flag
161 *  Signals if Dc transform is to be done or not
162 *   1 -> Dc transform will be done
163 *   0 -> Dc transform will not be done
164 *
165 * @remarks
166 *
167 *******************************************************************************
168 */
169 void ih264e_luma_16x16_resi_trans_dctrans_quant(
170                 codec_t *ps_codec, UWORD8 *pu1_src, UWORD8 *pu1_pred,
171                 WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
172                 WORD32 dst_strd, const UWORD16 *pu2_scale_matrix,
173                 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
174                 UWORD32 u4_round_factor, UWORD8 *pu1_nnz, UWORD32 u4_dc_flag);
175 
176 /**
177 *******************************************************************************
178 *
179 * @brief
180 *  This function performs the intra 16x16 inverse transform process for H264
181 *  it includes inverse Dc transform, inverse quant and then inverse transform
182 *
183 * @par Description:
184 *
185 * @param[in] pi2_src
186 *  Input data, 16x16 size
187 *  First 16 mem locations will have the Dc coffs in rater scan order in linear fashion
188 *  after a stride 1st AC clock will be present again in raster can order
189 *  Then each AC block of the 16x16 block will follow in raster scan order
190 *
191 * @param[in] pu1_pred
192 *  The predicted data, 16x16 size
193 *  Block by block form
194 *
195 * @param[in] pu1_out
196 *  Output 16x16
197 *  In block by block form
198 *
199 * @param[in] src_strd
200 *  Source stride
201 *
202 * @param[in] pred_strd
203 *  input stride for prediction buffer
204 *
205 * @param[in] out_strd
206 *  input stride for output buffer
207 *
208 * @param[in] pu2_iscale_mat
209 *  Inverse quantization matrix for 4x4 transform
210 *
211 * @param[in] pu2_weigh_mat
212 *  weight matrix of 4x4 transform
213 *
214 * @param[in] qp_div
215 *  QP/6
216 *
217 * @param[in] pi4_tmp
218 *  Input temporary buffer
219 *  needs to be at least 20 in size
220 *
221 * @param[in] pu4_cntrl
222 *  Controls the transform path
223 *  total Last 17 bits are used
224 *  the 16th th bit will correspond to DC block
225 *  and 32-17 will correspond to the ac blocks in raster scan order
226 *  bit equaling zero indicates that the entire 4x4 block is zero for DC
227 *  For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero
228 *
229 * @param[in] pi4_tmp
230 *  Input temporary buffer
231 *  needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
232 *
233 * @returns
234 *  none
235 *
236 * @remarks
237 *  The all zero case must be taken care outside
238 *
239 *******************************************************************************
240 */
241 void ih264e_luma_16x16_idctrans_iquant_itrans_recon(
242                 codec_t *ps_codec, WORD16 *pi2_src, UWORD8 *pu1_pred,
243                 UWORD8 *pu1_out, WORD32 src_strd, WORD32 pred_strd,
244                 WORD32 out_strd, const UWORD16 *pu2_iscale_mat,
245                 const UWORD16 *pu2_weigh_mat, UWORD32 qp_div, UWORD32 u4_cntrl,
246                 UWORD32 u4_dc_trans_flag, WORD32 *pi4_tmp);
247 
248 /**
249 *******************************************************************************
250 *
251 * @brief
252 *  This function performs does the DCT transform then Hadamard transform
253 *  and quantization for a chroma macroblock
254 *
255 * @par Description:
256 *  First  cf4 is done on all 16 4x4 blocks of the 8x8input block
257 *  Then hadamard transform is done on the DC coefficients
258 *  Quantization is then performed on the 8x8 block, 4x4 wise
259 *
260 * @param[in] pu1_src
261 *  Pointer to source sub-block
262 *  The input is in interleaved format for two chroma planes
263 *
264 * @param[in] pu1_pred
265 *  Pointer to prediction sub-block
266 *  Prediction is in inter leaved format
267 *
268 * @param[in] pi2_out
269 *  Pointer to residual sub-block
270 *  The output will be in linear format
271 *  The first 4 continuous locations will contain the values of DC block for U
272 *  and then next 4 will contain for V.
273 *  After DC block and a stride 1st AC block of U plane will follow
274 *  After one more stride next AC block of V plane will follow
275 *  The blocks will be in raster scan order
276 *
277 *  After all the AC blocks of U plane AC blocks of V plane will follow in exact
278 *  same way
279 *
280 * @param[in] src_strd
281 *  Source stride
282 *
283 * @param[in] pred_strd
284 *  Prediction stride
285 *
286 * @param[in] dst_strd
287 *  Destination stride
288 *
289 * @param[in] pu2_scale_matrix
290 *  The quantization matrix for 4x4 transform
291 *
292 * @param[in] pu2_threshold_matrix
293 *  Threshold matrix
294 *
295 * @param[in] u4_qbits
296 *  15+QP/6
297 *
298 * @param[in] u4_round_factor
299 *  Round factor for quant
300 *
301 * @param[out] pu1_nnz
302 *  Memory to store the non-zeros after transform
303 *  The first byte will be the nnz od DC block for U plane
304 *  From the next byte the AC nnzs will be storerd in raster scan order
305 *  The fifth byte will be nnz of Dc block of V plane
306 *  Then Ac blocks will follow
307 *
308 * @param u4_dc_flag
309 *  Signals if Dc transform is to be done or not
310 *   1 -> Dc transform will be done
311 *   0 -> Dc transform will not be done
312 *
313 * @remarks
314 *
315 *******************************************************************************
316 */
317 void ih264e_chroma_8x8_resi_trans_dctrans_quant(
318                 codec_t *ps_codec, UWORD8 *pu1_src, UWORD8 *pu1_pred,
319                 WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
320                 WORD32 out_strd, const UWORD16 *pu2_scale_matrix,
321                 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
322                 UWORD32 u4_round_factor, UWORD8 *pu1_nnz_c);
323 
324 /**
325 *******************************************************************************
326 * @brief
327 *  This function performs the inverse transform with process for chroma MB of H264
328 *
329 * @par Description:
330 *  Does inverse DC transform ,inverse quantization inverse transform
331 *
332 * @param[in] pi2_src
333 *  Input data, 16x16 size
334 *  The input is in the form of, first 4 locations will contain DC coeffs of
335 *  U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
336 *  in raster scan order will follow, each block as linear array in raster scan order.
337 *  After a stride next AC block will follow. After all AC blocks of U plane
338 *  V plane AC blocks will follow in exact same order.
339 *
340 * @param[in] pu1_pred
341 *  The predicted data, 8x16 size, U and V interleaved
342 *
343 * @param[in] pu1_out
344 *  Output 8x16, U and V interleaved
345 *
346 * @param[in] src_strd
347 *  Source stride
348 *
349 * @param[in] pred_strd
350 *  input stride for prediction buffer
351 *
352 * @param[in] out_strd
353 *  input stride for output buffer
354 *
355 * @param[in] pu2_iscale_mat
356 *  Inverse quantization martix for 4x4 transform
357 *
358 * @param[in] pu2_weigh_mat
359 *  weight matrix of 4x4 transform
360 *
361 * @param[in] qp_div
362 *  QP/6
363 *
364 * @param[in] pi4_tmp
365 *  Input temporary buffer
366 *  needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes
367 *  in size
368 *
369 * @param[in] pu4_cntrl
370 *  Controls the transform path
371 *  the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block
372 *  32-28 bits will indicate AC blocks of U plane in raster scan order
373 *  27-23 bits will indicate AC blocks of V plane in rater scan order
374 *  The bit 1 implies that there is at least one non zero coff in a block
375 *
376 * @returns
377 *  none
378 *
379 * @remarks
380 *******************************************************************************
381 */
382 void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(
383                 codec_t *ps_codec, WORD16 *pi2_src, UWORD8 *pu1_pred,
384                 UWORD8 *pu1_out, WORD32 src_strd, WORD32 pred_strd,
385                 WORD32 out_strd, const UWORD16 *pu2_iscale_mat,
386                 const UWORD16 *pu2_weigh_mat, UWORD32 qp_div, UWORD32 u4_cntrl,
387                 WORD32 *pi4_tmp);
388 
389 /**
390 ******************************************************************************
391 *
392 * @brief  This function packs residue of an i16x16 luma mb for entropy coding
393 *
394 * @par   Description
395 *  An i16 macro block contains two classes of units, dc 4x4 block and
396 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
397 *  the 16 ac blocks are sent next in scan order. Each and every block is
398 *  represented by 3 parameters (nnz, significant coefficient map and the
399 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
400 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
401 *  sent in scan order.
402 *
403 *  The first byte of each block will be nnz of the block, if it is non zero,
404 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
405 *  This is repeated for 1 dc + 16 ac blocks.
406 *
407 * @param[in]  pi2_res_mb
408 *  pointer to residue mb
409 *
410 * @param[in, out]  pv_mb_coeff_data
411 *  buffer pointing to packed residue coefficients
412 *
413 * @param[in]  u4_res_strd
414 *  residual block stride
415 *
416 * @param[out]  u1_cbp_l
417 *  coded block pattern luma
418 *
419 * @param[in]   pu1_nnz
420 *  number of non zero coefficients in each 4x4 unit
421 *
422 * @param[out]
423 *  Control signal for inverse transform of 16x16 blocks
424 *
425 * @return none
426 *
427 * @ remarks
428 *
429 ******************************************************************************
430 */
431 void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb, void **pv_mb_coeff_data,
432                           WORD32 i4_res_strd, UWORD8 *u1_cbp_l, UWORD8 *pu1_nnz,
433                           UWORD32 *pu4_cntrl);
434 
435 /**
436 ******************************************************************************
437 *
438 * @brief  This function packs residue of an i8x8 chroma mb for entropy coding
439 *
440 * @par   Description
441 *  An i8 chroma macro block contains two classes of units, dc 2x2 block and
442 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
443 *  the 4 ac blocks are sent next in scan order. Each and every block is
444 *  represented by 3 parameters (nnz, significant coefficient map and the
445 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
446 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
447 *  sent in scan order.
448 *
449 *  The first byte of each block will be nnz of the block, if it is non zero,
450 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
451 *  This is repeated for 1 dc + 4 ac blocks.
452 *
453 * @param[in]  pi2_res_mb
454 *  pointer to residue mb
455 *
456 * @param[in, out]  pv_mb_coeff_data
457 *  buffer pointing to packed residue coefficients
458 *
459 * @param[in]  u4_res_strd
460 *  residual block stride
461 *
462 * @param[out]  u1_cbp_c
463 *  coded block pattern chroma
464 *
465 * @param[in]   pu1_nnz
466 *  number of non zero coefficients in each 4x4 unit
467 *
468 * @param[out]   pu1_nnz
469 *  Control signal for inverse transform
470 *
471 * @param[in]   u4_swap_uv
472 *  Swaps the order of U and V planes in entropy bitstream
473 *
474 * @return none
475 *
476 * @ remarks
477 *
478 ******************************************************************************
479 */
480 void ih264e_pack_c_mb(WORD16 *pi2_res_mb, void **pv_mb_coeff_data,
481                       WORD32 i4_res_strd, UWORD8 *u1_cbp_c, UWORD8 *pu1_nnz,
482                       UWORD32 u4_kill_coffs_flag, UWORD32 *pu4_cntrl,
483                       UWORD32 u4_swap_uv);
484 
485 /**
486 *******************************************************************************
487 *
488 * @brief performs luma core coding when intra mode is i16x16
489 *
490 * @par Description:
491 *  If the current mb is to be coded as intra of mb type i16x16, the mb is first
492 *  predicted using one of i16x16 prediction filters, basing on the intra mode
493 *  chosen. Then, error is computed between the input blk and the estimated blk.
494 *  This error is transformed (hierarchical transform i.e., dct followed by hada-
495 *  -mard), quantized. The quantized coefficients are packed in scan order for
496 *  entropy coding.
497 *
498 * @param[in] ps_proc_ctxt
499 *  pointer to the current macro block context
500 *
501 * @returns u1_cbp_l
502 *  coded block pattern luma
503 *
504 * @remarks none
505 *
506 *******************************************************************************
507 */
508 UWORD8 ih264e_code_luma_intra_macroblock_16x16
509         (
510             process_ctxt_t *ps_proc
511         );
512 
513 /**
514 *******************************************************************************
515 *
516 * @brief performs luma core coding when intra mode is i4x4
517 *
518 * @par Description:
519 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
520 *  predicted using one of i4x4 prediction filters, basing on the intra mode
521 *  chosen. Then, error is computed between the input blk and the estimated blk.
522 *  This error is dct transformed and quantized. The quantized coefficients are
523 *  packed in scan order for entropy coding.
524 *
525 * @param[in] ps_proc_ctxt
526 *  pointer to the current macro block context
527 *
528 * @returns u1_cbp_l
529 *  coded block pattern luma
530 *
531 * @remarks
532 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
533 *  mentioned in h.264 specification
534 *
535 *******************************************************************************
536 */
537 UWORD8 ih264e_code_luma_intra_macroblock_4x4
538         (
539             process_ctxt_t *ps_proc
540         );
541 
542 /**
543 *******************************************************************************
544 *
545 * @brief performs luma core coding when intra mode is i4x4
546 *
547 * @par Description:
548 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
549 *  predicted using one of i4x4 prediction filters, basing on the intra mode
550 *  chosen. Then, error is computed between the input blk and the estimated blk.
551 *  This error is dct transformed and quantized. The quantized coefficients are
552 *  packed in scan order for entropy coding.
553 *
554 * @param[in] ps_proc_ctxt
555 *  pointer to the current macro block context
556 *
557 * @returns u1_cbp_l
558 *  coded block pattern luma
559 *
560 * @remarks
561 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
562 *  mentioned in h.264 specification
563 *
564 *******************************************************************************
565 */
566 UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on
567         (
568             process_ctxt_t *ps_proc
569         );
570 
571 /**
572 *******************************************************************************
573 *
574 * @brief performs chroma core coding for intra macro blocks
575 *
576 * @par Description:
577 *  If the current MB is to be intra coded with mb type chroma I8x8, the MB is
578 *  first predicted using intra 8x8 prediction filters. The predicted data is
579 *  compared with the input for error and the error is transformed. The DC
580 *  coefficients of each transformed sub blocks are further transformed using
581 *  Hadamard transform. The resulting coefficients are quantized, packed and sent
582 *  for entropy coding.
583 *
584 * @param[in] ps_proc_ctxt
585 *  pointer to the current macro block context
586 *
587 * @returns u1_cbp_c
588 *  coded block pattern chroma
589 *
590 * @remarks
591 *  The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
592 *  mentioned in h.264 specification
593 *
594 *******************************************************************************
595 */
596 UWORD8 ih264e_code_chroma_intra_macroblock_8x8
597         (
598             process_ctxt_t *ps_proc
599         );
600 
601 /**
602 *******************************************************************************
603 * @brief performs luma core coding when  mode is inter
604 *
605 * @par Description:
606 *  If the current mb is to be coded as inter predicted mb,based on the sub mb
607 *  partitions and corresponding motion vectors generated by ME, prediction is done.
608 *  Then, error is computed between the input blk and the estimated blk.
609 *  This error is transformed ( dct and with out hadamard), quantized. The
610 *  quantized coefficients are packed in scan order for entropy coding.
611 *
612 * @param[in] ps_proc_ctxt
613 *  pointer to the current macro block context
614 *
615 * @returns u1_cbp_l
616 *  coded block pattern luma
617 *
618 * @remarks none
619 *
620 *******************************************************************************
621 */
622 UWORD8 ih264e_code_luma_inter_macroblock_16x16
623         (
624             process_ctxt_t *ps_proc
625         );
626 
627 /**
628 *******************************************************************************
629 * @brief performs chroma core coding for inter macro blocks
630 *
631 * @par Description:
632 *  If the current mb is to be coded as inter predicted mb, based on the sub mb
633 *  partitions and corresponding motion vectors generated by ME, prediction is done.
634 *  Then, error is computed between the input blk and the estimated blk.
635 *  This error is transformed, quantized. The quantized coefficients
636 *  are packed in scan order for entropy coding.
637 *
638 * @param[in] ps_proc_ctxt
639 *  pointer to the current macro block context
640 *
641 * @returns u1_cbp_l
642 *  coded block pattern luma
643 *
644 * @remarks none
645 *
646 *******************************************************************************
647 */
648 UWORD8 ih264e_code_chroma_inter_macroblock_8x8
649         (
650             process_ctxt_t *ps_proc
651         );
652 
653 #endif /* IH264E_CORE_CODING_H_ */
654