1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21  *******************************************************************************
22  * @file
23  *  ih264_resi_trans_quant.c
24  *
25  * @brief
26  *  Contains function definitions single stage  forward transform for H.264
27  *  It will calculate the residue, do the cf and then do quantization
28  *
29  * @author
30  *  Ittiam
31  *
32  * @par List of Functions:
33  *  - ih264_resi_trans_quant_4x4()
34  *  - ih264_resi_trans_quant_chroma_4x4
35  *  - ih264_hadamard_quant_4x4
36  *  - ih264_hadamard_quant_2x2_uv
37  *  - ih264_resi_trans_quant_8x8
38  *
39  * @remarks
40  *******************************************************************************
41  */
42 
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 
47 /* System include files */
48 #include <stddef.h>
49 
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "ih264_defs.h"
53 #include "ih264_size_defs.h"
54 #include "ih264_macros.h"
55 #include "ih264_trans_macros.h"
56 #include "ih264_trans_data.h"
57 #include "ih264_structs.h"
58 #include "ih264_trans_quant_itrans_iquant.h"
59 
60 /**
61  *******************************************************************************
62  *
63  * @brief
64  *   This function performs forward transform and quantization on a 4*4 block
65  *
66  * @par Description:
67  *   The function accepts source buffer and estimation buffer. From these, it
68  *   computes the residue. This is residue is then transformed and quantized.
69  *   The transform and quantization are in placed computed. They use the residue
70  *   buffer for this.
71  *
72  * @param[in] pu1_src
73  *   Pointer to source sub-block
74  *
75  * @param[in] pu1_pred
76  *   Pointer to prediction sub-block
77  *
78  * @param[in] pi2_out
79  *   Pointer to residual sub-block
80  *
81  * @param[in] src_strd
82  *   Source stride
83  *
84  * @param[in] pred_strd
85  *   Prediction stride
86  *
87  * @param[in] dst_strd
88  *   Destination stride
89  *
90  * @param[in] u4_qbits
91  *    QP_BITS_h264_4x4 + floor(QP/6)
92  *
93  * @param[in] pu2_threshold_matrix
94  *   Pointer to Forward Quant Threshold Matrix
95  *
96  * @param[in] pu2_scale_matrix
97  *   Pointer to Forward Quant Scale Matrix
98  *
99  * @param[in] u4_round_factor
100  *   Quantization Round factor
101  *
102  * @param[out] pu1_nnz
103  *   Total non-zero coefficients in the current sub-block
104  *
105  * @returns
106  *
107  * @remarks
108  *   None
109  *
110  *******************************************************************************
111  */
ih264_resi_trans_quant_4x4(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pi2_alt_dc_addr)112 void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
113                                 UWORD8 *pu1_pred,
114                                 WORD16 *pi2_out,
115                                 WORD32 src_strd,
116                                 WORD32 pred_strd,
117                                 const UWORD16 *pu2_scale_matrix,
118                                 const UWORD16 *pu2_threshold_matrix,
119                                 UWORD32 u4_qbits,
120                                 UWORD32 u4_round_factor,
121                                 UWORD8 *pu1_nnz,
122                                 WORD16 *pi2_alt_dc_addr)
123 {
124     UWORD32 i;
125     WORD32  x0, x1, x2, x3, x4, x5, x6, x7;
126     WORD32  i4_value, i4_sign;
127     UWORD32 u4_abs_value;
128     WORD16  *pi2_out_tmp = pi2_out;
129     UWORD32 u4_nonzero_coeff = 0;
130 
131     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
132     {
133         /* computing prediction error (residue) */
134         x4 = pu1_src[0] - pu1_pred[0];
135         x5 = pu1_src[1] - pu1_pred[1];
136         x6 = pu1_src[2] - pu1_pred[2];
137         x7 = pu1_src[3] - pu1_pred[3];
138 
139         /* Horizontal transform */
140         x0 = x4 + x7;
141         x1 = x5 + x6;
142         x2 = x5 - x6;
143         x3 = x4 - x7;
144 
145         pi2_out_tmp[0] = x0 + x1;
146         pi2_out_tmp[1] = (x3 <<1) + x2;
147         pi2_out_tmp[2] = x0 - x1;
148         pi2_out_tmp[3] = x3 - (x2<<1);
149 
150         /* pointing to next row; */
151         pu1_src += src_strd;
152         pu1_pred += pred_strd;
153         pi2_out_tmp += 4;
154 
155     }
156     pi2_out_tmp = pi2_out;
157     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
158     {
159 
160         /* Vertical transform and quantization */
161         x4 = pi2_out_tmp[0];
162         x5 = pi2_out_tmp[4];
163         x6 = pi2_out_tmp[8];
164         x7 = pi2_out_tmp[12];
165 
166 
167         x0 = x4 + x7;
168         x1 = x5 + x6;
169         x2 = x5 - x6;
170         x3 = x4 - x7;
171 
172         /* quantization is done in place */
173 
174         i4_value = x0 + x1;
175 
176         if(i==0)
177         {
178           (*pi2_alt_dc_addr) = i4_value;
179         }
180 
181         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff);
182         pi2_out_tmp[0] = i4_value;
183 
184 
185         i4_value = (x3 << 1) + x2;
186         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff);
187         pi2_out_tmp[4] = i4_value;
188 
189 
190         i4_value = x0 - x1;
191         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff);
192         pi2_out_tmp[8] = i4_value;
193 
194 
195         i4_value = x3 - (x2 << 1);
196         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff);
197         pi2_out_tmp[12] = i4_value;
198 
199         pi2_out_tmp ++;
200         pu2_scale_matrix++;
201         pu2_threshold_matrix++;
202     }
203 
204     /* Return total nonzero coefficients in the current sub block */
205     *pu1_nnz =  u4_nonzero_coeff;
206 }
207 /**
208  *******************************************************************************
209  *
210  * @brief
211  *   This function performs forward transform and quantization on a 4*4 chroma block
212  *   with interleaved values
213  *
214  * @par Description:
215  *   The function accepts source buffer and estimation buffer. From these, it
216  *   computes the residue. This is residue is then transformed and quantized.
217  *   The transform and quantization are in placed computed. They use the residue
218  *   buffer for this.
219  *
220  * @param[in] pu1_src
221  *   Pointer to source sub-block
222  *
223  * @param[in] pu1_pred
224  *   Pointer to prediction sub-block
225  *
226  * @param[in] pi2_out
227  *   Pointer to residual sub-block
228  *
229  * @param[in] src_strd
230  *   Source stride
231  *
232  * @param[in] pred_strd
233  *   Prediction stride
234  *
235  * @param[in] dst_strd
236  *   Destination stride
237  *
238  * @param[in] u4_qbits
239  *    QP_BITS_h264_4x4 + floor(QP/6)
240  *
241  * @param[in] pu2_threshold_matrix
242  *   Pointer to Forward Quant Threshold Matrix
243  *
244  * @param[in] pu2_scale_matrix
245  *   Pointer to Forward Quant Scale Matrix
246  *
247  * @param[in] u4_round_factor
248  *   Quantization Round factor
249  *
250  * @param[out] pu1_nnz
251  *   Total non-zero coefficients in the current sub-block
252  *
253  * @returns
254  *
255  * @remarks
256  *   None
257  *
258  *******************************************************************************
259  */
ih264_resi_trans_quant_chroma_4x4(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pu1_dc_alt_addr)260 void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
261                                        UWORD8 *pu1_pred,
262                                        WORD16 *pi2_out,
263                                        WORD32 src_strd,
264                                        WORD32 pred_strd,
265                                        const UWORD16 *pu2_scale_matrix,
266                                        const UWORD16 *pu2_threshold_matrix,
267                                        UWORD32 u4_qbits,
268                                        UWORD32 u4_round_factor,
269                                        UWORD8 *pu1_nnz,
270                                        WORD16 *pu1_dc_alt_addr)
271 {
272     UWORD32 i;
273     WORD32  x0, x1, x2, x3, x4, x5, x6, x7;
274     WORD32  i4_value, i4_sign;
275     UWORD32 u4_abs_value;
276     WORD16  *pi2_out_tmp = pi2_out;
277     UWORD32 u4_nonzero_coeff = 0;
278 
279     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
280     {
281         /* computing prediction error (residue) */
282         x4 = pu1_src[0] - pu1_pred[0];
283         x5 = pu1_src[2] - pu1_pred[2];
284         x6 = pu1_src[4] - pu1_pred[4];
285         x7 = pu1_src[6] - pu1_pred[6];
286 
287         /* Horizontal transform */
288         x0 = x4 + x7;
289         x1 = x5 + x6;
290         x2 = x5 - x6;
291         x3 = x4 - x7;
292 
293         pi2_out_tmp[0] = x0 + x1;
294         pi2_out_tmp[1] = (x3 <<1) + x2;
295         pi2_out_tmp[2] = x0 - x1;
296         pi2_out_tmp[3] = x3 - (x2<<1);
297 
298         /* pointing to next row; */
299         pu1_src += src_strd;
300         pu1_pred += pred_strd;
301         pi2_out_tmp += 4;
302 
303     }
304     pi2_out_tmp = pi2_out;
305     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
306     {
307 
308         /* Vertical transform and quantization */
309         x4 = pi2_out_tmp[0];
310         x5 = pi2_out_tmp[4];
311         x6 = pi2_out_tmp[8];
312         x7 = pi2_out_tmp[12];
313 
314 
315         x0 = x4 + x7;
316         x1 = x5 + x6;
317         x2 = x5 - x6;
318         x3 = x4 - x7;
319 
320         /* quantization is done in place */
321 
322         i4_value = x0 + x1;
323 
324         if(i==0)
325         {
326           *pu1_dc_alt_addr = i4_value;
327         }
328 
329         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
330                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
331                   u4_nonzero_coeff);
332         pi2_out_tmp[0] = i4_value;
333 
334         i4_value = (x3 << 1) + x2;
335         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4],
336                   pu2_scale_matrix[4], u4_round_factor, u4_qbits,
337                   u4_nonzero_coeff);
338         pi2_out_tmp[4] = i4_value;
339 
340         i4_value = x0 - x1;
341         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
342                   pu2_scale_matrix[8], u4_round_factor, u4_qbits,
343                   u4_nonzero_coeff);
344         pi2_out_tmp[8] = i4_value;
345 
346         i4_value = x3 - (x2 << 1);
347         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12],
348                   pu2_scale_matrix[12], u4_round_factor, u4_qbits,
349                   u4_nonzero_coeff);
350         pi2_out_tmp[12] = i4_value;
351 
352         pi2_out_tmp ++;
353         pu2_scale_matrix++;
354         pu2_threshold_matrix++;
355     }
356 
357     /* Return total nonzero coefficients in the current sub block */
358     *pu1_nnz =  u4_nonzero_coeff;
359 }
360 
361 /**
362  *******************************************************************************
363  *
364  * @brief
365  *   This function performs forward hadamard transform and quantization on a 4*4 block
366  *
367  * @par Description:
368  *   The function accepts source buffer and estimation buffer. From these, it
369  *   computes the residue. This is residue is then transformed and quantized.
370  *   The transform and quantization are in placed computed. They use the residue
371  *   buffer for this.
372  *
373  * @param[in] pu1_src
374  *   Pointer to source sub-block
375  *
376  * @param[in] pu1_pred
377  *   Pointer to prediction sub-block
378  *
379  * @param[in] pi2_out
380  *   Pointer to residual sub-block
381  *
382  * @param[in] src_strd
383  *   Source stride
384  *
385  * @param[in] pred_strd
386  *   Prediction stride
387  *
388  * @param[in] dst_strd
389  *   Destination stride
390  *
391  * @param[in] u4_qbits
392  *    QP_BITS_h264_4x4 + floor(QP/6)
393  *
394  * @param[in] pu2_threshold_matrix
395  *   Pointer to Forward Quant Threshold Matrix
396  *
397  * @param[in] pu2_scale_matrix
398  *   Pointer to Forward Quant Scale Matrix
399  *
400  * @param[in] u4_round_factor
401  *   Quantization Round factor
402  *
403  * @param[out] pu1_nnz
404  *   Total non-zero coefficients in the current sub-block
405  *
406  * @returns
407  *
408  * @remarks
409  *   None
410  *
411  */
412 
ih264_hadamard_quant_4x4(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)413 void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
414                               WORD16 *pi2_dst,
415                               const UWORD16 *pu2_scale_matrix,
416                               const UWORD16 *pu2_threshold_matrix,
417                               UWORD32 u4_qbits,
418                               UWORD32 u4_round_factor,
419                               UWORD8 *pu1_nnz)
420 {
421   WORD32 i;
422   WORD32 x0,x1,x2,x3,x4,x5,x6,x7,i4_value;
423   UWORD32 u4_abs_value;
424   WORD32 i4_sign;
425 
426   *pu1_nnz = 0;
427 
428   for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
429     {
430         x4 = pi2_src[0];
431         x5 = pi2_src[1];
432         x6 = pi2_src[2];
433         x7 = pi2_src[3];
434 
435         x0 = x4 + x7;
436         x1 = x5 + x6;
437         x2 = x5 - x6;
438         x3 = x4 - x7;
439 
440         pi2_dst[0] = x0 + x1;
441         pi2_dst[1] = x3 + x2;
442         pi2_dst[2] = x0 - x1;
443         pi2_dst[3] = x3 - x2;
444 
445         pi2_src += 4;
446         pi2_dst += 4;
447     }
448 
449     /* Vertical transform and quantization */
450     pi2_dst -= SUB_BLK_WIDTH_4x4<<2;
451 
452     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
453     {
454         x4 = pi2_dst[0];
455         x5 = pi2_dst[4];
456         x6 = pi2_dst[8];
457         x7 = pi2_dst[12] ;
458 
459         x0 = x4 + x7;
460         x1 = x5 + x6;
461         x2 = x5 - x6;
462         x3 = x4 - x7;
463 
464 
465         i4_value = (x0 + x1) >> 1;
466         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
467                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
468         pi2_dst[0] = i4_value;
469 
470         i4_value = (x3 + x2) >> 1;
471         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
472                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
473         pi2_dst[4] = i4_value;
474 
475         i4_value = (x0 - x1) >> 1;
476         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
477                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
478         pi2_dst[8] = i4_value;
479 
480         i4_value = (x3 - x2) >> 1;
481         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
482                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
483         pi2_dst[12] = i4_value;
484 
485         pi2_dst ++;
486     }
487 }
488 
489 /**
490  *******************************************************************************
491  *
492  * @brief
493  *   This function performs forward hadamard transform and quantization on a 2*2 block
494  *   for both U and V planes
495  *
496  * @par Description:
497  *   The function accepts source buffer and estimation buffer. From these, it
498  *   computes the residue. This is residue is then transformed and quantized.
499  *   The transform and quantization are in placed computed. They use the residue
500  *   buffer for this.
501  *
502  * @param[in] pu1_src
503  *   Pointer to source sub-block
504  *
505  * @param[in] pu1_pred
506  *   Pointer to prediction sub-block
507  *
508  * @param[in] pi2_out
509  *   Pointer to residual sub-block
510  *
511  * @param[in] src_strd
512  *   Source stride
513  *
514  * @param[in] pred_strd
515  *   Prediction stride
516  *
517  * @param[in] dst_strd
518  *   Destination stride
519  *
520  * @param[in] u4_qbits
521  *    QP_BITS_h264_4x4 + floor(QP/6)
522  *
523  * @param[in] pu2_threshold_matrix
524  *   Pointer to Forward Quant Threshold Matrix
525  *
526  * @param[in] pu2_scale_matrix
527  *   Pointer to Forward Quant Scale Matrix
528  *
529  * @param[in] u4_round_factor
530  *   Quantization Round factor
531  *
532  * @param[out] pu1_nnz
533  *   Total non-zero coefficients in the current sub-block
534  *
535  * @returns
536  *
537  * @remarks
538  *   NNZ for dc is populated at 0 and 5th position of pu1_nnz
539  *
540  */
541 
ih264_hadamard_quant_2x2_uv(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)542 void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
543                                  WORD16 *pi2_dst,
544                                  const UWORD16 *pu2_scale_matrix,
545                                  const UWORD16 *pu2_threshold_matrix,
546                                  UWORD32 u4_qbits,
547                                  UWORD32 u4_round_factor,
548                                  UWORD8 *pu1_nnz)
549 {
550     WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
551     WORD32 i4_value, i4_sign, plane;
552     UWORD32 u4_abs_value;
553 
554     for(plane = 0; plane < 2; plane++)
555     {
556         pu1_nnz[plane] = 0;
557 
558         /* Horizontal transform */
559         x4 = pi2_src[0];
560         x5 = pi2_src[1];
561         x6 = pi2_src[2];
562         x7 = pi2_src[3];
563 
564         x0 = x4 + x5;
565         x1 = x4 - x5;
566         x2 = x6 + x7;
567         x3 = x6 - x7;
568 
569         /* Vertical transform and quantization */
570         i4_value = (x0 + x2);
571         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
572                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
573                   pu1_nnz[plane]);
574         pi2_dst[0] = i4_value;
575 
576         i4_value = (x0 - x2);
577         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
578                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
579                   pu1_nnz[plane]);
580         pi2_dst[2] = i4_value;
581 
582         i4_value = (x1 - x3);
583         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
584                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
585                   pu1_nnz[plane]);
586         pi2_dst[3] = i4_value;
587 
588         i4_value = (x1 + x3);
589         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
590                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
591                   pu1_nnz[plane]);
592         pi2_dst[1] = i4_value;
593 
594         pi2_dst += 4;
595         pi2_src += 4;
596 
597     }
598 }
599 
600 /*
601  *******************************************************************************
602  *
603  * @brief
604  *  This function performs Single stage forward transform CF8 and quantization on 8*8 blocks
605  *  for h.264
606  *
607  * @par Description:
608  *  Performs single stage 8x8 forward transform CF8 after calculating the residue
609  *  The result is then quantized
610  *
611  * @param[in] pu1_src
612  *  Input 8x8 pixels
613  *
614  * @param[in] pu1_pred
615  *  Input 8x8 pixels
616  *
617  * @param[in] pi1_out
618  * Output 8x8 pixels
619  *
620  * @param[in] u4_thresh
621  *  Threshold under which the coeffs are not quantized
622  *
623  *  @param[in] u4_qp_div
624  *  QP/6
625  *
626  *  @param[in] u4_qp_rem
627  *  QP%6
628  *
629  * @param[in] u2_src_stride
630  *  Source stride
631  *
632  * @param[in] pred_strd
633  * stride for prediciton buffer
634  *
635  *  @param[in] dst_strd
636  *  stride for destination buffer
637  *
638  *  @param[in] pu4_quant_mat
639  *  Pointer to the 4x4 quantization matrix
640  *
641  * @returns  Void
642  *
643  *
644  *******************************************************************************
645  */
ih264_resi_trans_quant_8x8(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pu1_dc_alt_addr)646 void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
647                                 UWORD8 *pu1_pred,
648                                 WORD16 *pi2_out,
649                                 WORD32 src_strd,
650                                 WORD32 pred_strd,
651                                 const UWORD16 *pu2_scale_matrix,
652                                 const UWORD16 *pu2_threshold_matrix,
653                                 UWORD32 u4_qbits,
654                                 UWORD32 u4_round_factor,
655                                 UWORD8 *pu1_nnz,
656                                 WORD16 *pu1_dc_alt_addr)
657 
658 {
659     WORD16 *pi2_out_tmp = pi2_out;
660     UWORD32 i;
661     WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
662     WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
663     WORD32 i4_sign;
664     UWORD32 u4_abs_value;
665     UWORD32 u4_nonzero_coeff = 0;
666 
667     UNUSED(pu1_dc_alt_addr);
668 
669     /*Horizontal transform */
670     /* we are going to use the a's and r's in a twisted way since */
671     /*i dont want to declare more variables */
672     for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
673     {
674         r0 = pu1_src[0];
675         r0 -= pu1_pred[0];
676         r1 = pu1_src[1];
677         r1 -= pu1_pred[1];
678         r2 = pu1_src[2];r2 -= pu1_pred[2];
679         r3 = pu1_src[3];r3 -= pu1_pred[3];
680         r4 = pu1_src[4];r4 -= pu1_pred[4];
681         r5 = pu1_src[5];r5 -= pu1_pred[5];
682         r6 = pu1_src[6];r6 -= pu1_pred[6];
683         r7 = pu1_src[7];r7 -= pu1_pred[7];
684 
685 
686         a0 = r0 + r7;
687         a1 = r1 + r6;
688         a2 = r2 + r5;
689         a3 = r3 + r4;
690 
691         a4 = a0 + a3;
692         a5 = a1 + a2;
693         a6 = a0 - a3;
694         a7 = a1 - a2;
695 
696         pi2_out_tmp[0] = a4 + a5;
697 
698         pi2_out_tmp[2] = a6 + (a7>>1);
699         pi2_out_tmp[4] = a4 - a5;
700         pi2_out_tmp[6] = (a6>>1) - a7;
701 
702         a0 = r0 - r7;
703         a1 = r1 - r6;
704         a2 = r2 - r5;
705         a3 = r3 - r4;
706 
707         a4 = a1 + a2 + ((a0>>1) + a0);
708         a5 = a0 - a3 - ((a2>>1) + a2);
709         a6 = a0 + a3 - ((a1>>1) + a1);
710         a7 = a1 - a2 + ((a3>>1) + a3);
711 
712         pi2_out_tmp[1] = a4 + (a7>>2);
713         pi2_out_tmp[3] = a5 + (a6>>2);
714         pi2_out_tmp[5] = a6 - (a5>>2);
715         pi2_out_tmp[7] = (a4>>2) - a7;
716 
717         pu1_src += src_strd;
718         pu1_pred += pred_strd;
719         pi2_out_tmp += 8;
720     }
721 
722     /*vertical transform and quant */
723 
724     pi2_out_tmp = pi2_out;
725 
726     for (i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
727     {
728 
729         r0 = pi2_out_tmp[0];
730         r1 = pi2_out_tmp[8];
731         r2 = pi2_out_tmp[16];
732         r3 = pi2_out_tmp[24];
733         r4 = pi2_out_tmp[32];
734         r5 = pi2_out_tmp[40];
735         r6 = pi2_out_tmp[48];
736         r7 = pi2_out_tmp[56];
737 
738         a0 = r0 + r7;
739         a1 = r1 + r6;
740         a2 = r2 + r5;
741         a3 = r3 + r4;
742 
743         a4 = a0 + a3;
744         a5 = a1 + a2;
745         a6 = a0 - a3;
746         a7 = a1 - a2;
747 
748         a0 = r0 - r7;
749         a1 = r1 - r6;
750         a2 = r2 - r5;
751         a3 = r3 - r4;
752 
753         r0 = a4 + a5;
754         r2 = a6 + (a7>>1);
755         r4 = a4 - a5;
756         r6 = (a6>>1) - a7;
757 
758         a4 = a1 + a2 + ((a0>>1) + a0);
759         a5 = a0 - a3 - ((a2>>1) + a2);
760         a6 = a0 + a3 - ((a1>>1) + a1);
761         a7 = a1 - a2 + ((a3>>1) + a3);
762 
763         r1 = a4 + (a7>>2);
764         r3 = a5 + (a6>>2);
765         r5 = a6 - (a5>>2);
766         r7 = (a4>>2) - a7;
767 
768         FWD_QUANT(r0, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
769                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
770                   u4_nonzero_coeff);
771         pi2_out_tmp[0] = r0;
772 
773         FWD_QUANT(r1, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
774                   pu2_scale_matrix[8], u4_round_factor, u4_qbits,
775                   u4_nonzero_coeff);
776         pi2_out_tmp[8] = r1;
777 
778         FWD_QUANT(r2, u4_abs_value, i4_sign, pu2_threshold_matrix[16],
779                   pu2_scale_matrix[16], u4_round_factor, u4_qbits,
780                   u4_nonzero_coeff);
781         pi2_out_tmp[16] = r2;
782 
783         FWD_QUANT(r3, u4_abs_value, i4_sign, pu2_threshold_matrix[24],
784                   pu2_scale_matrix[24], u4_round_factor, u4_qbits,
785                   u4_nonzero_coeff);
786         pi2_out_tmp[24] = r3;
787 
788         FWD_QUANT(r4, u4_abs_value, i4_sign, pu2_threshold_matrix[32],
789                   pu2_scale_matrix[32], u4_round_factor, u4_qbits,
790                   u4_nonzero_coeff);
791         pi2_out_tmp[32] = r4;
792 
793         FWD_QUANT(r5, u4_abs_value, i4_sign, pu2_threshold_matrix[40],
794                   pu2_scale_matrix[40], u4_round_factor, u4_qbits,
795                   u4_nonzero_coeff);
796         pi2_out_tmp[40] = r5;
797 
798         FWD_QUANT(r6, u4_abs_value, i4_sign, pu2_threshold_matrix[48],
799                   pu2_scale_matrix[48], u4_round_factor, u4_qbits,
800                   u4_nonzero_coeff);
801         pi2_out_tmp[48] = r6;
802 
803         FWD_QUANT(r7, u4_abs_value, i4_sign, pu2_threshold_matrix[56],
804                   pu2_scale_matrix[56], u4_round_factor, u4_qbits,
805                   u4_nonzero_coeff);
806         pi2_out_tmp[56] = r7;
807 
808         pi2_out_tmp++;
809         pu2_scale_matrix++;
810         pu2_threshold_matrix++;
811     }
812        /* Return total nonzero coefficients in the current sub block */
813         *pu1_nnz =  u4_nonzero_coeff;
814 }
815