1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19  *******************************************************************************
20  * @file
21  *  ihevc_chroma_itrans_recon_16x16.c
22  *
23  * @brief
24  *  Contains function definitions for 16x16 inverse transform  and reconstruction
25  * of chroma interleaved data.
26  *
27  * @author
28  *  100470
29  *
30  * @par List of Functions:
31  *  - ihevc_chroma_itrans_recon_16x16()
32  *
33  * @remarks
34  *  None
35  *
36  *******************************************************************************
37  */
38 
39 #include <stdio.h>
40 #include <string.h>
41 #include "ihevc_typedefs.h"
42 #include "ihevc_macros.h"
43 #include "ihevc_platform_macros.h"
44 #include "ihevc_defs.h"
45 #include "ihevc_trans_tables.h"
46 #include "ihevc_chroma_itrans_recon.h"
47 #include "ihevc_func_selector.h"
48 #include "ihevc_trans_macros.h"
49 
50 /* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
51 /* Data visualization */
52 /* U V U V U V U V */
53 /* U V U V U V U V */
54 /* U V U V U V U V */
55 /* U V U V U V U V */
56 /* If the pointer points to first byte of above stream (U) , functions will operate on U component */
57 /* If the pointer points to second byte of above stream (V) , functions will operate on V component */
58 
59 
60 /**
61  *******************************************************************************
62  *
63  * @brief
64  *  This function performs Inverse transform  and reconstruction for 16x16
65  * input block
66  *
67  * @par Description:
68  *  Performs inverse transform and adds the prediction  data and clips output
69  * to 8 bit
70  *
71  * @param[in] pi2_src
72  *  Input 16x16 coefficients
73  *
74  * @param[in] pi2_tmp
75  *  Temporary 16x16 buffer for storing inverse transform
76  *  1st stage output
77  *
78  * @param[in] pu1_pred
79  *  Prediction 16x16 block
80  *
81  * @param[out] pu1_dst
82  *  Output 16x16 block
83  *
84  * @param[in] src_strd
85  *  Input stride
86  *
87  * @param[in] pred_strd
88  *  Prediction stride
89  *
90  * @param[in] dst_strd
91  *  Output Stride
92  *
93  * @param[in] shift
94  *  Output shift
95  *
96  * @param[in] zero_cols
97  *  Zero columns in pi2_src
98  *
99  * @returns  Void
100  *
101  * @remarks
102  *  None
103  *
104  *******************************************************************************
105  */
106 
107 
ihevc_chroma_itrans_recon_16x16(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)108 void ihevc_chroma_itrans_recon_16x16(WORD16 *pi2_src,
109                                      WORD16 *pi2_tmp,
110                                      UWORD8 *pu1_pred,
111                                      UWORD8 *pu1_dst,
112                                      WORD32 src_strd,
113                                      WORD32 pred_strd,
114                                      WORD32 dst_strd,
115                                      WORD32 zero_cols,
116                                      WORD32 zero_rows)
117 {
118     WORD32 j, k;
119     WORD32 e[8], o[8];
120     WORD32 ee[4], eo[4];
121     WORD32 eee[2], eeo[2];
122     WORD32 add;
123     WORD32 shift;
124     WORD16 *pi2_tmp_orig;
125     WORD32 trans_size;
126     WORD32 row_limit_2nd_stage, zero_rows_2nd_stage = zero_cols;
127 
128     trans_size = TRANS_SIZE_16;
129     pi2_tmp_orig = pi2_tmp;
130 
131     if((zero_cols & 0xFFF0) == 0xFFF0)
132         row_limit_2nd_stage = 4;
133     else if((zero_cols & 0xFF00) == 0xFF00)
134         row_limit_2nd_stage = 8;
135     else
136         row_limit_2nd_stage = TRANS_SIZE_16;
137 
138     if((zero_rows & 0xFFF0) == 0xFFF0) /* First 4 rows of input are non-zero */
139     {
140         /************************************************************************************************/
141         /**********************************START - IT_RECON_16x16****************************************/
142         /************************************************************************************************/
143 
144         /* Inverse Transform 1st stage */
145         shift = IT_SHIFT_STAGE_1;
146         add = 1 << (shift - 1);
147 
148         for(j = 0; j < row_limit_2nd_stage; j++)
149         {
150             /* Checking for Zero Cols */
151             if((zero_cols & 1) == 1)
152             {
153                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
154             }
155             else
156             {
157                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
158                 for(k = 0; k < 8; k++)
159                 {
160                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
161                                     + g_ai2_ihevc_trans_16[3][k]
162                                                     * pi2_src[3 * src_strd];
163                 }
164                 for(k = 0; k < 4; k++)
165                 {
166                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
167                 }
168                 eeo[0] = 0;
169                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
170                 eeo[1] = 0;
171                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
172 
173                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
174                 for(k = 0; k < 2; k++)
175                 {
176                     ee[k] = eee[k] + eeo[k];
177                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
178                 }
179                 for(k = 0; k < 4; k++)
180                 {
181                     e[k] = ee[k] + eo[k];
182                     e[k + 4] = ee[3 - k] - eo[3 - k];
183                 }
184                 for(k = 0; k < 8; k++)
185                 {
186                     pi2_tmp[k] =
187                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
188                     pi2_tmp[k + 8] =
189                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
190                 }
191             }
192             pi2_src++;
193             pi2_tmp += trans_size;
194             zero_cols = zero_cols >> 1;
195         }
196 
197         pi2_tmp = pi2_tmp_orig;
198 
199         /* Inverse Transform 2nd stage */
200         shift = IT_SHIFT_STAGE_2;
201         add = 1 << (shift - 1);
202         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
203         {
204             for(j = 0; j < trans_size; j++)
205             {
206                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
207                 for(k = 0; k < 8; k++)
208                 {
209                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
210                                     + g_ai2_ihevc_trans_16[3][k]
211                                                     * pi2_tmp[3 * trans_size];
212                 }
213                 for(k = 0; k < 4; k++)
214                 {
215                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
216                 }
217                 eeo[0] = 0;
218                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
219                 eeo[1] = 0;
220                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
221 
222                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
223                 for(k = 0; k < 2; k++)
224                 {
225                     ee[k] = eee[k] + eeo[k];
226                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
227                 }
228                 for(k = 0; k < 4; k++)
229                 {
230                     e[k] = ee[k] + eo[k];
231                     e[k + 4] = ee[3 - k] - eo[3 - k];
232                 }
233                 for(k = 0; k < 8; k++)
234                 {
235                     WORD32 itrans_out;
236                     itrans_out =
237                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
238                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
239                     itrans_out =
240                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
241                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
242                 }
243                 pi2_tmp++;
244                 pu1_pred += pred_strd;
245                 pu1_dst += dst_strd;
246             }
247         }
248         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
249         {
250             for(j = 0; j < trans_size; j++)
251             {
252                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
253                 for(k = 0; k < 8; k++)
254                 {
255                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
256                                     + g_ai2_ihevc_trans_16[3][k]
257                                                     * pi2_tmp[3 * trans_size]
258                                     + g_ai2_ihevc_trans_16[5][k]
259                                                     * pi2_tmp[5 * trans_size]
260                                     + g_ai2_ihevc_trans_16[7][k]
261                                                     * pi2_tmp[7 * trans_size];
262                 }
263                 for(k = 0; k < 4; k++)
264                 {
265                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
266                                     + g_ai2_ihevc_trans_16[6][k]
267                                                     * pi2_tmp[6 * trans_size];
268                 }
269                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
270                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
271                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
272                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
273 
274                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
275                 for(k = 0; k < 2; k++)
276                 {
277                     ee[k] = eee[k] + eeo[k];
278                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
279                 }
280                 for(k = 0; k < 4; k++)
281                 {
282                     e[k] = ee[k] + eo[k];
283                     e[k + 4] = ee[3 - k] - eo[3 - k];
284                 }
285                 for(k = 0; k < 8; k++)
286                 {
287                     WORD32 itrans_out;
288                     itrans_out =
289                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
290                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
291                     itrans_out =
292                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
293                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
294                 }
295                 pi2_tmp++;
296                 pu1_pred += pred_strd;
297                 pu1_dst += dst_strd;
298             }
299         }
300         else /* All rows of output of 1st stage are non-zero */
301         {
302             for(j = 0; j < trans_size; j++)
303             {
304                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
305                 for(k = 0; k < 8; k++)
306                 {
307                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
308                                     + g_ai2_ihevc_trans_16[3][k]
309                                                     * pi2_tmp[3 * trans_size]
310                                     + g_ai2_ihevc_trans_16[5][k]
311                                                     * pi2_tmp[5 * trans_size]
312                                     + g_ai2_ihevc_trans_16[7][k]
313                                                     * pi2_tmp[7 * trans_size]
314                                     + g_ai2_ihevc_trans_16[9][k]
315                                                     * pi2_tmp[9 * trans_size]
316                                     + g_ai2_ihevc_trans_16[11][k]
317                                                     * pi2_tmp[11 * trans_size]
318                                     + g_ai2_ihevc_trans_16[13][k]
319                                                     * pi2_tmp[13 * trans_size]
320                                     + g_ai2_ihevc_trans_16[15][k]
321                                                     * pi2_tmp[15 * trans_size];
322                 }
323                 for(k = 0; k < 4; k++)
324                 {
325                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
326                                     + g_ai2_ihevc_trans_16[6][k]
327                                                     * pi2_tmp[6 * trans_size]
328                                     + g_ai2_ihevc_trans_16[10][k]
329                                                     * pi2_tmp[10 * trans_size]
330                                     + g_ai2_ihevc_trans_16[14][k]
331                                                     * pi2_tmp[14 * trans_size];
332                 }
333                 eeo[0] =
334                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
335                                                 + g_ai2_ihevc_trans_16[12][0]
336                                                                 * pi2_tmp[12
337                                                                                 * trans_size];
338                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
339                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
340                 eeo[1] =
341                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
342                                                 + g_ai2_ihevc_trans_16[12][1]
343                                                                 * pi2_tmp[12
344                                                                                 * trans_size];
345                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
346                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
347 
348                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
349                 for(k = 0; k < 2; k++)
350                 {
351                     ee[k] = eee[k] + eeo[k];
352                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
353                 }
354                 for(k = 0; k < 4; k++)
355                 {
356                     e[k] = ee[k] + eo[k];
357                     e[k + 4] = ee[3 - k] - eo[3 - k];
358                 }
359                 for(k = 0; k < 8; k++)
360                 {
361                     WORD32 itrans_out;
362                     itrans_out =
363                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
364                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
365                     itrans_out =
366                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
367                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
368                 }
369                 pi2_tmp++;
370                 pu1_pred += pred_strd;
371                 pu1_dst += dst_strd;
372             }
373         }
374         /************************************************************************************************/
375         /************************************END - IT_RECON_16x16****************************************/
376         /************************************************************************************************/
377     }
378     else if((zero_rows & 0xFF00) == 0xFF00) /* First 8 rows of input are non-zero */
379     {
380         /************************************************************************************************/
381         /**********************************START - IT_RECON_16x16****************************************/
382         /************************************************************************************************/
383 
384         /* Inverse Transform 1st stage */
385         shift = IT_SHIFT_STAGE_1;
386         add = 1 << (shift - 1);
387 
388         for(j = 0; j < row_limit_2nd_stage; j++)
389         {
390             /* Checking for Zero Cols */
391             if((zero_cols & 1) == 1)
392             {
393                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
394             }
395             else
396             {
397                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
398                 for(k = 0; k < 8; k++)
399                 {
400                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
401                                     + g_ai2_ihevc_trans_16[3][k]
402                                                     * pi2_src[3 * src_strd]
403                                     + g_ai2_ihevc_trans_16[5][k]
404                                                     * pi2_src[5 * src_strd]
405                                     + g_ai2_ihevc_trans_16[7][k]
406                                                     * pi2_src[7 * src_strd];
407                 }
408                 for(k = 0; k < 4; k++)
409                 {
410                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
411                                     + g_ai2_ihevc_trans_16[6][k]
412                                                     * pi2_src[6 * src_strd];
413                 }
414                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
415                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
416                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
417                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
418 
419                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
420                 for(k = 0; k < 2; k++)
421                 {
422                     ee[k] = eee[k] + eeo[k];
423                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
424                 }
425                 for(k = 0; k < 4; k++)
426                 {
427                     e[k] = ee[k] + eo[k];
428                     e[k + 4] = ee[3 - k] - eo[3 - k];
429                 }
430                 for(k = 0; k < 8; k++)
431                 {
432                     pi2_tmp[k] =
433                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
434                     pi2_tmp[k + 8] =
435                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
436                 }
437             }
438             pi2_src++;
439             pi2_tmp += trans_size;
440             zero_cols = zero_cols >> 1;
441         }
442 
443         pi2_tmp = pi2_tmp_orig;
444 
445         /* Inverse Transform 2nd stage */
446         shift = IT_SHIFT_STAGE_2;
447         add = 1 << (shift - 1);
448         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
449         {
450             for(j = 0; j < trans_size; j++)
451             {
452                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
453                 for(k = 0; k < 8; k++)
454                 {
455                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
456                                     + g_ai2_ihevc_trans_16[3][k]
457                                                     * pi2_tmp[3 * trans_size];
458                 }
459                 for(k = 0; k < 4; k++)
460                 {
461                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
462                 }
463                 eeo[0] = 0;
464                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
465                 eeo[1] = 0;
466                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
467 
468                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
469                 for(k = 0; k < 2; k++)
470                 {
471                     ee[k] = eee[k] + eeo[k];
472                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
473                 }
474                 for(k = 0; k < 4; k++)
475                 {
476                     e[k] = ee[k] + eo[k];
477                     e[k + 4] = ee[3 - k] - eo[3 - k];
478                 }
479                 for(k = 0; k < 8; k++)
480                 {
481                     WORD32 itrans_out;
482                     itrans_out =
483                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
484                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
485                     itrans_out =
486                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
487                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
488                 }
489                 pi2_tmp++;
490                 pu1_pred += pred_strd;
491                 pu1_dst += dst_strd;
492             }
493         }
494         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
495         {
496             for(j = 0; j < trans_size; j++)
497             {
498                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
499                 for(k = 0; k < 8; k++)
500                 {
501                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
502                                     + g_ai2_ihevc_trans_16[3][k]
503                                                     * pi2_tmp[3 * trans_size]
504                                     + g_ai2_ihevc_trans_16[5][k]
505                                                     * pi2_tmp[5 * trans_size]
506                                     + g_ai2_ihevc_trans_16[7][k]
507                                                     * pi2_tmp[7 * trans_size];
508                 }
509                 for(k = 0; k < 4; k++)
510                 {
511                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
512                                     + g_ai2_ihevc_trans_16[6][k]
513                                                     * pi2_tmp[6 * trans_size];
514                 }
515                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
516                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
517                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
518                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
519 
520                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
521                 for(k = 0; k < 2; k++)
522                 {
523                     ee[k] = eee[k] + eeo[k];
524                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
525                 }
526                 for(k = 0; k < 4; k++)
527                 {
528                     e[k] = ee[k] + eo[k];
529                     e[k + 4] = ee[3 - k] - eo[3 - k];
530                 }
531                 for(k = 0; k < 8; k++)
532                 {
533                     WORD32 itrans_out;
534                     itrans_out =
535                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
536                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
537                     itrans_out =
538                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
539                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
540                 }
541                 pi2_tmp++;
542                 pu1_pred += pred_strd;
543                 pu1_dst += dst_strd;
544             }
545         }
546         else /* All rows of output of 1st stage are non-zero */
547         {
548             for(j = 0; j < trans_size; j++)
549             {
550                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
551                 for(k = 0; k < 8; k++)
552                 {
553                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
554                                     + g_ai2_ihevc_trans_16[3][k]
555                                                     * pi2_tmp[3 * trans_size]
556                                     + g_ai2_ihevc_trans_16[5][k]
557                                                     * pi2_tmp[5 * trans_size]
558                                     + g_ai2_ihevc_trans_16[7][k]
559                                                     * pi2_tmp[7 * trans_size]
560                                     + g_ai2_ihevc_trans_16[9][k]
561                                                     * pi2_tmp[9 * trans_size]
562                                     + g_ai2_ihevc_trans_16[11][k]
563                                                     * pi2_tmp[11 * trans_size]
564                                     + g_ai2_ihevc_trans_16[13][k]
565                                                     * pi2_tmp[13 * trans_size]
566                                     + g_ai2_ihevc_trans_16[15][k]
567                                                     * pi2_tmp[15 * trans_size];
568                 }
569                 for(k = 0; k < 4; k++)
570                 {
571                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
572                                     + g_ai2_ihevc_trans_16[6][k]
573                                                     * pi2_tmp[6 * trans_size]
574                                     + g_ai2_ihevc_trans_16[10][k]
575                                                     * pi2_tmp[10 * trans_size]
576                                     + g_ai2_ihevc_trans_16[14][k]
577                                                     * pi2_tmp[14 * trans_size];
578                 }
579                 eeo[0] =
580                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
581                                                 + g_ai2_ihevc_trans_16[12][0]
582                                                                 * pi2_tmp[12
583                                                                                 * trans_size];
584                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
585                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
586                 eeo[1] =
587                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
588                                                 + g_ai2_ihevc_trans_16[12][1]
589                                                                 * pi2_tmp[12
590                                                                                 * trans_size];
591                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
592                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
593 
594                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
595                 for(k = 0; k < 2; k++)
596                 {
597                     ee[k] = eee[k] + eeo[k];
598                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
599                 }
600                 for(k = 0; k < 4; k++)
601                 {
602                     e[k] = ee[k] + eo[k];
603                     e[k + 4] = ee[3 - k] - eo[3 - k];
604                 }
605                 for(k = 0; k < 8; k++)
606                 {
607                     WORD32 itrans_out;
608                     itrans_out =
609                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
610                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
611                     itrans_out =
612                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
613                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
614                 }
615                 pi2_tmp++;
616                 pu1_pred += pred_strd;
617                 pu1_dst += dst_strd;
618             }
619         }
620         /************************************************************************************************/
621         /************************************END - IT_RECON_16x16****************************************/
622         /************************************************************************************************/
623     }
624     else /* All rows of input are non-zero */
625     {
626         /************************************************************************************************/
627         /**********************************START - IT_RECON_16x16****************************************/
628         /************************************************************************************************/
629 
630         /* Inverse Transform 1st stage */
631         shift = IT_SHIFT_STAGE_1;
632         add = 1 << (shift - 1);
633 
634         for(j = 0; j < row_limit_2nd_stage; j++)
635         {
636             /* Checking for Zero Cols */
637             if((zero_cols & 1) == 1)
638             {
639                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
640             }
641             else
642             {
643                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
644                 for(k = 0; k < 8; k++)
645                 {
646                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
647                                     + g_ai2_ihevc_trans_16[3][k]
648                                                     * pi2_src[3 * src_strd]
649                                     + g_ai2_ihevc_trans_16[5][k]
650                                                     * pi2_src[5 * src_strd]
651                                     + g_ai2_ihevc_trans_16[7][k]
652                                                     * pi2_src[7 * src_strd]
653                                     + g_ai2_ihevc_trans_16[9][k]
654                                                     * pi2_src[9 * src_strd]
655                                     + g_ai2_ihevc_trans_16[11][k]
656                                                     * pi2_src[11 * src_strd]
657                                     + g_ai2_ihevc_trans_16[13][k]
658                                                     * pi2_src[13 * src_strd]
659                                     + g_ai2_ihevc_trans_16[15][k]
660                                                     * pi2_src[15 * src_strd];
661                 }
662                 for(k = 0; k < 4; k++)
663                 {
664                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
665                                     + g_ai2_ihevc_trans_16[6][k]
666                                                     * pi2_src[6 * src_strd]
667                                     + g_ai2_ihevc_trans_16[10][k]
668                                                     * pi2_src[10 * src_strd]
669                                     + g_ai2_ihevc_trans_16[14][k]
670                                                     * pi2_src[14 * src_strd];
671                 }
672                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
673                                 + g_ai2_ihevc_trans_16[12][0]
674                                                 * pi2_src[12 * src_strd];
675                 eee[0] =
676                                 g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
677                                                 + g_ai2_ihevc_trans_16[8][0]
678                                                                 * pi2_src[8
679                                                                                 * src_strd];
680                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
681                                 + g_ai2_ihevc_trans_16[12][1]
682                                                 * pi2_src[12 * src_strd];
683                 eee[1] =
684                                 g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
685                                                 + g_ai2_ihevc_trans_16[8][1]
686                                                                 * pi2_src[8
687                                                                                 * src_strd];
688 
689                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
690                 for(k = 0; k < 2; k++)
691                 {
692                     ee[k] = eee[k] + eeo[k];
693                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
694                 }
695                 for(k = 0; k < 4; k++)
696                 {
697                     e[k] = ee[k] + eo[k];
698                     e[k + 4] = ee[3 - k] - eo[3 - k];
699                 }
700                 for(k = 0; k < 8; k++)
701                 {
702                     pi2_tmp[k] =
703                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
704                     pi2_tmp[k + 8] =
705                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
706                 }
707             }
708             pi2_src++;
709             pi2_tmp += trans_size;
710             zero_cols = zero_cols >> 1;
711         }
712 
713         pi2_tmp = pi2_tmp_orig;
714 
715         /* Inverse Transform 2nd stage */
716         shift = IT_SHIFT_STAGE_2;
717         add = 1 << (shift - 1);
718         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
719         {
720             for(j = 0; j < trans_size; j++)
721             {
722                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
723                 for(k = 0; k < 8; k++)
724                 {
725                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
726                                     + g_ai2_ihevc_trans_16[3][k]
727                                                     * pi2_tmp[3 * trans_size];
728                 }
729                 for(k = 0; k < 4; k++)
730                 {
731                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
732                 }
733                 eeo[0] = 0;
734                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
735                 eeo[1] = 0;
736                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
737 
738                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
739                 for(k = 0; k < 2; k++)
740                 {
741                     ee[k] = eee[k] + eeo[k];
742                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
743                 }
744                 for(k = 0; k < 4; k++)
745                 {
746                     e[k] = ee[k] + eo[k];
747                     e[k + 4] = ee[3 - k] - eo[3 - k];
748                 }
749                 for(k = 0; k < 8; k++)
750                 {
751                     WORD32 itrans_out;
752                     itrans_out =
753                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
754                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
755                     itrans_out =
756                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
757                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
758                 }
759                 pi2_tmp++;
760                 pu1_pred += pred_strd;
761                 pu1_dst += dst_strd;
762             }
763         }
764         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
765         {
766             for(j = 0; j < trans_size; j++)
767             {
768                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
769                 for(k = 0; k < 8; k++)
770                 {
771                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
772                                     + g_ai2_ihevc_trans_16[3][k]
773                                                     * pi2_tmp[3 * trans_size]
774                                     + g_ai2_ihevc_trans_16[5][k]
775                                                     * pi2_tmp[5 * trans_size]
776                                     + g_ai2_ihevc_trans_16[7][k]
777                                                     * pi2_tmp[7 * trans_size];
778                 }
779                 for(k = 0; k < 4; k++)
780                 {
781                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
782                                     + g_ai2_ihevc_trans_16[6][k]
783                                                     * pi2_tmp[6 * trans_size];
784                 }
785                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
786                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
787                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
788                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
789 
790                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
791                 for(k = 0; k < 2; k++)
792                 {
793                     ee[k] = eee[k] + eeo[k];
794                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
795                 }
796                 for(k = 0; k < 4; k++)
797                 {
798                     e[k] = ee[k] + eo[k];
799                     e[k + 4] = ee[3 - k] - eo[3 - k];
800                 }
801                 for(k = 0; k < 8; k++)
802                 {
803                     WORD32 itrans_out;
804                     itrans_out =
805                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
806                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
807                     itrans_out =
808                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
809                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
810                 }
811                 pi2_tmp++;
812                 pu1_pred += pred_strd;
813                 pu1_dst += dst_strd;
814             }
815         }
816         else /* All rows of output of 1st stage are non-zero */
817         {
818             for(j = 0; j < trans_size; j++)
819             {
820                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
821                 for(k = 0; k < 8; k++)
822                 {
823                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
824                                     + g_ai2_ihevc_trans_16[3][k]
825                                                     * pi2_tmp[3 * trans_size]
826                                     + g_ai2_ihevc_trans_16[5][k]
827                                                     * pi2_tmp[5 * trans_size]
828                                     + g_ai2_ihevc_trans_16[7][k]
829                                                     * pi2_tmp[7 * trans_size]
830                                     + g_ai2_ihevc_trans_16[9][k]
831                                                     * pi2_tmp[9 * trans_size]
832                                     + g_ai2_ihevc_trans_16[11][k]
833                                                     * pi2_tmp[11 * trans_size]
834                                     + g_ai2_ihevc_trans_16[13][k]
835                                                     * pi2_tmp[13 * trans_size]
836                                     + g_ai2_ihevc_trans_16[15][k]
837                                                     * pi2_tmp[15 * trans_size];
838                 }
839                 for(k = 0; k < 4; k++)
840                 {
841                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
842                                     + g_ai2_ihevc_trans_16[6][k]
843                                                     * pi2_tmp[6 * trans_size]
844                                     + g_ai2_ihevc_trans_16[10][k]
845                                                     * pi2_tmp[10 * trans_size]
846                                     + g_ai2_ihevc_trans_16[14][k]
847                                                     * pi2_tmp[14 * trans_size];
848                 }
849                 eeo[0] =
850                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
851                                                 + g_ai2_ihevc_trans_16[12][0]
852                                                                 * pi2_tmp[12
853                                                                                 * trans_size];
854                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
855                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
856                 eeo[1] =
857                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
858                                                 + g_ai2_ihevc_trans_16[12][1]
859                                                                 * pi2_tmp[12
860                                                                                 * trans_size];
861                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
862                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
863 
864                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
865                 for(k = 0; k < 2; k++)
866                 {
867                     ee[k] = eee[k] + eeo[k];
868                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
869                 }
870                 for(k = 0; k < 4; k++)
871                 {
872                     e[k] = ee[k] + eo[k];
873                     e[k + 4] = ee[3 - k] - eo[3 - k];
874                 }
875                 for(k = 0; k < 8; k++)
876                 {
877                     WORD32 itrans_out;
878                     itrans_out =
879                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
880                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
881                     itrans_out =
882                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
883                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
884                 }
885                 pi2_tmp++;
886                 pu1_pred += pred_strd;
887                 pu1_dst += dst_strd;
888             }
889         }
890         /************************************************************************************************/
891         /************************************END - IT_RECON_16x16****************************************/
892         /************************************************************************************************/
893     }
894 }
895 
896