1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19  *******************************************************************************
20  * @file
21  *  ihevc_itrans_recon_16x16.c
22  *
23  * @brief
24  *  Contains function definitions for inverse transform  and reconstruction 16x16
25  *
26  *
27  * @author
28  *  100470
29  *
30  * @par List of Functions:
31  *  - ihevc_itrans_recon_16x16()
32  *
33  * @remarks
34  *  None
35  *
36  *******************************************************************************
37  */
38 #include <stdio.h>
39 #include <string.h>
40 #include "ihevc_typedefs.h"
41 #include "ihevc_macros.h"
42 #include "ihevc_platform_macros.h"
43 #include "ihevc_defs.h"
44 #include "ihevc_trans_tables.h"
45 #include "ihevc_itrans_recon.h"
46 #include "ihevc_func_selector.h"
47 #include "ihevc_trans_macros.h"
48 
49 /**
50  *******************************************************************************
51  *
52  * @brief
53  *  This function performs Inverse transform  and reconstruction for 16x16
54  * input block
55  *
56  * @par Description:
57  *  Performs inverse transform and adds the prediction  data and clips output
58  * to 8 bit
59  *
60  * @param[in] pi2_src
61  *  Input 16x16 coefficients
62  *
63  * @param[in] pi2_tmp
64  *  Temporary 16x16 buffer for storing inverse
65  *
66  *  transform
67  *  1st stage output
68  *
69  * @param[in] pu1_pred
70  *  Prediction 16x16 block
71  *
72  * @param[out] pu1_dst
73  *  Output 16x16 block
74  *
75  * @param[in] src_strd
76  *  Input stride
77  *
78  * @param[in] pred_strd
79  *  Prediction stride
80  *
81  * @param[in] dst_strd
82  *  Output Stride
83  *
84  * @param[in] shift
85  *  Output shift
86  *
87  * @param[in] zero_cols
88  *  Zero columns in pi2_src
89  *
90  * @returns  Void
91  *
92  * @remarks
93  *  None
94  *
95  *******************************************************************************
96  */
97 
ihevc_itrans_recon_16x16(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)98 void ihevc_itrans_recon_16x16(WORD16 *pi2_src,
99                               WORD16 *pi2_tmp,
100                               UWORD8 *pu1_pred,
101                               UWORD8 *pu1_dst,
102                               WORD32 src_strd,
103                               WORD32 pred_strd,
104                               WORD32 dst_strd,
105                               WORD32 zero_cols,
106                               WORD32 zero_rows)
107 {
108     WORD32 j, k;
109     WORD32 e[8], o[8];
110     WORD32 ee[4], eo[4];
111     WORD32 eee[2], eeo[2];
112     WORD32 add;
113     WORD32 shift;
114     WORD16 *pi2_tmp_orig;
115     WORD32 trans_size;
116     WORD32 zero_rows_2nd_stage = zero_cols;
117     WORD32 row_limit_2nd_stage;
118 
119     if((zero_cols & 0xFFF0) == 0xFFF0)
120         row_limit_2nd_stage = 4;
121     else if((zero_cols & 0xFF00) == 0xFF00)
122         row_limit_2nd_stage = 8;
123     else
124         row_limit_2nd_stage = TRANS_SIZE_16;
125 
126     trans_size = TRANS_SIZE_16;
127     pi2_tmp_orig = pi2_tmp;
128     if((zero_rows & 0xFFF0) == 0xFFF0)  /* First 4 rows of input are non-zero */
129     {
130         /* Inverse Transform 1st stage */
131         /************************************************************************************************/
132         /**********************************START - IT_RECON_16x16****************************************/
133         /************************************************************************************************/
134 
135         shift = IT_SHIFT_STAGE_1;
136         add = 1 << (shift - 1);
137 
138         for(j = 0; j < row_limit_2nd_stage; j++)
139         {
140             /* Checking for Zero Cols */
141             if((zero_cols & 1) == 1)
142             {
143                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
144             }
145             else
146             {
147                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
148                 for(k = 0; k < 8; k++)
149                 {
150                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
151                                     + g_ai2_ihevc_trans_16[3][k]
152                                                     * pi2_src[3 * src_strd];
153                 }
154                 for(k = 0; k < 4; k++)
155                 {
156                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
157                 }
158                 eeo[0] = 0;
159                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
160                 eeo[1] = 0;
161                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
162 
163                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
164                 for(k = 0; k < 2; k++)
165                 {
166                     ee[k] = eee[k] + eeo[k];
167                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
168                 }
169                 for(k = 0; k < 4; k++)
170                 {
171                     e[k] = ee[k] + eo[k];
172                     e[k + 4] = ee[3 - k] - eo[3 - k];
173                 }
174                 for(k = 0; k < 8; k++)
175                 {
176                     pi2_tmp[k] =
177                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
178                     pi2_tmp[k + 8] =
179                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
180                 }
181             }
182             pi2_src++;
183             pi2_tmp += trans_size;
184             zero_cols = zero_cols >> 1;
185         }
186 
187         pi2_tmp = pi2_tmp_orig;
188 
189         /* Inverse Transform 2nd stage */
190         shift = IT_SHIFT_STAGE_2;
191         add = 1 << (shift - 1);
192 
193         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
194         {
195             for(j = 0; j < trans_size; j++)
196             {
197                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
198                 for(k = 0; k < 8; k++)
199                 {
200                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
201                                     + g_ai2_ihevc_trans_16[3][k]
202                                                     * pi2_tmp[3 * trans_size];
203                 }
204                 for(k = 0; k < 4; k++)
205                 {
206                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
207                 }
208                 eeo[0] = 0;
209                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
210                 eeo[1] = 0;
211                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
212 
213                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
214                 for(k = 0; k < 2; k++)
215                 {
216                     ee[k] = eee[k] + eeo[k];
217                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
218                 }
219                 for(k = 0; k < 4; k++)
220                 {
221                     e[k] = ee[k] + eo[k];
222                     e[k + 4] = ee[3 - k] - eo[3 - k];
223                 }
224                 for(k = 0; k < 8; k++)
225                 {
226                     WORD32 itrans_out;
227                     itrans_out =
228                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
229                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
230                     itrans_out =
231                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
232                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
233                 }
234                 pi2_tmp++;
235                 pu1_pred += pred_strd;
236                 pu1_dst += dst_strd;
237             }
238         }
239         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
240         {
241             for(j = 0; j < trans_size; j++)
242             {
243                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
244                 for(k = 0; k < 8; k++)
245                 {
246                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
247                                     + g_ai2_ihevc_trans_16[3][k]
248                                                     * pi2_tmp[3 * trans_size]
249                                     + g_ai2_ihevc_trans_16[5][k]
250                                                     * pi2_tmp[5 * trans_size]
251                                     + g_ai2_ihevc_trans_16[7][k]
252                                                     * pi2_tmp[7 * trans_size];
253                 }
254                 for(k = 0; k < 4; k++)
255                 {
256                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
257                                     + g_ai2_ihevc_trans_16[6][k]
258                                                     * pi2_tmp[6 * trans_size];
259                 }
260                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
261                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
262                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
263                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
264 
265                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
266                 for(k = 0; k < 2; k++)
267                 {
268                     ee[k] = eee[k] + eeo[k];
269                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
270                 }
271                 for(k = 0; k < 4; k++)
272                 {
273                     e[k] = ee[k] + eo[k];
274                     e[k + 4] = ee[3 - k] - eo[3 - k];
275                 }
276                 for(k = 0; k < 8; k++)
277                 {
278                     WORD32 itrans_out;
279                     itrans_out =
280                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
281                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
282                     itrans_out =
283                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
284                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
285                 }
286                 pi2_tmp++;
287                 pu1_pred += pred_strd;
288                 pu1_dst += dst_strd;
289             }
290         }
291         else /* All rows of output of 1st stage are non-zero */
292         {
293             for(j = 0; j < trans_size; j++)
294             {
295                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
296                 for(k = 0; k < 8; k++)
297                 {
298                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
299                                     + g_ai2_ihevc_trans_16[3][k]
300                                                     * pi2_tmp[3 * trans_size]
301                                     + g_ai2_ihevc_trans_16[5][k]
302                                                     * pi2_tmp[5 * trans_size]
303                                     + g_ai2_ihevc_trans_16[7][k]
304                                                     * pi2_tmp[7 * trans_size]
305                                     + g_ai2_ihevc_trans_16[9][k]
306                                                     * pi2_tmp[9 * trans_size]
307                                     + g_ai2_ihevc_trans_16[11][k]
308                                                     * pi2_tmp[11 * trans_size]
309                                     + g_ai2_ihevc_trans_16[13][k]
310                                                     * pi2_tmp[13 * trans_size]
311                                     + g_ai2_ihevc_trans_16[15][k]
312                                                     * pi2_tmp[15 * trans_size];
313                 }
314                 for(k = 0; k < 4; k++)
315                 {
316                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
317                                     + g_ai2_ihevc_trans_16[6][k]
318                                                     * pi2_tmp[6 * trans_size]
319                                     + g_ai2_ihevc_trans_16[10][k]
320                                                     * pi2_tmp[10 * trans_size]
321                                     + g_ai2_ihevc_trans_16[14][k]
322                                                     * pi2_tmp[14 * trans_size];
323                 }
324                 eeo[0] =
325                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
326                                                 + g_ai2_ihevc_trans_16[12][0]
327                                                                 * pi2_tmp[12
328                                                                                 * trans_size];
329                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
330                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
331                 eeo[1] =
332                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
333                                                 + g_ai2_ihevc_trans_16[12][1]
334                                                                 * pi2_tmp[12
335                                                                                 * trans_size];
336                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
337                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
338 
339                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
340                 for(k = 0; k < 2; k++)
341                 {
342                     ee[k] = eee[k] + eeo[k];
343                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
344                 }
345                 for(k = 0; k < 4; k++)
346                 {
347                     e[k] = ee[k] + eo[k];
348                     e[k + 4] = ee[3 - k] - eo[3 - k];
349                 }
350                 for(k = 0; k < 8; k++)
351                 {
352                     WORD32 itrans_out;
353                     itrans_out =
354                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
355                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
356                     itrans_out =
357                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
358                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
359                 }
360                 pi2_tmp++;
361                 pu1_pred += pred_strd;
362                 pu1_dst += dst_strd;
363             }
364         }
365         /************************************************************************************************/
366         /************************************END - IT_RECON_16x16****************************************/
367         /************************************************************************************************/
368     }
369     else if((zero_rows & 0xFF00) == 0xFF00)  /* First 8 rows of input are non-zero */
370     {
371         /* Inverse Transform 1st stage */
372         /************************************************************************************************/
373         /**********************************START - IT_RECON_16x16****************************************/
374         /************************************************************************************************/
375 
376         shift = IT_SHIFT_STAGE_1;
377         add = 1 << (shift - 1);
378 
379         for(j = 0; j < row_limit_2nd_stage; j++)
380         {
381             /* Checking for Zero Cols */
382             if((zero_cols & 1) == 1)
383             {
384                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
385             }
386             else
387             {
388                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
389                 for(k = 0; k < 8; k++)
390                 {
391                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
392                                     + g_ai2_ihevc_trans_16[3][k]
393                                                     * pi2_src[3 * src_strd]
394                                     + g_ai2_ihevc_trans_16[5][k]
395                                                     * pi2_src[5 * src_strd]
396                                     + g_ai2_ihevc_trans_16[7][k]
397                                                     * pi2_src[7 * src_strd];
398                 }
399                 for(k = 0; k < 4; k++)
400                 {
401                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
402                                     + g_ai2_ihevc_trans_16[6][k]
403                                                     * pi2_src[6 * src_strd];
404                 }
405                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
406                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
407                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
408                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
409 
410                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
411                 for(k = 0; k < 2; k++)
412                 {
413                     ee[k] = eee[k] + eeo[k];
414                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
415                 }
416                 for(k = 0; k < 4; k++)
417                 {
418                     e[k] = ee[k] + eo[k];
419                     e[k + 4] = ee[3 - k] - eo[3 - k];
420                 }
421                 for(k = 0; k < 8; k++)
422                 {
423                     pi2_tmp[k] =
424                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
425                     pi2_tmp[k + 8] =
426                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
427                 }
428             }
429             pi2_src++;
430             pi2_tmp += trans_size;
431             zero_cols = zero_cols >> 1;
432         }
433 
434         pi2_tmp = pi2_tmp_orig;
435 
436         /* Inverse Transform 2nd stage */
437         shift = IT_SHIFT_STAGE_2;
438         add = 1 << (shift - 1);
439 
440         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
441         {
442             for(j = 0; j < trans_size; j++)
443             {
444                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
445                 for(k = 0; k < 8; k++)
446                 {
447                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
448                                     + g_ai2_ihevc_trans_16[3][k]
449                                                     * pi2_tmp[3 * trans_size];
450                 }
451                 for(k = 0; k < 4; k++)
452                 {
453                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
454                 }
455                 eeo[0] = 0;
456                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
457                 eeo[1] = 0;
458                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
459 
460                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
461                 for(k = 0; k < 2; k++)
462                 {
463                     ee[k] = eee[k] + eeo[k];
464                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
465                 }
466                 for(k = 0; k < 4; k++)
467                 {
468                     e[k] = ee[k] + eo[k];
469                     e[k + 4] = ee[3 - k] - eo[3 - k];
470                 }
471                 for(k = 0; k < 8; k++)
472                 {
473                     WORD32 itrans_out;
474                     itrans_out =
475                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
476                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
477                     itrans_out =
478                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
479                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
480                 }
481                 pi2_tmp++;
482                 pu1_pred += pred_strd;
483                 pu1_dst += dst_strd;
484             }
485         }
486         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
487         {
488             for(j = 0; j < trans_size; j++)
489             {
490                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
491                 for(k = 0; k < 8; k++)
492                 {
493                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
494                                     + g_ai2_ihevc_trans_16[3][k]
495                                                     * pi2_tmp[3 * trans_size]
496                                     + g_ai2_ihevc_trans_16[5][k]
497                                                     * pi2_tmp[5 * trans_size]
498                                     + g_ai2_ihevc_trans_16[7][k]
499                                                     * pi2_tmp[7 * trans_size];
500                 }
501                 for(k = 0; k < 4; k++)
502                 {
503                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
504                                     + g_ai2_ihevc_trans_16[6][k]
505                                                     * pi2_tmp[6 * trans_size];
506                 }
507                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
508                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
509                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
510                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
511 
512                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
513                 for(k = 0; k < 2; k++)
514                 {
515                     ee[k] = eee[k] + eeo[k];
516                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
517                 }
518                 for(k = 0; k < 4; k++)
519                 {
520                     e[k] = ee[k] + eo[k];
521                     e[k + 4] = ee[3 - k] - eo[3 - k];
522                 }
523                 for(k = 0; k < 8; k++)
524                 {
525                     WORD32 itrans_out;
526                     itrans_out =
527                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
528                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
529                     itrans_out =
530                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
531                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
532                 }
533                 pi2_tmp++;
534                 pu1_pred += pred_strd;
535                 pu1_dst += dst_strd;
536             }
537         }
538         else /* All rows of output of 1st stage are non-zero */
539         {
540             for(j = 0; j < trans_size; j++)
541             {
542                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
543                 for(k = 0; k < 8; k++)
544                 {
545                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
546                                     + g_ai2_ihevc_trans_16[3][k]
547                                                     * pi2_tmp[3 * trans_size]
548                                     + g_ai2_ihevc_trans_16[5][k]
549                                                     * pi2_tmp[5 * trans_size]
550                                     + g_ai2_ihevc_trans_16[7][k]
551                                                     * pi2_tmp[7 * trans_size]
552                                     + g_ai2_ihevc_trans_16[9][k]
553                                                     * pi2_tmp[9 * trans_size]
554                                     + g_ai2_ihevc_trans_16[11][k]
555                                                     * pi2_tmp[11 * trans_size]
556                                     + g_ai2_ihevc_trans_16[13][k]
557                                                     * pi2_tmp[13 * trans_size]
558                                     + g_ai2_ihevc_trans_16[15][k]
559                                                     * pi2_tmp[15 * trans_size];
560                 }
561                 for(k = 0; k < 4; k++)
562                 {
563                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
564                                     + g_ai2_ihevc_trans_16[6][k]
565                                                     * pi2_tmp[6 * trans_size]
566                                     + g_ai2_ihevc_trans_16[10][k]
567                                                     * pi2_tmp[10 * trans_size]
568                                     + g_ai2_ihevc_trans_16[14][k]
569                                                     * pi2_tmp[14 * trans_size];
570                 }
571                 eeo[0] =
572                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
573                                                 + g_ai2_ihevc_trans_16[12][0]
574                                                                 * pi2_tmp[12
575                                                                                 * trans_size];
576                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
577                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
578                 eeo[1] =
579                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
580                                                 + g_ai2_ihevc_trans_16[12][1]
581                                                                 * pi2_tmp[12
582                                                                                 * trans_size];
583                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
584                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
585 
586                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
587                 for(k = 0; k < 2; k++)
588                 {
589                     ee[k] = eee[k] + eeo[k];
590                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
591                 }
592                 for(k = 0; k < 4; k++)
593                 {
594                     e[k] = ee[k] + eo[k];
595                     e[k + 4] = ee[3 - k] - eo[3 - k];
596                 }
597                 for(k = 0; k < 8; k++)
598                 {
599                     WORD32 itrans_out;
600                     itrans_out =
601                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
602                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
603                     itrans_out =
604                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
605                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
606                 }
607                 pi2_tmp++;
608                 pu1_pred += pred_strd;
609                 pu1_dst += dst_strd;
610             }
611         }
612         /************************************************************************************************/
613         /************************************END - IT_RECON_16x16****************************************/
614         /************************************************************************************************/
615     }
616     else  /* All rows of input are non-zero */
617     {
618         /* Inverse Transform 1st stage */
619         /************************************************************************************************/
620         /**********************************START - IT_RECON_16x16****************************************/
621         /************************************************************************************************/
622 
623         shift = IT_SHIFT_STAGE_1;
624         add = 1 << (shift - 1);
625 
626         for(j = 0; j < row_limit_2nd_stage; j++)
627         {
628             /* Checking for Zero Cols */
629             if((zero_cols & 1) == 1)
630             {
631                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
632             }
633             else
634             {
635                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
636                 for(k = 0; k < 8; k++)
637                 {
638                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
639                                     + g_ai2_ihevc_trans_16[3][k]
640                                                     * pi2_src[3 * src_strd]
641                                     + g_ai2_ihevc_trans_16[5][k]
642                                                     * pi2_src[5 * src_strd]
643                                     + g_ai2_ihevc_trans_16[7][k]
644                                                     * pi2_src[7 * src_strd]
645                                     + g_ai2_ihevc_trans_16[9][k]
646                                                     * pi2_src[9 * src_strd]
647                                     + g_ai2_ihevc_trans_16[11][k]
648                                                     * pi2_src[11 * src_strd]
649                                     + g_ai2_ihevc_trans_16[13][k]
650                                                     * pi2_src[13 * src_strd]
651                                     + g_ai2_ihevc_trans_16[15][k]
652                                                     * pi2_src[15 * src_strd];
653                 }
654                 for(k = 0; k < 4; k++)
655                 {
656                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
657                                     + g_ai2_ihevc_trans_16[6][k]
658                                                     * pi2_src[6 * src_strd]
659                                     + g_ai2_ihevc_trans_16[10][k]
660                                                     * pi2_src[10 * src_strd]
661                                     + g_ai2_ihevc_trans_16[14][k]
662                                                     * pi2_src[14 * src_strd];
663                 }
664                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
665                                 + g_ai2_ihevc_trans_16[12][0]
666                                                 * pi2_src[12 * src_strd];
667                 eee[0] =
668                                 g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
669                                                 + g_ai2_ihevc_trans_16[8][0]
670                                                                 * pi2_src[8
671                                                                                 * src_strd];
672                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
673                                 + g_ai2_ihevc_trans_16[12][1]
674                                                 * pi2_src[12 * src_strd];
675                 eee[1] =
676                                 g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
677                                                 + g_ai2_ihevc_trans_16[8][1]
678                                                                 * pi2_src[8
679                                                                                 * src_strd];
680 
681                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
682                 for(k = 0; k < 2; k++)
683                 {
684                     ee[k] = eee[k] + eeo[k];
685                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
686                 }
687                 for(k = 0; k < 4; k++)
688                 {
689                     e[k] = ee[k] + eo[k];
690                     e[k + 4] = ee[3 - k] - eo[3 - k];
691                 }
692                 for(k = 0; k < 8; k++)
693                 {
694                     pi2_tmp[k] =
695                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
696                     pi2_tmp[k + 8] =
697                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
698                 }
699             }
700             pi2_src++;
701             pi2_tmp += trans_size;
702             zero_cols = zero_cols >> 1;
703         }
704 
705         pi2_tmp = pi2_tmp_orig;
706 
707         /* Inverse Transform 2nd stage */
708         shift = IT_SHIFT_STAGE_2;
709         add = 1 << (shift - 1);
710 
711         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
712         {
713             for(j = 0; j < trans_size; j++)
714             {
715                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
716                 for(k = 0; k < 8; k++)
717                 {
718                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
719                                     + g_ai2_ihevc_trans_16[3][k]
720                                                     * pi2_tmp[3 * trans_size];
721                 }
722                 for(k = 0; k < 4; k++)
723                 {
724                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
725                 }
726                 eeo[0] = 0;
727                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
728                 eeo[1] = 0;
729                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
730 
731                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
732                 for(k = 0; k < 2; k++)
733                 {
734                     ee[k] = eee[k] + eeo[k];
735                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
736                 }
737                 for(k = 0; k < 4; k++)
738                 {
739                     e[k] = ee[k] + eo[k];
740                     e[k + 4] = ee[3 - k] - eo[3 - k];
741                 }
742                 for(k = 0; k < 8; k++)
743                 {
744                     WORD32 itrans_out;
745                     itrans_out =
746                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
747                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
748                     itrans_out =
749                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
750                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
751                 }
752                 pi2_tmp++;
753                 pu1_pred += pred_strd;
754                 pu1_dst += dst_strd;
755             }
756         }
757         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
758         {
759             for(j = 0; j < trans_size; j++)
760             {
761                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
762                 for(k = 0; k < 8; k++)
763                 {
764                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
765                                     + g_ai2_ihevc_trans_16[3][k]
766                                                     * pi2_tmp[3 * trans_size]
767                                     + g_ai2_ihevc_trans_16[5][k]
768                                                     * pi2_tmp[5 * trans_size]
769                                     + g_ai2_ihevc_trans_16[7][k]
770                                                     * pi2_tmp[7 * trans_size];
771                 }
772                 for(k = 0; k < 4; k++)
773                 {
774                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
775                                     + g_ai2_ihevc_trans_16[6][k]
776                                                     * pi2_tmp[6 * trans_size];
777                 }
778                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
779                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
780                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
781                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
782 
783                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
784                 for(k = 0; k < 2; k++)
785                 {
786                     ee[k] = eee[k] + eeo[k];
787                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
788                 }
789                 for(k = 0; k < 4; k++)
790                 {
791                     e[k] = ee[k] + eo[k];
792                     e[k + 4] = ee[3 - k] - eo[3 - k];
793                 }
794                 for(k = 0; k < 8; k++)
795                 {
796                     WORD32 itrans_out;
797                     itrans_out =
798                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
799                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
800                     itrans_out =
801                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
802                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
803                 }
804                 pi2_tmp++;
805                 pu1_pred += pred_strd;
806                 pu1_dst += dst_strd;
807             }
808         }
809         else /* All rows of output of 1st stage are non-zero */
810         {
811             for(j = 0; j < trans_size; j++)
812             {
813                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
814                 for(k = 0; k < 8; k++)
815                 {
816                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
817                                     + g_ai2_ihevc_trans_16[3][k]
818                                                     * pi2_tmp[3 * trans_size]
819                                     + g_ai2_ihevc_trans_16[5][k]
820                                                     * pi2_tmp[5 * trans_size]
821                                     + g_ai2_ihevc_trans_16[7][k]
822                                                     * pi2_tmp[7 * trans_size]
823                                     + g_ai2_ihevc_trans_16[9][k]
824                                                     * pi2_tmp[9 * trans_size]
825                                     + g_ai2_ihevc_trans_16[11][k]
826                                                     * pi2_tmp[11 * trans_size]
827                                     + g_ai2_ihevc_trans_16[13][k]
828                                                     * pi2_tmp[13 * trans_size]
829                                     + g_ai2_ihevc_trans_16[15][k]
830                                                     * pi2_tmp[15 * trans_size];
831                 }
832                 for(k = 0; k < 4; k++)
833                 {
834                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
835                                     + g_ai2_ihevc_trans_16[6][k]
836                                                     * pi2_tmp[6 * trans_size]
837                                     + g_ai2_ihevc_trans_16[10][k]
838                                                     * pi2_tmp[10 * trans_size]
839                                     + g_ai2_ihevc_trans_16[14][k]
840                                                     * pi2_tmp[14 * trans_size];
841                 }
842                 eeo[0] =
843                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
844                                                 + g_ai2_ihevc_trans_16[12][0]
845                                                                 * pi2_tmp[12
846                                                                                 * trans_size];
847                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
848                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
849                 eeo[1] =
850                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
851                                                 + g_ai2_ihevc_trans_16[12][1]
852                                                                 * pi2_tmp[12
853                                                                                 * trans_size];
854                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
855                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
856 
857                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
858                 for(k = 0; k < 2; k++)
859                 {
860                     ee[k] = eee[k] + eeo[k];
861                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
862                 }
863                 for(k = 0; k < 4; k++)
864                 {
865                     e[k] = ee[k] + eo[k];
866                     e[k + 4] = ee[3 - k] - eo[3 - k];
867                 }
868                 for(k = 0; k < 8; k++)
869                 {
870                     WORD32 itrans_out;
871                     itrans_out =
872                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
873                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
874                     itrans_out =
875                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
876                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
877                 }
878                 pi2_tmp++;
879                 pu1_pred += pred_strd;
880                 pu1_dst += dst_strd;
881             }
882         }
883         /************************************************************************************************/
884         /************************************END - IT_RECON_16x16****************************************/
885         /************************************************************************************************/
886     }
887 
888 }
889 
890