1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19  *******************************************************************************
20  * @file
21  *  ihevc_itrans.c
22  *
23  * @brief
24  *  Contains function definitions for single stage  inverse transform
25  *
26  * @author
27  *  100470
28  *
29  * @par List of Functions:
30  *  - ihevc_itrans_4x4_ttype1()
31  *  - ihevc_itrans_4x4()
32  *  - ihevc_itrans_8x8()
33  *  - ihevc_itrans_16x16()
34  *  - ihevc_itrans_32x32()
35  *
36  * @remarks
37  *  None
38  *
39  *******************************************************************************
40  */
41 #include <stdio.h>
42 #include <string.h>
43 #include "ihevc_typedefs.h"
44 #include "ihevc_macros.h"
45 #include "ihevc_platform_macros.h"
46 #include "ihevc_defs.h"
47 #include "ihevc_trans_tables.h"
48 #include "ihevc_func_selector.h"
49 #include "ihevc_trans_macros.h"
50 
51 #define NON_OPTIMIZED 1
52 
53 /**
54  *******************************************************************************
55  *
56  * @brief
57  *  This function performs Single stage  Inverse transform type 1 (DST) for
58  * 4x4 input block
59  *
60  * @par Description:
61  *  Performs single stage 4x4 inverse transform type 1  by utilizing the
62  * symmetry of transformation matrix  and reducing number of multiplications
63  * wherever  possible but keeping the number of operations
64  * (addition,multiplication and shift)same
65  *
66  * @param[in] pi2_src
67  *  Input 4x4 coefficients
68  *
69  * @param[out] pi2_dst
70  *  Output 4x4 block
71  *
72  * @param[in] src_strd
73  *  Input stride
74  *
75  * @param[in] dst_strd
76  *  Output Stride
77  *
78  * @param[in] i4_shift
79  *  Output shift
80  *
81  * @param[in] zero_cols
82  *  Zero columns in pi2_src
83  *
84  * @returns  Void
85  *
86  * @remarks
87  *  None
88  *
89  *******************************************************************************
90  */
91 
92 
ihevc_itrans_4x4_ttype1(WORD16 * pi2_src,WORD16 * pi2_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 i4_shift,WORD32 zero_cols)93 void ihevc_itrans_4x4_ttype1(WORD16 *pi2_src,
94                              WORD16 *pi2_dst,
95                              WORD32 src_strd,
96                              WORD32 dst_strd,
97                              WORD32 i4_shift,
98                              WORD32 zero_cols)
99 {
100     WORD32 i, c[4];
101     WORD32 add;
102 
103     add = 1 << (i4_shift - 1);
104 
105     for(i = 0; i < TRANS_SIZE_4; i++)
106     {
107         /* Checking for Zero Cols */
108         if((zero_cols & 1) == 1)
109         {
110             memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
111         }
112         else
113         {
114             // Intermediate Variables
115             c[0] = pi2_src[0] + pi2_src[2 * src_strd];
116             c[1] = pi2_src[2 * src_strd] + pi2_src[3 * src_strd];
117             c[2] = pi2_src[0] - pi2_src[3 * src_strd];
118             c[3] = 74 * pi2_src[src_strd];
119 
120             pi2_dst[0] =
121                             CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> i4_shift);
122             pi2_dst[1] =
123                             CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> i4_shift);
124             pi2_dst[2] =
125                             CLIP_S16((74 * (pi2_src[0] - pi2_src[2 * src_strd] + pi2_src[3 * src_strd]) + add) >> i4_shift);
126             pi2_dst[3] =
127                             CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> i4_shift);
128         }
129         pi2_src++;
130         pi2_dst += dst_strd;
131         zero_cols = zero_cols >> 1;
132     }
133 }
134 
135 
136 /**
137  *******************************************************************************
138  *
139  * @brief
140  *  This function performs Single stage  Inverse transform for 4x4 input
141  * block
142  *
143  * @par Description:
144  *  Performs single stage 4x4 inverse transform by utilizing  the symmetry of
145  * transformation matrix and reducing number  of multiplications wherever
146  * possible but keeping the  number of operations(addition,multiplication and
147  * shift)  same
148  *
149  * @param[in] pi2_src
150  *  Input 4x4 coefficients
151  *
152  * @param[out] pi2_dst
153  *  Output 4x4 block
154  *
155  * @param[in] src_strd
156  *  Input stride
157  *
158  * @param[in] dst_strd
159  *  Output Stride
160  *
161  * @param[in] i4_shift
162  *  Output shift
163  *
164  * @param[in] zero_cols
165  *  Zero columns in pi2_src
166  *
167  * @returns  Void
168  *
169  * @remarks
170  *  None
171  *
172  *******************************************************************************
173  */
174 
175 #if NON_OPTIMIZED
ihevc_itrans_4x4(WORD16 * pi2_src,WORD16 * pi2_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 i4_shift,WORD32 zero_cols)176 void ihevc_itrans_4x4(WORD16 *pi2_src,
177                       WORD16 *pi2_dst,
178                       WORD32 src_strd,
179                       WORD32 dst_strd,
180                       WORD32 i4_shift,
181                       WORD32 zero_cols)
182 {
183     WORD32 j;
184     WORD32 e[2], o[2];
185     WORD32 add;
186 
187     add = 1 << (i4_shift - 1);
188 
189     for(j = 0; j < TRANS_SIZE_4; j++)
190     {
191         /* Checking for Zero Cols */
192         if((zero_cols & 1) == 1)
193         {
194             memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
195         }
196         else
197         {
198 
199             /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
200             o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_src[src_strd]
201                             + g_ai2_ihevc_trans_4[3][0] * pi2_src[3 * src_strd];
202             o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_src[src_strd]
203                             + g_ai2_ihevc_trans_4[3][1] * pi2_src[3 * src_strd];
204             e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_src[0]
205                             + g_ai2_ihevc_trans_4[2][0] * pi2_src[2 * src_strd];
206             e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_src[0]
207                             + g_ai2_ihevc_trans_4[2][1] * pi2_src[2 * src_strd];
208 
209             pi2_dst[0] =
210                             CLIP_S16(((e[0] + o[0] + add) >> i4_shift));
211             pi2_dst[1] =
212                             CLIP_S16(((e[1] + o[1] + add) >> i4_shift));
213             pi2_dst[2] =
214                             CLIP_S16(((e[1] - o[1] + add) >> i4_shift));
215             pi2_dst[3] =
216                             CLIP_S16(((e[0] - o[0] + add) >> i4_shift));
217 
218         }
219         pi2_src++;
220         pi2_dst += dst_strd;
221         zero_cols = zero_cols >> 1;
222     }
223 }
224 #else
ihevc_itrans_4x4(WORD16 * pi2_src,WORD16 * pi2_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 i4_shift,WORD32 zero_cols)225 void ihevc_itrans_4x4(WORD16 *pi2_src,
226                       WORD16 *pi2_dst,
227                       WORD32 src_strd,
228                       WORD32 dst_strd,
229                       WORD32 i4_shift,
230                       WORD32 zero_cols)
231 {
232     WORD32 j;
233     WORD32 e[2], o[2];
234     WORD32 add;
235 
236     add = 1 << (i4_shift - 1);
237 
238     /***************************************************************************/
239     /* Transform Matrix 4x4                                                    */
240     /*      0   1   2   3                                                      */
241     /* 0 { 64, 64, 64, 64},                                                    */
242     /* 1 { 83, 36,-36,-83},                                                    */
243     /* 2 { 64,-64,-64, 64},                                                    */
244     /* 3 { 36,-83, 83,-36}                                                     */
245     /***************************************************************************/
246 
247     for(j = 0; j < TRANS_SIZE_4; j++)
248     {
249         WORD32 temp;
250 
251         /* Checking for Zero Cols */
252         if((zero_cols & 1) == 1)
253         {
254             memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
255         }
256         else
257         {
258             /* Common operation in o[0] and o[1] */
259             temp = (pi2_src[src_strd] + pi2_src[3 * src_strd]) * 36;
260 
261             o[0] = temp + 47 * pi2_src[src_strd];
262             o[1] = temp - 119 * pi2_src[3 * src_strd];
263             e[0] = (pi2_src[0] + pi2_src[2 * src_strd]) << 6;
264             e[1] = (pi2_src[0] - pi2_src[2 * src_strd]) << 6;
265 
266             pi2_dst[0] =
267                             CLIP_S16(((e[0] + o[0] + add) >> i4_shift));
268             pi2_dst[1] =
269                             CLIP_S16(((e[1] + o[1] + add) >> i4_shift));
270             pi2_dst[2] =
271                             CLIP_S16(((e[1] - o[1] + add) >> i4_shift));
272             pi2_dst[3] =
273                             CLIP_S16(((e[0] - o[0] + add) >> i4_shift));
274         }
275         pi2_src++;
276         pi2_dst += dst_strd;
277         zero_cols = zero_cols >> 1;
278     }
279 }
280 #endif
281 
282 /**
283  *******************************************************************************
284  *
285  * @brief
286  *  This function performs Single stage  Inverse transform for 8x8 input
287  * block
288  *
289  * @par Description:
290  *  Performs single stage 8x8 inverse transform by utilizing  the symmetry of
291  * transformation matrix and reducing number  of multiplications wherever
292  * possible but keeping the  number of operations(addition,multiplication and
293  * shift)  same
294  *
295  * @param[in] pi2_src
296  *  Input 8x8 coefficients
297  *
298  * @param[out] pi2_dst
299  *  Output 8x8 block
300  *
301  * @param[in] src_strd
302  *  Input stride
303  *
304  * @param[in] dst_strd
305  *  Output Stride
306  *
307  * @param[in] i4_shift
308  *  Output shift
309  *
310  * @param[in] zero_cols
311  *  Zero columns in pi2_src
312  *
313  * @returns  Void
314  *
315  * @remarks
316  *  None
317  *
318  *******************************************************************************
319  */
320 
321 #if NON_OPTIMIZED
ihevc_itrans_8x8(WORD16 * pi2_src,WORD16 * pi2_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 i4_shift,WORD32 zero_cols)322 void ihevc_itrans_8x8(WORD16 *pi2_src,
323                       WORD16 *pi2_dst,
324                       WORD32 src_strd,
325                       WORD32 dst_strd,
326                       WORD32 i4_shift,
327                       WORD32 zero_cols)
328 {
329     WORD32 j, k;
330     WORD32 e[4], o[4];
331     WORD32 ee[2], eo[2];
332     WORD32 add;
333 
334     add = 1 << (i4_shift - 1);
335 
336     for(j = 0; j < TRANS_SIZE_8; j++)
337     {
338         /* Checking for Zero Cols */
339         if((zero_cols & 1) == 1)
340         {
341             memset(pi2_dst, 0, TRANS_SIZE_8 * sizeof(WORD16));
342         }
343         else
344         {
345             /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
346             for(k = 0; k < 4; k++)
347             {
348                 o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
349                                 + g_ai2_ihevc_trans_8[3][k]
350                                                 * pi2_src[3 * src_strd]
351                                 + g_ai2_ihevc_trans_8[5][k]
352                                                 * pi2_src[5 * src_strd]
353                                 + g_ai2_ihevc_trans_8[7][k]
354                                                 * pi2_src[7 * src_strd];
355             }
356 
357             eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
358                             + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
359             eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
360                             + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
361             ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
362                             + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
363             ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
364                             + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
365 
366             /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
367             e[0] = ee[0] + eo[0];
368             e[3] = ee[0] - eo[0];
369             e[1] = ee[1] + eo[1];
370             e[2] = ee[1] - eo[1];
371             for(k = 0; k < 4; k++)
372             {
373                 pi2_dst[k] =
374                                 CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
375                 pi2_dst[k + 4] =
376                                 CLIP_S16(((e[3 - k] - o[3 - k] + add) >> i4_shift));
377             }
378         }
379         pi2_src++;
380         pi2_dst += dst_strd;
381         zero_cols = zero_cols >> 1;
382     }
383 }
384 
385 #else
ihevc_itrans_8x8(WORD16 * pi2_src,WORD16 * pi2_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 i4_shift,WORD32 zero_cols)386 void ihevc_itrans_8x8(WORD16 *pi2_src,
387                       WORD16 *pi2_dst,
388                       WORD32 src_strd,
389                       WORD32 dst_strd,
390                       WORD32 i4_shift,
391                       WORD32 zero_cols)
392 {
393     /* Transform Matrix 8x8                          */
394     /*              0    1    2   3   4   5   6   7  */
395     /*     0 -      64   64   64  64  64  64  64  64 */
396     /*     1 -      89   75   50  18 -18 -50 -75 -89 */
397     /*     2 -      83   36  -36 -83 -83 -36  36  83 */
398     /*     3 -      75  -18  -89 -50  50  89  18 -75 */
399     /*     4 -      64  -64  -64  64  64 -64 -64  64 */
400     /*     5 -      50  -89   18  75 -75 -18  89 -50 */
401     /*     6 -      36  -83   83 -36 -36  83 -83  36 */
402     /*     7 -      18  -50   75 -89  89 -75  50 -18 */
403 
404     /* 0th and 4th row will have no multiplications */
405     /* 2nd and 6th row has only two coefff multiplies */
406     /* 1st, 3rd, 5th and 7th rows have o mirror symmetry */
407     WORD32 j, k;
408     WORD32 temp1, temp2;
409     WORD32 e[4], o[4];
410     WORD32 ee[2], eo[2];
411     WORD32 add;
412 
413     add = 1 << (i4_shift - 1);
414 
415     for(j = 0; j < TRANS_SIZE_8; j++)
416     {
417         /* Checking for Zero Cols */
418         if((zero_cols & 1) == 1)
419         {
420             memset(pi2_dst, 0, TRANS_SIZE_8 * sizeof(WORD16));
421         }
422         else
423         {
424 
425             /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
426             /*
427              o[0] = 89 *pi2_src[8] +  75 *pi2_src[3*8] +  50 *pi2_src[5*8] +  18 *pi2_src[7*8];
428              o[1] = 75 *pi2_src[8] + -18 *pi2_src[3*8] + -89 *pi2_src[5*8] + -50 *pi2_src[7*8];
429              o[2] = 50 *pi2_src[8] + -89 *pi2_src[3*8] +  18 *pi2_src[5*8] +  75 *pi2_src[7*8];
430              o[3] = 18 *pi2_src[8] + -50 *pi2_src[3*8] +  75 *pi2_src[5*8] + -89 *pi2_src[7*8];
431              */
432 
433             /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
434             /*
435              temp1 = (pi2_src[8  ] + pi2_src[3*8]) * 75;
436              temp2 = (pi2_src[5*8] + pi2_src[7*8]) * 50;
437 
438              o[0] = temp1 + 14 * pi2_src[8  ] + temp2 - 32 * pi2_src[7*8];
439              o[1] = temp1 - 93 * pi2_src[3*8] - temp2 - 39 * pi2_src[5*8];
440              */
441 
442             temp1 = (pi2_src[src_strd] + pi2_src[3 * src_strd]) * 75;
443             temp2 = (pi2_src[5 * src_strd] + pi2_src[7 * src_strd]) * 50;
444 
445             o[0] = temp1 + 14 * pi2_src[src_strd] + temp2
446                             - (pi2_src[7 * src_strd] << 5);
447             o[1] = temp1 - 93 * pi2_src[3 * src_strd] - temp2
448                             - 39 * pi2_src[5 * src_strd];
449 
450             /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
451             /*
452              temp1 = (pi2_src[8  ] - pi2_src[3*8]) * 50;
453              temp2 = (pi2_src[5*8] + pi2_src[7*8]) * 75;
454 
455              o[2] = temp1 - 39 * pi2_src[3*8] + temp2 -  57 * pi2_src[5*8];
456              o[3] = temp1 - 32 * pi2_src[8  ] + temp2 - 164 * pi2_src[7*8];
457              */
458 
459             temp1 = (pi2_src[src_strd] - pi2_src[3 * src_strd]) * 50;
460             temp2 = (pi2_src[5 * src_strd] + pi2_src[7 * src_strd]) * 75;
461 
462             o[2] = temp1 - 39 * pi2_src[3 * src_strd] + temp2
463                             - 57 * pi2_src[5 * src_strd];
464             o[3] = temp1 - (pi2_src[src_strd] << 5) + temp2
465                             - 164 * pi2_src[7 * src_strd];
466 
467             /*
468              eo[0] = 83 *pi2_src[ 2*8 ] +  36 *pi2_src[ 6*8 ];
469              eo[1] = 36 *pi2_src[ 2*8 ] + -83 *pi2_src[ 6*8 ];
470              ee[0] = 64 *pi2_src[ 0   ] +  64 *pi2_src[ 4*8 ];
471              ee[1] = 64 *pi2_src[ 0   ] + -64 *pi2_src[ 4*8 ];
472              */
473 
474             /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
475             temp1 = (pi2_src[2 * src_strd] + pi2_src[6 * src_strd]) * 36;
476             eo[0] = temp1 + 47 * pi2_src[2 * src_strd];
477             eo[1] = temp1 - 119 * pi2_src[6 * src_strd];
478 
479             /* Optimization: 4 mul + 2 add  ---> 2 i4_shift + 2 add */
480             ee[0] = (pi2_src[0] + pi2_src[4 * src_strd]) << 6;
481             ee[1] = (pi2_src[0] - pi2_src[4 * src_strd]) << 6;
482 
483             e[0] = ee[0] + eo[0];
484             e[3] = ee[0] - eo[0];
485             e[1] = ee[1] + eo[1];
486             e[2] = ee[1] - eo[1];
487 
488             for(k = 0; k < 4; k++)
489             {
490                 pi2_dst[k] =
491                                 CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
492                 pi2_dst[k + 4] =
493                                 CLIP_S16(((e[3 - k] - o[3 - k] + add) >> i4_shift));
494             }
495         }
496         pi2_src++;
497         pi2_dst += dst_strd;
498         zero_cols = zero_cols >> 1;
499     }
500 
501 }
502 #endif
503 
504 
505 /**
506  *******************************************************************************
507  *
508  * @brief
509  *  This function performs Single stage  Inverse transform for 16x16 input
510  * block
511  *
512  * @par Description:
513  *  Performs single stage 16x16 inverse transform by  utilizing the symmetry
514  * of transformation matrix  and reducing number of multiplications wherever
515  * possible  but keeping the number of operations  (addition,multiplication
516  * and shift) same
517  *
518  * @param[in] pi2_src
519  *  Input 16x16 coefficients
520  *
521  * @param[out] pi2_dst
522  *  Output 16x16 block
523  *
524  * @param[in] src_strd
525  *  Input stride
526  *
527  * @param[in] dst_strd
528  *  Output Stride
529  *
530  * @param[in] i4_shift
531  *  Output shift
532  *
533  * @param[in] zero_cols
534  *  Zero columns in pi2_src
535  *
536  * @returns  Void
537  *
538  * @remarks
539  *  None
540  *
541  *******************************************************************************
542  */
543 
544 #if NON_OPTIMIZED
ihevc_itrans_16x16(WORD16 * pi2_src,WORD16 * pi2_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 i4_shift,WORD32 zero_cols)545 void ihevc_itrans_16x16(WORD16 *pi2_src,
546                         WORD16 *pi2_dst,
547                         WORD32 src_strd,
548                         WORD32 dst_strd,
549                         WORD32 i4_shift,
550                         WORD32 zero_cols)
551 {
552     WORD32 j, k;
553     WORD32 e[8], o[8];
554     WORD32 ee[4], eo[4];
555     WORD32 eee[2], eeo[2];
556     WORD32 add;
557 
558     add = 1 << (i4_shift - 1);
559 
560     for(j = 0; j < TRANS_SIZE_16; j++)
561     {
562         /* Checking for Zero Cols */
563         if((zero_cols & 1) == 1)
564         {
565             memset(pi2_dst, 0, TRANS_SIZE_16 * sizeof(WORD16));
566         }
567         else
568         {
569             /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
570             for(k = 0; k < 8; k++)
571             {
572                 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
573                                 + g_ai2_ihevc_trans_16[3][k]
574                                                 * pi2_src[3 * src_strd]
575                                 + g_ai2_ihevc_trans_16[5][k]
576                                                 * pi2_src[5 * src_strd]
577                                 + g_ai2_ihevc_trans_16[7][k]
578                                                 * pi2_src[7 * src_strd]
579                                 + g_ai2_ihevc_trans_16[9][k]
580                                                 * pi2_src[9 * src_strd]
581                                 + g_ai2_ihevc_trans_16[11][k]
582                                                 * pi2_src[11 * src_strd]
583                                 + g_ai2_ihevc_trans_16[13][k]
584                                                 * pi2_src[13 * src_strd]
585                                 + g_ai2_ihevc_trans_16[15][k]
586                                                 * pi2_src[15 * src_strd];
587             }
588             for(k = 0; k < 4; k++)
589             {
590                 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
591                                 + g_ai2_ihevc_trans_16[6][k]
592                                                 * pi2_src[6 * src_strd]
593                                 + g_ai2_ihevc_trans_16[10][k]
594                                                 * pi2_src[10 * src_strd]
595                                 + g_ai2_ihevc_trans_16[14][k]
596                                                 * pi2_src[14 * src_strd];
597             }
598             eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
599                             + g_ai2_ihevc_trans_16[12][0]
600                                             * pi2_src[12 * src_strd];
601             eee[0] =
602                             g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
603                                             + g_ai2_ihevc_trans_16[8][0]
604                                                             * pi2_src[8
605                                                                             * src_strd];
606             eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
607                             + g_ai2_ihevc_trans_16[12][1]
608                                             * pi2_src[12 * src_strd];
609             eee[1] =
610                             g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
611                                             + g_ai2_ihevc_trans_16[8][1]
612                                                             * pi2_src[8
613                                                                             * src_strd];
614 
615             /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
616             for(k = 0; k < 2; k++)
617             {
618                 ee[k] = eee[k] + eeo[k];
619                 ee[k + 2] = eee[1 - k] - eeo[1 - k];
620             }
621             for(k = 0; k < 4; k++)
622             {
623                 e[k] = ee[k] + eo[k];
624                 e[k + 4] = ee[3 - k] - eo[3 - k];
625             }
626             for(k = 0; k < 8; k++)
627             {
628                 pi2_dst[k] =
629                                 CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
630                 pi2_dst[k + 8] =
631                                 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> i4_shift));
632             }
633         }
634         pi2_src++;
635         pi2_dst += dst_strd;
636         zero_cols = zero_cols >> 1;
637     }
638 }
639 #else
ihevc_itrans_16x16(WORD16 * pi2_src,WORD16 * pi2_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 i4_shift,WORD32 zero_cols)640 void ihevc_itrans_16x16(WORD16 *pi2_src,
641                         WORD16 *pi2_dst,
642                         WORD32 src_strd,
643                         WORD32 dst_strd,
644                         WORD32 i4_shift,
645                         WORD32 zero_cols)
646 {
647     WORD32 j, k;
648     WORD32 e[8], o[8];
649     WORD32 ee[4], eo[4];
650     WORD32 eee[2], eeo[2];
651     WORD32 add;
652     WORD32 temp1, temp2;
653 
654     add = 1 << (i4_shift - 1);
655     /***************************************************************************/
656     /* Transform Matrix 16x16                                                  */
657     /*       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15     */
658     /* 0  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},   */
659     /* 1  { 90, 87, 80, 70, 57, 43, 25,  9, -9,-25,-43,-57,-70,-80,-87,-90},   */
660     /* 2  { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89},   */
661     /* 3  { 87, 57,  9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87},   */
662     /* 4  { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83},   */
663     /* 5  { 80,  9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80},   */
664     /* 6  { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75},   */
665     /* 7  { 70,-43,-87,  9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70},   */
666     /* 8  { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64},   */
667     /* 9  { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87,  9,-90, 25, 80,-57},   */
668     /* 10 { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50},   */
669     /* 11 { 43,-90, 57, 25,-87, 70,  9,-80, 80, -9,-70, 87,-25,-57, 90,-43},   */
670     /* 12 { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36},   */
671     /* 13 { 25,-70, 90,-80, 43,  9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25},   */
672     /* 14 { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18},   */
673     /* 15 {  9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9}    */
674     /***************************************************************************/
675 
676     for(j = 0; j < TRANS_SIZE_16; j++)
677     {
678         /* Checking for Zero Cols */
679         if((zero_cols & 1) == 1)
680         {
681             memset(pi2_dst, 0, TRANS_SIZE_16 * sizeof(WORD16));
682         }
683         else
684         {
685             /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
686             {
687                 /*
688                  o[k] = g_ai2_ihevc_trans_16[ 1][k]*pi2_src[ src_strd   ] + g_ai2_ihevc_trans_16[ 3][k]*pi2_src[ 3*src_strd   ] + g_ai2_ihevc_trans_16[ 5][k]*pi2_src[ 5*src_strd   ] + g_ai2_ihevc_trans_16[ 7][k]*pi2_src[ 7*src_strd   ] +
689                  g_ai2_ihevc_trans_16[ 9][k]*pi2_src[ 9*src_strd   ] + g_ai2_ihevc_trans_16[11][k]*pi2_src[11*src_strd   ] + g_ai2_ihevc_trans_16[13][k]*pi2_src[13*src_strd   ] + g_ai2_ihevc_trans_16[15][k]*pi2_src[15*src_strd   ];
690                  */
691 
692                 o[0] = 90 * pi2_src[src_strd] + 87 * pi2_src[3 * src_strd]
693                                 + 80 * pi2_src[5 * src_strd]
694                                 + 70 * pi2_src[7 * src_strd]
695                                 + 57 * pi2_src[9 * src_strd]
696                                 + 43 * pi2_src[11 * src_strd]
697                                 + 25 * pi2_src[13 * src_strd]
698                                 + 9 * pi2_src[15 * src_strd];
699 
700                 o[1] = 87 * pi2_src[src_strd] + 57 * pi2_src[3 * src_strd]
701                                 + 9 * pi2_src[5 * src_strd]
702                                 + -43 * pi2_src[7 * src_strd]
703                                 + -80 * pi2_src[9 * src_strd]
704                                 + -90 * pi2_src[11 * src_strd]
705                                 + -70 * pi2_src[13 * src_strd]
706                                 + -25 * pi2_src[15 * src_strd];
707 
708                 o[2] = 80 * pi2_src[src_strd] + 9 * pi2_src[3 * src_strd]
709                                 + -70 * pi2_src[5 * src_strd]
710                                 + -87 * pi2_src[7 * src_strd]
711                                 + -25 * pi2_src[9 * src_strd]
712                                 + 57 * pi2_src[11 * src_strd]
713                                 + 90 * pi2_src[13 * src_strd]
714                                 + 43 * pi2_src[15 * src_strd];
715 
716                 o[3] = 70 * pi2_src[src_strd] + -43 * pi2_src[3 * src_strd]
717                                 + -87 * pi2_src[5 * src_strd]
718                                 + 9 * pi2_src[7 * src_strd]
719                                 + 90 * pi2_src[9 * src_strd]
720                                 + 25 * pi2_src[11 * src_strd]
721                                 + -80 * pi2_src[13 * src_strd]
722                                 + -57 * pi2_src[15 * src_strd];
723 
724                 o[4] = 57 * pi2_src[src_strd] + -80 * pi2_src[3 * src_strd]
725                                 + -25 * pi2_src[5 * src_strd]
726                                 + 90 * pi2_src[7 * src_strd]
727                                 + -9 * pi2_src[9 * src_strd]
728                                 + -87 * pi2_src[11 * src_strd]
729                                 + 43 * pi2_src[13 * src_strd]
730                                 + 70 * pi2_src[15 * src_strd];
731 
732                 o[5] = 43 * pi2_src[src_strd] + -90 * pi2_src[3 * src_strd]
733                                 + 57 * pi2_src[5 * src_strd]
734                                 + 25 * pi2_src[7 * src_strd]
735                                 + -87 * pi2_src[9 * src_strd]
736                                 + 70 * pi2_src[11 * src_strd]
737                                 + 9 * pi2_src[13 * src_strd]
738                                 + -80 * pi2_src[15 * src_strd];
739 
740                 o[6] = 25 * pi2_src[src_strd] + -70 * pi2_src[3 * src_strd]
741                                 + 90 * pi2_src[5 * src_strd]
742                                 + -80 * pi2_src[7 * src_strd]
743                                 + 43 * pi2_src[9 * src_strd]
744                                 + 9 * pi2_src[11 * src_strd]
745                                 + -57 * pi2_src[13 * src_strd]
746                                 + 87 * pi2_src[15 * src_strd];
747 
748                 o[7] = 9 * pi2_src[src_strd] + -25 * pi2_src[3 * src_strd]
749                                 + 43 * pi2_src[5 * src_strd]
750                                 + -57 * pi2_src[7 * src_strd]
751                                 + 70 * pi2_src[9 * src_strd]
752                                 + -80 * pi2_src[11 * src_strd]
753                                 + 87 * pi2_src[13 * src_strd]
754                                 + -90 * pi2_src[15 * src_strd];
755             }
756             {
757                 temp1 = (pi2_src[2 * src_strd] + pi2_src[6 * src_strd]) * 75;
758                 temp2 = (pi2_src[10 * src_strd] + pi2_src[14 * src_strd]) * 50;
759                 eo[0] = temp1 + 14 * pi2_src[2 * src_strd] + temp2
760                                 - (pi2_src[14 * src_strd] << 5);
761                 eo[1] = temp1 - 93 * pi2_src[6 * src_strd] - temp2
762                                 - 39 * pi2_src[10 * src_strd];
763 
764                 temp1 = (pi2_src[2 * src_strd] - pi2_src[6 * src_strd]) * 50;
765                 temp2 = (pi2_src[10 * src_strd] + pi2_src[14 * src_strd]) * 75;
766                 eo[2] = temp1 - 39 * pi2_src[6 * src_strd] + temp2
767                                 - 57 * pi2_src[10 * src_strd];
768                 eo[3] = temp1 - (pi2_src[2 * src_strd] << 5) + temp2
769                                 - 164 * pi2_src[14 * src_strd];
770             }
771 
772             temp1 = (pi2_src[4 * src_strd] + pi2_src[12 * src_strd]) * 36;
773             eeo[0] = temp1 + 47 * pi2_src[4 * src_strd];
774             eeo[1] = temp1 - 119 * pi2_src[12 * src_strd];
775 
776             eee[0] = (pi2_src[0] + pi2_src[8 * src_strd]) << 6;
777             eee[1] = (pi2_src[0] - pi2_src[8 * src_strd]) << 6;
778 
779             /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
780             for(k = 0; k < 2; k++)
781             {
782                 ee[k] = eee[k] + eeo[k];
783                 ee[k + 2] = eee[1 - k] - eeo[1 - k];
784             }
785             for(k = 0; k < 4; k++)
786             {
787                 e[k] = ee[k] + eo[k];
788                 e[k + 4] = ee[3 - k] - eo[3 - k];
789             }
790             for(k = 0; k < 8; k++)
791             {
792                 pi2_dst[k] =
793                                 CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
794                 pi2_dst[k + 8] =
795                                 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> i4_shift));
796             }
797         }
798         pi2_src++;
799         pi2_dst += dst_strd;
800         zero_cols = zero_cols >> 1;
801     }
802 }
803 #endif
804 
805 /**
806  *******************************************************************************
807  *
808  * @brief
809  *  This function performs Single stage  Inverse transform for 32x32 input
810  * block
811  *
812  * @par Description:
813  *  Performs single stage 32x32 inverse transform by  utilizing the symmetry
814  * of transformation matrix and  reducing number of multiplications wherever
815  * possible  but keeping the number of operations  (addition,multiplication
816  * and shift) same
817  *
818  * @param[in] pi2_src
819  *  Input 32x32 coefficients
820  *
821  * @param[out] pi2_dst
822  *  Output 32x32 block
823  *
824  * @param[in] src_strd
825  *  Input stride
826  *
827  * @param[in] dst_strd
828  *  Output Stride
829  *
830  * @param[in] i4_shift
831  *  Output shift
832  *
833  * @param[in] zero_cols
834  *  Zero columns in pi2_src
835  *
836  * @returns  Void
837  *
838  * @remarks
839  *  None
840  *
841  *******************************************************************************
842  */
843 
844 
ihevc_itrans_32x32(WORD16 * pi2_src,WORD16 * pi2_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 i4_shift,WORD32 zero_cols)845 void ihevc_itrans_32x32(WORD16 *pi2_src,
846                         WORD16 *pi2_dst,
847                         WORD32 src_strd,
848                         WORD32 dst_strd,
849                         WORD32 i4_shift,
850                         WORD32 zero_cols)
851 {
852     WORD32 j, k;
853     WORD32 e[16], o[16];
854     WORD32 ee[8], eo[8];
855     WORD32 eee[4], eeo[4];
856     WORD32 eeee[2], eeeo[2];
857     WORD32 add;
858 
859     add = 1 << (i4_shift - 1);
860 
861     for(j = 0; j < TRANS_SIZE_32; j++)
862     {
863         /* Checking for Zero Cols */
864         if((zero_cols & 1) == 1)
865         {
866             memset(pi2_dst, 0, TRANS_SIZE_32 * sizeof(WORD16));
867         }
868         else
869         {
870             /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
871             for(k = 0; k < 16; k++)
872             {
873                 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
874                                 + g_ai2_ihevc_trans_32[3][k]
875                                                 * pi2_src[3 * src_strd]
876                                 + g_ai2_ihevc_trans_32[5][k]
877                                                 * pi2_src[5 * src_strd]
878                                 + g_ai2_ihevc_trans_32[7][k]
879                                                 * pi2_src[7 * src_strd]
880                                 + g_ai2_ihevc_trans_32[9][k]
881                                                 * pi2_src[9 * src_strd]
882                                 + g_ai2_ihevc_trans_32[11][k]
883                                                 * pi2_src[11 * src_strd]
884                                 + g_ai2_ihevc_trans_32[13][k]
885                                                 * pi2_src[13 * src_strd]
886                                 + g_ai2_ihevc_trans_32[15][k]
887                                                 * pi2_src[15 * src_strd]
888                                 + g_ai2_ihevc_trans_32[17][k]
889                                                 * pi2_src[17 * src_strd]
890                                 + g_ai2_ihevc_trans_32[19][k]
891                                                 * pi2_src[19 * src_strd]
892                                 + g_ai2_ihevc_trans_32[21][k]
893                                                 * pi2_src[21 * src_strd]
894                                 + g_ai2_ihevc_trans_32[23][k]
895                                                 * pi2_src[23 * src_strd]
896                                 + g_ai2_ihevc_trans_32[25][k]
897                                                 * pi2_src[25 * src_strd]
898                                 + g_ai2_ihevc_trans_32[27][k]
899                                                 * pi2_src[27 * src_strd]
900                                 + g_ai2_ihevc_trans_32[29][k]
901                                                 * pi2_src[29 * src_strd]
902                                 + g_ai2_ihevc_trans_32[31][k]
903                                                 * pi2_src[31 * src_strd];
904             }
905             for(k = 0; k < 8; k++)
906             {
907                 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
908                                 + g_ai2_ihevc_trans_32[6][k]
909                                                 * pi2_src[6 * src_strd]
910                                 + g_ai2_ihevc_trans_32[10][k]
911                                                 * pi2_src[10 * src_strd]
912                                 + g_ai2_ihevc_trans_32[14][k]
913                                                 * pi2_src[14 * src_strd]
914                                 + g_ai2_ihevc_trans_32[18][k]
915                                                 * pi2_src[18 * src_strd]
916                                 + g_ai2_ihevc_trans_32[22][k]
917                                                 * pi2_src[22 * src_strd]
918                                 + g_ai2_ihevc_trans_32[26][k]
919                                                 * pi2_src[26 * src_strd]
920                                 + g_ai2_ihevc_trans_32[30][k]
921                                                 * pi2_src[30 * src_strd];
922             }
923             for(k = 0; k < 4; k++)
924             {
925                 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
926                                 + g_ai2_ihevc_trans_32[12][k]
927                                                 * pi2_src[12 * src_strd]
928                                 + g_ai2_ihevc_trans_32[20][k]
929                                                 * pi2_src[20 * src_strd]
930                                 + g_ai2_ihevc_trans_32[28][k]
931                                                 * pi2_src[28 * src_strd];
932             }
933             eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
934                             + g_ai2_ihevc_trans_32[24][0]
935                                             * pi2_src[24 * src_strd];
936             eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
937                             + g_ai2_ihevc_trans_32[24][1]
938                                             * pi2_src[24 * src_strd];
939             eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
940                             + g_ai2_ihevc_trans_32[16][0]
941                                             * pi2_src[16 * src_strd];
942             eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
943                             + g_ai2_ihevc_trans_32[16][1]
944                                             * pi2_src[16 * src_strd];
945 
946             /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
947             eee[0] = eeee[0] + eeeo[0];
948             eee[3] = eeee[0] - eeeo[0];
949             eee[1] = eeee[1] + eeeo[1];
950             eee[2] = eeee[1] - eeeo[1];
951             for(k = 0; k < 4; k++)
952             {
953                 ee[k] = eee[k] + eeo[k];
954                 ee[k + 4] = eee[3 - k] - eeo[3 - k];
955             }
956             for(k = 0; k < 8; k++)
957             {
958                 e[k] = ee[k] + eo[k];
959                 e[k + 8] = ee[7 - k] - eo[7 - k];
960             }
961             for(k = 0; k < 16; k++)
962             {
963                 pi2_dst[k] =
964                                 CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
965                 pi2_dst[k + 16] =
966                                 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> i4_shift));
967             }
968         }
969         pi2_src++;
970         pi2_dst += dst_strd;
971         zero_cols = zero_cols >> 1;
972     }
973 }
974 
975