1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_decomp_pre_intra_pass_neon.c
24 *
25 * @brief
26 *  Contains functions to perform input scaling
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes                                                             */
40 /*****************************************************************************/
41 /* System include files */
42 #include <stdio.h>
43 #include <string.h>
44 #include <assert.h>
45 #include <arm_neon.h>
46 
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "ihevc_macros.h"
50 #include "ihevc_platform_macros.h"
51 #include "itt_video_api.h"
52 #include "ihevc_defs.h"
53 #include "ihevc_cmn_utils_neon.h"
54 #include "ihevce_ipe_instr_set_router.h"
55 
56 /*****************************************************************************/
57 /* Function Definitions                                                      */
58 /*****************************************************************************/
ihevce_scaling_filter_mxn(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_scrtch,WORD32 scrtch_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht,WORD32 wd)59 void ihevce_scaling_filter_mxn(
60     UWORD8 *pu1_src,
61     WORD32 src_strd,
62     UWORD8 *pu1_scrtch,
63     WORD32 scrtch_strd,
64     UWORD8 *pu1_dst,
65     WORD32 dst_strd,
66     WORD32 ht,
67     WORD32 wd)
68 {
69 #define FILT_TAP_Q 8
70 #define N_TAPS 7
71     const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 };
72     WORD32 i, j;
73     WORD32 tmp;
74     UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd;
75     UWORD8 *pu1_scrtch_tmp = pu1_scrtch;
76 
77     /* horizontal filtering */
78     for(i = -3; i < ht + 2; i++)
79     {
80         for(j = 0; j < wd; j += 2)
81         {
82             tmp = (i4_ftaps[3] * pu1_src_tmp[j] +
83                    i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) +
84                    i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) +
85                    i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) +
86                    (1 << (FILT_TAP_Q - 1))) >>
87                   FILT_TAP_Q;
88             pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp);
89         }
90         pu1_scrtch_tmp += scrtch_strd;
91         pu1_src_tmp += src_strd;
92     }
93     /* vertical filtering */
94     pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd;
95     for(i = 0; i < ht; i += 2)
96     {
97         for(j = 0; j < (wd >> 1); j++)
98         {
99             tmp =
100                 (i4_ftaps[3] * pu1_scrtch_tmp[j] +
101                  i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) +
102                  i4_ftaps[1] *
103                      (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) +
104                  i4_ftaps[0] *
105                      (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) +
106                  (1 << (FILT_TAP_Q - 1))) >>
107                 FILT_TAP_Q;
108             pu1_dst[j] = CLIP_U8(tmp);
109         }
110         pu1_dst += dst_strd;
111         pu1_scrtch_tmp += (scrtch_strd << 1);
112     }
113 }
114 
ihevce_scale_by_2_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht,UWORD8 * pu1_wkg_mem,WORD32 ht_offset,WORD32 block_ht,WORD32 wd_offset,WORD32 block_wd,FT_COPY_2D * pf_copy_2d)115 void ihevce_scale_by_2_neon(
116     UWORD8 *pu1_src,
117     WORD32 src_strd,
118     UWORD8 *pu1_dst,
119     WORD32 dst_strd,
120     WORD32 wd,
121     WORD32 ht,
122     UWORD8 *pu1_wkg_mem,
123     WORD32 ht_offset,
124     WORD32 block_ht,
125     WORD32 wd_offset,
126     WORD32 block_wd,
127     FT_COPY_2D *pf_copy_2d)
128 {
129 #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1))
130     UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ];
131     UWORD32 cpy_strd = MAX_BLK_SZ;
132     UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1);
133 
134     UWORD8 *pu1_in, *pu1_out;
135     WORD32 in_strd, wkg_mem_strd;
136 
137     WORD32 row_start, row_end;
138     WORD32 col_start, col_end;
139     WORD32 i, fun_select;
140     WORD32 ht_tmp, wd_tmp;
141     FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2];
142 
143     assert((wd & 1) == 0);
144     assert((ht & 1) == 0);
145     assert(block_wd <= MAX_CTB_SIZE);
146     assert(block_ht <= MAX_CTB_SIZE);
147 
148     /* function pointers for filtering different dimensions */
149     ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn;
150     ihevce_scaling_filters[1] = ihevce_scaling_filter_mxn_neon;
151 
152     /* handle boundary blks */
153     col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0;
154     row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0;
155     col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0;
156     row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0;
157     if(col_end && (wd % block_wd != 0))
158     {
159         block_wd = (wd % block_wd);
160     }
161     if(row_end && (ht % block_ht != 0))
162     {
163         block_ht = (ht % block_ht);
164     }
165 
166     /* boundary blks needs to be padded, copy src to tmp buffer */
167     if(col_start || col_end || row_end || row_start)
168     {
169         UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd;
170 
171         pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start));
172         pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start));
173         ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end);
174         wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end);
175         pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp);
176         pu1_in = au1_cpy + cpy_strd * 3 + 3;
177         in_strd = cpy_strd;
178     }
179     else
180     {
181         pu1_in = pu1_src + wd_offset + ht_offset * src_strd;
182         in_strd = src_strd;
183     }
184 
185     /*top padding*/
186     if(row_start)
187     {
188         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3;
189 
190         pu1_cpy = au1_cpy + cpy_strd * (3 - 1);
191         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
192         pu1_cpy -= cpy_strd;
193         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
194         pu1_cpy -= cpy_strd;
195         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
196     }
197 
198     /*bottom padding*/
199     if(row_end)
200     {
201         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd;
202 
203         pu1_cpy = pu1_cpy_tmp + cpy_strd;
204         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
205         pu1_cpy += cpy_strd;
206         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
207         pu1_cpy += cpy_strd;
208         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
209     }
210 
211     /*left padding*/
212     if(col_start)
213     {
214         UWORD8 *pu1_cpy_tmp = au1_cpy + 3;
215 
216         pu1_cpy = au1_cpy;
217         for(i = 0; i < block_ht + 6; i++)
218         {
219             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
220             pu1_cpy += cpy_strd;
221             pu1_cpy_tmp += cpy_strd;
222         }
223     }
224 
225     /*right padding*/
226     if(col_end)
227     {
228         UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1;
229 
230         pu1_cpy = au1_cpy + 3 + block_wd;
231         for(i = 0; i < block_ht + 6; i++)
232         {
233             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
234             pu1_cpy += cpy_strd;
235             pu1_cpy_tmp += cpy_strd;
236         }
237     }
238 
239     wkg_mem_strd = block_wd >> 1;
240     pu1_out = pu1_dst + (wd_offset >> 1);
241     fun_select = (block_wd % 16 == 0);
242     ihevce_scaling_filters[fun_select](
243         pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd);
244 
245     /* Left padding of 16 for 1st block of every row */
246     if(wd_offset == 0)
247     {
248         UWORD8 u1_val;
249         WORD32 pad_wd = 16;
250         WORD32 pad_ht = block_ht >> 1;
251         UWORD8 *dst = pu1_dst;
252 
253         for(i = 0; i < pad_ht; i++)
254         {
255             u1_val = dst[0];
256             memset(&dst[-pad_wd], u1_val, pad_wd);
257             dst += dst_strd;
258         }
259     }
260 
261     if(wd == wd_offset + block_wd)
262     {
263         /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */
264         /* Right padding is done only after processing of last block of that row is done*/
265         UWORD8 u1_val;
266         WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4;
267         WORD32 pad_ht = block_ht >> 1;
268         UWORD8 *dst = pu1_dst + (wd >> 1) - 1;
269 
270         for(i = 0; i < pad_ht; i++)
271         {
272             u1_val = dst[0];
273             memset(&dst[1], u1_val, pad_wd);
274             dst += dst_strd;
275         }
276 
277         if(ht_offset == 0)
278         {
279             /* Top padding of 16 is done for 1st row only after we reach end of that row */
280             WORD32 pad_wd = dst_strd;
281             WORD32 pad_ht = 16;
282             UWORD8 *dst = pu1_dst - 16;
283 
284             for(i = 1; i <= pad_ht; i++)
285             {
286                 memcpy(dst - (i * dst_strd), dst, pad_wd);
287             }
288         }
289 
290         /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have
291          reached end of frame */
292         if(ht - ht_offset - block_ht == 0)
293         {
294             WORD32 pad_wd = dst_strd;
295             WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4;
296             UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16;
297 
298             for(i = 1; i <= pad_ht; i++)
299                 memcpy(dst + (i * dst_strd), dst, pad_wd);
300         }
301     }
302 }
303