1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_decomp_pre_intra_pass_neon.c
24 *
25 * @brief
26 * Contains functions to perform input scaling
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes */
40 /*****************************************************************************/
41 /* System include files */
42 #include <stdio.h>
43 #include <string.h>
44 #include <assert.h>
45 #include <arm_neon.h>
46
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "ihevc_macros.h"
50 #include "ihevc_platform_macros.h"
51 #include "itt_video_api.h"
52 #include "ihevc_defs.h"
53 #include "ihevc_cmn_utils_neon.h"
54 #include "ihevce_ipe_instr_set_router.h"
55
56 /*****************************************************************************/
57 /* Function Definitions */
58 /*****************************************************************************/
ihevce_scaling_filter_mxn(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_scrtch,WORD32 scrtch_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht,WORD32 wd)59 void ihevce_scaling_filter_mxn(
60 UWORD8 *pu1_src,
61 WORD32 src_strd,
62 UWORD8 *pu1_scrtch,
63 WORD32 scrtch_strd,
64 UWORD8 *pu1_dst,
65 WORD32 dst_strd,
66 WORD32 ht,
67 WORD32 wd)
68 {
69 #define FILT_TAP_Q 8
70 #define N_TAPS 7
71 const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 };
72 WORD32 i, j;
73 WORD32 tmp;
74 UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd;
75 UWORD8 *pu1_scrtch_tmp = pu1_scrtch;
76
77 /* horizontal filtering */
78 for(i = -3; i < ht + 2; i++)
79 {
80 for(j = 0; j < wd; j += 2)
81 {
82 tmp = (i4_ftaps[3] * pu1_src_tmp[j] +
83 i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) +
84 i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) +
85 i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) +
86 (1 << (FILT_TAP_Q - 1))) >>
87 FILT_TAP_Q;
88 pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp);
89 }
90 pu1_scrtch_tmp += scrtch_strd;
91 pu1_src_tmp += src_strd;
92 }
93 /* vertical filtering */
94 pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd;
95 for(i = 0; i < ht; i += 2)
96 {
97 for(j = 0; j < (wd >> 1); j++)
98 {
99 tmp =
100 (i4_ftaps[3] * pu1_scrtch_tmp[j] +
101 i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) +
102 i4_ftaps[1] *
103 (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) +
104 i4_ftaps[0] *
105 (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) +
106 (1 << (FILT_TAP_Q - 1))) >>
107 FILT_TAP_Q;
108 pu1_dst[j] = CLIP_U8(tmp);
109 }
110 pu1_dst += dst_strd;
111 pu1_scrtch_tmp += (scrtch_strd << 1);
112 }
113 }
114
ihevce_scale_by_2_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht,UWORD8 * pu1_wkg_mem,WORD32 ht_offset,WORD32 block_ht,WORD32 wd_offset,WORD32 block_wd,FT_COPY_2D * pf_copy_2d)115 void ihevce_scale_by_2_neon(
116 UWORD8 *pu1_src,
117 WORD32 src_strd,
118 UWORD8 *pu1_dst,
119 WORD32 dst_strd,
120 WORD32 wd,
121 WORD32 ht,
122 UWORD8 *pu1_wkg_mem,
123 WORD32 ht_offset,
124 WORD32 block_ht,
125 WORD32 wd_offset,
126 WORD32 block_wd,
127 FT_COPY_2D *pf_copy_2d)
128 {
129 #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1))
130 UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ];
131 UWORD32 cpy_strd = MAX_BLK_SZ;
132 UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1);
133
134 UWORD8 *pu1_in, *pu1_out;
135 WORD32 in_strd, wkg_mem_strd;
136
137 WORD32 row_start, row_end;
138 WORD32 col_start, col_end;
139 WORD32 i, fun_select;
140 WORD32 ht_tmp, wd_tmp;
141 FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2];
142
143 assert((wd & 1) == 0);
144 assert((ht & 1) == 0);
145 assert(block_wd <= MAX_CTB_SIZE);
146 assert(block_ht <= MAX_CTB_SIZE);
147
148 /* function pointers for filtering different dimensions */
149 ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn;
150 ihevce_scaling_filters[1] = ihevce_scaling_filter_mxn_neon;
151
152 /* handle boundary blks */
153 col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0;
154 row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0;
155 col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0;
156 row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0;
157 if(col_end && (wd % block_wd != 0))
158 {
159 block_wd = (wd % block_wd);
160 }
161 if(row_end && (ht % block_ht != 0))
162 {
163 block_ht = (ht % block_ht);
164 }
165
166 /* boundary blks needs to be padded, copy src to tmp buffer */
167 if(col_start || col_end || row_end || row_start)
168 {
169 UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd;
170
171 pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start));
172 pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start));
173 ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end);
174 wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end);
175 pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp);
176 pu1_in = au1_cpy + cpy_strd * 3 + 3;
177 in_strd = cpy_strd;
178 }
179 else
180 {
181 pu1_in = pu1_src + wd_offset + ht_offset * src_strd;
182 in_strd = src_strd;
183 }
184
185 /*top padding*/
186 if(row_start)
187 {
188 UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3;
189
190 pu1_cpy = au1_cpy + cpy_strd * (3 - 1);
191 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
192 pu1_cpy -= cpy_strd;
193 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
194 pu1_cpy -= cpy_strd;
195 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
196 }
197
198 /*bottom padding*/
199 if(row_end)
200 {
201 UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd;
202
203 pu1_cpy = pu1_cpy_tmp + cpy_strd;
204 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
205 pu1_cpy += cpy_strd;
206 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
207 pu1_cpy += cpy_strd;
208 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
209 }
210
211 /*left padding*/
212 if(col_start)
213 {
214 UWORD8 *pu1_cpy_tmp = au1_cpy + 3;
215
216 pu1_cpy = au1_cpy;
217 for(i = 0; i < block_ht + 6; i++)
218 {
219 pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
220 pu1_cpy += cpy_strd;
221 pu1_cpy_tmp += cpy_strd;
222 }
223 }
224
225 /*right padding*/
226 if(col_end)
227 {
228 UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1;
229
230 pu1_cpy = au1_cpy + 3 + block_wd;
231 for(i = 0; i < block_ht + 6; i++)
232 {
233 pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
234 pu1_cpy += cpy_strd;
235 pu1_cpy_tmp += cpy_strd;
236 }
237 }
238
239 wkg_mem_strd = block_wd >> 1;
240 pu1_out = pu1_dst + (wd_offset >> 1);
241 fun_select = (block_wd % 16 == 0);
242 ihevce_scaling_filters[fun_select](
243 pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd);
244
245 /* Left padding of 16 for 1st block of every row */
246 if(wd_offset == 0)
247 {
248 UWORD8 u1_val;
249 WORD32 pad_wd = 16;
250 WORD32 pad_ht = block_ht >> 1;
251 UWORD8 *dst = pu1_dst;
252
253 for(i = 0; i < pad_ht; i++)
254 {
255 u1_val = dst[0];
256 memset(&dst[-pad_wd], u1_val, pad_wd);
257 dst += dst_strd;
258 }
259 }
260
261 if(wd == wd_offset + block_wd)
262 {
263 /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */
264 /* Right padding is done only after processing of last block of that row is done*/
265 UWORD8 u1_val;
266 WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4;
267 WORD32 pad_ht = block_ht >> 1;
268 UWORD8 *dst = pu1_dst + (wd >> 1) - 1;
269
270 for(i = 0; i < pad_ht; i++)
271 {
272 u1_val = dst[0];
273 memset(&dst[1], u1_val, pad_wd);
274 dst += dst_strd;
275 }
276
277 if(ht_offset == 0)
278 {
279 /* Top padding of 16 is done for 1st row only after we reach end of that row */
280 WORD32 pad_wd = dst_strd;
281 WORD32 pad_ht = 16;
282 UWORD8 *dst = pu1_dst - 16;
283
284 for(i = 1; i <= pad_ht; i++)
285 {
286 memcpy(dst - (i * dst_strd), dst, pad_wd);
287 }
288 }
289
290 /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have
291 reached end of frame */
292 if(ht - ht_offset - block_ht == 0)
293 {
294 WORD32 pad_wd = dst_strd;
295 WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4;
296 UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16;
297
298 for(i = 1; i <= pad_ht; i++)
299 memcpy(dst + (i * dst_strd), dst, pad_wd);
300 }
301 }
302 }
303