1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 ******************************************************************************
22 * @file
23 *  ihevce_subpel_neon.c
24 *
25 * @brief
26 *  Subpel refinement modules for ME algo
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 /* System include files */
43 #include <stdio.h>
44 #include <string.h>
45 #include <assert.h>
46 #include <arm_neon.h>
47 
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevc_cmn_utils_neon.h"
52 #include "ihevc_chroma_itrans_recon.h"
53 #include "ihevc_chroma_intra_pred.h"
54 #include "ihevc_debug.h"
55 #include "ihevc_deblk.h"
56 #include "ihevc_defs.h"
57 #include "ihevc_itrans_recon.h"
58 #include "ihevc_intra_pred.h"
59 #include "ihevc_inter_pred.h"
60 #include "ihevc_macros.h"
61 #include "ihevc_mem_fns.h"
62 #include "ihevc_padding.h"
63 #include "ihevc_quant_iquant_ssd.h"
64 #include "ihevc_resi_trans.h"
65 #include "ihevc_sao.h"
66 #include "ihevc_structs.h"
67 #include "ihevc_weighted_pred.h"
68 
69 #include "rc_cntrl_param.h"
70 #include "rc_frame_info_collector.h"
71 #include "rc_look_ahead_params.h"
72 
73 #include "ihevce_api.h"
74 #include "ihevce_defs.h"
75 #include "ihevce_lap_enc_structs.h"
76 #include "ihevce_multi_thrd_structs.h"
77 #include "ihevce_function_selector.h"
78 #include "ihevce_me_common_defs.h"
79 #include "ihevce_enc_structs.h"
80 #include "ihevce_had_satd.h"
81 #include "ihevce_ipe_instr_set_router.h"
82 #include "ihevce_global_tables.h"
83 
84 #include "hme_datatype.h"
85 #include "hme_common_defs.h"
86 #include "hme_interface.h"
87 #include "hme_defs.h"
88 
89 #include "ihevce_me_instr_set_router.h"
90 
91 /*****************************************************************************/
92 /* Function Declarations                                                     */
93 /*****************************************************************************/
94 FT_CALC_SATD_AND_RESULT hme_evalsatd_update_1_best_result_pt_pu_16x16_neon;
95 
96 WORD32 ihevce_had4_4x4_neon(
97     UWORD8 *pu1_src,
98     WORD32 src_strd,
99     UWORD8 *pu1_pred,
100     WORD32 pred_strd,
101     WORD16 *pi2_dst4x4,
102     WORD32 dst_strd,
103     WORD32 *pi4_hsad,
104     WORD32 hsad_stride,
105     WORD32 i4_frm_qstep);
106 
107 /*****************************************************************************/
108 /* Function Definitions                                                      */
109 /*****************************************************************************/
110 
hme_4x4_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd)111 static void hme_4x4_qpel_interp_avg_neon(
112     UWORD8 *pu1_src_a,
113     UWORD8 *pu1_src_b,
114     WORD32 src_a_strd,
115     WORD32 src_b_strd,
116     UWORD8 *pu1_dst,
117     WORD32 dst_strd)
118 {
119     uint8x16_t src_a = load_unaligned_u8q(pu1_src_a, src_a_strd);
120     uint8x16_t src_b = load_unaligned_u8q(pu1_src_b, src_b_strd);
121     uint8x16_t dst = vrhaddq_u8(src_a, src_b);
122 
123     store_unaligned_u8q(pu1_dst, dst_strd, dst);
124 }
125 
hme_8xn_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht)126 static void hme_8xn_qpel_interp_avg_neon(
127     UWORD8 *pu1_src_a,
128     UWORD8 *pu1_src_b,
129     WORD32 src_a_strd,
130     WORD32 src_b_strd,
131     UWORD8 *pu1_dst,
132     WORD32 dst_strd,
133     WORD32 ht)
134 {
135     WORD32 i;
136 
137     for(i = 0; i < ht; i++)
138     {
139         uint8x8_t src_a = vld1_u8(pu1_src_a);
140         uint8x8_t src_b = vld1_u8(pu1_src_b);
141         uint8x8_t dst = vrhadd_u8(src_a, src_b);
142 
143         vst1_u8(pu1_dst, dst);
144         pu1_src_a += src_a_strd;
145         pu1_src_b += src_b_strd;
146         pu1_dst += dst_strd;
147     }
148 }
149 
hme_16xn_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht)150 static void hme_16xn_qpel_interp_avg_neon(
151     UWORD8 *pu1_src_a,
152     UWORD8 *pu1_src_b,
153     WORD32 src_a_strd,
154     WORD32 src_b_strd,
155     UWORD8 *pu1_dst,
156     WORD32 dst_strd,
157     WORD32 ht)
158 {
159     WORD32 i;
160 
161     for(i = 0; i < ht; i++)
162     {
163         uint8x16_t src_a = vld1q_u8(pu1_src_a);
164         uint8x16_t src_b = vld1q_u8(pu1_src_b);
165         uint8x16_t dst = vrhaddq_u8(src_a, src_b);
166 
167         vst1q_u8(pu1_dst, dst);
168         pu1_src_a += src_a_strd;
169         pu1_src_b += src_b_strd;
170         pu1_dst += dst_strd;
171     }
172 }
173 
hme_32xn_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht)174 static void hme_32xn_qpel_interp_avg_neon(
175     UWORD8 *pu1_src_a,
176     UWORD8 *pu1_src_b,
177     WORD32 src_a_strd,
178     WORD32 src_b_strd,
179     UWORD8 *pu1_dst,
180     WORD32 dst_strd,
181     WORD32 ht)
182 {
183     WORD32 i;
184 
185     for(i = 0; i < ht; i++)
186     {
187         uint8x16_t src_a_0 = vld1q_u8(pu1_src_a);
188         uint8x16_t src_b_0 = vld1q_u8(pu1_src_b);
189         uint8x16_t dst_0 = vrhaddq_u8(src_a_0, src_b_0);
190 
191         uint8x16_t src_a_1 = vld1q_u8(pu1_src_a + 16);
192         uint8x16_t src_b_1 = vld1q_u8(pu1_src_b + 16);
193         uint8x16_t dst_1 = vrhaddq_u8(src_a_1, src_b_1);
194 
195         vst1q_u8(pu1_dst, dst_0);
196         vst1q_u8(pu1_dst + 16, dst_1);
197         pu1_src_a += src_a_strd;
198         pu1_src_b += src_b_strd;
199         pu1_dst += dst_strd;
200     }
201 }
202 
hme_4mx4n_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 blk_wd,WORD32 blk_ht)203 static void hme_4mx4n_qpel_interp_avg_neon(
204     UWORD8 *pu1_src_a,
205     UWORD8 *pu1_src_b,
206     WORD32 src_a_strd,
207     WORD32 src_b_strd,
208     UWORD8 *pu1_dst,
209     WORD32 dst_strd,
210     WORD32 blk_wd,
211     WORD32 blk_ht)
212 {
213     WORD32 i, j;
214 
215     assert(blk_wd % 4 == 0);
216     assert(blk_ht % 4 == 0);
217 
218     for(i = 0; i < blk_ht; i += 4)
219     {
220         for(j = 0; j < blk_wd;)
221         {
222             WORD32 wd = blk_wd - j;
223 
224             if(wd >= 32)
225             {
226                 hme_32xn_qpel_interp_avg_neon(
227                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
228                 j += 32;
229             }
230             else if(wd >= 16)
231             {
232                 hme_16xn_qpel_interp_avg_neon(
233                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
234                 j += 16;
235             }
236             else if(wd >= 8)
237             {
238                 hme_8xn_qpel_interp_avg_neon(
239                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
240                 j += 8;
241             }
242             else
243             {
244                 hme_4x4_qpel_interp_avg_neon(
245                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd);
246                 j += 4;
247             }
248         }
249         pu1_src_a += (4 * src_a_strd);
250         pu1_src_b += (4 * src_b_strd);
251         pu1_dst += (4 * dst_strd);
252     }
253 }
254 
hme_qpel_interp_avg_neon(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,S32 i4_buf_id)255 void hme_qpel_interp_avg_neon(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
256 {
257     U08 *pu1_src1, *pu1_src2, *pu1_dst;
258     qpel_input_buf_cfg_t *ps_inp_cfg;
259     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
260     S32 i4_ref_stride = ps_prms->i4_ref_stride;
261 
262     i4_mv_x_frac = i4_mv_x & 3;
263     i4_mv_y_frac = i4_mv_y & 3;
264 
265     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
266 
267     /* Derive the descriptor that has all offset and size info */
268     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
269 
270     if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
271     {
272         /* This is case for fxfy/hxfy/fxhy/hxhy */
273         ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
274         ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
275         ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
276         ps_prms->i4_final_out_stride = i4_ref_stride;
277 
278         return;
279     }
280 
281     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
282     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
283     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
284 
285     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
286     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
287     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);
288 
289     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
290 
291     hme_4mx4n_qpel_interp_avg_neon(
292         pu1_src1,
293         pu1_src2,
294         ps_prms->i4_ref_stride,
295         ps_prms->i4_ref_stride,
296         pu1_dst,
297         ps_prms->i4_out_stride,
298         ps_prms->i4_blk_wd,
299         ps_prms->i4_blk_ht);
300     ps_prms->pu1_final_out = pu1_dst;
301     ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
302 }
303 
304 // TODO: Can this function and above function be unified
hme_qpel_interp_avg_1pt_neon(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,S32 i4_buf_id,U08 ** ppu1_final,S32 * pi4_final_stride)305 void hme_qpel_interp_avg_1pt_neon(
306     interp_prms_t *ps_prms,
307     S32 i4_mv_x,
308     S32 i4_mv_y,
309     S32 i4_buf_id,
310     U08 **ppu1_final,
311     S32 *pi4_final_stride)
312 {
313     U08 *pu1_src1, *pu1_src2, *pu1_dst;
314     qpel_input_buf_cfg_t *ps_inp_cfg;
315     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
316     S32 i4_ref_stride = ps_prms->i4_ref_stride;
317 
318     i4_mv_x_frac = i4_mv_x & 3;
319     i4_mv_y_frac = i4_mv_y & 3;
320 
321     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
322 
323     /* Derive the descriptor that has all offset and size info */
324     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
325 
326     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
327     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
328     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
329 
330     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
331     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
332     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);
333 
334     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
335 
336     hme_4mx4n_qpel_interp_avg_neon(
337         pu1_src1,
338         pu1_src2,
339         ps_prms->i4_ref_stride,
340         ps_prms->i4_ref_stride,
341         pu1_dst,
342         ps_prms->i4_out_stride,
343         ps_prms->i4_blk_wd,
344         ps_prms->i4_blk_ht);
345     ppu1_final[i4_buf_id] = pu1_dst;
346     pi4_final_stride[i4_buf_id] = ps_prms->i4_out_stride;
347 }
348 
hme_qpel_interp_avg_2pt_vert_with_reuse_neon(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride)349 void hme_qpel_interp_avg_2pt_vert_with_reuse_neon(
350     interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
351 {
352     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
353 
354     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
355 }
356 
hme_qpel_interp_avg_2pt_horz_with_reuse_neon(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride)357 void hme_qpel_interp_avg_2pt_horz_with_reuse_neon(
358     interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
359 {
360     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
361 
362     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
363 }
364 
hme_evalsatd_update_1_best_result_pt_pu_16x16_neon(err_prms_t * ps_prms,result_upd_prms_t * ps_result_prms)365 void hme_evalsatd_update_1_best_result_pt_pu_16x16_neon(
366     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
367 {
368     mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
369     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
370     S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
371 
372     S32 ai4_satd_4x4[16];
373     S32 ai4_satd_8x8[4];
374 
375     U08 *pu1_inp = ps_prms->pu1_inp;
376     U08 *pu1_ref = ps_prms->pu1_ref;
377 
378     S32 inp_stride = ps_prms->i4_inp_stride;
379     S32 ref_stride = ps_prms->i4_ref_stride;
380 
381     S32 i;
382 
383     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
384     for(i = 0; i < 4; i++)
385     {
386         U08 *pu1_src = pu1_inp + (i & 0x1) * 8 + (i >> 1) * inp_stride * 8;
387         U08 *pu1_pred = pu1_ref + (i & 0x1) * 8 + (i >> 1) * ref_stride * 8;
388         S16 idx = (i & 0x1) * 2 + (i >> 1) * 8;
389 
390         ai4_satd_8x8[i] = ihevce_had4_4x4_neon(
391             pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 0, &ai4_satd_4x4[idx], 4, 0);
392     }
393 
394     /* Update 16x16 SATDs */
395     pi4_sad_grid[PART_ID_2Nx2N] =
396         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
397 
398     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
399     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
400     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
401     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
402 
403     /* Update 8x16 / 16x8 SATDs */
404     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
405     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
406     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
407     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
408 
409     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
410     pi4_sad_grid[PART_ID_nLx2N_L] =
411         ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
412     pi4_sad_grid[PART_ID_nRx2N_R] =
413         ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
414     pi4_sad_grid[PART_ID_2NxnU_T] =
415         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
416     pi4_sad_grid[PART_ID_2NxnD_B] =
417         ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
418 
419     pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
420     pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
421     pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
422     pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
423 
424     /* For each valid partition, update the refine_prm structure to
425      * reflect the best and second best candidates for that partition */
426     for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
427     {
428         S32 part_id = pi4_valid_part_ids[i];
429         S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i;
430         S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id];
431         S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
432         S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
433         S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
434 
435         if(i4_tot_cost < best_node_cost)
436         {
437             refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
438             refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
439             refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x;
440             refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y;
441             refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx;
442         }
443     }
444 }
445