1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 *******************************************************************************
23 * @file
24 *  ihevc_quant_iquant_ssd_neon_intr.c
25 *
26 * @brief
27 *  Contains function definitions for quantization, followed by Inverse
28 *  quantization to find transform domain SSD
29 *
30 * @author
31 *  100736
32 *
33 * @par List of Functions:
34 *   - ihevc_quant_iquant_ssd_flat_scale_mat_neon()
35 *   - ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon()
36 *
37 * @remarks
38 *
39 *
40 *******************************************************************************
41 */
42 /* System include files */
43 #include <stdio.h>
44 #include <string.h>
45 #include <stdlib.h>
46 
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "ihevc_macros.h"
50 #include "ihevc_platform_macros.h"
51 #include "ihevc_defs.h"
52 #include "ihevc_debug.h"
53 #include "ihevc_trans_tables.h"
54 #include "ihevc_quant_iquant_ssd.h"
55 #include "ihevc_func_selector.h"
56 #include "ihevc_trans_macros.h"
57 #include "arm_neon.h"
58 
59 /*****************************************************************************/
60 /* Function Definitions                                                      */
61 /*****************************************************************************/
62 
ihevc_quant_iquant_ssd_flat_scale_mat_neon(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)63 WORD32 ihevc_quant_iquant_ssd_flat_scale_mat_neon(
64     WORD16 *pi2_coeffs,
65     WORD16 *pi2_quant_coeff,
66     WORD16 *pi2_q_dst,
67     WORD16 *pi2_iq_dst,
68     WORD32 trans_size,
69     WORD32 qp_div,
70     WORD32 qp_rem,
71     WORD32 q_add,
72     WORD32 *pi4_quant_round_factor_0_1,
73     WORD32 *pi4_quant_round_factor_1_2,
74     WORD32 src_strd,
75     WORD32 dst_q_strd,
76     WORD32 dst_iq_strd,
77     UWORD8 *csbf,
78     WORD32 csbf_strd,
79     WORD32 *zero_col,
80     WORD32 *zero_row,
81     WORD16 *pi2_dequant_coeff,
82     LWORD64 *pi8_cost)
83 {
84     WORD32 i, j;
85     WORD32 log2_size;
86     WORD32 cbf = 0;
87 
88     WORD16 qm = 4;
89     WORD16 bd = 8;
90     WORD32 q_bits, tr, temp;
91     WORD32 block_col = 0;
92     WORD32 block_row = 0;
93     WORD32 temp_zero_col = 0;
94     WORD32 temp_zero_row = 0;
95 
96     WORD32 sh;
97     WORD32 s_iq;
98     WORD32 sh_tmp;
99 
100     // ssd
101     int32x4_t ssd0 = vdupq_n_s32(0);
102     int32x2_t ssd1;
103     WORD32 ssd;
104     // const
105     const int16x4_t zero = vdup_n_s16(0);
106     const int16x4_t zero_d = vdup_n_s16(0);
107     const int16x4_t sq = vdup_n_s16(g_ihevc_quant_scales[qp_rem]);
108     const int16x4_t siq = vdup_n_s16((g_ihevc_iquant_scales_flat_scale[qp_rem]));
109     // src
110     int16x4_t s0, s1, s2, s3;
111     // q-iq
112     int16x4_t q0, q1, q2, q3;
113     int16x4_t iq0, iq1, iq2, iq3;
114     // residue
115     int32x4_t r0, r1, r2, r3;
116     // sign
117     uint16x4_t psgn0, psgn1, psgn2, psgn3;
118     uint16x4_t nsgn0, nsgn1, nsgn2, nsgn3;
119     // abs(src)
120     int16x4_t abs_s0, abs_s1, abs_s2, abs_s3;
121     // q-temp
122     int32x4_t qtmp_0, qtmp_1, qtmp_2, qtmp_3;
123     int16x4_t pq0, pq1, pq2, pq3;
124     int16x4_t nq0, nq1, nq2, nq3;
125     // iq-temp
126     int32x4_t iqtmp_0, iqtmp_1, iqtmp_2, iqtmp_3;
127 
128     int32x4_t add_q;
129     int32x4_t add_iq = vdupq_n_s32(1);
130     int32x4_t sh_iq_1;
131     int32x4_t sh_iq;
132     int32x4_t q_v_bits;
133 
134     (void)pi4_quant_round_factor_0_1;
135     (void)pi4_quant_round_factor_1_2;
136     (void)pi2_dequant_coeff;
137 
138     GETRANGE(log2_size, trans_size);
139     log2_size -= 1;
140 
141     tr = MAX_TR_DYNAMIC_RANGE - bd - log2_size;
142     q_bits = QUANT_SHIFT + qp_div + tr + SCALING_Q_SHIFT - qm - FLAT_RESCALE_MAT_Q_SHIFT;
143     temp = (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));
144 
145     q_v_bits = vdupq_n_s32(-q_bits);
146     add_q = vdupq_n_s32(temp);
147 
148     sh = bd + log2_size - 5;
149 
150     sh_tmp = (sh - qp_div - 1);
151     sh_iq_1 = vdupq_n_s32(sh_tmp);
152     add_iq = vshlq_s32(add_iq, sh_iq_1);
153 
154     s_iq = (-(sh - qp_div));
155     sh_iq = vdupq_n_s32(s_iq);
156 
157     for(i = 0; i < trans_size; i += 4)
158     {
159         for(j = 0; j < trans_size; j += 4)
160         {
161             s0 = vld1_s16(pi2_coeffs + j);
162             s1 = vld1_s16(pi2_coeffs + j + (src_strd));
163             s2 = vld1_s16(pi2_coeffs + j + (2 * src_strd));
164             s3 = vld1_s16(pi2_coeffs + j + (3 * src_strd));
165 
166             /* quantization */
167             /* sign */
168             psgn0 = vcge_s16(s0, zero);
169             psgn1 = vcge_s16(s1, zero);
170             psgn2 = vcge_s16(s2, zero);
171             psgn3 = vcge_s16(s3, zero);
172 
173             nsgn0 = vclt_s16(s0, zero);
174             nsgn1 = vclt_s16(s1, zero);
175             nsgn2 = vclt_s16(s2, zero);
176             nsgn3 = vclt_s16(s3, zero);
177 
178             /* |src| */
179             abs_s0 = vabs_s16(s0);
180             abs_s1 = vabs_s16(s1);
181             abs_s2 = vabs_s16(s2);
182             abs_s3 = vabs_s16(s3);
183 
184             /* tmp = tmp * quant_coeff */
185             qtmp_0 = vmull_s16(abs_s0, sq);
186             qtmp_1 = vmull_s16(abs_s1, sq);
187             qtmp_2 = vmull_s16(abs_s2, sq);
188             qtmp_3 = vmull_s16(abs_s3, sq);
189 
190             /* tmp += (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)) */
191             qtmp_0 = vaddq_s32(qtmp_0, add_q);
192             qtmp_1 = vaddq_s32(qtmp_1, add_q);
193             qtmp_2 = vaddq_s32(qtmp_2, add_q);
194             qtmp_3 = vaddq_s32(qtmp_3, add_q);
195 
196             /* tmp >>= q_bits; */
197             qtmp_0 = vshlq_s32(qtmp_0, q_v_bits);
198             qtmp_1 = vshlq_s32(qtmp_1, q_v_bits);
199             qtmp_2 = vshlq_s32(qtmp_2, q_v_bits);
200             qtmp_3 = vshlq_s32(qtmp_3, q_v_bits);
201 
202             /* clip */
203             q0 = vqmovn_s32(qtmp_0);
204             q1 = vqmovn_s32(qtmp_1);
205             q2 = vqmovn_s32(qtmp_2);
206             q3 = vqmovn_s32(qtmp_3);
207 
208             /* restore sign */
209             pq0 = vand_s16(q0, vreinterpret_s16_u16(psgn0));
210             pq1 = vand_s16(q1, vreinterpret_s16_u16(psgn1));
211             pq2 = vand_s16(q2, vreinterpret_s16_u16(psgn2));
212             pq3 = vand_s16(q3, vreinterpret_s16_u16(psgn3));
213 
214             nq0 = vand_s16(q0, vreinterpret_s16_u16(nsgn0));
215             nq1 = vand_s16(q1, vreinterpret_s16_u16(nsgn1));
216             nq2 = vand_s16(q2, vreinterpret_s16_u16(nsgn2));
217             nq3 = vand_s16(q3, vreinterpret_s16_u16(nsgn3));
218 
219             q0 = vsub_s16(pq0, nq0);
220             q1 = vsub_s16(pq1, nq1);
221             q2 = vsub_s16(pq2, nq2);
222             q3 = vsub_s16(pq3, nq3);
223 
224             /* store */
225             vst1_s16((pi2_q_dst + j), q0);
226             vst1_s16((pi2_q_dst + j + dst_q_strd), q1);
227             vst1_s16((pi2_q_dst + j + (2 * dst_q_strd)), q2);
228             vst1_s16((pi2_q_dst + j + (3 * dst_q_strd)), q3);
229 
230             *(csbf + block_col) = 0;
231             if(vget_lane_s64(vreinterpret_s64_s16(q0), 0) ||
232                vget_lane_s64(vreinterpret_s64_s16(q1), 0) ||
233                vget_lane_s64(vreinterpret_s64_s16(q2), 0) ||
234                vget_lane_s64(vreinterpret_s64_s16(q3), 0))
235             {
236                 *(csbf + block_col) = 1;
237             }
238 
239             if(*(csbf + block_col) == 1)
240             {
241                 temp_zero_col |= (0xF << block_col * 4);
242                 temp_zero_row |= (0xF << block_row);
243 
244                 /* inverse quantization */
245                 iqtmp_0 = vmull_s16(q0, siq);
246                 iqtmp_1 = vmull_s16(q1, siq);
247                 iqtmp_2 = vmull_s16(q2, siq);
248                 iqtmp_3 = vmull_s16(q3, siq);
249 
250                 iqtmp_0 = vaddq_s32(iqtmp_0, add_iq);
251                 iqtmp_1 = vaddq_s32(iqtmp_1, add_iq);
252                 iqtmp_2 = vaddq_s32(iqtmp_2, add_iq);
253                 iqtmp_3 = vaddq_s32(iqtmp_3, add_iq);
254 
255                 iqtmp_0 = vshlq_s32(iqtmp_0, sh_iq);
256                 iqtmp_1 = vshlq_s32(iqtmp_1, sh_iq);
257                 iqtmp_2 = vshlq_s32(iqtmp_2, sh_iq);
258                 iqtmp_3 = vshlq_s32(iqtmp_3, sh_iq);
259 
260                 /* clip */
261                 iq0 = vqmovn_s32(iqtmp_0);
262                 iq1 = vqmovn_s32(iqtmp_1);
263                 iq2 = vqmovn_s32(iqtmp_2);
264                 iq3 = vqmovn_s32(iqtmp_3);
265 
266                 /* store */
267                 vst1_s16((pi2_iq_dst + j), iq0);
268                 vst1_s16((pi2_iq_dst + j + dst_iq_strd), iq1);
269                 vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), iq2);
270                 vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), iq3);
271 
272                 /* ssd */
273                 /* trans_coeff - inv.quant */
274                 r0 = vsubl_s16(s0, iq0);
275                 r1 = vsubl_s16(s1, iq1);
276                 r2 = vsubl_s16(s2, iq2);
277                 r3 = vsubl_s16(s3, iq3);
278 
279                 /* SD */
280                 r0 = vmulq_s32(r0, r0);
281                 r1 = vmulq_s32(r1, r1);
282                 r2 = vmulq_s32(r2, r2);
283                 r3 = vmulq_s32(r3, r3);
284             }
285             else
286             {
287                 /* store */
288                 vst1_s16((pi2_iq_dst + j), zero_d);
289                 vst1_s16((pi2_iq_dst + j + dst_iq_strd), zero_d);
290                 vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), zero_d);
291                 vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), zero_d);
292 
293                 /* SD */
294                 r0 = vmull_s16(s0, s0);
295                 r1 = vmull_s16(s1, s1);
296                 r2 = vmull_s16(s2, s2);
297                 r3 = vmull_s16(s3, s3);
298             }
299 
300             /* SSD */
301             r0 = vaddq_s32(r0, r1);
302             r2 = vaddq_s32(r2, r3);
303 
304             r0 = vaddq_s32(r0, r2);
305 
306             /* SSD Accumulation */
307             ssd0 = vaddq_s32(ssd0, r0);
308 
309             cbf = cbf || (*(csbf + block_col));  // cbf update
310             block_col++;
311         }
312 
313         block_col = 0;
314         block_row += 4;
315         csbf += csbf_strd;
316 
317         pi2_coeffs += 4 * src_strd;
318         pi2_q_dst += 4 * dst_q_strd;
319         pi2_iq_dst += 4 * dst_iq_strd;
320         pi2_quant_coeff += 4 * trans_size;
321     }
322 
323     /* SSD Computation */
324     ssd1 = vpadd_s32(vget_low_s32(ssd0), vget_high_s32(ssd0));
325     ssd1 = vpadd_s32(ssd1, ssd1);
326     ssd = vget_lane_s32(ssd1, 0);
327 
328     *zero_col = ~temp_zero_col;  //final zero_col storing
329     *zero_row = ~temp_zero_row;  //final zero_row storing
330 
331     /* Store the cost */
332     *pi8_cost = ssd;
333 
334     return cbf;
335 }
336 
ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)337 WORD32 ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon(
338     WORD16 *pi2_coeffs,
339     WORD16 *pi2_quant_coeff,
340     WORD16 *pi2_q_dst,
341     WORD16 *pi2_iq_dst,
342     WORD32 trans_size,
343     WORD32 qp_div, /* qpscaled / 6 */
344     WORD32 qp_rem, /* qpscaled % 6 */
345     WORD32 q_add,
346     WORD32 *pi4_quant_round_factor_0_1,
347     WORD32 *pi4_quant_round_factor_1_2,
348     WORD32 src_strd,
349     WORD32 dst_q_strd,
350     WORD32 dst_iq_strd,
351     UWORD8 *csbf,
352     WORD32 csbf_strd,
353     WORD32 *zero_col,
354     WORD32 *zero_row,
355     WORD16 *pi2_dequant_coeff,
356     LWORD64 *pi8_cost)
357 {
358     WORD32 i, j;
359     WORD32 log2_size;
360     WORD32 cbf = 0;
361 
362     WORD16 qm = 4;
363     WORD16 bd = 8;
364     WORD32 q_bits, tr;
365     WORD32 block_col = 0;
366     WORD32 block_row = 0;
367     WORD32 temp_zero_col = 0;
368     WORD32 temp_zero_row = 0;
369 
370     WORD32 sh;
371     WORD32 s_iq;
372     WORD32 sh_tmp;
373 
374     // ssd
375     int32x4_t ssd0 = vdupq_n_s32(0);
376     int32x2_t ssd1;
377     WORD32 ssd;
378     // const
379     const int16x8_t zero = vdupq_n_s16(0);
380     const int16x4_t zero_d = vdup_n_s16(0);
381     const int16x8_t one = vdupq_n_s16(1);
382     const int16x8_t two = vdupq_n_s16(2);
383     const int16x4_t sq = vdup_n_s16(g_ihevc_quant_scales[qp_rem]);
384     const int16x4_t siq = vdup_n_s16((g_ihevc_iquant_scales_flat_scale[qp_rem]));
385     // src
386     int16x4_t s0, s1, s2, s3;
387     // sign
388     uint16x8_t psgn0, psgn1;
389     uint16x8_t nsgn0, nsgn1;
390     int16x8_t pq0, pq1;
391     int16x8_t nq0, nq1;
392     // abs(src)
393     int16x4_t abs_s0, abs_s1, abs_s2, abs_s3;
394     // q-temp
395     int32x4_t mul_0, mul_1, mul_2, mul_3;
396     int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
397     int16x8_t q_00, q_01;
398     int16x8_t q_10, q_11;
399     int16x8_t q_20, q_21;
400     int16x8_t q_30, q_31;
401     // cmp
402     uint16x8_t cmp_00, cmp_01;
403     uint16x8_t cmp_10, cmp_11;
404     uint16x8_t cmp_20, cmp_21;
405     // iq-temp
406     int32x4_t iqtmp_0, iqtmp_1, iqtmp_2, iqtmp_3;
407     int16x4_t iq0, iq1, iq2, iq3;
408     //residue
409     int32x4_t r0, r1, r2, r3;
410     // add_q
411     int32x4_t add_q;
412     int32x4_t add_q0, add_q1, add_q2, add_q3;
413     int32x4_t add_iq = vdupq_n_s32(1);
414     int32x4_t sh_iq_1;
415     int32x4_t sh_iq;
416     int32x4_t q_v_bits;
417     int32x4_t stmp;
418 
419     (void)q_add;
420     (void)pi2_dequant_coeff;
421     GETRANGE(log2_size, trans_size);
422     log2_size -= 1;
423 
424     tr = MAX_TR_DYNAMIC_RANGE - bd - log2_size;
425     q_bits = QUANT_SHIFT + qp_div + tr + SCALING_Q_SHIFT - qm - FLAT_RESCALE_MAT_Q_SHIFT;
426 
427     stmp = vdupq_n_s32(q_bits - QUANT_ROUND_FACTOR_Q);
428 
429     add_q = vdupq_n_s32((1 << QUANT_ROUND_FACTOR_Q) / 2);
430     add_q = vshlq_s32(add_q, stmp);
431 
432     q_v_bits = vdupq_n_s32(-q_bits);
433 
434     sh = bd + log2_size - 5;
435 
436     sh_tmp = (sh - qp_div - 1);
437     sh_iq_1 = vdupq_n_s32(sh_tmp);
438     add_iq = vshlq_s32(add_iq, sh_iq_1);
439 
440     s_iq = (-(sh - qp_div));
441     sh_iq = vdupq_n_s32(s_iq);
442 
443     for(i = 0; i < trans_size; i += 4)
444     {
445         for(j = 0; j < trans_size; j += 4)
446         {
447             s0 = vld1_s16(pi2_coeffs + j);
448             s1 = vld1_s16(pi2_coeffs + j + (src_strd));
449             s2 = vld1_s16(pi2_coeffs + j + (2 * src_strd));
450             s3 = vld1_s16(pi2_coeffs + j + (3 * src_strd));
451 
452             /* quantization */
453             /* sign */
454             psgn0 = vcgeq_s16(vcombine_s16(s0, s1), zero);
455             psgn1 = vcgeq_s16(vcombine_s16(s2, s3), zero);
456 
457             nsgn0 = vcltq_s16(vcombine_s16(s0, s1), zero);
458             nsgn1 = vcltq_s16(vcombine_s16(s2, s3), zero);
459 
460             /* |src| */
461             abs_s0 = vabs_s16(s0);
462             abs_s1 = vabs_s16(s1);
463             abs_s2 = vabs_s16(s2);
464             abs_s3 = vabs_s16(s3);
465 
466             /* tmp = tmp * quant_coeff */
467             mul_0 = vmull_s16(abs_s0, sq);
468             mul_1 = vmull_s16(abs_s1, sq);
469             mul_2 = vmull_s16(abs_s2, sq);
470             mul_3 = vmull_s16(abs_s3, sq);
471 
472             /* qadd = 0 */
473             /* tmp >>= q_bits; */
474             q_tmp0 = vshlq_s32(mul_0, q_v_bits);
475             q_tmp1 = vshlq_s32(mul_1, q_v_bits);
476             q_tmp2 = vshlq_s32(mul_2, q_v_bits);
477             q_tmp3 = vshlq_s32(mul_3, q_v_bits);
478 
479             /* clip */
480             q_00 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1));
481             q_01 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3));
482 
483             /* compare qtmp_10, qtmp_20 with 2*/
484             cmp_00 = vcltq_s16(q_00, two);
485             cmp_01 = vcltq_s16(q_01, two);
486 
487             /* qadd = (1 << QUANT_ROUND_FACTOR_Q)/2) */
488             /* tmp >>= q_bits; */
489             q_tmp0 = vaddq_s32(mul_0, add_q);
490             q_tmp1 = vaddq_s32(mul_1, add_q);
491             q_tmp2 = vaddq_s32(mul_2, add_q);
492             q_tmp3 = vaddq_s32(mul_3, add_q);
493 
494             q_tmp0 = vshlq_s32(q_tmp0, q_v_bits);
495             q_tmp1 = vshlq_s32(q_tmp1, q_v_bits);
496             q_tmp2 = vshlq_s32(q_tmp2, q_v_bits);
497             q_tmp3 = vshlq_s32(q_tmp3, q_v_bits);
498 
499             /* clip */
500             q_10 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1));
501             q_11 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3));
502 
503             if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp_00)), 0) ||
504                vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp_00)), 0) ||
505                vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp_01)), 0) ||
506                vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp_01)), 0))
507             {
508                 /* qadd = *pi4_quant_round_factor_1_2 */
509                 /* tmp >>= q_bits; */
510                 add_q0 = vld1q_s32(pi4_quant_round_factor_1_2 + j);
511                 add_q1 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (trans_size));
512                 add_q2 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (2 * trans_size));
513                 add_q3 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (3 * trans_size));
514 
515                 add_q0 = vshlq_s32(add_q0, stmp);
516                 add_q1 = vshlq_s32(add_q1, stmp);
517                 add_q2 = vshlq_s32(add_q2, stmp);
518                 add_q3 = vshlq_s32(add_q3, stmp);
519 
520                 q_tmp0 = vaddq_s32(mul_0, add_q0);
521                 q_tmp1 = vaddq_s32(mul_1, add_q1);
522                 q_tmp2 = vaddq_s32(mul_2, add_q2);
523                 q_tmp3 = vaddq_s32(mul_3, add_q3);
524 
525                 q_tmp0 = vshlq_s32(q_tmp0, q_v_bits);
526                 q_tmp1 = vshlq_s32(q_tmp1, q_v_bits);
527                 q_tmp2 = vshlq_s32(q_tmp2, q_v_bits);
528                 q_tmp3 = vshlq_s32(q_tmp3, q_v_bits);
529 
530                 /* clip */
531                 q_20 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1));
532                 q_21 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3));
533 
534                 /* qadd = *pi4_quant_round_factor_0_1 */
535                 /* tmp >>= q_bits; */
536                 add_q0 = vld1q_s32(pi4_quant_round_factor_0_1 + j);
537                 add_q1 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (trans_size));
538                 add_q2 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (2 * trans_size));
539                 add_q3 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (3 * trans_size));
540 
541                 add_q0 = vshlq_s32(add_q0, stmp);
542                 add_q1 = vshlq_s32(add_q1, stmp);
543                 add_q2 = vshlq_s32(add_q2, stmp);
544                 add_q3 = vshlq_s32(add_q3, stmp);
545 
546                 q_tmp0 = vaddq_s32(mul_0, add_q0);
547                 q_tmp1 = vaddq_s32(mul_1, add_q1);
548                 q_tmp2 = vaddq_s32(mul_2, add_q2);
549                 q_tmp3 = vaddq_s32(mul_3, add_q3);
550 
551                 q_tmp0 = vshlq_s32(q_tmp0, q_v_bits);
552                 q_tmp1 = vshlq_s32(q_tmp1, q_v_bits);
553                 q_tmp2 = vshlq_s32(q_tmp2, q_v_bits);
554                 q_tmp3 = vshlq_s32(q_tmp3, q_v_bits);
555 
556                 /* clip */
557                 q_30 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1));
558                 q_31 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3));
559 
560                 /* compare qtmp_10, qtmp_20 with 1*/
561                 cmp_10 = vcltq_s16(q_00, one);
562                 cmp_11 = vcltq_s16(q_01, one);
563 
564                 cmp_20 = vbicq_u16(cmp_00, cmp_10);
565                 cmp_21 = vbicq_u16(cmp_01, cmp_11);
566 
567                 q_10 = vbslq_s16(cmp_10, q_30, q_10);
568                 q_11 = vbslq_s16(cmp_11, q_31, q_11);
569 
570                 q_10 = vbslq_s16(cmp_20, q_20, q_10);
571                 q_11 = vbslq_s16(cmp_21, q_21, q_11);
572             }
573 
574             /* restore sign */
575             pq0 = vandq_s16(q_10, vreinterpretq_s16_u16(psgn0));
576             pq1 = vandq_s16(q_11, vreinterpretq_s16_u16(psgn1));
577 
578             nq0 = vandq_s16(q_10, vreinterpretq_s16_u16(nsgn0));
579             nq1 = vandq_s16(q_11, vreinterpretq_s16_u16(nsgn1));
580 
581             q_10 = vsubq_s16(pq0, nq0);
582             q_11 = vsubq_s16(pq1, nq1);
583 
584             /* store */
585             vst1_s16((pi2_q_dst + j), vget_low_s16(q_10));
586             vst1_s16((pi2_q_dst + j + dst_q_strd), vget_high_s16(q_10));
587             vst1_s16((pi2_q_dst + j + (2 * dst_q_strd)), vget_low_s16(q_11));
588             vst1_s16((pi2_q_dst + j + (3 * dst_q_strd)), vget_high_s16(q_11));
589 
590             *(csbf + block_col) = 0;
591             if(vget_lane_s64(vreinterpret_s64_s16(vget_low_s16(q_10)), 0) ||
592                vget_lane_s64(vreinterpret_s64_s16(vget_high_s16(q_10)), 0) ||
593                vget_lane_s64(vreinterpret_s64_s16(vget_low_s16(q_11)), 0) ||
594                vget_lane_s64(vreinterpret_s64_s16(vget_high_s16(q_11)), 0))
595             {
596                 *(csbf + block_col) = 1;
597             }
598 
599             if(*(csbf + block_col) == 1)
600             {
601                 temp_zero_col |= (0xF << block_col * 4);
602                 temp_zero_row |= (0xF << block_row);
603 
604                 /* inverse quantization */
605                 iqtmp_0 = vmull_s16(vget_low_s16(q_10), siq);
606                 iqtmp_1 = vmull_s16(vget_high_s16(q_10), siq);
607                 iqtmp_2 = vmull_s16(vget_low_s16(q_11), siq);
608                 iqtmp_3 = vmull_s16(vget_high_s16(q_11), siq);
609 
610                 iqtmp_0 = vaddq_s32(iqtmp_0, add_iq);
611                 iqtmp_1 = vaddq_s32(iqtmp_1, add_iq);
612                 iqtmp_2 = vaddq_s32(iqtmp_2, add_iq);
613                 iqtmp_3 = vaddq_s32(iqtmp_3, add_iq);
614 
615                 iqtmp_0 = vshlq_s32(iqtmp_0, sh_iq);
616                 iqtmp_1 = vshlq_s32(iqtmp_1, sh_iq);
617                 iqtmp_2 = vshlq_s32(iqtmp_2, sh_iq);
618                 iqtmp_3 = vshlq_s32(iqtmp_3, sh_iq);
619 
620                 /* clip */
621                 iq0 = vqmovn_s32(iqtmp_0);
622                 iq1 = vqmovn_s32(iqtmp_1);
623                 iq2 = vqmovn_s32(iqtmp_2);
624                 iq3 = vqmovn_s32(iqtmp_3);
625 
626                 /* store */
627                 vst1_s16((pi2_iq_dst + j), iq0);
628                 vst1_s16((pi2_iq_dst + j + dst_iq_strd), iq1);
629                 vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), iq2);
630                 vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), iq3);
631 
632                 /* ssd */
633                 /* trans_coeff - inv.quant */
634                 r0 = vsubl_s16(s0, iq0);
635                 r1 = vsubl_s16(s1, iq1);
636                 r2 = vsubl_s16(s2, iq2);
637                 r3 = vsubl_s16(s3, iq3);
638 
639                 /* SD */
640                 r0 = vmulq_s32(r0, r0);
641                 r1 = vmulq_s32(r1, r1);
642                 r2 = vmulq_s32(r2, r2);
643                 r3 = vmulq_s32(r3, r3);
644             }
645             else
646             {
647                 /* store */
648                 vst1_s16((pi2_iq_dst + j), zero_d);
649                 vst1_s16((pi2_iq_dst + j + dst_iq_strd), zero_d);
650                 vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), zero_d);
651                 vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), zero_d);
652 
653                 /* SD */
654                 r0 = vmull_s16(s0, s0);
655                 r1 = vmull_s16(s1, s1);
656                 r2 = vmull_s16(s2, s2);
657                 r3 = vmull_s16(s3, s3);
658             }
659 
660             /* SSD */
661             r0 = vaddq_s32(r0, r1);
662             r2 = vaddq_s32(r2, r3);
663 
664             r0 = vaddq_s32(r0, r2);
665 
666             /* SSD Accumulation */
667             ssd0 = vaddq_s32(ssd0, r0);
668 
669             cbf = cbf || (*(csbf + block_col));  // cbf update
670             block_col++;
671         }
672 
673         block_col = 0;
674         block_row += 4;
675         csbf += csbf_strd;
676 
677         pi2_coeffs += 4 * src_strd;
678         pi2_q_dst += 4 * dst_q_strd;
679         pi2_iq_dst += 4 * dst_iq_strd;
680         pi2_quant_coeff += 4 * trans_size;
681         pi4_quant_round_factor_1_2 += 4 * trans_size;
682         pi4_quant_round_factor_0_1 += 4 * trans_size;
683     }
684 
685     /* SSD Computation */
686     ssd1 = vpadd_s32(vget_low_s32(ssd0), vget_high_s32(ssd0));
687     ssd1 = vpadd_s32(ssd1, ssd1);
688     ssd = vget_lane_s32(ssd1, 0);
689 
690     *zero_col = ~temp_zero_col;  //final zero_col storing
691     *zero_row = ~temp_zero_row;  //final zero_row storing
692 
693     /* Store the cost */
694     *pi8_cost = ssd;
695 
696     return cbf;
697 }
698