1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 * @file
24 * ihevc_quant_iquant_ssd_neon_intr.c
25 *
26 * @brief
27 * Contains function definitions for quantization, followed by Inverse
28 * quantization to find transform domain SSD
29 *
30 * @author
31 * 100736
32 *
33 * @par List of Functions:
34 * - ihevc_quant_iquant_ssd_flat_scale_mat_neon()
35 * - ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon()
36 *
37 * @remarks
38 *
39 *
40 *******************************************************************************
41 */
42 /* System include files */
43 #include <stdio.h>
44 #include <string.h>
45 #include <stdlib.h>
46
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "ihevc_macros.h"
50 #include "ihevc_platform_macros.h"
51 #include "ihevc_defs.h"
52 #include "ihevc_debug.h"
53 #include "ihevc_trans_tables.h"
54 #include "ihevc_quant_iquant_ssd.h"
55 #include "ihevc_func_selector.h"
56 #include "ihevc_trans_macros.h"
57 #include "arm_neon.h"
58
59 /*****************************************************************************/
60 /* Function Definitions */
61 /*****************************************************************************/
62
ihevc_quant_iquant_ssd_flat_scale_mat_neon(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)63 WORD32 ihevc_quant_iquant_ssd_flat_scale_mat_neon(
64 WORD16 *pi2_coeffs,
65 WORD16 *pi2_quant_coeff,
66 WORD16 *pi2_q_dst,
67 WORD16 *pi2_iq_dst,
68 WORD32 trans_size,
69 WORD32 qp_div,
70 WORD32 qp_rem,
71 WORD32 q_add,
72 WORD32 *pi4_quant_round_factor_0_1,
73 WORD32 *pi4_quant_round_factor_1_2,
74 WORD32 src_strd,
75 WORD32 dst_q_strd,
76 WORD32 dst_iq_strd,
77 UWORD8 *csbf,
78 WORD32 csbf_strd,
79 WORD32 *zero_col,
80 WORD32 *zero_row,
81 WORD16 *pi2_dequant_coeff,
82 LWORD64 *pi8_cost)
83 {
84 WORD32 i, j;
85 WORD32 log2_size;
86 WORD32 cbf = 0;
87
88 WORD16 qm = 4;
89 WORD16 bd = 8;
90 WORD32 q_bits, tr, temp;
91 WORD32 block_col = 0;
92 WORD32 block_row = 0;
93 WORD32 temp_zero_col = 0;
94 WORD32 temp_zero_row = 0;
95
96 WORD32 sh;
97 WORD32 s_iq;
98 WORD32 sh_tmp;
99
100 // ssd
101 int32x4_t ssd0 = vdupq_n_s32(0);
102 int32x2_t ssd1;
103 WORD32 ssd;
104 // const
105 const int16x4_t zero = vdup_n_s16(0);
106 const int16x4_t zero_d = vdup_n_s16(0);
107 const int16x4_t sq = vdup_n_s16(g_ihevc_quant_scales[qp_rem]);
108 const int16x4_t siq = vdup_n_s16((g_ihevc_iquant_scales_flat_scale[qp_rem]));
109 // src
110 int16x4_t s0, s1, s2, s3;
111 // q-iq
112 int16x4_t q0, q1, q2, q3;
113 int16x4_t iq0, iq1, iq2, iq3;
114 // residue
115 int32x4_t r0, r1, r2, r3;
116 // sign
117 uint16x4_t psgn0, psgn1, psgn2, psgn3;
118 uint16x4_t nsgn0, nsgn1, nsgn2, nsgn3;
119 // abs(src)
120 int16x4_t abs_s0, abs_s1, abs_s2, abs_s3;
121 // q-temp
122 int32x4_t qtmp_0, qtmp_1, qtmp_2, qtmp_3;
123 int16x4_t pq0, pq1, pq2, pq3;
124 int16x4_t nq0, nq1, nq2, nq3;
125 // iq-temp
126 int32x4_t iqtmp_0, iqtmp_1, iqtmp_2, iqtmp_3;
127
128 int32x4_t add_q;
129 int32x4_t add_iq = vdupq_n_s32(1);
130 int32x4_t sh_iq_1;
131 int32x4_t sh_iq;
132 int32x4_t q_v_bits;
133
134 (void)pi4_quant_round_factor_0_1;
135 (void)pi4_quant_round_factor_1_2;
136 (void)pi2_dequant_coeff;
137
138 GETRANGE(log2_size, trans_size);
139 log2_size -= 1;
140
141 tr = MAX_TR_DYNAMIC_RANGE - bd - log2_size;
142 q_bits = QUANT_SHIFT + qp_div + tr + SCALING_Q_SHIFT - qm - FLAT_RESCALE_MAT_Q_SHIFT;
143 temp = (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));
144
145 q_v_bits = vdupq_n_s32(-q_bits);
146 add_q = vdupq_n_s32(temp);
147
148 sh = bd + log2_size - 5;
149
150 sh_tmp = (sh - qp_div - 1);
151 sh_iq_1 = vdupq_n_s32(sh_tmp);
152 add_iq = vshlq_s32(add_iq, sh_iq_1);
153
154 s_iq = (-(sh - qp_div));
155 sh_iq = vdupq_n_s32(s_iq);
156
157 for(i = 0; i < trans_size; i += 4)
158 {
159 for(j = 0; j < trans_size; j += 4)
160 {
161 s0 = vld1_s16(pi2_coeffs + j);
162 s1 = vld1_s16(pi2_coeffs + j + (src_strd));
163 s2 = vld1_s16(pi2_coeffs + j + (2 * src_strd));
164 s3 = vld1_s16(pi2_coeffs + j + (3 * src_strd));
165
166 /* quantization */
167 /* sign */
168 psgn0 = vcge_s16(s0, zero);
169 psgn1 = vcge_s16(s1, zero);
170 psgn2 = vcge_s16(s2, zero);
171 psgn3 = vcge_s16(s3, zero);
172
173 nsgn0 = vclt_s16(s0, zero);
174 nsgn1 = vclt_s16(s1, zero);
175 nsgn2 = vclt_s16(s2, zero);
176 nsgn3 = vclt_s16(s3, zero);
177
178 /* |src| */
179 abs_s0 = vabs_s16(s0);
180 abs_s1 = vabs_s16(s1);
181 abs_s2 = vabs_s16(s2);
182 abs_s3 = vabs_s16(s3);
183
184 /* tmp = tmp * quant_coeff */
185 qtmp_0 = vmull_s16(abs_s0, sq);
186 qtmp_1 = vmull_s16(abs_s1, sq);
187 qtmp_2 = vmull_s16(abs_s2, sq);
188 qtmp_3 = vmull_s16(abs_s3, sq);
189
190 /* tmp += (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)) */
191 qtmp_0 = vaddq_s32(qtmp_0, add_q);
192 qtmp_1 = vaddq_s32(qtmp_1, add_q);
193 qtmp_2 = vaddq_s32(qtmp_2, add_q);
194 qtmp_3 = vaddq_s32(qtmp_3, add_q);
195
196 /* tmp >>= q_bits; */
197 qtmp_0 = vshlq_s32(qtmp_0, q_v_bits);
198 qtmp_1 = vshlq_s32(qtmp_1, q_v_bits);
199 qtmp_2 = vshlq_s32(qtmp_2, q_v_bits);
200 qtmp_3 = vshlq_s32(qtmp_3, q_v_bits);
201
202 /* clip */
203 q0 = vqmovn_s32(qtmp_0);
204 q1 = vqmovn_s32(qtmp_1);
205 q2 = vqmovn_s32(qtmp_2);
206 q3 = vqmovn_s32(qtmp_3);
207
208 /* restore sign */
209 pq0 = vand_s16(q0, vreinterpret_s16_u16(psgn0));
210 pq1 = vand_s16(q1, vreinterpret_s16_u16(psgn1));
211 pq2 = vand_s16(q2, vreinterpret_s16_u16(psgn2));
212 pq3 = vand_s16(q3, vreinterpret_s16_u16(psgn3));
213
214 nq0 = vand_s16(q0, vreinterpret_s16_u16(nsgn0));
215 nq1 = vand_s16(q1, vreinterpret_s16_u16(nsgn1));
216 nq2 = vand_s16(q2, vreinterpret_s16_u16(nsgn2));
217 nq3 = vand_s16(q3, vreinterpret_s16_u16(nsgn3));
218
219 q0 = vsub_s16(pq0, nq0);
220 q1 = vsub_s16(pq1, nq1);
221 q2 = vsub_s16(pq2, nq2);
222 q3 = vsub_s16(pq3, nq3);
223
224 /* store */
225 vst1_s16((pi2_q_dst + j), q0);
226 vst1_s16((pi2_q_dst + j + dst_q_strd), q1);
227 vst1_s16((pi2_q_dst + j + (2 * dst_q_strd)), q2);
228 vst1_s16((pi2_q_dst + j + (3 * dst_q_strd)), q3);
229
230 *(csbf + block_col) = 0;
231 if(vget_lane_s64(vreinterpret_s64_s16(q0), 0) ||
232 vget_lane_s64(vreinterpret_s64_s16(q1), 0) ||
233 vget_lane_s64(vreinterpret_s64_s16(q2), 0) ||
234 vget_lane_s64(vreinterpret_s64_s16(q3), 0))
235 {
236 *(csbf + block_col) = 1;
237 }
238
239 if(*(csbf + block_col) == 1)
240 {
241 temp_zero_col |= (0xF << block_col * 4);
242 temp_zero_row |= (0xF << block_row);
243
244 /* inverse quantization */
245 iqtmp_0 = vmull_s16(q0, siq);
246 iqtmp_1 = vmull_s16(q1, siq);
247 iqtmp_2 = vmull_s16(q2, siq);
248 iqtmp_3 = vmull_s16(q3, siq);
249
250 iqtmp_0 = vaddq_s32(iqtmp_0, add_iq);
251 iqtmp_1 = vaddq_s32(iqtmp_1, add_iq);
252 iqtmp_2 = vaddq_s32(iqtmp_2, add_iq);
253 iqtmp_3 = vaddq_s32(iqtmp_3, add_iq);
254
255 iqtmp_0 = vshlq_s32(iqtmp_0, sh_iq);
256 iqtmp_1 = vshlq_s32(iqtmp_1, sh_iq);
257 iqtmp_2 = vshlq_s32(iqtmp_2, sh_iq);
258 iqtmp_3 = vshlq_s32(iqtmp_3, sh_iq);
259
260 /* clip */
261 iq0 = vqmovn_s32(iqtmp_0);
262 iq1 = vqmovn_s32(iqtmp_1);
263 iq2 = vqmovn_s32(iqtmp_2);
264 iq3 = vqmovn_s32(iqtmp_3);
265
266 /* store */
267 vst1_s16((pi2_iq_dst + j), iq0);
268 vst1_s16((pi2_iq_dst + j + dst_iq_strd), iq1);
269 vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), iq2);
270 vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), iq3);
271
272 /* ssd */
273 /* trans_coeff - inv.quant */
274 r0 = vsubl_s16(s0, iq0);
275 r1 = vsubl_s16(s1, iq1);
276 r2 = vsubl_s16(s2, iq2);
277 r3 = vsubl_s16(s3, iq3);
278
279 /* SD */
280 r0 = vmulq_s32(r0, r0);
281 r1 = vmulq_s32(r1, r1);
282 r2 = vmulq_s32(r2, r2);
283 r3 = vmulq_s32(r3, r3);
284 }
285 else
286 {
287 /* store */
288 vst1_s16((pi2_iq_dst + j), zero_d);
289 vst1_s16((pi2_iq_dst + j + dst_iq_strd), zero_d);
290 vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), zero_d);
291 vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), zero_d);
292
293 /* SD */
294 r0 = vmull_s16(s0, s0);
295 r1 = vmull_s16(s1, s1);
296 r2 = vmull_s16(s2, s2);
297 r3 = vmull_s16(s3, s3);
298 }
299
300 /* SSD */
301 r0 = vaddq_s32(r0, r1);
302 r2 = vaddq_s32(r2, r3);
303
304 r0 = vaddq_s32(r0, r2);
305
306 /* SSD Accumulation */
307 ssd0 = vaddq_s32(ssd0, r0);
308
309 cbf = cbf || (*(csbf + block_col)); // cbf update
310 block_col++;
311 }
312
313 block_col = 0;
314 block_row += 4;
315 csbf += csbf_strd;
316
317 pi2_coeffs += 4 * src_strd;
318 pi2_q_dst += 4 * dst_q_strd;
319 pi2_iq_dst += 4 * dst_iq_strd;
320 pi2_quant_coeff += 4 * trans_size;
321 }
322
323 /* SSD Computation */
324 ssd1 = vpadd_s32(vget_low_s32(ssd0), vget_high_s32(ssd0));
325 ssd1 = vpadd_s32(ssd1, ssd1);
326 ssd = vget_lane_s32(ssd1, 0);
327
328 *zero_col = ~temp_zero_col; //final zero_col storing
329 *zero_row = ~temp_zero_row; //final zero_row storing
330
331 /* Store the cost */
332 *pi8_cost = ssd;
333
334 return cbf;
335 }
336
ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon(WORD16 * pi2_coeffs,WORD16 * pi2_quant_coeff,WORD16 * pi2_q_dst,WORD16 * pi2_iq_dst,WORD32 trans_size,WORD32 qp_div,WORD32 qp_rem,WORD32 q_add,WORD32 * pi4_quant_round_factor_0_1,WORD32 * pi4_quant_round_factor_1_2,WORD32 src_strd,WORD32 dst_q_strd,WORD32 dst_iq_strd,UWORD8 * csbf,WORD32 csbf_strd,WORD32 * zero_col,WORD32 * zero_row,WORD16 * pi2_dequant_coeff,LWORD64 * pi8_cost)337 WORD32 ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon(
338 WORD16 *pi2_coeffs,
339 WORD16 *pi2_quant_coeff,
340 WORD16 *pi2_q_dst,
341 WORD16 *pi2_iq_dst,
342 WORD32 trans_size,
343 WORD32 qp_div, /* qpscaled / 6 */
344 WORD32 qp_rem, /* qpscaled % 6 */
345 WORD32 q_add,
346 WORD32 *pi4_quant_round_factor_0_1,
347 WORD32 *pi4_quant_round_factor_1_2,
348 WORD32 src_strd,
349 WORD32 dst_q_strd,
350 WORD32 dst_iq_strd,
351 UWORD8 *csbf,
352 WORD32 csbf_strd,
353 WORD32 *zero_col,
354 WORD32 *zero_row,
355 WORD16 *pi2_dequant_coeff,
356 LWORD64 *pi8_cost)
357 {
358 WORD32 i, j;
359 WORD32 log2_size;
360 WORD32 cbf = 0;
361
362 WORD16 qm = 4;
363 WORD16 bd = 8;
364 WORD32 q_bits, tr;
365 WORD32 block_col = 0;
366 WORD32 block_row = 0;
367 WORD32 temp_zero_col = 0;
368 WORD32 temp_zero_row = 0;
369
370 WORD32 sh;
371 WORD32 s_iq;
372 WORD32 sh_tmp;
373
374 // ssd
375 int32x4_t ssd0 = vdupq_n_s32(0);
376 int32x2_t ssd1;
377 WORD32 ssd;
378 // const
379 const int16x8_t zero = vdupq_n_s16(0);
380 const int16x4_t zero_d = vdup_n_s16(0);
381 const int16x8_t one = vdupq_n_s16(1);
382 const int16x8_t two = vdupq_n_s16(2);
383 const int16x4_t sq = vdup_n_s16(g_ihevc_quant_scales[qp_rem]);
384 const int16x4_t siq = vdup_n_s16((g_ihevc_iquant_scales_flat_scale[qp_rem]));
385 // src
386 int16x4_t s0, s1, s2, s3;
387 // sign
388 uint16x8_t psgn0, psgn1;
389 uint16x8_t nsgn0, nsgn1;
390 int16x8_t pq0, pq1;
391 int16x8_t nq0, nq1;
392 // abs(src)
393 int16x4_t abs_s0, abs_s1, abs_s2, abs_s3;
394 // q-temp
395 int32x4_t mul_0, mul_1, mul_2, mul_3;
396 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
397 int16x8_t q_00, q_01;
398 int16x8_t q_10, q_11;
399 int16x8_t q_20, q_21;
400 int16x8_t q_30, q_31;
401 // cmp
402 uint16x8_t cmp_00, cmp_01;
403 uint16x8_t cmp_10, cmp_11;
404 uint16x8_t cmp_20, cmp_21;
405 // iq-temp
406 int32x4_t iqtmp_0, iqtmp_1, iqtmp_2, iqtmp_3;
407 int16x4_t iq0, iq1, iq2, iq3;
408 //residue
409 int32x4_t r0, r1, r2, r3;
410 // add_q
411 int32x4_t add_q;
412 int32x4_t add_q0, add_q1, add_q2, add_q3;
413 int32x4_t add_iq = vdupq_n_s32(1);
414 int32x4_t sh_iq_1;
415 int32x4_t sh_iq;
416 int32x4_t q_v_bits;
417 int32x4_t stmp;
418
419 (void)q_add;
420 (void)pi2_dequant_coeff;
421 GETRANGE(log2_size, trans_size);
422 log2_size -= 1;
423
424 tr = MAX_TR_DYNAMIC_RANGE - bd - log2_size;
425 q_bits = QUANT_SHIFT + qp_div + tr + SCALING_Q_SHIFT - qm - FLAT_RESCALE_MAT_Q_SHIFT;
426
427 stmp = vdupq_n_s32(q_bits - QUANT_ROUND_FACTOR_Q);
428
429 add_q = vdupq_n_s32((1 << QUANT_ROUND_FACTOR_Q) / 2);
430 add_q = vshlq_s32(add_q, stmp);
431
432 q_v_bits = vdupq_n_s32(-q_bits);
433
434 sh = bd + log2_size - 5;
435
436 sh_tmp = (sh - qp_div - 1);
437 sh_iq_1 = vdupq_n_s32(sh_tmp);
438 add_iq = vshlq_s32(add_iq, sh_iq_1);
439
440 s_iq = (-(sh - qp_div));
441 sh_iq = vdupq_n_s32(s_iq);
442
443 for(i = 0; i < trans_size; i += 4)
444 {
445 for(j = 0; j < trans_size; j += 4)
446 {
447 s0 = vld1_s16(pi2_coeffs + j);
448 s1 = vld1_s16(pi2_coeffs + j + (src_strd));
449 s2 = vld1_s16(pi2_coeffs + j + (2 * src_strd));
450 s3 = vld1_s16(pi2_coeffs + j + (3 * src_strd));
451
452 /* quantization */
453 /* sign */
454 psgn0 = vcgeq_s16(vcombine_s16(s0, s1), zero);
455 psgn1 = vcgeq_s16(vcombine_s16(s2, s3), zero);
456
457 nsgn0 = vcltq_s16(vcombine_s16(s0, s1), zero);
458 nsgn1 = vcltq_s16(vcombine_s16(s2, s3), zero);
459
460 /* |src| */
461 abs_s0 = vabs_s16(s0);
462 abs_s1 = vabs_s16(s1);
463 abs_s2 = vabs_s16(s2);
464 abs_s3 = vabs_s16(s3);
465
466 /* tmp = tmp * quant_coeff */
467 mul_0 = vmull_s16(abs_s0, sq);
468 mul_1 = vmull_s16(abs_s1, sq);
469 mul_2 = vmull_s16(abs_s2, sq);
470 mul_3 = vmull_s16(abs_s3, sq);
471
472 /* qadd = 0 */
473 /* tmp >>= q_bits; */
474 q_tmp0 = vshlq_s32(mul_0, q_v_bits);
475 q_tmp1 = vshlq_s32(mul_1, q_v_bits);
476 q_tmp2 = vshlq_s32(mul_2, q_v_bits);
477 q_tmp3 = vshlq_s32(mul_3, q_v_bits);
478
479 /* clip */
480 q_00 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1));
481 q_01 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3));
482
483 /* compare qtmp_10, qtmp_20 with 2*/
484 cmp_00 = vcltq_s16(q_00, two);
485 cmp_01 = vcltq_s16(q_01, two);
486
487 /* qadd = (1 << QUANT_ROUND_FACTOR_Q)/2) */
488 /* tmp >>= q_bits; */
489 q_tmp0 = vaddq_s32(mul_0, add_q);
490 q_tmp1 = vaddq_s32(mul_1, add_q);
491 q_tmp2 = vaddq_s32(mul_2, add_q);
492 q_tmp3 = vaddq_s32(mul_3, add_q);
493
494 q_tmp0 = vshlq_s32(q_tmp0, q_v_bits);
495 q_tmp1 = vshlq_s32(q_tmp1, q_v_bits);
496 q_tmp2 = vshlq_s32(q_tmp2, q_v_bits);
497 q_tmp3 = vshlq_s32(q_tmp3, q_v_bits);
498
499 /* clip */
500 q_10 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1));
501 q_11 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3));
502
503 if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp_00)), 0) ||
504 vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp_00)), 0) ||
505 vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp_01)), 0) ||
506 vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp_01)), 0))
507 {
508 /* qadd = *pi4_quant_round_factor_1_2 */
509 /* tmp >>= q_bits; */
510 add_q0 = vld1q_s32(pi4_quant_round_factor_1_2 + j);
511 add_q1 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (trans_size));
512 add_q2 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (2 * trans_size));
513 add_q3 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (3 * trans_size));
514
515 add_q0 = vshlq_s32(add_q0, stmp);
516 add_q1 = vshlq_s32(add_q1, stmp);
517 add_q2 = vshlq_s32(add_q2, stmp);
518 add_q3 = vshlq_s32(add_q3, stmp);
519
520 q_tmp0 = vaddq_s32(mul_0, add_q0);
521 q_tmp1 = vaddq_s32(mul_1, add_q1);
522 q_tmp2 = vaddq_s32(mul_2, add_q2);
523 q_tmp3 = vaddq_s32(mul_3, add_q3);
524
525 q_tmp0 = vshlq_s32(q_tmp0, q_v_bits);
526 q_tmp1 = vshlq_s32(q_tmp1, q_v_bits);
527 q_tmp2 = vshlq_s32(q_tmp2, q_v_bits);
528 q_tmp3 = vshlq_s32(q_tmp3, q_v_bits);
529
530 /* clip */
531 q_20 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1));
532 q_21 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3));
533
534 /* qadd = *pi4_quant_round_factor_0_1 */
535 /* tmp >>= q_bits; */
536 add_q0 = vld1q_s32(pi4_quant_round_factor_0_1 + j);
537 add_q1 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (trans_size));
538 add_q2 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (2 * trans_size));
539 add_q3 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (3 * trans_size));
540
541 add_q0 = vshlq_s32(add_q0, stmp);
542 add_q1 = vshlq_s32(add_q1, stmp);
543 add_q2 = vshlq_s32(add_q2, stmp);
544 add_q3 = vshlq_s32(add_q3, stmp);
545
546 q_tmp0 = vaddq_s32(mul_0, add_q0);
547 q_tmp1 = vaddq_s32(mul_1, add_q1);
548 q_tmp2 = vaddq_s32(mul_2, add_q2);
549 q_tmp3 = vaddq_s32(mul_3, add_q3);
550
551 q_tmp0 = vshlq_s32(q_tmp0, q_v_bits);
552 q_tmp1 = vshlq_s32(q_tmp1, q_v_bits);
553 q_tmp2 = vshlq_s32(q_tmp2, q_v_bits);
554 q_tmp3 = vshlq_s32(q_tmp3, q_v_bits);
555
556 /* clip */
557 q_30 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1));
558 q_31 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3));
559
560 /* compare qtmp_10, qtmp_20 with 1*/
561 cmp_10 = vcltq_s16(q_00, one);
562 cmp_11 = vcltq_s16(q_01, one);
563
564 cmp_20 = vbicq_u16(cmp_00, cmp_10);
565 cmp_21 = vbicq_u16(cmp_01, cmp_11);
566
567 q_10 = vbslq_s16(cmp_10, q_30, q_10);
568 q_11 = vbslq_s16(cmp_11, q_31, q_11);
569
570 q_10 = vbslq_s16(cmp_20, q_20, q_10);
571 q_11 = vbslq_s16(cmp_21, q_21, q_11);
572 }
573
574 /* restore sign */
575 pq0 = vandq_s16(q_10, vreinterpretq_s16_u16(psgn0));
576 pq1 = vandq_s16(q_11, vreinterpretq_s16_u16(psgn1));
577
578 nq0 = vandq_s16(q_10, vreinterpretq_s16_u16(nsgn0));
579 nq1 = vandq_s16(q_11, vreinterpretq_s16_u16(nsgn1));
580
581 q_10 = vsubq_s16(pq0, nq0);
582 q_11 = vsubq_s16(pq1, nq1);
583
584 /* store */
585 vst1_s16((pi2_q_dst + j), vget_low_s16(q_10));
586 vst1_s16((pi2_q_dst + j + dst_q_strd), vget_high_s16(q_10));
587 vst1_s16((pi2_q_dst + j + (2 * dst_q_strd)), vget_low_s16(q_11));
588 vst1_s16((pi2_q_dst + j + (3 * dst_q_strd)), vget_high_s16(q_11));
589
590 *(csbf + block_col) = 0;
591 if(vget_lane_s64(vreinterpret_s64_s16(vget_low_s16(q_10)), 0) ||
592 vget_lane_s64(vreinterpret_s64_s16(vget_high_s16(q_10)), 0) ||
593 vget_lane_s64(vreinterpret_s64_s16(vget_low_s16(q_11)), 0) ||
594 vget_lane_s64(vreinterpret_s64_s16(vget_high_s16(q_11)), 0))
595 {
596 *(csbf + block_col) = 1;
597 }
598
599 if(*(csbf + block_col) == 1)
600 {
601 temp_zero_col |= (0xF << block_col * 4);
602 temp_zero_row |= (0xF << block_row);
603
604 /* inverse quantization */
605 iqtmp_0 = vmull_s16(vget_low_s16(q_10), siq);
606 iqtmp_1 = vmull_s16(vget_high_s16(q_10), siq);
607 iqtmp_2 = vmull_s16(vget_low_s16(q_11), siq);
608 iqtmp_3 = vmull_s16(vget_high_s16(q_11), siq);
609
610 iqtmp_0 = vaddq_s32(iqtmp_0, add_iq);
611 iqtmp_1 = vaddq_s32(iqtmp_1, add_iq);
612 iqtmp_2 = vaddq_s32(iqtmp_2, add_iq);
613 iqtmp_3 = vaddq_s32(iqtmp_3, add_iq);
614
615 iqtmp_0 = vshlq_s32(iqtmp_0, sh_iq);
616 iqtmp_1 = vshlq_s32(iqtmp_1, sh_iq);
617 iqtmp_2 = vshlq_s32(iqtmp_2, sh_iq);
618 iqtmp_3 = vshlq_s32(iqtmp_3, sh_iq);
619
620 /* clip */
621 iq0 = vqmovn_s32(iqtmp_0);
622 iq1 = vqmovn_s32(iqtmp_1);
623 iq2 = vqmovn_s32(iqtmp_2);
624 iq3 = vqmovn_s32(iqtmp_3);
625
626 /* store */
627 vst1_s16((pi2_iq_dst + j), iq0);
628 vst1_s16((pi2_iq_dst + j + dst_iq_strd), iq1);
629 vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), iq2);
630 vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), iq3);
631
632 /* ssd */
633 /* trans_coeff - inv.quant */
634 r0 = vsubl_s16(s0, iq0);
635 r1 = vsubl_s16(s1, iq1);
636 r2 = vsubl_s16(s2, iq2);
637 r3 = vsubl_s16(s3, iq3);
638
639 /* SD */
640 r0 = vmulq_s32(r0, r0);
641 r1 = vmulq_s32(r1, r1);
642 r2 = vmulq_s32(r2, r2);
643 r3 = vmulq_s32(r3, r3);
644 }
645 else
646 {
647 /* store */
648 vst1_s16((pi2_iq_dst + j), zero_d);
649 vst1_s16((pi2_iq_dst + j + dst_iq_strd), zero_d);
650 vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), zero_d);
651 vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), zero_d);
652
653 /* SD */
654 r0 = vmull_s16(s0, s0);
655 r1 = vmull_s16(s1, s1);
656 r2 = vmull_s16(s2, s2);
657 r3 = vmull_s16(s3, s3);
658 }
659
660 /* SSD */
661 r0 = vaddq_s32(r0, r1);
662 r2 = vaddq_s32(r2, r3);
663
664 r0 = vaddq_s32(r0, r2);
665
666 /* SSD Accumulation */
667 ssd0 = vaddq_s32(ssd0, r0);
668
669 cbf = cbf || (*(csbf + block_col)); // cbf update
670 block_col++;
671 }
672
673 block_col = 0;
674 block_row += 4;
675 csbf += csbf_strd;
676
677 pi2_coeffs += 4 * src_strd;
678 pi2_q_dst += 4 * dst_q_strd;
679 pi2_iq_dst += 4 * dst_iq_strd;
680 pi2_quant_coeff += 4 * trans_size;
681 pi4_quant_round_factor_1_2 += 4 * trans_size;
682 pi4_quant_round_factor_0_1 += 4 * trans_size;
683 }
684
685 /* SSD Computation */
686 ssd1 = vpadd_s32(vget_low_s32(ssd0), vget_high_s32(ssd0));
687 ssd1 = vpadd_s32(ssd1, ssd1);
688 ssd = vget_lane_s32(ssd1, 0);
689
690 *zero_col = ~temp_zero_col; //final zero_col storing
691 *zero_row = ~temp_zero_row; //final zero_row storing
692
693 /* Store the cost */
694 *pi8_cost = ssd;
695
696 return cbf;
697 }
698