1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vp8_rtcd.h"
14 #include "vp8/encoder/block.h"
15
16 static const uint16_t inv_zig_zag[16] = { 1, 2, 6, 7, 3, 5, 8, 13,
17 4, 9, 12, 14, 10, 11, 15, 16 };
18
vp8_fast_quantize_b_neon(BLOCK * b,BLOCKD * d)19 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
20 const int16x8_t one_q = vdupq_n_s16(-1), z0 = vld1q_s16(b->coeff),
21 z1 = vld1q_s16(b->coeff + 8), round0 = vld1q_s16(b->round),
22 round1 = vld1q_s16(b->round + 8),
23 quant0 = vld1q_s16(b->quant_fast),
24 quant1 = vld1q_s16(b->quant_fast + 8),
25 dequant0 = vld1q_s16(d->dequant),
26 dequant1 = vld1q_s16(d->dequant + 8);
27 const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
28 zig_zag1 = vld1q_u16(inv_zig_zag + 8);
29 int16x8_t x0, x1, sz0, sz1, y0, y1;
30 uint16x8_t eob0, eob1;
31 #ifndef __aarch64__
32 uint16x4_t eob_d16;
33 uint32x2_t eob_d32;
34 uint32x4_t eob_q32;
35 #endif // __arch64__
36
37 /* sign of z: z >> 15 */
38 sz0 = vshrq_n_s16(z0, 15);
39 sz1 = vshrq_n_s16(z1, 15);
40
41 /* x = abs(z) */
42 x0 = vabsq_s16(z0);
43 x1 = vabsq_s16(z1);
44
45 /* x += round */
46 x0 = vaddq_s16(x0, round0);
47 x1 = vaddq_s16(x1, round1);
48
49 /* y = 2 * (x * quant) >> 16 */
50 y0 = vqdmulhq_s16(x0, quant0);
51 y1 = vqdmulhq_s16(x1, quant1);
52
53 /* Compensate for doubling in vqdmulhq */
54 y0 = vshrq_n_s16(y0, 1);
55 y1 = vshrq_n_s16(y1, 1);
56
57 /* Restore sign bit */
58 y0 = veorq_s16(y0, sz0);
59 y1 = veorq_s16(y1, sz1);
60 x0 = vsubq_s16(y0, sz0);
61 x1 = vsubq_s16(y1, sz1);
62
63 /* find non-zero elements */
64 eob0 = vtstq_s16(x0, one_q);
65 eob1 = vtstq_s16(x1, one_q);
66
67 /* mask zig zag */
68 eob0 = vandq_u16(eob0, zig_zag0);
69 eob1 = vandq_u16(eob1, zig_zag1);
70
71 /* select the largest value */
72 eob0 = vmaxq_u16(eob0, eob1);
73 #ifdef __aarch64__
74 *d->eob = (int8_t)vmaxvq_u16(eob0);
75 #else
76 eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
77 eob_q32 = vmovl_u16(eob_d16);
78 eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
79 eob_d32 = vpmax_u32(eob_d32, eob_d32);
80
81 vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
82 #endif // __aarch64__
83
84 /* qcoeff = x */
85 vst1q_s16(d->qcoeff, x0);
86 vst1q_s16(d->qcoeff + 8, x1);
87
88 /* dqcoeff = x * dequant */
89 vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
90 vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
91 }
92