1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include "vp8/encoder/block.h"
13 
14 static const uint16_t inv_zig_zag[16] = {
15     1,  2,  6,   7,
16     3,  5,  8,  13,
17     4,  9,  12, 14,
18     10, 11, 15, 16
19 };
20 
vp8_fast_quantize_b_neon(BLOCK * b,BLOCKD * d)21 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
22     const int16x8_t one_q = vdupq_n_s16(-1),
23                     z0 = vld1q_s16(b->coeff),
24                     z1 = vld1q_s16(b->coeff + 8),
25                     round0 = vld1q_s16(b->round),
26                     round1 = vld1q_s16(b->round + 8),
27                     quant0 = vld1q_s16(b->quant_fast),
28                     quant1 = vld1q_s16(b->quant_fast + 8),
29                     dequant0 = vld1q_s16(d->dequant),
30                     dequant1 = vld1q_s16(d->dequant + 8);
31     const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
32                      zig_zag1 = vld1q_u16(inv_zig_zag + 8);
33     int16x8_t x0, x1, sz0, sz1, y0, y1;
34     uint16x8_t eob0, eob1;
35     uint16x4_t eob_d16;
36     uint32x2_t eob_d32;
37     uint32x4_t eob_q32;
38 
39     /* sign of z: z >> 15 */
40     sz0 = vshrq_n_s16(z0, 15);
41     sz1 = vshrq_n_s16(z1, 15);
42 
43     /* x = abs(z) */
44     x0 = vabsq_s16(z0);
45     x1 = vabsq_s16(z1);
46 
47     /* x += round */
48     x0 = vaddq_s16(x0, round0);
49     x1 = vaddq_s16(x1, round1);
50 
51     /* y = 2 * (x * quant) >> 16 */
52     y0 = vqdmulhq_s16(x0, quant0);
53     y1 = vqdmulhq_s16(x1, quant1);
54 
55     /* Compensate for doubling in vqdmulhq */
56     y0 = vshrq_n_s16(y0, 1);
57     y1 = vshrq_n_s16(y1, 1);
58 
59     /* Restore sign bit */
60     y0 = veorq_s16(y0, sz0);
61     y1 = veorq_s16(y1, sz1);
62     x0 = vsubq_s16(y0, sz0);
63     x1 = vsubq_s16(y1, sz1);
64 
65     /* find non-zero elements */
66     eob0 = vtstq_s16(x0, one_q);
67     eob1 = vtstq_s16(x1, one_q);
68 
69     /* mask zig zag */
70     eob0 = vandq_u16(eob0, zig_zag0);
71     eob1 = vandq_u16(eob1, zig_zag1);
72 
73     /* select the largest value */
74     eob0 = vmaxq_u16(eob0, eob1);
75     eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
76     eob_q32 = vmovl_u16(eob_d16);
77     eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
78     eob_d32 = vpmax_u32(eob_d32, eob_d32);
79 
80     /* qcoeff = x */
81     vst1q_s16(d->qcoeff, x0);
82     vst1q_s16(d->qcoeff + 8, x1);
83 
84     /* dqcoeff = x * dequant */
85     vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
86     vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
87 
88     vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
89 }
90