1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <emmintrin.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/vpx_dsp_common.h"
16 #include "vpx_mem/vpx_mem.h"
17 #include "vpx_ports/mem.h"
18
19 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_quantize_b_sse2(const tran_low_t * coeff_ptr,intptr_t count,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)20 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
21 int skip_block, const int16_t *zbin_ptr,
22 const int16_t *round_ptr,
23 const int16_t *quant_ptr,
24 const int16_t *quant_shift_ptr,
25 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
26 const int16_t *dequant_ptr, uint16_t *eob_ptr,
27 const int16_t *scan, const int16_t *iscan) {
28 int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
29 __m128i zbins[2];
30 __m128i nzbins[2];
31
32 zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
33 (int)zbin_ptr[0]);
34 zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
35
36 nzbins[0] = _mm_setzero_si128();
37 nzbins[1] = _mm_setzero_si128();
38 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
39 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
40
41 (void)scan;
42 (void)skip_block;
43 assert(!skip_block);
44
45 memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
46 memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
47
48 // Pre-scan pass
49 for (i = ((int)count / 4) - 1; i >= 0; i--) {
50 __m128i coeffs, cmp1, cmp2;
51 int test;
52 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
53 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
54 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
55 cmp1 = _mm_and_si128(cmp1, cmp2);
56 test = _mm_movemask_epi8(cmp1);
57 if (test == 0xffff)
58 non_zero_regs--;
59 else
60 break;
61 }
62
63 // Quantization pass:
64 for (i = 0; i < non_zero_regs; i++) {
65 __m128i coeffs, coeffs_sign, tmp1, tmp2;
66 int test;
67 int abs_coeff[4];
68 int coeff_sign[4];
69
70 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
71 coeffs_sign = _mm_srai_epi32(coeffs, 31);
72 coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
73 tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
74 tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
75 tmp1 = _mm_or_si128(tmp1, tmp2);
76 test = _mm_movemask_epi8(tmp1);
77 _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
78 _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
79
80 for (j = 0; j < 4; j++) {
81 if (test & (1 << (4 * j))) {
82 int k = 4 * i + j;
83 const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
84 const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
85 const uint32_t abs_qcoeff =
86 (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
87 qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
88 dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
89 if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
90 }
91 }
92 }
93 *eob_ptr = eob_i + 1;
94 }
95
vpx_highbd_quantize_b_32x32_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)96 void vpx_highbd_quantize_b_32x32_sse2(
97 const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
98 const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
99 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
100 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
101 const int16_t *scan, const int16_t *iscan) {
102 __m128i zbins[2];
103 __m128i nzbins[2];
104 int idx = 0;
105 int idx_arr[1024];
106 int i, eob = -1;
107 const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
108 const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
109 (void)scan;
110 (void)skip_block;
111 assert(!skip_block);
112
113 zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
114 zbins[1] = _mm_set1_epi32(zbin1_tmp);
115
116 nzbins[0] = _mm_setzero_si128();
117 nzbins[1] = _mm_setzero_si128();
118 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
119 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
120
121 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
122 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
123
124 // Pre-scan pass
125 for (i = 0; i < n_coeffs / 4; i++) {
126 __m128i coeffs, cmp1, cmp2;
127 int test;
128 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
129 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
130 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
131 cmp1 = _mm_and_si128(cmp1, cmp2);
132 test = _mm_movemask_epi8(cmp1);
133 if (!(test & 0xf)) idx_arr[idx++] = i * 4;
134 if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
135 if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
136 if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
137 }
138
139 // Quantization pass: only process the coefficients selected in
140 // pre-scan pass. Note: idx can be zero.
141 for (i = 0; i < idx; i++) {
142 const int rc = idx_arr[i];
143 const int coeff = coeff_ptr[rc];
144 const int coeff_sign = (coeff >> 31);
145 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
146 const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
147 const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
148 const uint32_t abs_qcoeff =
149 (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
150 qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
151 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
152 if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
153 }
154 *eob_ptr = eob + 1;
155 }
156 #endif
157