1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h>
12 #include <xmmintrin.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16
load_coefficients(const tran_low_t * coeff_ptr)17 static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
18 #if CONFIG_VP9_HIGHBITDEPTH
19 return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
20 (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4],
21 (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
22 #else
23 return _mm_load_si128((const __m128i *)coeff_ptr);
24 #endif
25 }
26
store_coefficients(__m128i coeff_vals,tran_low_t * coeff_ptr)27 static INLINE void store_coefficients(__m128i coeff_vals,
28 tran_low_t *coeff_ptr) {
29 #if CONFIG_VP9_HIGHBITDEPTH
30 __m128i one = _mm_set1_epi16(1);
31 __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
32 __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
33 __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
34 __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
35 _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1);
36 _mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2);
37 #else
38 _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals);
39 #endif
40 }
41
vpx_quantize_b_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan_ptr,const int16_t * iscan_ptr)42 void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
43 int skip_block, const int16_t* zbin_ptr,
44 const int16_t* round_ptr, const int16_t* quant_ptr,
45 const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
46 tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
47 uint16_t* eob_ptr,
48 const int16_t* scan_ptr,
49 const int16_t* iscan_ptr) {
50 __m128i zero;
51 (void)scan_ptr;
52
53 coeff_ptr += n_coeffs;
54 iscan_ptr += n_coeffs;
55 qcoeff_ptr += n_coeffs;
56 dqcoeff_ptr += n_coeffs;
57 n_coeffs = -n_coeffs;
58 zero = _mm_setzero_si128();
59 if (!skip_block) {
60 __m128i eob;
61 __m128i zbin;
62 __m128i round, quant, dequant, shift;
63 {
64 __m128i coeff0, coeff1;
65
66 // Setup global values
67 {
68 __m128i pw_1;
69 zbin = _mm_load_si128((const __m128i*)zbin_ptr);
70 round = _mm_load_si128((const __m128i*)round_ptr);
71 quant = _mm_load_si128((const __m128i*)quant_ptr);
72 pw_1 = _mm_set1_epi16(1);
73 zbin = _mm_sub_epi16(zbin, pw_1);
74 dequant = _mm_load_si128((const __m128i*)dequant_ptr);
75 shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
76 }
77
78 {
79 __m128i coeff0_sign, coeff1_sign;
80 __m128i qcoeff0, qcoeff1;
81 __m128i qtmp0, qtmp1;
82 __m128i cmp_mask0, cmp_mask1;
83 // Do DC and first 15 AC
84 coeff0 = load_coefficients(coeff_ptr + n_coeffs);
85 coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
86
87 // Poor man's sign extract
88 coeff0_sign = _mm_srai_epi16(coeff0, 15);
89 coeff1_sign = _mm_srai_epi16(coeff1, 15);
90 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
91 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
92 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
93 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
94
95 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
96 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
97 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
98 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
99 round = _mm_unpackhi_epi64(round, round);
100 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
101 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
102 quant = _mm_unpackhi_epi64(quant, quant);
103 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
104 qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
105 qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
106 qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
107 shift = _mm_unpackhi_epi64(shift, shift);
108 qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
109
110 // Reinsert signs
111 qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
112 qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
113 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
114 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
115
116 // Mask out zbin threshold coeffs
117 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
118 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
119
120 store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
121 store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
122
123 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
124 dequant = _mm_unpackhi_epi64(dequant, dequant);
125 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
126
127 store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
128 store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
129 }
130
131 {
132 // Scan for eob
133 __m128i zero_coeff0, zero_coeff1;
134 __m128i nzero_coeff0, nzero_coeff1;
135 __m128i iscan0, iscan1;
136 __m128i eob1;
137 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
138 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
139 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
140 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
141 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
142 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
143 // Add one to convert from indices to counts
144 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
145 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
146 eob = _mm_and_si128(iscan0, nzero_coeff0);
147 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
148 eob = _mm_max_epi16(eob, eob1);
149 }
150 n_coeffs += 8 * 2;
151 }
152
153 // AC only loop
154 while (n_coeffs < 0) {
155 __m128i coeff0, coeff1;
156 {
157 __m128i coeff0_sign, coeff1_sign;
158 __m128i qcoeff0, qcoeff1;
159 __m128i qtmp0, qtmp1;
160 __m128i cmp_mask0, cmp_mask1;
161
162 coeff0 = load_coefficients(coeff_ptr + n_coeffs);
163 coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
164
165 // Poor man's sign extract
166 coeff0_sign = _mm_srai_epi16(coeff0, 15);
167 coeff1_sign = _mm_srai_epi16(coeff1, 15);
168 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
169 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
170 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
171 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
172
173 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
174 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
175 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
176 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
177 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
178 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
179 qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
180 qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
181 qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
182 qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
183
184 // Reinsert signs
185 qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
186 qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
187 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
188 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
189
190 // Mask out zbin threshold coeffs
191 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
192 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
193
194 store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
195 store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
196
197 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
198 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
199
200 store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
201 store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
202 }
203
204 {
205 // Scan for eob
206 __m128i zero_coeff0, zero_coeff1;
207 __m128i nzero_coeff0, nzero_coeff1;
208 __m128i iscan0, iscan1;
209 __m128i eob0, eob1;
210 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
211 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
212 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
213 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
214 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
215 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
216 // Add one to convert from indices to counts
217 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
218 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
219 eob0 = _mm_and_si128(iscan0, nzero_coeff0);
220 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
221 eob0 = _mm_max_epi16(eob0, eob1);
222 eob = _mm_max_epi16(eob, eob0);
223 }
224 n_coeffs += 8 * 2;
225 }
226
227 // Accumulate EOB
228 {
229 __m128i eob_shuffled;
230 eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
231 eob = _mm_max_epi16(eob, eob_shuffled);
232 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
233 eob = _mm_max_epi16(eob, eob_shuffled);
234 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
235 eob = _mm_max_epi16(eob, eob_shuffled);
236 *eob_ptr = _mm_extract_epi16(eob, 1);
237 }
238 } else {
239 do {
240 store_coefficients(zero, dqcoeff_ptr + n_coeffs);
241 store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
242 store_coefficients(zero, qcoeff_ptr + n_coeffs);
243 store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
244 n_coeffs += 8 * 2;
245 } while (n_coeffs < 0);
246 *eob_ptr = 0;
247 }
248 }
249