1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>
12 #include <xmmintrin.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 
load_coefficients(const tran_low_t * coeff_ptr)17 static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
18 #if CONFIG_VP9_HIGHBITDEPTH
19   return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
20       (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4],
21       (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
22 #else
23   return _mm_load_si128((const __m128i *)coeff_ptr);
24 #endif
25 }
26 
store_coefficients(__m128i coeff_vals,tran_low_t * coeff_ptr)27 static INLINE void store_coefficients(__m128i coeff_vals,
28                                       tran_low_t *coeff_ptr) {
29 #if CONFIG_VP9_HIGHBITDEPTH
30   __m128i one = _mm_set1_epi16(1);
31   __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
32   __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
33   __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
34   __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
35   _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1);
36   _mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2);
37 #else
38   _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals);
39 #endif
40 }
41 
vpx_quantize_b_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan_ptr,const int16_t * iscan_ptr)42 void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
43                          int skip_block, const int16_t* zbin_ptr,
44                          const int16_t* round_ptr, const int16_t* quant_ptr,
45                          const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
46                          tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
47                          uint16_t* eob_ptr,
48                          const int16_t* scan_ptr,
49                          const int16_t* iscan_ptr) {
50   __m128i zero;
51   (void)scan_ptr;
52 
53   coeff_ptr += n_coeffs;
54   iscan_ptr += n_coeffs;
55   qcoeff_ptr += n_coeffs;
56   dqcoeff_ptr += n_coeffs;
57   n_coeffs = -n_coeffs;
58   zero = _mm_setzero_si128();
59   if (!skip_block) {
60     __m128i eob;
61     __m128i zbin;
62     __m128i round, quant, dequant, shift;
63     {
64       __m128i coeff0, coeff1;
65 
66       // Setup global values
67       {
68         __m128i pw_1;
69         zbin = _mm_load_si128((const __m128i*)zbin_ptr);
70         round = _mm_load_si128((const __m128i*)round_ptr);
71         quant = _mm_load_si128((const __m128i*)quant_ptr);
72         pw_1 = _mm_set1_epi16(1);
73         zbin = _mm_sub_epi16(zbin, pw_1);
74         dequant = _mm_load_si128((const __m128i*)dequant_ptr);
75         shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
76       }
77 
78       {
79         __m128i coeff0_sign, coeff1_sign;
80         __m128i qcoeff0, qcoeff1;
81         __m128i qtmp0, qtmp1;
82         __m128i cmp_mask0, cmp_mask1;
83         // Do DC and first 15 AC
84         coeff0 = load_coefficients(coeff_ptr + n_coeffs);
85         coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
86 
87         // Poor man's sign extract
88         coeff0_sign = _mm_srai_epi16(coeff0, 15);
89         coeff1_sign = _mm_srai_epi16(coeff1, 15);
90         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
91         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
92         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
93         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
94 
95         cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
96         zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
97         cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
98         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
99         round = _mm_unpackhi_epi64(round, round);
100         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
101         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
102         quant = _mm_unpackhi_epi64(quant, quant);
103         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
104         qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
105         qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
106         qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
107         shift = _mm_unpackhi_epi64(shift, shift);
108         qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
109 
110         // Reinsert signs
111         qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
112         qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
113         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
114         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
115 
116         // Mask out zbin threshold coeffs
117         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
118         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
119 
120         store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
121         store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
122 
123         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
124         dequant = _mm_unpackhi_epi64(dequant, dequant);
125         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
126 
127         store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
128         store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
129       }
130 
131       {
132         // Scan for eob
133         __m128i zero_coeff0, zero_coeff1;
134         __m128i nzero_coeff0, nzero_coeff1;
135         __m128i iscan0, iscan1;
136         __m128i eob1;
137         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
138         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
139         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
140         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
141         iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
142         iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
143         // Add one to convert from indices to counts
144         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
145         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
146         eob = _mm_and_si128(iscan0, nzero_coeff0);
147         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
148         eob = _mm_max_epi16(eob, eob1);
149       }
150       n_coeffs += 8 * 2;
151     }
152 
153     // AC only loop
154     while (n_coeffs < 0) {
155       __m128i coeff0, coeff1;
156       {
157         __m128i coeff0_sign, coeff1_sign;
158         __m128i qcoeff0, qcoeff1;
159         __m128i qtmp0, qtmp1;
160         __m128i cmp_mask0, cmp_mask1;
161 
162         coeff0 = load_coefficients(coeff_ptr + n_coeffs);
163         coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
164 
165         // Poor man's sign extract
166         coeff0_sign = _mm_srai_epi16(coeff0, 15);
167         coeff1_sign = _mm_srai_epi16(coeff1, 15);
168         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
169         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
170         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
171         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
172 
173         cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
174         cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
175         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
176         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
177         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
178         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
179         qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
180         qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
181         qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
182         qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
183 
184         // Reinsert signs
185         qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
186         qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
187         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
188         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
189 
190         // Mask out zbin threshold coeffs
191         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
192         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
193 
194         store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
195         store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
196 
197         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
198         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
199 
200         store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
201         store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
202       }
203 
204       {
205         // Scan for eob
206         __m128i zero_coeff0, zero_coeff1;
207         __m128i nzero_coeff0, nzero_coeff1;
208         __m128i iscan0, iscan1;
209         __m128i eob0, eob1;
210         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
211         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
212         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
213         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
214         iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
215         iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
216         // Add one to convert from indices to counts
217         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
218         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
219         eob0 = _mm_and_si128(iscan0, nzero_coeff0);
220         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
221         eob0 = _mm_max_epi16(eob0, eob1);
222         eob = _mm_max_epi16(eob, eob0);
223       }
224       n_coeffs += 8 * 2;
225     }
226 
227     // Accumulate EOB
228     {
229       __m128i eob_shuffled;
230       eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
231       eob = _mm_max_epi16(eob, eob_shuffled);
232       eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
233       eob = _mm_max_epi16(eob, eob_shuffled);
234       eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
235       eob = _mm_max_epi16(eob, eob_shuffled);
236       *eob_ptr = _mm_extract_epi16(eob, 1);
237     }
238   } else {
239     do {
240       store_coefficients(zero, dqcoeff_ptr + n_coeffs);
241       store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
242       store_coefficients(zero, qcoeff_ptr + n_coeffs);
243       store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
244       n_coeffs += 8 * 2;
245     } while (n_coeffs < 0);
246     *eob_ptr = 0;
247   }
248 }
249