1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vpx_config.h"
12 #include "vp8_rtcd.h"
13 #include "vpx_ports/x86.h"
14 #include "vpx_mem/vpx_mem.h"
15 #include "vp8/encoder/block.h"
16 #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
17 
18 #include <mmintrin.h>  /* MMX */
19 #include <xmmintrin.h> /* SSE */
20 #include <emmintrin.h> /* SSE2 */
21 
22 #define SELECT_EOB(i, z)                    \
23   do {                                      \
24     short boost = *zbin_boost_ptr;          \
25     int cmp = (x[z] < boost) | (y[z] == 0); \
26     zbin_boost_ptr++;                       \
27     if (cmp) break;                         \
28     qcoeff_ptr[z] = y[z];                   \
29     eob = i;                                \
30     zbin_boost_ptr = b->zrun_zbin_boost;    \
31   } while (0)
32 
vp8_regular_quantize_b_sse2(BLOCK * b,BLOCKD * d)33 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) {
34   char eob = 0;
35   short *zbin_boost_ptr;
36   short *qcoeff_ptr = d->qcoeff;
37   DECLARE_ALIGNED(16, short, x[16]);
38   DECLARE_ALIGNED(16, short, y[16]);
39 
40   __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
41   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
42   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
43   __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
44   __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
45   __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
46   __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
47   __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
48   __m128i round0 = _mm_load_si128((__m128i *)(b->round));
49   __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
50   __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
51   __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
52   __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
53   __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
54 
55   memset(qcoeff_ptr, 0, 32);
56 
57   /* Duplicate to all lanes. */
58   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
59   zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
60 
61   /* Sign of z: z >> 15 */
62   sz0 = _mm_srai_epi16(z0, 15);
63   sz1 = _mm_srai_epi16(z1, 15);
64 
65   /* x = abs(z): (z ^ sz) - sz */
66   x0 = _mm_xor_si128(z0, sz0);
67   x1 = _mm_xor_si128(z1, sz1);
68   x0 = _mm_sub_epi16(x0, sz0);
69   x1 = _mm_sub_epi16(x1, sz1);
70 
71   /* zbin[] + zbin_extra */
72   zbin0 = _mm_add_epi16(zbin0, zbin_extra);
73   zbin1 = _mm_add_epi16(zbin1, zbin_extra);
74 
75   /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
76    * the equation because boost is the only value which can change:
77    * x - (zbin[] + extra) >= boost */
78   x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
79   x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
80 
81   _mm_store_si128((__m128i *)(x), x_minus_zbin0);
82   _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
83 
84   /* All the remaining calculations are valid whether they are done now with
85    * simd or later inside the loop one at a time. */
86   x0 = _mm_add_epi16(x0, round0);
87   x1 = _mm_add_epi16(x1, round1);
88 
89   y0 = _mm_mulhi_epi16(x0, quant0);
90   y1 = _mm_mulhi_epi16(x1, quant1);
91 
92   y0 = _mm_add_epi16(y0, x0);
93   y1 = _mm_add_epi16(y1, x1);
94 
95   /* Instead of shifting each value independently we convert the scaling
96    * factor with 1 << (16 - shift) so we can use multiply/return high half. */
97   y0 = _mm_mulhi_epi16(y0, quant_shift0);
98   y1 = _mm_mulhi_epi16(y1, quant_shift1);
99 
100   /* Return the sign: (y ^ sz) - sz */
101   y0 = _mm_xor_si128(y0, sz0);
102   y1 = _mm_xor_si128(y1, sz1);
103   y0 = _mm_sub_epi16(y0, sz0);
104   y1 = _mm_sub_epi16(y1, sz1);
105 
106   _mm_store_si128((__m128i *)(y), y0);
107   _mm_store_si128((__m128i *)(y + 8), y1);
108 
109   zbin_boost_ptr = b->zrun_zbin_boost;
110 
111   /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
112   SELECT_EOB(1, 0);
113   SELECT_EOB(2, 1);
114   SELECT_EOB(3, 4);
115   SELECT_EOB(4, 8);
116   SELECT_EOB(5, 5);
117   SELECT_EOB(6, 2);
118   SELECT_EOB(7, 3);
119   SELECT_EOB(8, 6);
120   SELECT_EOB(9, 9);
121   SELECT_EOB(10, 12);
122   SELECT_EOB(11, 13);
123   SELECT_EOB(12, 10);
124   SELECT_EOB(13, 7);
125   SELECT_EOB(14, 11);
126   SELECT_EOB(15, 14);
127   SELECT_EOB(16, 15);
128 
129   y0 = _mm_load_si128((__m128i *)(d->qcoeff));
130   y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
131 
132   /* dqcoeff = qcoeff * dequant */
133   y0 = _mm_mullo_epi16(y0, dequant0);
134   y1 = _mm_mullo_epi16(y1, dequant1);
135 
136   _mm_store_si128((__m128i *)(d->dqcoeff), y0);
137   _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
138 
139   *d->eob = eob;
140 }
141 
vp8_fast_quantize_b_sse2(BLOCK * b,BLOCKD * d)142 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) {
143   __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
144   __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
145   __m128i round0 = _mm_load_si128((__m128i *)(b->round));
146   __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
147   __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
148   __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
149   __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
150   __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
151   __m128i inv_zig_zag0 =
152       _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
153   __m128i inv_zig_zag1 =
154       _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
155 
156   __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
157 
158   /* sign of z: z >> 15 */
159   sz0 = _mm_srai_epi16(z0, 15);
160   sz1 = _mm_srai_epi16(z1, 15);
161 
162   /* x = abs(z): (z ^ sz) - sz */
163   x0 = _mm_xor_si128(z0, sz0);
164   x1 = _mm_xor_si128(z1, sz1);
165   x0 = _mm_sub_epi16(x0, sz0);
166   x1 = _mm_sub_epi16(x1, sz1);
167 
168   /* x += round */
169   x0 = _mm_add_epi16(x0, round0);
170   x1 = _mm_add_epi16(x1, round1);
171 
172   /* y = (x * quant) >> 16 */
173   y0 = _mm_mulhi_epi16(x0, quant_fast0);
174   y1 = _mm_mulhi_epi16(x1, quant_fast1);
175 
176   /* x = abs(y) = (y ^ sz) - sz */
177   y0 = _mm_xor_si128(y0, sz0);
178   y1 = _mm_xor_si128(y1, sz1);
179   x0 = _mm_sub_epi16(y0, sz0);
180   x1 = _mm_sub_epi16(y1, sz1);
181 
182   /* qcoeff = x */
183   _mm_store_si128((__m128i *)(d->qcoeff), x0);
184   _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
185 
186   /* x * dequant */
187   xdq0 = _mm_mullo_epi16(x0, dequant0);
188   xdq1 = _mm_mullo_epi16(x1, dequant1);
189 
190   /* dqcoeff = x * dequant */
191   _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
192   _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
193 
194   /* build a mask for the zig zag */
195   zeros = _mm_setzero_si128();
196 
197   x0 = _mm_cmpeq_epi16(x0, zeros);
198   x1 = _mm_cmpeq_epi16(x1, zeros);
199 
200   ones = _mm_cmpeq_epi16(zeros, zeros);
201 
202   x0 = _mm_xor_si128(x0, ones);
203   x1 = _mm_xor_si128(x1, ones);
204 
205   x0 = _mm_and_si128(x0, inv_zig_zag0);
206   x1 = _mm_and_si128(x1, inv_zig_zag1);
207 
208   x0 = _mm_max_epi16(x0, x1);
209 
210   /* now down to 8 */
211   x1 = _mm_shuffle_epi32(x0, 0xE);  // 0b00001110
212 
213   x0 = _mm_max_epi16(x0, x1);
214 
215   /* only 4 left */
216   x1 = _mm_shufflelo_epi16(x0, 0xE);  // 0b00001110
217 
218   x0 = _mm_max_epi16(x0, x1);
219 
220   /* okay, just 2! */
221   x1 = _mm_shufflelo_epi16(x0, 0x1);  // 0b00000001
222 
223   x0 = _mm_max_epi16(x0, x1);
224 
225   *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
226 }
227