1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h>
12
13 #include "./vp9_rtcd.h"
14 #include "vpx_ports/mem.h"
15
vp9_minmax_8x8_sse2(const uint8_t * s,int p,const uint8_t * d,int dp,int * min,int * max)16 void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
17 int *min, int *max) {
18 __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
19 u0 = _mm_setzero_si128();
20 // Row 0
21 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
22 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
23 diff = _mm_subs_epi16(s0, d0);
24 negdiff = _mm_subs_epi16(u0, diff);
25 absdiff0 = _mm_max_epi16(diff, negdiff);
26 // Row 1
27 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
28 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
29 diff = _mm_subs_epi16(s0, d0);
30 negdiff = _mm_subs_epi16(u0, diff);
31 absdiff = _mm_max_epi16(diff, negdiff);
32 maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
33 minabsdiff = _mm_min_epi16(absdiff0, absdiff);
34 // Row 2
35 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
36 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
37 diff = _mm_subs_epi16(s0, d0);
38 negdiff = _mm_subs_epi16(u0, diff);
39 absdiff = _mm_max_epi16(diff, negdiff);
40 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
41 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
42 // Row 3
43 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
44 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
45 diff = _mm_subs_epi16(s0, d0);
46 negdiff = _mm_subs_epi16(u0, diff);
47 absdiff = _mm_max_epi16(diff, negdiff);
48 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
49 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
50 // Row 4
51 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
52 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
53 diff = _mm_subs_epi16(s0, d0);
54 negdiff = _mm_subs_epi16(u0, diff);
55 absdiff = _mm_max_epi16(diff, negdiff);
56 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
57 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
58 // Row 5
59 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
60 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
61 diff = _mm_subs_epi16(s0, d0);
62 negdiff = _mm_subs_epi16(u0, diff);
63 absdiff = _mm_max_epi16(diff, negdiff);
64 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
65 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
66 // Row 6
67 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
68 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
69 diff = _mm_subs_epi16(s0, d0);
70 negdiff = _mm_subs_epi16(u0, diff);
71 absdiff = _mm_max_epi16(diff, negdiff);
72 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
73 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
74 // Row 7
75 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
76 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
77 diff = _mm_subs_epi16(s0, d0);
78 negdiff = _mm_subs_epi16(u0, diff);
79 absdiff = _mm_max_epi16(diff, negdiff);
80 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
81 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
82
83 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
84 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
85 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
86 *max = _mm_extract_epi16(maxabsdiff, 0);
87
88 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
89 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
90 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
91 *min = _mm_extract_epi16(minabsdiff, 0);
92 }
93
vp9_avg_8x8_sse2(const uint8_t * s,int p)94 unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
95 __m128i s0, s1, u0;
96 unsigned int avg = 0;
97 u0 = _mm_setzero_si128();
98 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
99 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
100 s0 = _mm_adds_epu16(s0, s1);
101 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
102 s0 = _mm_adds_epu16(s0, s1);
103 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
104 s0 = _mm_adds_epu16(s0, s1);
105 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
106 s0 = _mm_adds_epu16(s0, s1);
107 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
108 s0 = _mm_adds_epu16(s0, s1);
109 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
110 s0 = _mm_adds_epu16(s0, s1);
111 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
112 s0 = _mm_adds_epu16(s0, s1);
113
114 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
115 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
116 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
117 avg = _mm_extract_epi16(s0, 0);
118 return (avg + 32) >> 6;
119 }
120
vp9_avg_4x4_sse2(const uint8_t * s,int p)121 unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
122 __m128i s0, s1, u0;
123 unsigned int avg = 0;
124 u0 = _mm_setzero_si128();
125 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
126 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
127 s0 = _mm_adds_epu16(s0, s1);
128 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
129 s0 = _mm_adds_epu16(s0, s1);
130 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
131 s0 = _mm_adds_epu16(s0, s1);
132
133 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
134 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
135 avg = _mm_extract_epi16(s0, 0);
136 return (avg + 8) >> 4;
137 }
138
hadamard_col8_sse2(__m128i * in,int iter)139 static void hadamard_col8_sse2(__m128i *in, int iter) {
140 __m128i a0 = in[0];
141 __m128i a1 = in[1];
142 __m128i a2 = in[2];
143 __m128i a3 = in[3];
144 __m128i a4 = in[4];
145 __m128i a5 = in[5];
146 __m128i a6 = in[6];
147 __m128i a7 = in[7];
148
149 __m128i b0 = _mm_add_epi16(a0, a1);
150 __m128i b1 = _mm_sub_epi16(a0, a1);
151 __m128i b2 = _mm_add_epi16(a2, a3);
152 __m128i b3 = _mm_sub_epi16(a2, a3);
153 __m128i b4 = _mm_add_epi16(a4, a5);
154 __m128i b5 = _mm_sub_epi16(a4, a5);
155 __m128i b6 = _mm_add_epi16(a6, a7);
156 __m128i b7 = _mm_sub_epi16(a6, a7);
157
158 a0 = _mm_add_epi16(b0, b2);
159 a1 = _mm_add_epi16(b1, b3);
160 a2 = _mm_sub_epi16(b0, b2);
161 a3 = _mm_sub_epi16(b1, b3);
162 a4 = _mm_add_epi16(b4, b6);
163 a5 = _mm_add_epi16(b5, b7);
164 a6 = _mm_sub_epi16(b4, b6);
165 a7 = _mm_sub_epi16(b5, b7);
166
167 if (iter == 0) {
168 b0 = _mm_add_epi16(a0, a4);
169 b7 = _mm_add_epi16(a1, a5);
170 b3 = _mm_add_epi16(a2, a6);
171 b4 = _mm_add_epi16(a3, a7);
172 b2 = _mm_sub_epi16(a0, a4);
173 b6 = _mm_sub_epi16(a1, a5);
174 b1 = _mm_sub_epi16(a2, a6);
175 b5 = _mm_sub_epi16(a3, a7);
176
177 a0 = _mm_unpacklo_epi16(b0, b1);
178 a1 = _mm_unpacklo_epi16(b2, b3);
179 a2 = _mm_unpackhi_epi16(b0, b1);
180 a3 = _mm_unpackhi_epi16(b2, b3);
181 a4 = _mm_unpacklo_epi16(b4, b5);
182 a5 = _mm_unpacklo_epi16(b6, b7);
183 a6 = _mm_unpackhi_epi16(b4, b5);
184 a7 = _mm_unpackhi_epi16(b6, b7);
185
186 b0 = _mm_unpacklo_epi32(a0, a1);
187 b1 = _mm_unpacklo_epi32(a4, a5);
188 b2 = _mm_unpackhi_epi32(a0, a1);
189 b3 = _mm_unpackhi_epi32(a4, a5);
190 b4 = _mm_unpacklo_epi32(a2, a3);
191 b5 = _mm_unpacklo_epi32(a6, a7);
192 b6 = _mm_unpackhi_epi32(a2, a3);
193 b7 = _mm_unpackhi_epi32(a6, a7);
194
195 in[0] = _mm_unpacklo_epi64(b0, b1);
196 in[1] = _mm_unpackhi_epi64(b0, b1);
197 in[2] = _mm_unpacklo_epi64(b2, b3);
198 in[3] = _mm_unpackhi_epi64(b2, b3);
199 in[4] = _mm_unpacklo_epi64(b4, b5);
200 in[5] = _mm_unpackhi_epi64(b4, b5);
201 in[6] = _mm_unpacklo_epi64(b6, b7);
202 in[7] = _mm_unpackhi_epi64(b6, b7);
203 } else {
204 in[0] = _mm_add_epi16(a0, a4);
205 in[7] = _mm_add_epi16(a1, a5);
206 in[3] = _mm_add_epi16(a2, a6);
207 in[4] = _mm_add_epi16(a3, a7);
208 in[2] = _mm_sub_epi16(a0, a4);
209 in[6] = _mm_sub_epi16(a1, a5);
210 in[1] = _mm_sub_epi16(a2, a6);
211 in[5] = _mm_sub_epi16(a3, a7);
212 }
213 }
214
vp9_hadamard_8x8_sse2(int16_t const * src_diff,int src_stride,int16_t * coeff)215 void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
216 int16_t *coeff) {
217 __m128i src[8];
218 src[0] = _mm_load_si128((const __m128i *)src_diff);
219 src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
220 src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
221 src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
222 src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
223 src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
224 src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
225 src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
226
227 hadamard_col8_sse2(src, 0);
228 hadamard_col8_sse2(src, 1);
229
230 _mm_store_si128((__m128i *)coeff, src[0]);
231 coeff += 8;
232 _mm_store_si128((__m128i *)coeff, src[1]);
233 coeff += 8;
234 _mm_store_si128((__m128i *)coeff, src[2]);
235 coeff += 8;
236 _mm_store_si128((__m128i *)coeff, src[3]);
237 coeff += 8;
238 _mm_store_si128((__m128i *)coeff, src[4]);
239 coeff += 8;
240 _mm_store_si128((__m128i *)coeff, src[5]);
241 coeff += 8;
242 _mm_store_si128((__m128i *)coeff, src[6]);
243 coeff += 8;
244 _mm_store_si128((__m128i *)coeff, src[7]);
245 }
246
vp9_hadamard_16x16_sse2(int16_t const * src_diff,int src_stride,int16_t * coeff)247 void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
248 int16_t *coeff) {
249 int idx;
250 for (idx = 0; idx < 4; ++idx) {
251 int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
252 + (idx & 0x01) * 8;
253 vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
254 }
255
256 for (idx = 0; idx < 64; idx += 8) {
257 __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
258 __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
259 __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
260 __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
261
262 __m128i b0 = _mm_add_epi16(coeff0, coeff1);
263 __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
264 __m128i b2 = _mm_add_epi16(coeff2, coeff3);
265 __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
266
267 b0 = _mm_srai_epi16(b0, 1);
268 b1 = _mm_srai_epi16(b1, 1);
269 b2 = _mm_srai_epi16(b2, 1);
270 b3 = _mm_srai_epi16(b3, 1);
271
272 coeff0 = _mm_add_epi16(b0, b2);
273 coeff1 = _mm_add_epi16(b1, b3);
274 _mm_store_si128((__m128i *)coeff, coeff0);
275 _mm_store_si128((__m128i *)(coeff + 64), coeff1);
276
277 coeff2 = _mm_sub_epi16(b0, b2);
278 coeff3 = _mm_sub_epi16(b1, b3);
279 _mm_store_si128((__m128i *)(coeff + 128), coeff2);
280 _mm_store_si128((__m128i *)(coeff + 192), coeff3);
281
282 coeff += 8;
283 }
284 }
285
vp9_satd_sse2(const int16_t * coeff,int length)286 int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
287 int i;
288 __m128i sum = _mm_load_si128((const __m128i *)coeff);
289 __m128i sign = _mm_srai_epi16(sum, 15);
290 __m128i val = _mm_xor_si128(sum, sign);
291 sum = _mm_sub_epi16(val, sign);
292 coeff += 8;
293
294 for (i = 8; i < length; i += 8) {
295 __m128i src_line = _mm_load_si128((const __m128i *)coeff);
296 sign = _mm_srai_epi16(src_line, 15);
297 val = _mm_xor_si128(src_line, sign);
298 val = _mm_sub_epi16(val, sign);
299 sum = _mm_add_epi16(sum, val);
300 coeff += 8;
301 }
302
303 val = _mm_srli_si128(sum, 8);
304 sum = _mm_add_epi16(sum, val);
305 val = _mm_srli_epi64(sum, 32);
306 sum = _mm_add_epi16(sum, val);
307 val = _mm_srli_epi32(sum, 16);
308 sum = _mm_add_epi16(sum, val);
309
310 return _mm_extract_epi16(sum, 0);
311 }
312
vp9_int_pro_row_sse2(int16_t * hbuf,uint8_t const * ref,const int ref_stride,const int height)313 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
314 const int ref_stride, const int height) {
315 int idx;
316 __m128i zero = _mm_setzero_si128();
317 __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
318 __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
319 __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
320 __m128i t0, t1;
321 int height_1 = height - 1;
322 ref += ref_stride;
323
324 for (idx = 1; idx < height_1; idx += 2) {
325 src_line = _mm_loadu_si128((const __m128i *)ref);
326 t0 = _mm_unpacklo_epi8(src_line, zero);
327 t1 = _mm_unpackhi_epi8(src_line, zero);
328 s0 = _mm_adds_epu16(s0, t0);
329 s1 = _mm_adds_epu16(s1, t1);
330 ref += ref_stride;
331
332 src_line = _mm_loadu_si128((const __m128i *)ref);
333 t0 = _mm_unpacklo_epi8(src_line, zero);
334 t1 = _mm_unpackhi_epi8(src_line, zero);
335 s0 = _mm_adds_epu16(s0, t0);
336 s1 = _mm_adds_epu16(s1, t1);
337 ref += ref_stride;
338 }
339
340 src_line = _mm_loadu_si128((const __m128i *)ref);
341 t0 = _mm_unpacklo_epi8(src_line, zero);
342 t1 = _mm_unpackhi_epi8(src_line, zero);
343 s0 = _mm_adds_epu16(s0, t0);
344 s1 = _mm_adds_epu16(s1, t1);
345
346 if (height == 64) {
347 s0 = _mm_srai_epi16(s0, 5);
348 s1 = _mm_srai_epi16(s1, 5);
349 } else if (height == 32) {
350 s0 = _mm_srai_epi16(s0, 4);
351 s1 = _mm_srai_epi16(s1, 4);
352 } else {
353 s0 = _mm_srai_epi16(s0, 3);
354 s1 = _mm_srai_epi16(s1, 3);
355 }
356
357 _mm_storeu_si128((__m128i *)hbuf, s0);
358 hbuf += 8;
359 _mm_storeu_si128((__m128i *)hbuf, s1);
360 }
361
vp9_int_pro_col_sse2(uint8_t const * ref,const int width)362 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
363 __m128i zero = _mm_setzero_si128();
364 __m128i src_line = _mm_load_si128((const __m128i *)ref);
365 __m128i s0 = _mm_sad_epu8(src_line, zero);
366 __m128i s1;
367 int i;
368
369 for (i = 16; i < width; i += 16) {
370 ref += 16;
371 src_line = _mm_load_si128((const __m128i *)ref);
372 s1 = _mm_sad_epu8(src_line, zero);
373 s0 = _mm_adds_epu16(s0, s1);
374 }
375
376 s1 = _mm_srli_si128(s0, 8);
377 s0 = _mm_adds_epu16(s0, s1);
378
379 return _mm_extract_epi16(s0, 0);
380 }
381
vp9_vector_var_sse2(int16_t const * ref,int16_t const * src,const int bwl)382 int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
383 const int bwl) {
384 int idx;
385 int width = 4 << bwl;
386 int16_t mean;
387 __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
388 __m128i v1 = _mm_load_si128((const __m128i *)src);
389 __m128i diff = _mm_subs_epi16(v0, v1);
390 __m128i sum = diff;
391 __m128i sse = _mm_madd_epi16(diff, diff);
392
393 ref += 8;
394 src += 8;
395
396 for (idx = 8; idx < width; idx += 8) {
397 v0 = _mm_loadu_si128((const __m128i *)ref);
398 v1 = _mm_load_si128((const __m128i *)src);
399 diff = _mm_subs_epi16(v0, v1);
400
401 sum = _mm_add_epi16(sum, diff);
402 v0 = _mm_madd_epi16(diff, diff);
403 sse = _mm_add_epi32(sse, v0);
404
405 ref += 8;
406 src += 8;
407 }
408
409 v0 = _mm_srli_si128(sum, 8);
410 sum = _mm_add_epi16(sum, v0);
411 v0 = _mm_srli_epi64(sum, 32);
412 sum = _mm_add_epi16(sum, v0);
413 v0 = _mm_srli_epi32(sum, 16);
414 sum = _mm_add_epi16(sum, v0);
415
416 v1 = _mm_srli_si128(sse, 8);
417 sse = _mm_add_epi32(sse, v1);
418 v1 = _mm_srli_epi64(sse, 32);
419 sse = _mm_add_epi32(sse, v1);
420
421 mean = _mm_extract_epi16(sum, 0);
422
423 return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
424 }
425