1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>
12 
13 #include "./vp9_rtcd.h"
14 #include "vpx_ports/mem.h"
15 
vp9_minmax_8x8_sse2(const uint8_t * s,int p,const uint8_t * d,int dp,int * min,int * max)16 void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
17                          int *min, int *max) {
18   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
19   u0  = _mm_setzero_si128();
20   // Row 0
21   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
22   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
23   diff = _mm_subs_epi16(s0, d0);
24   negdiff = _mm_subs_epi16(u0, diff);
25   absdiff0 = _mm_max_epi16(diff, negdiff);
26   // Row 1
27   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
28   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
29   diff = _mm_subs_epi16(s0, d0);
30   negdiff = _mm_subs_epi16(u0, diff);
31   absdiff = _mm_max_epi16(diff, negdiff);
32   maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
33   minabsdiff = _mm_min_epi16(absdiff0, absdiff);
34   // Row 2
35   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
36   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
37   diff = _mm_subs_epi16(s0, d0);
38   negdiff = _mm_subs_epi16(u0, diff);
39   absdiff = _mm_max_epi16(diff, negdiff);
40   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
41   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
42   // Row 3
43   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
44   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
45   diff = _mm_subs_epi16(s0, d0);
46   negdiff = _mm_subs_epi16(u0, diff);
47   absdiff = _mm_max_epi16(diff, negdiff);
48   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
49   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
50   // Row 4
51   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
52   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
53   diff = _mm_subs_epi16(s0, d0);
54   negdiff = _mm_subs_epi16(u0, diff);
55   absdiff = _mm_max_epi16(diff, negdiff);
56   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
57   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
58   // Row 5
59   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
60   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
61   diff = _mm_subs_epi16(s0, d0);
62   negdiff = _mm_subs_epi16(u0, diff);
63   absdiff = _mm_max_epi16(diff, negdiff);
64   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
65   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
66   // Row 6
67   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
68   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
69   diff = _mm_subs_epi16(s0, d0);
70   negdiff = _mm_subs_epi16(u0, diff);
71   absdiff = _mm_max_epi16(diff, negdiff);
72   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
73   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
74   // Row 7
75   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
76   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
77   diff = _mm_subs_epi16(s0, d0);
78   negdiff = _mm_subs_epi16(u0, diff);
79   absdiff = _mm_max_epi16(diff, negdiff);
80   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
81   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
82 
83   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
84   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
85   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
86   *max = _mm_extract_epi16(maxabsdiff, 0);
87 
88   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
89   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
90   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
91   *min = _mm_extract_epi16(minabsdiff, 0);
92 }
93 
vp9_avg_8x8_sse2(const uint8_t * s,int p)94 unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
95   __m128i s0, s1, u0;
96   unsigned int avg = 0;
97   u0  = _mm_setzero_si128();
98   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
99   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
100   s0 = _mm_adds_epu16(s0, s1);
101   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
102   s0 = _mm_adds_epu16(s0, s1);
103   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
104   s0 = _mm_adds_epu16(s0, s1);
105   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
106   s0 = _mm_adds_epu16(s0, s1);
107   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
108   s0 = _mm_adds_epu16(s0, s1);
109   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
110   s0 = _mm_adds_epu16(s0, s1);
111   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
112   s0 = _mm_adds_epu16(s0, s1);
113 
114   s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
115   s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
116   s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
117   avg = _mm_extract_epi16(s0, 0);
118   return (avg + 32) >> 6;
119 }
120 
vp9_avg_4x4_sse2(const uint8_t * s,int p)121 unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
122   __m128i s0, s1, u0;
123   unsigned int avg = 0;
124   u0  = _mm_setzero_si128();
125   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
126   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
127   s0 = _mm_adds_epu16(s0, s1);
128   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
129   s0 = _mm_adds_epu16(s0, s1);
130   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
131   s0 = _mm_adds_epu16(s0, s1);
132 
133   s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
134   s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
135   avg = _mm_extract_epi16(s0, 0);
136   return (avg + 8) >> 4;
137 }
138 
hadamard_col8_sse2(__m128i * in,int iter)139 static void hadamard_col8_sse2(__m128i *in, int iter) {
140   __m128i a0 = in[0];
141   __m128i a1 = in[1];
142   __m128i a2 = in[2];
143   __m128i a3 = in[3];
144   __m128i a4 = in[4];
145   __m128i a5 = in[5];
146   __m128i a6 = in[6];
147   __m128i a7 = in[7];
148 
149   __m128i b0 = _mm_add_epi16(a0, a1);
150   __m128i b1 = _mm_sub_epi16(a0, a1);
151   __m128i b2 = _mm_add_epi16(a2, a3);
152   __m128i b3 = _mm_sub_epi16(a2, a3);
153   __m128i b4 = _mm_add_epi16(a4, a5);
154   __m128i b5 = _mm_sub_epi16(a4, a5);
155   __m128i b6 = _mm_add_epi16(a6, a7);
156   __m128i b7 = _mm_sub_epi16(a6, a7);
157 
158   a0 = _mm_add_epi16(b0, b2);
159   a1 = _mm_add_epi16(b1, b3);
160   a2 = _mm_sub_epi16(b0, b2);
161   a3 = _mm_sub_epi16(b1, b3);
162   a4 = _mm_add_epi16(b4, b6);
163   a5 = _mm_add_epi16(b5, b7);
164   a6 = _mm_sub_epi16(b4, b6);
165   a7 = _mm_sub_epi16(b5, b7);
166 
167   if (iter == 0) {
168     b0 = _mm_add_epi16(a0, a4);
169     b7 = _mm_add_epi16(a1, a5);
170     b3 = _mm_add_epi16(a2, a6);
171     b4 = _mm_add_epi16(a3, a7);
172     b2 = _mm_sub_epi16(a0, a4);
173     b6 = _mm_sub_epi16(a1, a5);
174     b1 = _mm_sub_epi16(a2, a6);
175     b5 = _mm_sub_epi16(a3, a7);
176 
177     a0 = _mm_unpacklo_epi16(b0, b1);
178     a1 = _mm_unpacklo_epi16(b2, b3);
179     a2 = _mm_unpackhi_epi16(b0, b1);
180     a3 = _mm_unpackhi_epi16(b2, b3);
181     a4 = _mm_unpacklo_epi16(b4, b5);
182     a5 = _mm_unpacklo_epi16(b6, b7);
183     a6 = _mm_unpackhi_epi16(b4, b5);
184     a7 = _mm_unpackhi_epi16(b6, b7);
185 
186     b0 = _mm_unpacklo_epi32(a0, a1);
187     b1 = _mm_unpacklo_epi32(a4, a5);
188     b2 = _mm_unpackhi_epi32(a0, a1);
189     b3 = _mm_unpackhi_epi32(a4, a5);
190     b4 = _mm_unpacklo_epi32(a2, a3);
191     b5 = _mm_unpacklo_epi32(a6, a7);
192     b6 = _mm_unpackhi_epi32(a2, a3);
193     b7 = _mm_unpackhi_epi32(a6, a7);
194 
195     in[0] = _mm_unpacklo_epi64(b0, b1);
196     in[1] = _mm_unpackhi_epi64(b0, b1);
197     in[2] = _mm_unpacklo_epi64(b2, b3);
198     in[3] = _mm_unpackhi_epi64(b2, b3);
199     in[4] = _mm_unpacklo_epi64(b4, b5);
200     in[5] = _mm_unpackhi_epi64(b4, b5);
201     in[6] = _mm_unpacklo_epi64(b6, b7);
202     in[7] = _mm_unpackhi_epi64(b6, b7);
203   } else {
204     in[0] = _mm_add_epi16(a0, a4);
205     in[7] = _mm_add_epi16(a1, a5);
206     in[3] = _mm_add_epi16(a2, a6);
207     in[4] = _mm_add_epi16(a3, a7);
208     in[2] = _mm_sub_epi16(a0, a4);
209     in[6] = _mm_sub_epi16(a1, a5);
210     in[1] = _mm_sub_epi16(a2, a6);
211     in[5] = _mm_sub_epi16(a3, a7);
212   }
213 }
214 
vp9_hadamard_8x8_sse2(int16_t const * src_diff,int src_stride,int16_t * coeff)215 void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
216                            int16_t *coeff) {
217   __m128i src[8];
218   src[0] = _mm_load_si128((const __m128i *)src_diff);
219   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
220   src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
221   src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
222   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
223   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
224   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
225   src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
226 
227   hadamard_col8_sse2(src, 0);
228   hadamard_col8_sse2(src, 1);
229 
230   _mm_store_si128((__m128i *)coeff, src[0]);
231   coeff += 8;
232   _mm_store_si128((__m128i *)coeff, src[1]);
233   coeff += 8;
234   _mm_store_si128((__m128i *)coeff, src[2]);
235   coeff += 8;
236   _mm_store_si128((__m128i *)coeff, src[3]);
237   coeff += 8;
238   _mm_store_si128((__m128i *)coeff, src[4]);
239   coeff += 8;
240   _mm_store_si128((__m128i *)coeff, src[5]);
241   coeff += 8;
242   _mm_store_si128((__m128i *)coeff, src[6]);
243   coeff += 8;
244   _mm_store_si128((__m128i *)coeff, src[7]);
245 }
246 
vp9_hadamard_16x16_sse2(int16_t const * src_diff,int src_stride,int16_t * coeff)247 void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
248                              int16_t *coeff) {
249   int idx;
250   for (idx = 0; idx < 4; ++idx) {
251     int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
252                                 + (idx & 0x01) * 8;
253     vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
254   }
255 
256   for (idx = 0; idx < 64; idx += 8) {
257     __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
258     __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
259     __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
260     __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
261 
262     __m128i b0 = _mm_add_epi16(coeff0, coeff1);
263     __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
264     __m128i b2 = _mm_add_epi16(coeff2, coeff3);
265     __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
266 
267     b0 = _mm_srai_epi16(b0, 1);
268     b1 = _mm_srai_epi16(b1, 1);
269     b2 = _mm_srai_epi16(b2, 1);
270     b3 = _mm_srai_epi16(b3, 1);
271 
272     coeff0 = _mm_add_epi16(b0, b2);
273     coeff1 = _mm_add_epi16(b1, b3);
274     _mm_store_si128((__m128i *)coeff, coeff0);
275     _mm_store_si128((__m128i *)(coeff + 64), coeff1);
276 
277     coeff2 = _mm_sub_epi16(b0, b2);
278     coeff3 = _mm_sub_epi16(b1, b3);
279     _mm_store_si128((__m128i *)(coeff + 128), coeff2);
280     _mm_store_si128((__m128i *)(coeff + 192), coeff3);
281 
282     coeff += 8;
283   }
284 }
285 
vp9_satd_sse2(const int16_t * coeff,int length)286 int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
287   int i;
288   __m128i sum = _mm_load_si128((const __m128i *)coeff);
289   __m128i sign = _mm_srai_epi16(sum, 15);
290   __m128i val = _mm_xor_si128(sum, sign);
291   sum = _mm_sub_epi16(val, sign);
292   coeff += 8;
293 
294   for (i = 8; i < length; i += 8) {
295     __m128i src_line = _mm_load_si128((const __m128i *)coeff);
296     sign = _mm_srai_epi16(src_line, 15);
297     val = _mm_xor_si128(src_line, sign);
298     val = _mm_sub_epi16(val, sign);
299     sum = _mm_add_epi16(sum, val);
300     coeff += 8;
301   }
302 
303   val = _mm_srli_si128(sum, 8);
304   sum = _mm_add_epi16(sum, val);
305   val = _mm_srli_epi64(sum, 32);
306   sum = _mm_add_epi16(sum, val);
307   val = _mm_srli_epi32(sum, 16);
308   sum = _mm_add_epi16(sum, val);
309 
310   return _mm_extract_epi16(sum, 0);
311 }
312 
vp9_int_pro_row_sse2(int16_t * hbuf,uint8_t const * ref,const int ref_stride,const int height)313 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
314                           const int ref_stride, const int height) {
315   int idx;
316   __m128i zero = _mm_setzero_si128();
317   __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
318   __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
319   __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
320   __m128i t0, t1;
321   int height_1 = height - 1;
322   ref += ref_stride;
323 
324   for (idx = 1; idx < height_1; idx += 2) {
325     src_line = _mm_loadu_si128((const __m128i *)ref);
326     t0 = _mm_unpacklo_epi8(src_line, zero);
327     t1 = _mm_unpackhi_epi8(src_line, zero);
328     s0 = _mm_adds_epu16(s0, t0);
329     s1 = _mm_adds_epu16(s1, t1);
330     ref += ref_stride;
331 
332     src_line = _mm_loadu_si128((const __m128i *)ref);
333     t0 = _mm_unpacklo_epi8(src_line, zero);
334     t1 = _mm_unpackhi_epi8(src_line, zero);
335     s0 = _mm_adds_epu16(s0, t0);
336     s1 = _mm_adds_epu16(s1, t1);
337     ref += ref_stride;
338   }
339 
340   src_line = _mm_loadu_si128((const __m128i *)ref);
341   t0 = _mm_unpacklo_epi8(src_line, zero);
342   t1 = _mm_unpackhi_epi8(src_line, zero);
343   s0 = _mm_adds_epu16(s0, t0);
344   s1 = _mm_adds_epu16(s1, t1);
345 
346   if (height == 64) {
347     s0 = _mm_srai_epi16(s0, 5);
348     s1 = _mm_srai_epi16(s1, 5);
349   } else if (height == 32) {
350     s0 = _mm_srai_epi16(s0, 4);
351     s1 = _mm_srai_epi16(s1, 4);
352   } else {
353     s0 = _mm_srai_epi16(s0, 3);
354     s1 = _mm_srai_epi16(s1, 3);
355   }
356 
357   _mm_storeu_si128((__m128i *)hbuf, s0);
358   hbuf += 8;
359   _mm_storeu_si128((__m128i *)hbuf, s1);
360 }
361 
vp9_int_pro_col_sse2(uint8_t const * ref,const int width)362 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
363   __m128i zero = _mm_setzero_si128();
364   __m128i src_line = _mm_load_si128((const __m128i *)ref);
365   __m128i s0 = _mm_sad_epu8(src_line, zero);
366   __m128i s1;
367   int i;
368 
369   for (i = 16; i < width; i += 16) {
370     ref += 16;
371     src_line = _mm_load_si128((const __m128i *)ref);
372     s1 = _mm_sad_epu8(src_line, zero);
373     s0 = _mm_adds_epu16(s0, s1);
374   }
375 
376   s1 = _mm_srli_si128(s0, 8);
377   s0 = _mm_adds_epu16(s0, s1);
378 
379   return _mm_extract_epi16(s0, 0);
380 }
381 
vp9_vector_var_sse2(int16_t const * ref,int16_t const * src,const int bwl)382 int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
383                         const int bwl) {
384   int idx;
385   int width = 4 << bwl;
386   int16_t mean;
387   __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
388   __m128i v1 = _mm_load_si128((const __m128i *)src);
389   __m128i diff = _mm_subs_epi16(v0, v1);
390   __m128i sum = diff;
391   __m128i sse = _mm_madd_epi16(diff, diff);
392 
393   ref += 8;
394   src += 8;
395 
396   for (idx = 8; idx < width; idx += 8) {
397     v0 = _mm_loadu_si128((const __m128i *)ref);
398     v1 = _mm_load_si128((const __m128i *)src);
399     diff = _mm_subs_epi16(v0, v1);
400 
401     sum = _mm_add_epi16(sum, diff);
402     v0  = _mm_madd_epi16(diff, diff);
403     sse = _mm_add_epi32(sse, v0);
404 
405     ref += 8;
406     src += 8;
407   }
408 
409   v0  = _mm_srli_si128(sum, 8);
410   sum = _mm_add_epi16(sum, v0);
411   v0  = _mm_srli_epi64(sum, 32);
412   sum = _mm_add_epi16(sum, v0);
413   v0  = _mm_srli_epi32(sum, 16);
414   sum = _mm_add_epi16(sum, v0);
415 
416   v1  = _mm_srli_si128(sse, 8);
417   sse = _mm_add_epi32(sse, v1);
418   v1  = _mm_srli_epi64(sse, 32);
419   sse = _mm_add_epi32(sse, v1);
420 
421   mean = _mm_extract_epi16(sum, 0);
422 
423   return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
424 }
425