1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <immintrin.h>
14 
15 #include "aom_dsp/x86/synonyms.h"
16 
17 #include "aom/aom_integer.h"
18 
19 #include "av1/common/reconinter.h"
20 
21 #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
22 
23 /**
24  * See av1_wedge_sse_from_residuals_c
25  */
av1_wedge_sse_from_residuals_sse2(const int16_t * r1,const int16_t * d,const uint8_t * m,int N)26 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
27                                            const uint8_t *m, int N) {
28   int n = -N;
29   int n8 = n + 8;
30 
31   uint64_t csse;
32 
33   const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
34   const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
35 
36   __m128i v_acc0_q = _mm_setzero_si128();
37 
38   assert(N % 64 == 0);
39 
40   r1 += N;
41   d += N;
42   m += N;
43 
44   do {
45     const __m128i v_r0_w = xx_load_128(r1 + n);
46     const __m128i v_r1_w = xx_load_128(r1 + n8);
47     const __m128i v_d0_w = xx_load_128(d + n);
48     const __m128i v_d1_w = xx_load_128(d + n8);
49     const __m128i v_m01_b = xx_load_128(m + n);
50 
51     const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
52     const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
53     const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
54     const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
55     const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
56     const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
57 
58     const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
59     const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
60     const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
61     const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
62 
63     const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
64     const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
65     const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
66     const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
67 
68     const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
69     const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
70 
71     const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
72     const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
73 
74     const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
75                                            _mm_srli_epi64(v_sq0_d, 32));
76     const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
77                                            _mm_srli_epi64(v_sq1_d, 32));
78 
79     v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
80     v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
81 
82     n8 += 16;
83     n += 16;
84   } while (n);
85 
86   v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
87 
88 #if ARCH_X86_64
89   csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
90 #else
91   xx_storel_64(&csse, v_acc0_q);
92 #endif
93 
94   return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
95 }
96 
97 /**
98  * See av1_wedge_sign_from_residuals_c
99  */
av1_wedge_sign_from_residuals_sse2(const int16_t * ds,const uint8_t * m,int N,int64_t limit)100 int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
101                                        int N, int64_t limit) {
102   int64_t acc;
103 
104   __m128i v_sign_d;
105   __m128i v_acc0_d = _mm_setzero_si128();
106   __m128i v_acc1_d = _mm_setzero_si128();
107   __m128i v_acc_q;
108 
109   // Input size limited to 8192 by the use of 32 bit accumulators and m
110   // being between [0, 64]. Overflow might happen at larger sizes,
111   // though it is practically impossible on real video input.
112   assert(N < 8192);
113   assert(N % 64 == 0);
114 
115   do {
116     const __m128i v_m01_b = xx_load_128(m);
117     const __m128i v_m23_b = xx_load_128(m + 16);
118     const __m128i v_m45_b = xx_load_128(m + 32);
119     const __m128i v_m67_b = xx_load_128(m + 48);
120 
121     const __m128i v_d0_w = xx_load_128(ds);
122     const __m128i v_d1_w = xx_load_128(ds + 8);
123     const __m128i v_d2_w = xx_load_128(ds + 16);
124     const __m128i v_d3_w = xx_load_128(ds + 24);
125     const __m128i v_d4_w = xx_load_128(ds + 32);
126     const __m128i v_d5_w = xx_load_128(ds + 40);
127     const __m128i v_d6_w = xx_load_128(ds + 48);
128     const __m128i v_d7_w = xx_load_128(ds + 56);
129 
130     const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
131     const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
132     const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
133     const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
134     const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
135     const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
136     const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
137     const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
138 
139     const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
140     const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
141     const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
142     const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
143     const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
144     const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
145     const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
146     const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
147 
148     const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
149     const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
150     const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
151     const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
152 
153     const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
154     const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
155 
156     v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
157     v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
158 
159     ds += 64;
160     m += 64;
161 
162     N -= 64;
163   } while (N);
164 
165   v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
166   v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
167                            _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
168 
169   v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
170   v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
171                            _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
172 
173   v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
174 
175   v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
176 
177 #if ARCH_X86_64
178   acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
179 #else
180   xx_storel_64(&acc, v_acc_q);
181 #endif
182 
183   return acc > limit;
184 }
185 
186 // Negate under mask
negm_epi16(__m128i v_v_w,__m128i v_mask_w)187 static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
188   return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
189 }
190 
191 /**
192  * av1_wedge_compute_delta_squares_c
193  */
av1_wedge_compute_delta_squares_sse2(int16_t * d,const int16_t * a,const int16_t * b,int N)194 void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
195                                           const int16_t *b, int N) {
196   const __m128i v_neg_w =
197       _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0);
198 
199   assert(N % 64 == 0);
200 
201   do {
202     const __m128i v_a0_w = xx_load_128(a);
203     const __m128i v_b0_w = xx_load_128(b);
204     const __m128i v_a1_w = xx_load_128(a + 8);
205     const __m128i v_b1_w = xx_load_128(b + 8);
206     const __m128i v_a2_w = xx_load_128(a + 16);
207     const __m128i v_b2_w = xx_load_128(b + 16);
208     const __m128i v_a3_w = xx_load_128(a + 24);
209     const __m128i v_b3_w = xx_load_128(b + 24);
210 
211     const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
212     const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
213     const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
214     const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
215     const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
216     const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
217     const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
218     const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
219 
220     // Negate top word of pairs
221     const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
222     const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
223     const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
224     const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
225     const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
226     const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
227     const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
228     const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
229 
230     const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
231     const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
232     const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
233     const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
234     const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
235     const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
236     const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
237     const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
238 
239     const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
240     const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
241     const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
242     const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
243 
244     xx_store_128(d, v_r0_w);
245     xx_store_128(d + 8, v_r1_w);
246     xx_store_128(d + 16, v_r2_w);
247     xx_store_128(d + 24, v_r3_w);
248 
249     a += 32;
250     b += 32;
251     d += 32;
252     N -= 32;
253   } while (N);
254 }
255