1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <immintrin.h>
14 
15 #include "config/aom_config.h"
16 
17 #include "aom_ports/mem.h"
18 #include "aom/aom_integer.h"
19 
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
22 #include "aom_dsp/x86/synonyms.h"
23 
24 ////////////////////////////////////////////////////////////////////////////////
25 // 8 bit
26 ////////////////////////////////////////////////////////////////////////////////
27 
obmc_sad_w4_avx2(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)28 static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
29                                             const int pre_stride,
30                                             const int32_t *wsrc,
31                                             const int32_t *mask,
32                                             const int height) {
33   int n = 0;
34   __m256i v_sad_d = _mm256_setzero_si256();
35   const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
36 
37   do {
38     const __m128i v_p_b_0 = xx_loadl_32(pre);
39     const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
40     const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
41     const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
42     const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
43 
44     const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
45 
46     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
47     // boundaries. We use pmaddwd, as it has lower latency on Haswell
48     // than pmulld but produces the same result with these inputs.
49     const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
50 
51     const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
52     const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
53 
54     // Rounded absolute difference
55     const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
56     const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
57 
58     v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
59 
60     n += 8;
61     pre += pre_stride << 1;
62   } while (n < 8 * (height >> 1));
63 
64   __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
65   __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
66   v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
67   return xx_hsum_epi32_si32(v_sad_d_0);
68 }
69 
obmc_sad_w8n_avx2(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)70 static INLINE unsigned int obmc_sad_w8n_avx2(
71     const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
72     const int32_t *mask, const int width, const int height) {
73   const int pre_step = pre_stride - width;
74   int n = 0;
75   __m256i v_sad_d = _mm256_setzero_si256();
76   const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
77   assert(width >= 8);
78   assert(IS_POWER_OF_TWO(width));
79 
80   do {
81     const __m128i v_p0_b = xx_loadl_64(pre + n);
82     const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
83     const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
84 
85     const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
86 
87     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
88     // boundaries. We use pmaddwd, as it has lower latency on Haswell
89     // than pmulld but produces the same result with these inputs.
90     const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
91 
92     const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
93     const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
94 
95     // Rounded absolute difference
96     const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
97     const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
98 
99     v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
100 
101     n += 8;
102 
103     if ((n & (width - 1)) == 0) pre += pre_step;
104   } while (n < width * height);
105 
106   __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
107   __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
108   v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
109   return xx_hsum_epi32_si32(v_sad_d_0);
110 }
111 
112 #define OBMCSADWXH(w, h)                                          \
113   unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
114       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
115       const int32_t *msk) {                                       \
116     if (w == 4) {                                                 \
117       return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
118     } else {                                                      \
119       return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
120     }                                                             \
121   }
122 
123 OBMCSADWXH(128, 128)
124 OBMCSADWXH(128, 64)
125 OBMCSADWXH(64, 128)
126 OBMCSADWXH(64, 64)
127 OBMCSADWXH(64, 32)
128 OBMCSADWXH(32, 64)
129 OBMCSADWXH(32, 32)
130 OBMCSADWXH(32, 16)
131 OBMCSADWXH(16, 32)
132 OBMCSADWXH(16, 16)
133 OBMCSADWXH(16, 8)
134 OBMCSADWXH(8, 16)
135 OBMCSADWXH(8, 8)
136 OBMCSADWXH(8, 4)
137 OBMCSADWXH(4, 8)
138 OBMCSADWXH(4, 4)
139 OBMCSADWXH(4, 16)
140 OBMCSADWXH(16, 4)
141 OBMCSADWXH(8, 32)
142 OBMCSADWXH(32, 8)
143 OBMCSADWXH(16, 64)
144 OBMCSADWXH(64, 16)
145 
146 ////////////////////////////////////////////////////////////////////////////////
147 // High bit-depth
148 ////////////////////////////////////////////////////////////////////////////////
149 
hbd_obmc_sad_w4_avx2(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)150 static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
151                                                 const int pre_stride,
152                                                 const int32_t *wsrc,
153                                                 const int32_t *mask,
154                                                 const int height) {
155   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
156   int n = 0;
157   __m256i v_sad_d = _mm256_setzero_si256();
158   const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
159   do {
160     const __m128i v_p_w_0 = xx_loadl_64(pre);
161     const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
162     const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
163     const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
164     const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
165 
166     const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
167 
168     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
169     // boundaries. We use pmaddwd, as it has lower latency on Haswell
170     // than pmulld but produces the same result with these inputs.
171     const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
172 
173     const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
174     const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
175 
176     // Rounded absolute difference
177 
178     const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
179     const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
180 
181     v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
182 
183     n += 8;
184 
185     pre += pre_stride << 1;
186   } while (n < 8 * (height >> 1));
187 
188   __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
189   __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
190   v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
191   return xx_hsum_epi32_si32(v_sad_d_0);
192 }
193 
hbd_obmc_sad_w8n_avx2(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)194 static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
195     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
196     const int32_t *mask, const int width, const int height) {
197   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
198   const int pre_step = pre_stride - width;
199   int n = 0;
200   __m256i v_sad_d = _mm256_setzero_si256();
201   const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
202 
203   assert(width >= 8);
204   assert(IS_POWER_OF_TWO(width));
205 
206   do {
207     const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
208     const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
209     const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
210 
211     const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
212 
213     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
214     // boundaries. We use pmaddwd, as it has lower latency on Haswell
215     // than pmulld but produces the same result with these inputs.
216     const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
217 
218     const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
219     const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
220 
221     // Rounded absolute difference
222     const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
223     const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
224 
225     v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
226 
227     n += 8;
228 
229     if (n % width == 0) pre += pre_step;
230   } while (n < width * height);
231 
232   __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
233   __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
234   v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
235   return xx_hsum_epi32_si32(v_sad_d_0);
236 }
237 
238 #define HBD_OBMCSADWXH(w, h)                                           \
239   unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
240       const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
241       const int32_t *mask) {                                           \
242     if (w == 4) {                                                      \
243       return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
244     } else {                                                           \
245       return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
246     }                                                                  \
247   }
248 
249 HBD_OBMCSADWXH(128, 128)
250 HBD_OBMCSADWXH(128, 64)
251 HBD_OBMCSADWXH(64, 128)
252 HBD_OBMCSADWXH(64, 64)
253 HBD_OBMCSADWXH(64, 32)
254 HBD_OBMCSADWXH(32, 64)
255 HBD_OBMCSADWXH(32, 32)
256 HBD_OBMCSADWXH(32, 16)
257 HBD_OBMCSADWXH(16, 32)
258 HBD_OBMCSADWXH(16, 16)
259 HBD_OBMCSADWXH(16, 8)
260 HBD_OBMCSADWXH(8, 16)
261 HBD_OBMCSADWXH(8, 8)
262 HBD_OBMCSADWXH(8, 4)
263 HBD_OBMCSADWXH(4, 8)
264 HBD_OBMCSADWXH(4, 4)
265 HBD_OBMCSADWXH(4, 16)
266 HBD_OBMCSADWXH(16, 4)
267 HBD_OBMCSADWXH(8, 32)
268 HBD_OBMCSADWXH(32, 8)
269 HBD_OBMCSADWXH(16, 64)
270 HBD_OBMCSADWXH(64, 16)
271