1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <immintrin.h>
14 
15 #include "config/aom_config.h"
16 
17 #include "aom_ports/mem.h"
18 #include "aom/aom_integer.h"
19 
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
22 #include "aom_dsp/x86/synonyms.h"
23 
24 ////////////////////////////////////////////////////////////////////////////////
25 // 8 bit
26 ////////////////////////////////////////////////////////////////////////////////
27 
obmc_sad_w4(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)28 static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
29                                                  const int pre_stride,
30                                                  const int32_t *wsrc,
31                                                  const int32_t *mask,
32                                                  const int height) {
33   const int pre_step = pre_stride - 4;
34   int n = 0;
35   __m128i v_sad_d = _mm_setzero_si128();
36 
37   do {
38     const __m128i v_p_b = xx_loadl_32(pre + n);
39     const __m128i v_m_d = xx_load_128(mask + n);
40     const __m128i v_w_d = xx_load_128(wsrc + n);
41 
42     const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
43 
44     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
45     // boundaries. We use pmaddwd, as it has lower latency on Haswell
46     // than pmulld but produces the same result with these inputs.
47     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
48 
49     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
50     const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
51 
52     // Rounded absolute difference
53     const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
54 
55     v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
56 
57     n += 4;
58 
59     if (n % 4 == 0) pre += pre_step;
60   } while (n < 4 * height);
61 
62   return xx_hsum_epi32_si32(v_sad_d);
63 }
64 
obmc_sad_w8n(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)65 static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
66     const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
67     const int32_t *mask, const int width, const int height) {
68   const int pre_step = pre_stride - width;
69   int n = 0;
70   __m128i v_sad_d = _mm_setzero_si128();
71 
72   assert(width >= 8);
73   assert(IS_POWER_OF_TWO(width));
74 
75   do {
76     const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
77     const __m128i v_m1_d = xx_load_128(mask + n + 4);
78     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
79     const __m128i v_p0_b = xx_loadl_32(pre + n);
80     const __m128i v_m0_d = xx_load_128(mask + n);
81     const __m128i v_w0_d = xx_load_128(wsrc + n);
82 
83     const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
84     const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
85 
86     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
87     // boundaries. We use pmaddwd, as it has lower latency on Haswell
88     // than pmulld but produces the same result with these inputs.
89     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
90     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
91 
92     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
93     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
94     const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
95     const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
96 
97     // Rounded absolute difference
98     const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
99     const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
100 
101     v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
102     v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
103 
104     n += 8;
105 
106     if (n % width == 0) pre += pre_step;
107   } while (n < width * height);
108 
109   return xx_hsum_epi32_si32(v_sad_d);
110 }
111 
112 #define OBMCSADWXH(w, h)                                       \
113   unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
114       const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
115       const int32_t *msk) {                                    \
116     if (w == 4) {                                              \
117       return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
118     } else {                                                   \
119       return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
120     }                                                          \
121   }
122 
123 OBMCSADWXH(128, 128)
124 OBMCSADWXH(128, 64)
125 OBMCSADWXH(64, 128)
126 OBMCSADWXH(64, 64)
127 OBMCSADWXH(64, 32)
128 OBMCSADWXH(32, 64)
129 OBMCSADWXH(32, 32)
130 OBMCSADWXH(32, 16)
131 OBMCSADWXH(16, 32)
132 OBMCSADWXH(16, 16)
133 OBMCSADWXH(16, 8)
134 OBMCSADWXH(8, 16)
135 OBMCSADWXH(8, 8)
136 OBMCSADWXH(8, 4)
137 OBMCSADWXH(4, 8)
138 OBMCSADWXH(4, 4)
139 OBMCSADWXH(4, 16)
140 OBMCSADWXH(16, 4)
141 OBMCSADWXH(8, 32)
142 OBMCSADWXH(32, 8)
143 OBMCSADWXH(16, 64)
144 OBMCSADWXH(64, 16)
145 
146 ////////////////////////////////////////////////////////////////////////////////
147 // High bit-depth
148 ////////////////////////////////////////////////////////////////////////////////
149 
hbd_obmc_sad_w4(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)150 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
151                                                      const int pre_stride,
152                                                      const int32_t *wsrc,
153                                                      const int32_t *mask,
154                                                      const int height) {
155   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
156   const int pre_step = pre_stride - 4;
157   int n = 0;
158   __m128i v_sad_d = _mm_setzero_si128();
159 
160   do {
161     const __m128i v_p_w = xx_loadl_64(pre + n);
162     const __m128i v_m_d = xx_load_128(mask + n);
163     const __m128i v_w_d = xx_load_128(wsrc + n);
164 
165     const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
166 
167     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
168     // boundaries. We use pmaddwd, as it has lower latency on Haswell
169     // than pmulld but produces the same result with these inputs.
170     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
171 
172     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
173     const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
174 
175     // Rounded absolute difference
176     const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
177 
178     v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
179 
180     n += 4;
181 
182     if (n % 4 == 0) pre += pre_step;
183   } while (n < 4 * height);
184 
185   return xx_hsum_epi32_si32(v_sad_d);
186 }
187 
hbd_obmc_sad_w8n(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)188 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
189     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
190     const int32_t *mask, const int width, const int height) {
191   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
192   const int pre_step = pre_stride - width;
193   int n = 0;
194   __m128i v_sad_d = _mm_setzero_si128();
195 
196   assert(width >= 8);
197   assert(IS_POWER_OF_TWO(width));
198 
199   do {
200     const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
201     const __m128i v_m1_d = xx_load_128(mask + n + 4);
202     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
203     const __m128i v_p0_w = xx_loadl_64(pre + n);
204     const __m128i v_m0_d = xx_load_128(mask + n);
205     const __m128i v_w0_d = xx_load_128(wsrc + n);
206 
207     const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
208     const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
209 
210     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
211     // boundaries. We use pmaddwd, as it has lower latency on Haswell
212     // than pmulld but produces the same result with these inputs.
213     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
214     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
215 
216     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
217     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
218     const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
219     const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
220 
221     // Rounded absolute difference
222     const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
223     const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
224 
225     v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
226     v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
227 
228     n += 8;
229 
230     if (n % width == 0) pre += pre_step;
231   } while (n < width * height);
232 
233   return xx_hsum_epi32_si32(v_sad_d);
234 }
235 
236 #define HBD_OBMCSADWXH(w, h)                                      \
237   unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
238       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
239       const int32_t *mask) {                                      \
240     if (w == 4) {                                                 \
241       return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
242     } else {                                                      \
243       return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
244     }                                                             \
245   }
246 
247 HBD_OBMCSADWXH(128, 128)
248 HBD_OBMCSADWXH(128, 64)
249 HBD_OBMCSADWXH(64, 128)
250 HBD_OBMCSADWXH(64, 64)
251 HBD_OBMCSADWXH(64, 32)
252 HBD_OBMCSADWXH(32, 64)
253 HBD_OBMCSADWXH(32, 32)
254 HBD_OBMCSADWXH(32, 16)
255 HBD_OBMCSADWXH(16, 32)
256 HBD_OBMCSADWXH(16, 16)
257 HBD_OBMCSADWXH(16, 8)
258 HBD_OBMCSADWXH(8, 16)
259 HBD_OBMCSADWXH(8, 8)
260 HBD_OBMCSADWXH(8, 4)
261 HBD_OBMCSADWXH(4, 8)
262 HBD_OBMCSADWXH(4, 4)
263 HBD_OBMCSADWXH(4, 16)
264 HBD_OBMCSADWXH(16, 4)
265 HBD_OBMCSADWXH(8, 32)
266 HBD_OBMCSADWXH(32, 8)
267 HBD_OBMCSADWXH(16, 64)
268 HBD_OBMCSADWXH(64, 16)
269