1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h> /* AVX2 */
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_ports/mem.h"
15
mb_lpf_horizontal_edge_w_avx2_8(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)16 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
17 const unsigned char *_blimit, const unsigned char *_limit,
18 const unsigned char *_thresh) {
19 __m128i mask, hev, flat, flat2;
20 const __m128i zero = _mm_set1_epi16(0);
21 const __m128i one = _mm_set1_epi8(1);
22 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
23 __m128i abs_p1p0;
24
25 const __m128i thresh = _mm_broadcastb_epi8(
26 _mm_cvtsi32_si128((int) _thresh[0]));
27 const __m128i limit = _mm_broadcastb_epi8(
28 _mm_cvtsi32_si128((int) _limit[0]));
29 const __m128i blimit = _mm_broadcastb_epi8(
30 _mm_cvtsi32_si128((int) _blimit[0]));
31
32 q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
33 q4p4 = _mm_castps_si128(
34 _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
35 q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
36 q3p3 = _mm_castps_si128(
37 _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
38 q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
39 q2p2 = _mm_castps_si128(
40 _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
41 q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
42 q1p1 = _mm_castps_si128(
43 _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
44 p1q1 = _mm_shuffle_epi32(q1p1, 78);
45 q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
46 q0p0 = _mm_castps_si128(
47 _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
48 p0q0 = _mm_shuffle_epi32(q0p0, 78);
49
50 {
51 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
52 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
53 _mm_subs_epu8(q0p0, q1p1));
54 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
55 fe = _mm_set1_epi8(0xfe);
56 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
57 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
58 _mm_subs_epu8(p0q0, q0p0));
59 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
60 _mm_subs_epu8(p1q1, q1p1));
61 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
62 hev = _mm_subs_epu8(flat, thresh);
63 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
64
65 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
66 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
67 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
68 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
69 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
70 mask = _mm_max_epu8(abs_p1p0, mask);
71 // mask |= (abs(p1 - p0) > limit) * -1;
72 // mask |= (abs(q1 - q0) > limit) * -1;
73
74 work = _mm_max_epu8(
75 _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
76 _mm_subs_epu8(q1p1, q2p2)),
77 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
78 _mm_subs_epu8(q2p2, q3p3)));
79 mask = _mm_max_epu8(work, mask);
80 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
81 mask = _mm_subs_epu8(mask, limit);
82 mask = _mm_cmpeq_epi8(mask, zero);
83 }
84
85 // lp filter
86 {
87 const __m128i t4 = _mm_set1_epi8(4);
88 const __m128i t3 = _mm_set1_epi8(3);
89 const __m128i t80 = _mm_set1_epi8(0x80);
90 const __m128i t1 = _mm_set1_epi16(0x1);
91 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
92 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
93 __m128i qs0 = _mm_xor_si128(p0q0, t80);
94 __m128i qs1 = _mm_xor_si128(p1q1, t80);
95 __m128i filt;
96 __m128i work_a;
97 __m128i filter1, filter2;
98 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
99 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
100
101 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
102 work_a = _mm_subs_epi8(qs0, qs0ps0);
103 filt = _mm_adds_epi8(filt, work_a);
104 filt = _mm_adds_epi8(filt, work_a);
105 filt = _mm_adds_epi8(filt, work_a);
106 /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
107 filt = _mm_and_si128(filt, mask);
108
109 filter1 = _mm_adds_epi8(filt, t4);
110 filter2 = _mm_adds_epi8(filt, t3);
111
112 filter1 = _mm_unpacklo_epi8(zero, filter1);
113 filter1 = _mm_srai_epi16(filter1, 0xB);
114 filter2 = _mm_unpacklo_epi8(zero, filter2);
115 filter2 = _mm_srai_epi16(filter2, 0xB);
116
117 /* Filter1 >> 3 */
118 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
119 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
120
121 /* filt >> 1 */
122 filt = _mm_adds_epi16(filter1, t1);
123 filt = _mm_srai_epi16(filt, 1);
124 filt = _mm_andnot_si128(
125 _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
126 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
127 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
128 // loopfilter done
129
130 {
131 __m128i work;
132 flat = _mm_max_epu8(
133 _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
134 _mm_subs_epu8(q0p0, q2p2)),
135 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
136 _mm_subs_epu8(q0p0, q3p3)));
137 flat = _mm_max_epu8(abs_p1p0, flat);
138 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
139 flat = _mm_subs_epu8(flat, one);
140 flat = _mm_cmpeq_epi8(flat, zero);
141 flat = _mm_and_si128(flat, mask);
142
143 q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
144 q5p5 = _mm_castps_si128(
145 _mm_loadh_pi(_mm_castsi128_ps(q5p5),
146 (__m64 *) (s + 5 * p)));
147
148 q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
149 q6p6 = _mm_castps_si128(
150 _mm_loadh_pi(_mm_castsi128_ps(q6p6),
151 (__m64 *) (s + 6 * p)));
152
153 flat2 = _mm_max_epu8(
154 _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
155 _mm_subs_epu8(q0p0, q4p4)),
156 _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
157 _mm_subs_epu8(q0p0, q5p5)));
158
159 q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
160 q7p7 = _mm_castps_si128(
161 _mm_loadh_pi(_mm_castsi128_ps(q7p7),
162 (__m64 *) (s + 7 * p)));
163
164 work = _mm_max_epu8(
165 _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
166 _mm_subs_epu8(q0p0, q6p6)),
167 _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
168 _mm_subs_epu8(q0p0, q7p7)));
169
170 flat2 = _mm_max_epu8(work, flat2);
171 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
172 flat2 = _mm_subs_epu8(flat2, one);
173 flat2 = _mm_cmpeq_epi8(flat2, zero);
174 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
175 }
176
177 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
178 // flat and wide flat calculations
179 {
180 const __m128i eight = _mm_set1_epi16(8);
181 const __m128i four = _mm_set1_epi16(4);
182 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
183 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
184 __m128i pixelFilter_p, pixelFilter_q;
185 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
186 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
187
188 p7_16 = _mm_unpacklo_epi8(q7p7, zero);
189 p6_16 = _mm_unpacklo_epi8(q6p6, zero);
190 p5_16 = _mm_unpacklo_epi8(q5p5, zero);
191 p4_16 = _mm_unpacklo_epi8(q4p4, zero);
192 p3_16 = _mm_unpacklo_epi8(q3p3, zero);
193 p2_16 = _mm_unpacklo_epi8(q2p2, zero);
194 p1_16 = _mm_unpacklo_epi8(q1p1, zero);
195 p0_16 = _mm_unpacklo_epi8(q0p0, zero);
196 q0_16 = _mm_unpackhi_epi8(q0p0, zero);
197 q1_16 = _mm_unpackhi_epi8(q1p1, zero);
198 q2_16 = _mm_unpackhi_epi8(q2p2, zero);
199 q3_16 = _mm_unpackhi_epi8(q3p3, zero);
200 q4_16 = _mm_unpackhi_epi8(q4p4, zero);
201 q5_16 = _mm_unpackhi_epi8(q5p5, zero);
202 q6_16 = _mm_unpackhi_epi8(q6p6, zero);
203 q7_16 = _mm_unpackhi_epi8(q7p7, zero);
204
205 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
206 _mm_add_epi16(p4_16, p3_16));
207 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
208 _mm_add_epi16(q4_16, q3_16));
209
210 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
211 _mm_add_epi16(p2_16, p1_16));
212 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
213
214 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
215 _mm_add_epi16(q2_16, q1_16));
216 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
217 pixelFilter_p = _mm_add_epi16(eight,
218 _mm_add_epi16(pixelFilter_p, pixelFilter_q));
219 pixetFilter_p2p1p0 = _mm_add_epi16(four,
220 _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
221 res_p = _mm_srli_epi16(
222 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
223 4);
224 res_q = _mm_srli_epi16(
225 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
226 4);
227 flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
228 res_p = _mm_srli_epi16(
229 _mm_add_epi16(pixetFilter_p2p1p0,
230 _mm_add_epi16(p3_16, p0_16)), 3);
231 res_q = _mm_srli_epi16(
232 _mm_add_epi16(pixetFilter_p2p1p0,
233 _mm_add_epi16(q3_16, q0_16)), 3);
234
235 flat_q0p0 = _mm_packus_epi16(res_p, res_q);
236
237 sum_p7 = _mm_add_epi16(p7_16, p7_16);
238 sum_q7 = _mm_add_epi16(q7_16, q7_16);
239 sum_p3 = _mm_add_epi16(p3_16, p3_16);
240 sum_q3 = _mm_add_epi16(q3_16, q3_16);
241
242 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
243 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
244 res_p = _mm_srli_epi16(
245 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
246 4);
247 res_q = _mm_srli_epi16(
248 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
249 4);
250 flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
251
252 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
253 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
254 res_p = _mm_srli_epi16(
255 _mm_add_epi16(pixetFilter_p2p1p0,
256 _mm_add_epi16(sum_p3, p1_16)), 3);
257 res_q = _mm_srli_epi16(
258 _mm_add_epi16(pixetFilter_q2q1q0,
259 _mm_add_epi16(sum_q3, q1_16)), 3);
260 flat_q1p1 = _mm_packus_epi16(res_p, res_q);
261
262 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
263 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
264 sum_p3 = _mm_add_epi16(sum_p3, p3_16);
265 sum_q3 = _mm_add_epi16(sum_q3, q3_16);
266
267 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
268 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
269 res_p = _mm_srli_epi16(
270 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
271 4);
272 res_q = _mm_srli_epi16(
273 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
274 4);
275 flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
276
277 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
278 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
279
280 res_p = _mm_srli_epi16(
281 _mm_add_epi16(pixetFilter_p2p1p0,
282 _mm_add_epi16(sum_p3, p2_16)), 3);
283 res_q = _mm_srli_epi16(
284 _mm_add_epi16(pixetFilter_q2q1q0,
285 _mm_add_epi16(sum_q3, q2_16)), 3);
286 flat_q2p2 = _mm_packus_epi16(res_p, res_q);
287
288 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
289 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
290 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
291 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
292 res_p = _mm_srli_epi16(
293 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
294 4);
295 res_q = _mm_srli_epi16(
296 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
297 4);
298 flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
299
300 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
301 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
302 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
303 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
304 res_p = _mm_srli_epi16(
305 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
306 4);
307 res_q = _mm_srli_epi16(
308 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
309 4);
310 flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
311
312 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
313 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
314 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
315 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
316 res_p = _mm_srli_epi16(
317 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
318 4);
319 res_q = _mm_srli_epi16(
320 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
321 4);
322 flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
323
324 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
325 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
326 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
327 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
328 res_p = _mm_srli_epi16(
329 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
330 4);
331 res_q = _mm_srli_epi16(
332 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
333 4);
334 flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
335 }
336 // wide flat
337 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
338
339 flat = _mm_shuffle_epi32(flat, 68);
340 flat2 = _mm_shuffle_epi32(flat2, 68);
341
342 q2p2 = _mm_andnot_si128(flat, q2p2);
343 flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
344 q2p2 = _mm_or_si128(q2p2, flat_q2p2);
345
346 qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
347 flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
348 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
349
350 qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
351 flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
352 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
353
354 q6p6 = _mm_andnot_si128(flat2, q6p6);
355 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
356 q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
357 _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
358 _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
359
360 q5p5 = _mm_andnot_si128(flat2, q5p5);
361 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
362 q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
363 _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
364 _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
365
366 q4p4 = _mm_andnot_si128(flat2, q4p4);
367 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
368 q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
369 _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
370 _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
371
372 q3p3 = _mm_andnot_si128(flat2, q3p3);
373 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
374 q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
375 _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
376 _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
377
378 q2p2 = _mm_andnot_si128(flat2, q2p2);
379 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
380 q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
381 _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
382 _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
383
384 q1p1 = _mm_andnot_si128(flat2, q1p1);
385 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
386 q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
387 _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
388 _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
389
390 q0p0 = _mm_andnot_si128(flat2, q0p0);
391 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
392 q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
393 _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
394 _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
395 }
396 }
397
398 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
399 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
400 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
401 };
402
mb_lpf_horizontal_edge_w_avx2_16(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)403 static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
404 const unsigned char *_blimit, const unsigned char *_limit,
405 const unsigned char *_thresh) {
406 __m128i mask, hev, flat, flat2;
407 const __m128i zero = _mm_set1_epi16(0);
408 const __m128i one = _mm_set1_epi8(1);
409 __m128i p7, p6, p5;
410 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
411 __m128i q5, q6, q7;
412 __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
413 q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
414 p256_0, q256_0;
415
416 const __m128i thresh = _mm_broadcastb_epi8(
417 _mm_cvtsi32_si128((int) _thresh[0]));
418 const __m128i limit = _mm_broadcastb_epi8(
419 _mm_cvtsi32_si128((int) _limit[0]));
420 const __m128i blimit = _mm_broadcastb_epi8(
421 _mm_cvtsi32_si128((int) _blimit[0]));
422
423 p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
424 (__m128d const *)(s - 5 * p)));
425 p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
426 (__m128d const *)(s - 4 * p)));
427 p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
428 (__m128d const *)(s - 3 * p)));
429 p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
430 (__m128d const *)(s - 2 * p)));
431 p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
432 (__m128d const *)(s - 1 * p)));
433 q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
434 (__m128d const *)(s - 0 * p)));
435 q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
436 (__m128d const *)(s + 1 * p)));
437 q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
438 (__m128d const *)(s + 2 * p)));
439 q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
440 (__m128d const *)(s + 3 * p)));
441 q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
442 (__m128d const *)(s + 4 * p)));
443
444 p4 = _mm256_castsi256_si128(p256_4);
445 p3 = _mm256_castsi256_si128(p256_3);
446 p2 = _mm256_castsi256_si128(p256_2);
447 p1 = _mm256_castsi256_si128(p256_1);
448 p0 = _mm256_castsi256_si128(p256_0);
449 q0 = _mm256_castsi256_si128(q256_0);
450 q1 = _mm256_castsi256_si128(q256_1);
451 q2 = _mm256_castsi256_si128(q256_2);
452 q3 = _mm256_castsi256_si128(q256_3);
453 q4 = _mm256_castsi256_si128(q256_4);
454
455 {
456 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
457 _mm_subs_epu8(p0, p1));
458 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
459 _mm_subs_epu8(q0, q1));
460 const __m128i fe = _mm_set1_epi8(0xfe);
461 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
462 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
463 _mm_subs_epu8(q0, p0));
464 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
465 _mm_subs_epu8(q1, p1));
466 __m128i work;
467 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
468 hev = _mm_subs_epu8(flat, thresh);
469 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
470
471 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
472 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
473 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
474 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
475 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
476 mask = _mm_max_epu8(flat, mask);
477 // mask |= (abs(p1 - p0) > limit) * -1;
478 // mask |= (abs(q1 - q0) > limit) * -1;
479 work = _mm_max_epu8(
480 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
481 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
482 mask = _mm_max_epu8(work, mask);
483 work = _mm_max_epu8(
484 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
485 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
486 mask = _mm_max_epu8(work, mask);
487 mask = _mm_subs_epu8(mask, limit);
488 mask = _mm_cmpeq_epi8(mask, zero);
489 }
490
491 // lp filter
492 {
493 const __m128i t4 = _mm_set1_epi8(4);
494 const __m128i t3 = _mm_set1_epi8(3);
495 const __m128i t80 = _mm_set1_epi8(0x80);
496 const __m128i te0 = _mm_set1_epi8(0xe0);
497 const __m128i t1f = _mm_set1_epi8(0x1f);
498 const __m128i t1 = _mm_set1_epi8(0x1);
499 const __m128i t7f = _mm_set1_epi8(0x7f);
500
501 __m128i ps1 = _mm_xor_si128(p1, t80);
502 __m128i ps0 = _mm_xor_si128(p0, t80);
503 __m128i qs0 = _mm_xor_si128(q0, t80);
504 __m128i qs1 = _mm_xor_si128(q1, t80);
505 __m128i filt;
506 __m128i work_a;
507 __m128i filter1, filter2;
508 __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
509 flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
510 flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
511 flat_q2;
512
513 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
514 work_a = _mm_subs_epi8(qs0, ps0);
515 filt = _mm_adds_epi8(filt, work_a);
516 filt = _mm_adds_epi8(filt, work_a);
517 filt = _mm_adds_epi8(filt, work_a);
518 /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
519 filt = _mm_and_si128(filt, mask);
520
521 filter1 = _mm_adds_epi8(filt, t4);
522 filter2 = _mm_adds_epi8(filt, t3);
523
524 /* Filter1 >> 3 */
525 work_a = _mm_cmpgt_epi8(zero, filter1);
526 filter1 = _mm_srli_epi16(filter1, 3);
527 work_a = _mm_and_si128(work_a, te0);
528 filter1 = _mm_and_si128(filter1, t1f);
529 filter1 = _mm_or_si128(filter1, work_a);
530 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
531
532 /* Filter2 >> 3 */
533 work_a = _mm_cmpgt_epi8(zero, filter2);
534 filter2 = _mm_srli_epi16(filter2, 3);
535 work_a = _mm_and_si128(work_a, te0);
536 filter2 = _mm_and_si128(filter2, t1f);
537 filter2 = _mm_or_si128(filter2, work_a);
538 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
539
540 /* filt >> 1 */
541 filt = _mm_adds_epi8(filter1, t1);
542 work_a = _mm_cmpgt_epi8(zero, filt);
543 filt = _mm_srli_epi16(filt, 1);
544 work_a = _mm_and_si128(work_a, t80);
545 filt = _mm_and_si128(filt, t7f);
546 filt = _mm_or_si128(filt, work_a);
547 filt = _mm_andnot_si128(hev, filt);
548 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
549 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
550 // loopfilter done
551
552 {
553 __m128i work;
554 work = _mm_max_epu8(
555 _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
556 _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
557 flat = _mm_max_epu8(work, flat);
558 work = _mm_max_epu8(
559 _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
560 _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
561 flat = _mm_max_epu8(work, flat);
562 work = _mm_max_epu8(
563 _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
564 _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
565 flat = _mm_subs_epu8(flat, one);
566 flat = _mm_cmpeq_epi8(flat, zero);
567 flat = _mm_and_si128(flat, mask);
568
569 p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
570 (__m128d const *)(s - 6 * p)));
571 q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
572 (__m128d const *)(s + 5 * p)));
573 p5 = _mm256_castsi256_si128(p256_5);
574 q5 = _mm256_castsi256_si128(q256_5);
575 flat2 = _mm_max_epu8(
576 _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
577 _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
578
579 flat2 = _mm_max_epu8(work, flat2);
580 p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
581 (__m128d const *)(s - 7 * p)));
582 q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
583 (__m128d const *)(s + 6 * p)));
584 p6 = _mm256_castsi256_si128(p256_6);
585 q6 = _mm256_castsi256_si128(q256_6);
586 work = _mm_max_epu8(
587 _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
588 _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
589
590 flat2 = _mm_max_epu8(work, flat2);
591
592 p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
593 (__m128d const *)(s - 8 * p)));
594 q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
595 (__m128d const *)(s + 7 * p)));
596 p7 = _mm256_castsi256_si128(p256_7);
597 q7 = _mm256_castsi256_si128(q256_7);
598 work = _mm_max_epu8(
599 _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
600 _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
601
602 flat2 = _mm_max_epu8(work, flat2);
603 flat2 = _mm_subs_epu8(flat2, one);
604 flat2 = _mm_cmpeq_epi8(flat2, zero);
605 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
606 }
607
608 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
609 // flat and wide flat calculations
610 {
611 const __m256i eight = _mm256_set1_epi16(8);
612 const __m256i four = _mm256_set1_epi16(4);
613 __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
614 pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
615 res_q;
616
617 const __m256i filter = _mm256_load_si256(
618 (__m256i const *)filt_loopfilter_avx2);
619 p256_7 = _mm256_shuffle_epi8(p256_7, filter);
620 p256_6 = _mm256_shuffle_epi8(p256_6, filter);
621 p256_5 = _mm256_shuffle_epi8(p256_5, filter);
622 p256_4 = _mm256_shuffle_epi8(p256_4, filter);
623 p256_3 = _mm256_shuffle_epi8(p256_3, filter);
624 p256_2 = _mm256_shuffle_epi8(p256_2, filter);
625 p256_1 = _mm256_shuffle_epi8(p256_1, filter);
626 p256_0 = _mm256_shuffle_epi8(p256_0, filter);
627 q256_0 = _mm256_shuffle_epi8(q256_0, filter);
628 q256_1 = _mm256_shuffle_epi8(q256_1, filter);
629 q256_2 = _mm256_shuffle_epi8(q256_2, filter);
630 q256_3 = _mm256_shuffle_epi8(q256_3, filter);
631 q256_4 = _mm256_shuffle_epi8(q256_4, filter);
632 q256_5 = _mm256_shuffle_epi8(q256_5, filter);
633 q256_6 = _mm256_shuffle_epi8(q256_6, filter);
634 q256_7 = _mm256_shuffle_epi8(q256_7, filter);
635
636 pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
637 _mm256_add_epi16(p256_4, p256_3));
638 pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
639 _mm256_add_epi16(q256_4, q256_3));
640
641 pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
642 _mm256_add_epi16(p256_2, p256_1));
643 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
644
645 pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
646 _mm256_add_epi16(q256_2, q256_1));
647 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
648
649 pixelFilter_p = _mm256_add_epi16(eight,
650 _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
651
652 pixetFilter_p2p1p0 = _mm256_add_epi16(four,
653 _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
654
655 res_p = _mm256_srli_epi16(
656 _mm256_add_epi16(pixelFilter_p,
657 _mm256_add_epi16(p256_7, p256_0)), 4);
658
659 flat2_p0 = _mm256_castsi256_si128(
660 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
661 168));
662
663 res_q = _mm256_srli_epi16(
664 _mm256_add_epi16(pixelFilter_p,
665 _mm256_add_epi16(q256_7, q256_0)), 4);
666
667 flat2_q0 = _mm256_castsi256_si128(
668 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
669 168));
670
671 res_p = _mm256_srli_epi16(
672 _mm256_add_epi16(pixetFilter_p2p1p0,
673 _mm256_add_epi16(p256_3, p256_0)), 3);
674
675 flat_p0 = _mm256_castsi256_si128(
676 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
677 168));
678
679 res_q = _mm256_srli_epi16(
680 _mm256_add_epi16(pixetFilter_p2p1p0,
681 _mm256_add_epi16(q256_3, q256_0)), 3);
682
683 flat_q0 = _mm256_castsi256_si128(
684 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
685 168));
686
687 sum_p7 = _mm256_add_epi16(p256_7, p256_7);
688
689 sum_q7 = _mm256_add_epi16(q256_7, q256_7);
690
691 sum_p3 = _mm256_add_epi16(p256_3, p256_3);
692
693 sum_q3 = _mm256_add_epi16(q256_3, q256_3);
694
695 pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
696
697 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
698
699 res_p = _mm256_srli_epi16(
700 _mm256_add_epi16(pixelFilter_p,
701 _mm256_add_epi16(sum_p7, p256_1)), 4);
702
703 flat2_p1 = _mm256_castsi256_si128(
704 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
705 168));
706
707 res_q = _mm256_srli_epi16(
708 _mm256_add_epi16(pixelFilter_q,
709 _mm256_add_epi16(sum_q7, q256_1)), 4);
710
711 flat2_q1 = _mm256_castsi256_si128(
712 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
713 168));
714
715 pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
716
717 pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
718
719 res_p = _mm256_srli_epi16(
720 _mm256_add_epi16(pixetFilter_p2p1p0,
721 _mm256_add_epi16(sum_p3, p256_1)), 3);
722
723 flat_p1 = _mm256_castsi256_si128(
724 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
725 168));
726
727 res_q = _mm256_srli_epi16(
728 _mm256_add_epi16(pixetFilter_q2q1q0,
729 _mm256_add_epi16(sum_q3, q256_1)), 3);
730
731 flat_q1 = _mm256_castsi256_si128(
732 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
733 168));
734
735 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
736
737 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
738
739 sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
740
741 sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
742
743 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
744
745 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
746
747 res_p = _mm256_srli_epi16(
748 _mm256_add_epi16(pixelFilter_p,
749 _mm256_add_epi16(sum_p7, p256_2)), 4);
750
751 flat2_p2 = _mm256_castsi256_si128(
752 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
753 168));
754
755 res_q = _mm256_srli_epi16(
756 _mm256_add_epi16(pixelFilter_q,
757 _mm256_add_epi16(sum_q7, q256_2)), 4);
758
759 flat2_q2 = _mm256_castsi256_si128(
760 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
761 168));
762
763 pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
764
765 pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
766
767 res_p = _mm256_srli_epi16(
768 _mm256_add_epi16(pixetFilter_p2p1p0,
769 _mm256_add_epi16(sum_p3, p256_2)), 3);
770
771 flat_p2 = _mm256_castsi256_si128(
772 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
773 168));
774
775 res_q = _mm256_srli_epi16(
776 _mm256_add_epi16(pixetFilter_q2q1q0,
777 _mm256_add_epi16(sum_q3, q256_2)), 3);
778
779 flat_q2 = _mm256_castsi256_si128(
780 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
781 168));
782
783 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
784
785 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
786
787 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
788
789 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
790
791 res_p = _mm256_srli_epi16(
792 _mm256_add_epi16(pixelFilter_p,
793 _mm256_add_epi16(sum_p7, p256_3)), 4);
794
795 flat2_p3 = _mm256_castsi256_si128(
796 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
797 168));
798
799 res_q = _mm256_srli_epi16(
800 _mm256_add_epi16(pixelFilter_q,
801 _mm256_add_epi16(sum_q7, q256_3)), 4);
802
803 flat2_q3 = _mm256_castsi256_si128(
804 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
805 168));
806
807 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
808
809 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
810
811 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
812
813 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
814
815 res_p = _mm256_srli_epi16(
816 _mm256_add_epi16(pixelFilter_p,
817 _mm256_add_epi16(sum_p7, p256_4)), 4);
818
819 flat2_p4 = _mm256_castsi256_si128(
820 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
821 168));
822
823 res_q = _mm256_srli_epi16(
824 _mm256_add_epi16(pixelFilter_q,
825 _mm256_add_epi16(sum_q7, q256_4)), 4);
826
827 flat2_q4 = _mm256_castsi256_si128(
828 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
829 168));
830
831 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
832
833 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
834
835 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
836
837 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
838
839 res_p = _mm256_srli_epi16(
840 _mm256_add_epi16(pixelFilter_p,
841 _mm256_add_epi16(sum_p7, p256_5)), 4);
842
843 flat2_p5 = _mm256_castsi256_si128(
844 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
845 168));
846
847 res_q = _mm256_srli_epi16(
848 _mm256_add_epi16(pixelFilter_q,
849 _mm256_add_epi16(sum_q7, q256_5)), 4);
850
851 flat2_q5 = _mm256_castsi256_si128(
852 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
853 168));
854
855 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
856
857 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
858
859 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
860
861 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
862
863 res_p = _mm256_srli_epi16(
864 _mm256_add_epi16(pixelFilter_p,
865 _mm256_add_epi16(sum_p7, p256_6)), 4);
866
867 flat2_p6 = _mm256_castsi256_si128(
868 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
869 168));
870
871 res_q = _mm256_srli_epi16(
872 _mm256_add_epi16(pixelFilter_q,
873 _mm256_add_epi16(sum_q7, q256_6)), 4);
874
875 flat2_q6 = _mm256_castsi256_si128(
876 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
877 168));
878 }
879
880 // wide flat
881 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
882
883 p2 = _mm_andnot_si128(flat, p2);
884 flat_p2 = _mm_and_si128(flat, flat_p2);
885 p2 = _mm_or_si128(flat_p2, p2);
886
887 p1 = _mm_andnot_si128(flat, ps1);
888 flat_p1 = _mm_and_si128(flat, flat_p1);
889 p1 = _mm_or_si128(flat_p1, p1);
890
891 p0 = _mm_andnot_si128(flat, ps0);
892 flat_p0 = _mm_and_si128(flat, flat_p0);
893 p0 = _mm_or_si128(flat_p0, p0);
894
895 q0 = _mm_andnot_si128(flat, qs0);
896 flat_q0 = _mm_and_si128(flat, flat_q0);
897 q0 = _mm_or_si128(flat_q0, q0);
898
899 q1 = _mm_andnot_si128(flat, qs1);
900 flat_q1 = _mm_and_si128(flat, flat_q1);
901 q1 = _mm_or_si128(flat_q1, q1);
902
903 q2 = _mm_andnot_si128(flat, q2);
904 flat_q2 = _mm_and_si128(flat, flat_q2);
905 q2 = _mm_or_si128(flat_q2, q2);
906
907 p6 = _mm_andnot_si128(flat2, p6);
908 flat2_p6 = _mm_and_si128(flat2, flat2_p6);
909 p6 = _mm_or_si128(flat2_p6, p6);
910 _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
911
912 p5 = _mm_andnot_si128(flat2, p5);
913 flat2_p5 = _mm_and_si128(flat2, flat2_p5);
914 p5 = _mm_or_si128(flat2_p5, p5);
915 _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
916
917 p4 = _mm_andnot_si128(flat2, p4);
918 flat2_p4 = _mm_and_si128(flat2, flat2_p4);
919 p4 = _mm_or_si128(flat2_p4, p4);
920 _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
921
922 p3 = _mm_andnot_si128(flat2, p3);
923 flat2_p3 = _mm_and_si128(flat2, flat2_p3);
924 p3 = _mm_or_si128(flat2_p3, p3);
925 _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
926
927 p2 = _mm_andnot_si128(flat2, p2);
928 flat2_p2 = _mm_and_si128(flat2, flat2_p2);
929 p2 = _mm_or_si128(flat2_p2, p2);
930 _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
931
932 p1 = _mm_andnot_si128(flat2, p1);
933 flat2_p1 = _mm_and_si128(flat2, flat2_p1);
934 p1 = _mm_or_si128(flat2_p1, p1);
935 _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
936
937 p0 = _mm_andnot_si128(flat2, p0);
938 flat2_p0 = _mm_and_si128(flat2, flat2_p0);
939 p0 = _mm_or_si128(flat2_p0, p0);
940 _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
941
942 q0 = _mm_andnot_si128(flat2, q0);
943 flat2_q0 = _mm_and_si128(flat2, flat2_q0);
944 q0 = _mm_or_si128(flat2_q0, q0);
945 _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
946
947 q1 = _mm_andnot_si128(flat2, q1);
948 flat2_q1 = _mm_and_si128(flat2, flat2_q1);
949 q1 = _mm_or_si128(flat2_q1, q1);
950 _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
951
952 q2 = _mm_andnot_si128(flat2, q2);
953 flat2_q2 = _mm_and_si128(flat2, flat2_q2);
954 q2 = _mm_or_si128(flat2_q2, q2);
955 _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
956
957 q3 = _mm_andnot_si128(flat2, q3);
958 flat2_q3 = _mm_and_si128(flat2, flat2_q3);
959 q3 = _mm_or_si128(flat2_q3, q3);
960 _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
961
962 q4 = _mm_andnot_si128(flat2, q4);
963 flat2_q4 = _mm_and_si128(flat2, flat2_q4);
964 q4 = _mm_or_si128(flat2_q4, q4);
965 _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
966
967 q5 = _mm_andnot_si128(flat2, q5);
968 flat2_q5 = _mm_and_si128(flat2, flat2_q5);
969 q5 = _mm_or_si128(flat2_q5, q5);
970 _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
971
972 q6 = _mm_andnot_si128(flat2, q6);
973 flat2_q6 = _mm_and_si128(flat2, flat2_q6);
974 q6 = _mm_or_si128(flat2_q6, q6);
975 _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
976 }
977 }
978
vpx_lpf_horizontal_16_avx2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh,int count)979 void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
980 const unsigned char *_blimit, const unsigned char *_limit,
981 const unsigned char *_thresh, int count) {
982 if (count == 1)
983 mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
984 else
985 mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
986 }
987