1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <immintrin.h>  /* AVX2 */
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_ports/mem.h"
15 
mb_lpf_horizontal_edge_w_avx2_8(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)16 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
17         const unsigned char *_blimit, const unsigned char *_limit,
18         const unsigned char *_thresh) {
19     __m128i mask, hev, flat, flat2;
20     const __m128i zero = _mm_set1_epi16(0);
21     const __m128i one = _mm_set1_epi8(1);
22     __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
23     __m128i abs_p1p0;
24 
25     const __m128i thresh = _mm_broadcastb_epi8(
26             _mm_cvtsi32_si128((int) _thresh[0]));
27     const __m128i limit = _mm_broadcastb_epi8(
28             _mm_cvtsi32_si128((int) _limit[0]));
29     const __m128i blimit = _mm_broadcastb_epi8(
30             _mm_cvtsi32_si128((int) _blimit[0]));
31 
32     q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
33     q4p4 = _mm_castps_si128(
34             _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
35     q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
36     q3p3 = _mm_castps_si128(
37             _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
38     q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
39     q2p2 = _mm_castps_si128(
40             _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
41     q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
42     q1p1 = _mm_castps_si128(
43             _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
44     p1q1 = _mm_shuffle_epi32(q1p1, 78);
45     q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
46     q0p0 = _mm_castps_si128(
47             _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
48     p0q0 = _mm_shuffle_epi32(q0p0, 78);
49 
50     {
51         __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
52         abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
53                 _mm_subs_epu8(q0p0, q1p1));
54         abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
55         fe = _mm_set1_epi8(0xfe);
56         ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
57         abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
58                 _mm_subs_epu8(p0q0, q0p0));
59         abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
60                 _mm_subs_epu8(p1q1, q1p1));
61         flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
62         hev = _mm_subs_epu8(flat, thresh);
63         hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
64 
65         abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
66         abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
67         mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
68         mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
69         // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
70         mask = _mm_max_epu8(abs_p1p0, mask);
71         // mask |= (abs(p1 - p0) > limit) * -1;
72         // mask |= (abs(q1 - q0) > limit) * -1;
73 
74         work = _mm_max_epu8(
75                 _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
76                         _mm_subs_epu8(q1p1, q2p2)),
77                 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
78                         _mm_subs_epu8(q2p2, q3p3)));
79         mask = _mm_max_epu8(work, mask);
80         mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
81         mask = _mm_subs_epu8(mask, limit);
82         mask = _mm_cmpeq_epi8(mask, zero);
83     }
84 
85     // lp filter
86     {
87         const __m128i t4 = _mm_set1_epi8(4);
88         const __m128i t3 = _mm_set1_epi8(3);
89         const __m128i t80 = _mm_set1_epi8(0x80);
90         const __m128i t1 = _mm_set1_epi16(0x1);
91         __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
92         __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
93         __m128i qs0 = _mm_xor_si128(p0q0, t80);
94         __m128i qs1 = _mm_xor_si128(p1q1, t80);
95         __m128i filt;
96         __m128i work_a;
97         __m128i filter1, filter2;
98         __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
99         __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
100 
101         filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
102         work_a = _mm_subs_epi8(qs0, qs0ps0);
103         filt = _mm_adds_epi8(filt, work_a);
104         filt = _mm_adds_epi8(filt, work_a);
105         filt = _mm_adds_epi8(filt, work_a);
106         /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
107         filt = _mm_and_si128(filt, mask);
108 
109         filter1 = _mm_adds_epi8(filt, t4);
110         filter2 = _mm_adds_epi8(filt, t3);
111 
112         filter1 = _mm_unpacklo_epi8(zero, filter1);
113         filter1 = _mm_srai_epi16(filter1, 0xB);
114         filter2 = _mm_unpacklo_epi8(zero, filter2);
115         filter2 = _mm_srai_epi16(filter2, 0xB);
116 
117         /* Filter1 >> 3 */
118         filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
119         qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
120 
121         /* filt >> 1 */
122         filt = _mm_adds_epi16(filter1, t1);
123         filt = _mm_srai_epi16(filt, 1);
124         filt = _mm_andnot_si128(
125                 _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
126         filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
127         qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
128         // loopfilter done
129 
130         {
131             __m128i work;
132             flat = _mm_max_epu8(
133                     _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
134                             _mm_subs_epu8(q0p0, q2p2)),
135                     _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
136                             _mm_subs_epu8(q0p0, q3p3)));
137             flat = _mm_max_epu8(abs_p1p0, flat);
138             flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
139             flat = _mm_subs_epu8(flat, one);
140             flat = _mm_cmpeq_epi8(flat, zero);
141             flat = _mm_and_si128(flat, mask);
142 
143             q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
144             q5p5 = _mm_castps_si128(
145                     _mm_loadh_pi(_mm_castsi128_ps(q5p5),
146                             (__m64 *) (s + 5 * p)));
147 
148             q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
149             q6p6 = _mm_castps_si128(
150                     _mm_loadh_pi(_mm_castsi128_ps(q6p6),
151                             (__m64 *) (s + 6 * p)));
152 
153             flat2 = _mm_max_epu8(
154                     _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
155                             _mm_subs_epu8(q0p0, q4p4)),
156                     _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
157                             _mm_subs_epu8(q0p0, q5p5)));
158 
159             q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
160             q7p7 = _mm_castps_si128(
161                     _mm_loadh_pi(_mm_castsi128_ps(q7p7),
162                             (__m64 *) (s + 7 * p)));
163 
164             work = _mm_max_epu8(
165                     _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
166                             _mm_subs_epu8(q0p0, q6p6)),
167                     _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
168                             _mm_subs_epu8(q0p0, q7p7)));
169 
170             flat2 = _mm_max_epu8(work, flat2);
171             flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
172             flat2 = _mm_subs_epu8(flat2, one);
173             flat2 = _mm_cmpeq_epi8(flat2, zero);
174             flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
175         }
176 
177         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
178         // flat and wide flat calculations
179         {
180             const __m128i eight = _mm_set1_epi16(8);
181             const __m128i four = _mm_set1_epi16(4);
182             __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
183             __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
184             __m128i pixelFilter_p, pixelFilter_q;
185             __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
186             __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
187 
188             p7_16 = _mm_unpacklo_epi8(q7p7, zero);
189             p6_16 = _mm_unpacklo_epi8(q6p6, zero);
190             p5_16 = _mm_unpacklo_epi8(q5p5, zero);
191             p4_16 = _mm_unpacklo_epi8(q4p4, zero);
192             p3_16 = _mm_unpacklo_epi8(q3p3, zero);
193             p2_16 = _mm_unpacklo_epi8(q2p2, zero);
194             p1_16 = _mm_unpacklo_epi8(q1p1, zero);
195             p0_16 = _mm_unpacklo_epi8(q0p0, zero);
196             q0_16 = _mm_unpackhi_epi8(q0p0, zero);
197             q1_16 = _mm_unpackhi_epi8(q1p1, zero);
198             q2_16 = _mm_unpackhi_epi8(q2p2, zero);
199             q3_16 = _mm_unpackhi_epi8(q3p3, zero);
200             q4_16 = _mm_unpackhi_epi8(q4p4, zero);
201             q5_16 = _mm_unpackhi_epi8(q5p5, zero);
202             q6_16 = _mm_unpackhi_epi8(q6p6, zero);
203             q7_16 = _mm_unpackhi_epi8(q7p7, zero);
204 
205             pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
206                     _mm_add_epi16(p4_16, p3_16));
207             pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
208                     _mm_add_epi16(q4_16, q3_16));
209 
210             pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
211                     _mm_add_epi16(p2_16, p1_16));
212             pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
213 
214             pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
215                     _mm_add_epi16(q2_16, q1_16));
216             pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
217             pixelFilter_p = _mm_add_epi16(eight,
218                     _mm_add_epi16(pixelFilter_p, pixelFilter_q));
219             pixetFilter_p2p1p0 = _mm_add_epi16(four,
220                     _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
221             res_p = _mm_srli_epi16(
222                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
223                     4);
224             res_q = _mm_srli_epi16(
225                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
226                     4);
227             flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
228             res_p = _mm_srli_epi16(
229                     _mm_add_epi16(pixetFilter_p2p1p0,
230                             _mm_add_epi16(p3_16, p0_16)), 3);
231             res_q = _mm_srli_epi16(
232                     _mm_add_epi16(pixetFilter_p2p1p0,
233                             _mm_add_epi16(q3_16, q0_16)), 3);
234 
235             flat_q0p0 = _mm_packus_epi16(res_p, res_q);
236 
237             sum_p7 = _mm_add_epi16(p7_16, p7_16);
238             sum_q7 = _mm_add_epi16(q7_16, q7_16);
239             sum_p3 = _mm_add_epi16(p3_16, p3_16);
240             sum_q3 = _mm_add_epi16(q3_16, q3_16);
241 
242             pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
243             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
244             res_p = _mm_srli_epi16(
245                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
246                     4);
247             res_q = _mm_srli_epi16(
248                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
249                     4);
250             flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
251 
252             pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
253             pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
254             res_p = _mm_srli_epi16(
255                     _mm_add_epi16(pixetFilter_p2p1p0,
256                             _mm_add_epi16(sum_p3, p1_16)), 3);
257             res_q = _mm_srli_epi16(
258                     _mm_add_epi16(pixetFilter_q2q1q0,
259                             _mm_add_epi16(sum_q3, q1_16)), 3);
260             flat_q1p1 = _mm_packus_epi16(res_p, res_q);
261 
262             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
263             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
264             sum_p3 = _mm_add_epi16(sum_p3, p3_16);
265             sum_q3 = _mm_add_epi16(sum_q3, q3_16);
266 
267             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
268             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
269             res_p = _mm_srli_epi16(
270                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
271                     4);
272             res_q = _mm_srli_epi16(
273                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
274                     4);
275             flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
276 
277             pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
278             pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
279 
280             res_p = _mm_srli_epi16(
281                     _mm_add_epi16(pixetFilter_p2p1p0,
282                             _mm_add_epi16(sum_p3, p2_16)), 3);
283             res_q = _mm_srli_epi16(
284                     _mm_add_epi16(pixetFilter_q2q1q0,
285                             _mm_add_epi16(sum_q3, q2_16)), 3);
286             flat_q2p2 = _mm_packus_epi16(res_p, res_q);
287 
288             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
289             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
290             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
291             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
292             res_p = _mm_srli_epi16(
293                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
294                     4);
295             res_q = _mm_srli_epi16(
296                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
297                     4);
298             flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
299 
300             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
301             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
302             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
303             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
304             res_p = _mm_srli_epi16(
305                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
306                     4);
307             res_q = _mm_srli_epi16(
308                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
309                     4);
310             flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
311 
312             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
313             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
314             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
315             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
316             res_p = _mm_srli_epi16(
317                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
318                     4);
319             res_q = _mm_srli_epi16(
320                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
321                     4);
322             flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
323 
324             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
325             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
326             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
327             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
328             res_p = _mm_srli_epi16(
329                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
330                     4);
331             res_q = _mm_srli_epi16(
332                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
333                     4);
334             flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
335         }
336         // wide flat
337         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
338 
339         flat = _mm_shuffle_epi32(flat, 68);
340         flat2 = _mm_shuffle_epi32(flat2, 68);
341 
342         q2p2 = _mm_andnot_si128(flat, q2p2);
343         flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
344         q2p2 = _mm_or_si128(q2p2, flat_q2p2);
345 
346         qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
347         flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
348         q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
349 
350         qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
351         flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
352         q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
353 
354         q6p6 = _mm_andnot_si128(flat2, q6p6);
355         flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
356         q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
357         _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
358         _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
359 
360         q5p5 = _mm_andnot_si128(flat2, q5p5);
361         flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
362         q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
363         _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
364         _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
365 
366         q4p4 = _mm_andnot_si128(flat2, q4p4);
367         flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
368         q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
369         _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
370         _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
371 
372         q3p3 = _mm_andnot_si128(flat2, q3p3);
373         flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
374         q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
375         _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
376         _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
377 
378         q2p2 = _mm_andnot_si128(flat2, q2p2);
379         flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
380         q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
381         _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
382         _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
383 
384         q1p1 = _mm_andnot_si128(flat2, q1p1);
385         flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
386         q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
387         _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
388         _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
389 
390         q0p0 = _mm_andnot_si128(flat2, q0p0);
391         flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
392         q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
393         _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
394         _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
395     }
396 }
397 
398 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
399   0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
400   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
401 };
402 
mb_lpf_horizontal_edge_w_avx2_16(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)403 static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
404         const unsigned char *_blimit, const unsigned char *_limit,
405         const unsigned char *_thresh) {
406     __m128i mask, hev, flat, flat2;
407     const __m128i zero = _mm_set1_epi16(0);
408     const __m128i one = _mm_set1_epi8(1);
409     __m128i p7, p6, p5;
410     __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
411     __m128i q5, q6, q7;
412     __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
413             q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
414             p256_0, q256_0;
415 
416     const __m128i thresh = _mm_broadcastb_epi8(
417             _mm_cvtsi32_si128((int) _thresh[0]));
418     const __m128i limit = _mm_broadcastb_epi8(
419             _mm_cvtsi32_si128((int) _limit[0]));
420     const __m128i blimit = _mm_broadcastb_epi8(
421             _mm_cvtsi32_si128((int) _blimit[0]));
422 
423     p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
424                                 (__m128d const *)(s - 5 * p)));
425     p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
426                                 (__m128d const *)(s - 4 * p)));
427     p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
428                                 (__m128d const *)(s - 3 * p)));
429     p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
430                                 (__m128d const *)(s - 2 * p)));
431     p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
432                                 (__m128d const *)(s - 1 * p)));
433     q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
434                                 (__m128d const *)(s - 0 * p)));
435     q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
436                                 (__m128d const *)(s + 1 * p)));
437     q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
438                                 (__m128d const *)(s + 2 * p)));
439     q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
440                                 (__m128d const *)(s + 3 * p)));
441     q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
442                                 (__m128d const *)(s + 4 * p)));
443 
444     p4 = _mm256_castsi256_si128(p256_4);
445     p3 = _mm256_castsi256_si128(p256_3);
446     p2 = _mm256_castsi256_si128(p256_2);
447     p1 = _mm256_castsi256_si128(p256_1);
448     p0 = _mm256_castsi256_si128(p256_0);
449     q0 = _mm256_castsi256_si128(q256_0);
450     q1 = _mm256_castsi256_si128(q256_1);
451     q2 = _mm256_castsi256_si128(q256_2);
452     q3 = _mm256_castsi256_si128(q256_3);
453     q4 = _mm256_castsi256_si128(q256_4);
454 
455     {
456         const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
457                 _mm_subs_epu8(p0, p1));
458         const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
459                 _mm_subs_epu8(q0, q1));
460         const __m128i fe = _mm_set1_epi8(0xfe);
461         const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
462         __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
463                 _mm_subs_epu8(q0, p0));
464         __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
465                 _mm_subs_epu8(q1, p1));
466         __m128i work;
467         flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
468         hev = _mm_subs_epu8(flat, thresh);
469         hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
470 
471         abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
472         abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
473         mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
474         mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
475         // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
476         mask = _mm_max_epu8(flat, mask);
477         // mask |= (abs(p1 - p0) > limit) * -1;
478         // mask |= (abs(q1 - q0) > limit) * -1;
479         work = _mm_max_epu8(
480                 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
481                 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
482         mask = _mm_max_epu8(work, mask);
483         work = _mm_max_epu8(
484                 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
485                 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
486         mask = _mm_max_epu8(work, mask);
487         mask = _mm_subs_epu8(mask, limit);
488         mask = _mm_cmpeq_epi8(mask, zero);
489     }
490 
491     // lp filter
492     {
493         const __m128i t4 = _mm_set1_epi8(4);
494         const __m128i t3 = _mm_set1_epi8(3);
495         const __m128i t80 = _mm_set1_epi8(0x80);
496         const __m128i te0 = _mm_set1_epi8(0xe0);
497         const __m128i t1f = _mm_set1_epi8(0x1f);
498         const __m128i t1 = _mm_set1_epi8(0x1);
499         const __m128i t7f = _mm_set1_epi8(0x7f);
500 
501         __m128i ps1 = _mm_xor_si128(p1, t80);
502         __m128i ps0 = _mm_xor_si128(p0, t80);
503         __m128i qs0 = _mm_xor_si128(q0, t80);
504         __m128i qs1 = _mm_xor_si128(q1, t80);
505         __m128i filt;
506         __m128i work_a;
507         __m128i filter1, filter2;
508         __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
509                 flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
510                 flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
511                 flat_q2;
512 
513         filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
514         work_a = _mm_subs_epi8(qs0, ps0);
515         filt = _mm_adds_epi8(filt, work_a);
516         filt = _mm_adds_epi8(filt, work_a);
517         filt = _mm_adds_epi8(filt, work_a);
518         /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
519         filt = _mm_and_si128(filt, mask);
520 
521         filter1 = _mm_adds_epi8(filt, t4);
522         filter2 = _mm_adds_epi8(filt, t3);
523 
524         /* Filter1 >> 3 */
525         work_a = _mm_cmpgt_epi8(zero, filter1);
526         filter1 = _mm_srli_epi16(filter1, 3);
527         work_a = _mm_and_si128(work_a, te0);
528         filter1 = _mm_and_si128(filter1, t1f);
529         filter1 = _mm_or_si128(filter1, work_a);
530         qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
531 
532         /* Filter2 >> 3 */
533         work_a = _mm_cmpgt_epi8(zero, filter2);
534         filter2 = _mm_srli_epi16(filter2, 3);
535         work_a = _mm_and_si128(work_a, te0);
536         filter2 = _mm_and_si128(filter2, t1f);
537         filter2 = _mm_or_si128(filter2, work_a);
538         ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
539 
540         /* filt >> 1 */
541         filt = _mm_adds_epi8(filter1, t1);
542         work_a = _mm_cmpgt_epi8(zero, filt);
543         filt = _mm_srli_epi16(filt, 1);
544         work_a = _mm_and_si128(work_a, t80);
545         filt = _mm_and_si128(filt, t7f);
546         filt = _mm_or_si128(filt, work_a);
547         filt = _mm_andnot_si128(hev, filt);
548         ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
549         qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
550         // loopfilter done
551 
552         {
553             __m128i work;
554             work = _mm_max_epu8(
555                     _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
556                     _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
557             flat = _mm_max_epu8(work, flat);
558             work = _mm_max_epu8(
559                     _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
560                     _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
561             flat = _mm_max_epu8(work, flat);
562             work = _mm_max_epu8(
563                     _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
564                     _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
565             flat = _mm_subs_epu8(flat, one);
566             flat = _mm_cmpeq_epi8(flat, zero);
567             flat = _mm_and_si128(flat, mask);
568 
569             p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
570                                         (__m128d const *)(s - 6 * p)));
571             q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
572                                         (__m128d const *)(s + 5 * p)));
573             p5 = _mm256_castsi256_si128(p256_5);
574             q5 = _mm256_castsi256_si128(q256_5);
575             flat2 = _mm_max_epu8(
576                     _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
577                     _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
578 
579             flat2 = _mm_max_epu8(work, flat2);
580             p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
581                                         (__m128d const *)(s - 7 * p)));
582             q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
583                                         (__m128d const *)(s + 6 * p)));
584             p6 = _mm256_castsi256_si128(p256_6);
585             q6 = _mm256_castsi256_si128(q256_6);
586             work = _mm_max_epu8(
587                     _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
588                     _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
589 
590             flat2 = _mm_max_epu8(work, flat2);
591 
592             p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
593                                         (__m128d const *)(s - 8 * p)));
594             q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
595                                         (__m128d const *)(s + 7 * p)));
596             p7 = _mm256_castsi256_si128(p256_7);
597             q7 = _mm256_castsi256_si128(q256_7);
598             work = _mm_max_epu8(
599                     _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
600                     _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
601 
602             flat2 = _mm_max_epu8(work, flat2);
603             flat2 = _mm_subs_epu8(flat2, one);
604             flat2 = _mm_cmpeq_epi8(flat2, zero);
605             flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
606         }
607 
608         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
609         // flat and wide flat calculations
610         {
611             const __m256i eight = _mm256_set1_epi16(8);
612             const __m256i four = _mm256_set1_epi16(4);
613             __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
614                     pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
615                     res_q;
616 
617             const __m256i filter = _mm256_load_si256(
618                                   (__m256i const *)filt_loopfilter_avx2);
619             p256_7 = _mm256_shuffle_epi8(p256_7, filter);
620             p256_6 = _mm256_shuffle_epi8(p256_6, filter);
621             p256_5 = _mm256_shuffle_epi8(p256_5, filter);
622             p256_4 = _mm256_shuffle_epi8(p256_4, filter);
623             p256_3 = _mm256_shuffle_epi8(p256_3, filter);
624             p256_2 = _mm256_shuffle_epi8(p256_2, filter);
625             p256_1 = _mm256_shuffle_epi8(p256_1, filter);
626             p256_0 = _mm256_shuffle_epi8(p256_0, filter);
627             q256_0 = _mm256_shuffle_epi8(q256_0, filter);
628             q256_1 = _mm256_shuffle_epi8(q256_1, filter);
629             q256_2 = _mm256_shuffle_epi8(q256_2, filter);
630             q256_3 = _mm256_shuffle_epi8(q256_3, filter);
631             q256_4 = _mm256_shuffle_epi8(q256_4, filter);
632             q256_5 = _mm256_shuffle_epi8(q256_5, filter);
633             q256_6 = _mm256_shuffle_epi8(q256_6, filter);
634             q256_7 = _mm256_shuffle_epi8(q256_7, filter);
635 
636             pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
637                     _mm256_add_epi16(p256_4, p256_3));
638             pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
639                     _mm256_add_epi16(q256_4, q256_3));
640 
641             pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
642                     _mm256_add_epi16(p256_2, p256_1));
643             pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
644 
645             pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
646                     _mm256_add_epi16(q256_2, q256_1));
647             pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
648 
649             pixelFilter_p = _mm256_add_epi16(eight,
650                     _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
651 
652             pixetFilter_p2p1p0 = _mm256_add_epi16(four,
653                     _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
654 
655             res_p = _mm256_srli_epi16(
656                     _mm256_add_epi16(pixelFilter_p,
657                             _mm256_add_epi16(p256_7, p256_0)), 4);
658 
659             flat2_p0 = _mm256_castsi256_si128(
660                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
661                             168));
662 
663             res_q = _mm256_srli_epi16(
664                     _mm256_add_epi16(pixelFilter_p,
665                             _mm256_add_epi16(q256_7, q256_0)), 4);
666 
667             flat2_q0 = _mm256_castsi256_si128(
668                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
669                             168));
670 
671             res_p = _mm256_srli_epi16(
672                     _mm256_add_epi16(pixetFilter_p2p1p0,
673                             _mm256_add_epi16(p256_3, p256_0)), 3);
674 
675             flat_p0 = _mm256_castsi256_si128(
676                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
677                             168));
678 
679             res_q = _mm256_srli_epi16(
680                     _mm256_add_epi16(pixetFilter_p2p1p0,
681                             _mm256_add_epi16(q256_3, q256_0)), 3);
682 
683             flat_q0 = _mm256_castsi256_si128(
684                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
685                             168));
686 
687             sum_p7 = _mm256_add_epi16(p256_7, p256_7);
688 
689             sum_q7 = _mm256_add_epi16(q256_7, q256_7);
690 
691             sum_p3 = _mm256_add_epi16(p256_3, p256_3);
692 
693             sum_q3 = _mm256_add_epi16(q256_3, q256_3);
694 
695             pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
696 
697             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
698 
699             res_p = _mm256_srli_epi16(
700                     _mm256_add_epi16(pixelFilter_p,
701                             _mm256_add_epi16(sum_p7, p256_1)), 4);
702 
703             flat2_p1 = _mm256_castsi256_si128(
704                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
705                             168));
706 
707             res_q = _mm256_srli_epi16(
708                     _mm256_add_epi16(pixelFilter_q,
709                             _mm256_add_epi16(sum_q7, q256_1)), 4);
710 
711             flat2_q1 = _mm256_castsi256_si128(
712                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
713                             168));
714 
715             pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
716 
717             pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
718 
719             res_p = _mm256_srli_epi16(
720                     _mm256_add_epi16(pixetFilter_p2p1p0,
721                             _mm256_add_epi16(sum_p3, p256_1)), 3);
722 
723             flat_p1 = _mm256_castsi256_si128(
724                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
725                             168));
726 
727             res_q = _mm256_srli_epi16(
728                     _mm256_add_epi16(pixetFilter_q2q1q0,
729                             _mm256_add_epi16(sum_q3, q256_1)), 3);
730 
731             flat_q1 = _mm256_castsi256_si128(
732                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
733                             168));
734 
735             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
736 
737             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
738 
739             sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
740 
741             sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
742 
743             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
744 
745             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
746 
747             res_p = _mm256_srli_epi16(
748                     _mm256_add_epi16(pixelFilter_p,
749                             _mm256_add_epi16(sum_p7, p256_2)), 4);
750 
751             flat2_p2 = _mm256_castsi256_si128(
752                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
753                             168));
754 
755             res_q = _mm256_srli_epi16(
756                     _mm256_add_epi16(pixelFilter_q,
757                             _mm256_add_epi16(sum_q7, q256_2)), 4);
758 
759             flat2_q2 = _mm256_castsi256_si128(
760                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
761                             168));
762 
763             pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
764 
765             pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
766 
767             res_p = _mm256_srli_epi16(
768                     _mm256_add_epi16(pixetFilter_p2p1p0,
769                             _mm256_add_epi16(sum_p3, p256_2)), 3);
770 
771             flat_p2 = _mm256_castsi256_si128(
772                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
773                             168));
774 
775             res_q = _mm256_srli_epi16(
776                     _mm256_add_epi16(pixetFilter_q2q1q0,
777                             _mm256_add_epi16(sum_q3, q256_2)), 3);
778 
779             flat_q2 = _mm256_castsi256_si128(
780                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
781                             168));
782 
783             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
784 
785             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
786 
787             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
788 
789             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
790 
791             res_p = _mm256_srli_epi16(
792                     _mm256_add_epi16(pixelFilter_p,
793                             _mm256_add_epi16(sum_p7, p256_3)), 4);
794 
795             flat2_p3 = _mm256_castsi256_si128(
796                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
797                             168));
798 
799             res_q = _mm256_srli_epi16(
800                     _mm256_add_epi16(pixelFilter_q,
801                             _mm256_add_epi16(sum_q7, q256_3)), 4);
802 
803             flat2_q3 = _mm256_castsi256_si128(
804                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
805                             168));
806 
807             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
808 
809             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
810 
811             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
812 
813             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
814 
815             res_p = _mm256_srli_epi16(
816                     _mm256_add_epi16(pixelFilter_p,
817                             _mm256_add_epi16(sum_p7, p256_4)), 4);
818 
819             flat2_p4 = _mm256_castsi256_si128(
820                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
821                             168));
822 
823             res_q = _mm256_srli_epi16(
824                     _mm256_add_epi16(pixelFilter_q,
825                             _mm256_add_epi16(sum_q7, q256_4)), 4);
826 
827             flat2_q4 = _mm256_castsi256_si128(
828                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
829                             168));
830 
831             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
832 
833             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
834 
835             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
836 
837             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
838 
839             res_p = _mm256_srli_epi16(
840                     _mm256_add_epi16(pixelFilter_p,
841                             _mm256_add_epi16(sum_p7, p256_5)), 4);
842 
843             flat2_p5 = _mm256_castsi256_si128(
844                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
845                             168));
846 
847             res_q = _mm256_srli_epi16(
848                     _mm256_add_epi16(pixelFilter_q,
849                             _mm256_add_epi16(sum_q7, q256_5)), 4);
850 
851             flat2_q5 = _mm256_castsi256_si128(
852                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
853                             168));
854 
855             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
856 
857             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
858 
859             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
860 
861             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
862 
863             res_p = _mm256_srli_epi16(
864                     _mm256_add_epi16(pixelFilter_p,
865                             _mm256_add_epi16(sum_p7, p256_6)), 4);
866 
867             flat2_p6 = _mm256_castsi256_si128(
868                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
869                             168));
870 
871             res_q = _mm256_srli_epi16(
872                     _mm256_add_epi16(pixelFilter_q,
873                             _mm256_add_epi16(sum_q7, q256_6)), 4);
874 
875             flat2_q6 = _mm256_castsi256_si128(
876                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
877                             168));
878         }
879 
880         // wide flat
881         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
882 
883         p2 = _mm_andnot_si128(flat, p2);
884         flat_p2 = _mm_and_si128(flat, flat_p2);
885         p2 = _mm_or_si128(flat_p2, p2);
886 
887         p1 = _mm_andnot_si128(flat, ps1);
888         flat_p1 = _mm_and_si128(flat, flat_p1);
889         p1 = _mm_or_si128(flat_p1, p1);
890 
891         p0 = _mm_andnot_si128(flat, ps0);
892         flat_p0 = _mm_and_si128(flat, flat_p0);
893         p0 = _mm_or_si128(flat_p0, p0);
894 
895         q0 = _mm_andnot_si128(flat, qs0);
896         flat_q0 = _mm_and_si128(flat, flat_q0);
897         q0 = _mm_or_si128(flat_q0, q0);
898 
899         q1 = _mm_andnot_si128(flat, qs1);
900         flat_q1 = _mm_and_si128(flat, flat_q1);
901         q1 = _mm_or_si128(flat_q1, q1);
902 
903         q2 = _mm_andnot_si128(flat, q2);
904         flat_q2 = _mm_and_si128(flat, flat_q2);
905         q2 = _mm_or_si128(flat_q2, q2);
906 
907         p6 = _mm_andnot_si128(flat2, p6);
908         flat2_p6 = _mm_and_si128(flat2, flat2_p6);
909         p6 = _mm_or_si128(flat2_p6, p6);
910         _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
911 
912         p5 = _mm_andnot_si128(flat2, p5);
913         flat2_p5 = _mm_and_si128(flat2, flat2_p5);
914         p5 = _mm_or_si128(flat2_p5, p5);
915         _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
916 
917         p4 = _mm_andnot_si128(flat2, p4);
918         flat2_p4 = _mm_and_si128(flat2, flat2_p4);
919         p4 = _mm_or_si128(flat2_p4, p4);
920         _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
921 
922         p3 = _mm_andnot_si128(flat2, p3);
923         flat2_p3 = _mm_and_si128(flat2, flat2_p3);
924         p3 = _mm_or_si128(flat2_p3, p3);
925         _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
926 
927         p2 = _mm_andnot_si128(flat2, p2);
928         flat2_p2 = _mm_and_si128(flat2, flat2_p2);
929         p2 = _mm_or_si128(flat2_p2, p2);
930         _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
931 
932         p1 = _mm_andnot_si128(flat2, p1);
933         flat2_p1 = _mm_and_si128(flat2, flat2_p1);
934         p1 = _mm_or_si128(flat2_p1, p1);
935         _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
936 
937         p0 = _mm_andnot_si128(flat2, p0);
938         flat2_p0 = _mm_and_si128(flat2, flat2_p0);
939         p0 = _mm_or_si128(flat2_p0, p0);
940         _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
941 
942         q0 = _mm_andnot_si128(flat2, q0);
943         flat2_q0 = _mm_and_si128(flat2, flat2_q0);
944         q0 = _mm_or_si128(flat2_q0, q0);
945         _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
946 
947         q1 = _mm_andnot_si128(flat2, q1);
948         flat2_q1 = _mm_and_si128(flat2, flat2_q1);
949         q1 = _mm_or_si128(flat2_q1, q1);
950         _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
951 
952         q2 = _mm_andnot_si128(flat2, q2);
953         flat2_q2 = _mm_and_si128(flat2, flat2_q2);
954         q2 = _mm_or_si128(flat2_q2, q2);
955         _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
956 
957         q3 = _mm_andnot_si128(flat2, q3);
958         flat2_q3 = _mm_and_si128(flat2, flat2_q3);
959         q3 = _mm_or_si128(flat2_q3, q3);
960         _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
961 
962         q4 = _mm_andnot_si128(flat2, q4);
963         flat2_q4 = _mm_and_si128(flat2, flat2_q4);
964         q4 = _mm_or_si128(flat2_q4, q4);
965         _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
966 
967         q5 = _mm_andnot_si128(flat2, q5);
968         flat2_q5 = _mm_and_si128(flat2, flat2_q5);
969         q5 = _mm_or_si128(flat2_q5, q5);
970         _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
971 
972         q6 = _mm_andnot_si128(flat2, q6);
973         flat2_q6 = _mm_and_si128(flat2, flat2_q6);
974         q6 = _mm_or_si128(flat2_q6, q6);
975         _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
976     }
977 }
978 
vpx_lpf_horizontal_16_avx2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh,int count)979 void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
980         const unsigned char *_blimit, const unsigned char *_limit,
981         const unsigned char *_thresh, int count) {
982     if (count == 1)
983         mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
984     else
985         mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
986 }
987