1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 #include "vp9/common/vp9_loopfilter.h"
13 #include "vpx_ports/emmintrin_compat.h"
14 
mb_lpf_horizontal_edge_w_sse2_8(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)15 static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
16                                             int p,
17                                             const unsigned char *_blimit,
18                                             const unsigned char *_limit,
19                                             const unsigned char *_thresh) {
20   const __m128i zero = _mm_set1_epi16(0);
21   const __m128i one = _mm_set1_epi8(1);
22   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
23   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
24   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
25   __m128i mask, hev, flat, flat2;
26   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
27   __m128i abs_p1p0;
28 
29   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
30   q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
31                                        (__m64 *)(s + 4 * p)));
32   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
33   q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
34                                        (__m64 *)(s + 3 * p)));
35   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
36   q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
37                                        (__m64 *)(s + 2 * p)));
38   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
39   q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
40                                        (__m64 *)(s + 1 * p)));
41   p1q1 = _mm_shuffle_epi32(q1p1, 78);
42   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
43   q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
44                                        (__m64 *)(s - 0 * p)));
45   p0q0 = _mm_shuffle_epi32(q0p0, 78);
46 
47   {
48     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
49     abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
50                             _mm_subs_epu8(q0p0, q1p1));
51     abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
52     fe = _mm_set1_epi8(0xfe);
53     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
54     abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
55                             _mm_subs_epu8(p0q0, q0p0));
56     abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
57                             _mm_subs_epu8(p1q1, q1p1));
58     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
59     hev = _mm_subs_epu8(flat, thresh);
60     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
61 
62     abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
63     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
64     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
65     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
66     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
67     mask = _mm_max_epu8(abs_p1p0, mask);
68     // mask |= (abs(p1 - p0) > limit) * -1;
69     // mask |= (abs(q1 - q0) > limit) * -1;
70 
71     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
72                                      _mm_subs_epu8(q1p1, q2p2)),
73                         _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
74                                      _mm_subs_epu8(q2p2, q3p3)));
75     mask = _mm_max_epu8(work, mask);
76     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
77     mask = _mm_subs_epu8(mask, limit);
78     mask = _mm_cmpeq_epi8(mask, zero);
79   }
80 
81   // lp filter
82   {
83     const __m128i t4 = _mm_set1_epi8(4);
84     const __m128i t3 = _mm_set1_epi8(3);
85     const __m128i t80 = _mm_set1_epi8(0x80);
86     const __m128i t1 = _mm_set1_epi16(0x1);
87     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
88     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
89     __m128i qs0 = _mm_xor_si128(p0q0, t80);
90     __m128i qs1 = _mm_xor_si128(p1q1, t80);
91     __m128i filt;
92     __m128i work_a;
93     __m128i filter1, filter2;
94     __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
95     __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
96 
97     filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
98     work_a = _mm_subs_epi8(qs0, qs0ps0);
99     filt = _mm_adds_epi8(filt, work_a);
100     filt = _mm_adds_epi8(filt, work_a);
101     filt = _mm_adds_epi8(filt, work_a);
102     // (vp9_filter + 3 * (qs0 - ps0)) & mask
103     filt = _mm_and_si128(filt, mask);
104 
105     filter1 = _mm_adds_epi8(filt, t4);
106     filter2 = _mm_adds_epi8(filt, t3);
107 
108     filter1 = _mm_unpacklo_epi8(zero, filter1);
109     filter1 = _mm_srai_epi16(filter1, 0xB);
110     filter2 = _mm_unpacklo_epi8(zero, filter2);
111     filter2 = _mm_srai_epi16(filter2, 0xB);
112 
113     // Filter1 >> 3
114     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
115     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
116 
117     // filt >> 1
118     filt = _mm_adds_epi16(filter1, t1);
119     filt = _mm_srai_epi16(filt, 1);
120     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
121                             filt);
122     filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
123     qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
124     // loopfilter done
125 
126     {
127       __m128i work;
128       flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
129                                        _mm_subs_epu8(q0p0, q2p2)),
130                           _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
131                                        _mm_subs_epu8(q0p0, q3p3)));
132       flat = _mm_max_epu8(abs_p1p0, flat);
133       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
134       flat = _mm_subs_epu8(flat, one);
135       flat = _mm_cmpeq_epi8(flat, zero);
136       flat = _mm_and_si128(flat, mask);
137 
138       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
139       q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
140                                            (__m64 *)(s + 5 * p)));
141 
142       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
143       q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
144                                            (__m64 *)(s + 6 * p)));
145 
146       flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
147                                         _mm_subs_epu8(q0p0, q4p4)),
148                            _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
149                                         _mm_subs_epu8(q0p0, q5p5)));
150 
151       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
152       q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
153                                            (__m64 *)(s + 7 * p)));
154 
155       work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
156                                        _mm_subs_epu8(q0p0, q6p6)),
157                           _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
158                                        _mm_subs_epu8(q0p0, q7p7)));
159 
160       flat2 = _mm_max_epu8(work, flat2);
161       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
162       flat2 = _mm_subs_epu8(flat2, one);
163       flat2 = _mm_cmpeq_epi8(flat2, zero);
164       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
165     }
166 
167     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
168     // flat and wide flat calculations
169     {
170       const __m128i eight = _mm_set1_epi16(8);
171       const __m128i four = _mm_set1_epi16(4);
172       __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
173       __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
174       __m128i pixelFilter_p, pixelFilter_q;
175       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
176       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
177 
178       p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
179       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
180       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
181       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
182       p3_16 = _mm_unpacklo_epi8(q3p3, zero);
183       p2_16 = _mm_unpacklo_epi8(q2p2, zero);
184       p1_16 = _mm_unpacklo_epi8(q1p1, zero);
185       p0_16 = _mm_unpacklo_epi8(q0p0, zero);
186       q0_16 = _mm_unpackhi_epi8(q0p0, zero);
187       q1_16 = _mm_unpackhi_epi8(q1p1, zero);
188       q2_16 = _mm_unpackhi_epi8(q2p2, zero);
189       q3_16 = _mm_unpackhi_epi8(q3p3, zero);
190       q4_16 = _mm_unpackhi_epi8(q4p4, zero);
191       q5_16 = _mm_unpackhi_epi8(q5p5, zero);
192       q6_16 = _mm_unpackhi_epi8(q6p6, zero);
193       q7_16 = _mm_unpackhi_epi8(q7p7, zero);
194 
195       pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
196                                     _mm_add_epi16(p4_16, p3_16));
197       pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
198                                     _mm_add_epi16(q4_16, q3_16));
199 
200       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
201       pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
202 
203       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
204       pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
205       pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
206                                                          pixelFilter_q));
207       pixetFilter_p2p1p0 =   _mm_add_epi16(four,
208                                            _mm_add_epi16(pixetFilter_p2p1p0,
209                                                          pixetFilter_q2q1q0));
210       res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
211                                            _mm_add_epi16(p7_16, p0_16)), 4);
212       res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
213                                            _mm_add_epi16(q7_16, q0_16)), 4);
214       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
215       res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
216                                            _mm_add_epi16(p3_16, p0_16)), 3);
217       res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
218                                            _mm_add_epi16(q3_16, q0_16)), 3);
219 
220       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
221 
222       sum_p7 = _mm_add_epi16(p7_16, p7_16);
223       sum_q7 = _mm_add_epi16(q7_16, q7_16);
224       sum_p3 = _mm_add_epi16(p3_16, p3_16);
225       sum_q3 = _mm_add_epi16(q3_16, q3_16);
226 
227       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
228       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
229       res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
230                              _mm_add_epi16(sum_p7, p1_16)), 4);
231       res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
232                              _mm_add_epi16(sum_q7, q1_16)), 4);
233       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
234 
235       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
236       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
237       res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
238                              _mm_add_epi16(sum_p3, p1_16)), 3);
239       res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
240                              _mm_add_epi16(sum_q3, q1_16)), 3);
241       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
242 
243       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
244       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
245       sum_p3 = _mm_add_epi16(sum_p3, p3_16);
246       sum_q3 = _mm_add_epi16(sum_q3, q3_16);
247 
248       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
249       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
250       res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
251                              _mm_add_epi16(sum_p7, p2_16)), 4);
252       res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
253                              _mm_add_epi16(sum_q7, q2_16)), 4);
254       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
255 
256       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
257       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
258 
259       res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
260                                            _mm_add_epi16(sum_p3, p2_16)), 3);
261       res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
262                                            _mm_add_epi16(sum_q3, q2_16)), 3);
263       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
264 
265       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
266       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
267       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
268       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
269       res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
270                              _mm_add_epi16(sum_p7, p3_16)), 4);
271       res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
272                              _mm_add_epi16(sum_q7, q3_16)), 4);
273       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
274 
275       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
276       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
277       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
278       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
279       res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
280                              _mm_add_epi16(sum_p7, p4_16)), 4);
281       res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
282                              _mm_add_epi16(sum_q7, q4_16)), 4);
283       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
284 
285       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
286       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
287       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
288       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
289       res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
290                              _mm_add_epi16(sum_p7, p5_16)), 4);
291       res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
292                              _mm_add_epi16(sum_q7, q5_16)), 4);
293       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
294 
295       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
296       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
297       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
298       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
299       res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
300                              _mm_add_epi16(sum_p7, p6_16)), 4);
301       res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
302                              _mm_add_epi16(sum_q7, q6_16)), 4);
303       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
304     }
305     // wide flat
306     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
307 
308     flat = _mm_shuffle_epi32(flat, 68);
309     flat2 = _mm_shuffle_epi32(flat2, 68);
310 
311     q2p2 = _mm_andnot_si128(flat, q2p2);
312     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
313     q2p2 = _mm_or_si128(q2p2, flat_q2p2);
314 
315     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
316     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
317     q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
318 
319     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
320     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
321     q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
322 
323     q6p6 = _mm_andnot_si128(flat2, q6p6);
324     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
325     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
326     _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
327     _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
328 
329     q5p5 = _mm_andnot_si128(flat2, q5p5);
330     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
331     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
332     _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
333     _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
334 
335     q4p4 = _mm_andnot_si128(flat2, q4p4);
336     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
337     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
338     _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
339     _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
340 
341     q3p3 = _mm_andnot_si128(flat2, q3p3);
342     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
343     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
344     _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
345     _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
346 
347     q2p2 = _mm_andnot_si128(flat2, q2p2);
348     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
349     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
350     _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
351     _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
352 
353     q1p1 = _mm_andnot_si128(flat2, q1p1);
354     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
355     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
356     _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
357     _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
358 
359     q0p0 = _mm_andnot_si128(flat2, q0p0);
360     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
361     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
362     _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
363     _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
364   }
365 }
366 
mb_lpf_horizontal_edge_w_sse2_16(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)367 static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
368                                              int p,
369                                              const unsigned char *_blimit,
370                                              const unsigned char *_limit,
371                                              const unsigned char *_thresh) {
372   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16);
373   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16);
374 
375   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16);
376   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16);
377 
378   DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16);
379   DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16);
380 
381   const __m128i zero = _mm_set1_epi16(0);
382   const __m128i one = _mm_set1_epi8(1);
383   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
384   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
385   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
386   __m128i mask, hev, flat, flat2;
387   __m128i p7, p6, p5;
388   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
389   __m128i q5, q6, q7;
390   int i = 0;
391 
392   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
393   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
394   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
395   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
396   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
397   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
398   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
399   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
400   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
401   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
402 
403   _mm_store_si128((__m128i *)&ap[4 * 16], p4);
404   _mm_store_si128((__m128i *)&ap[3 * 16], p3);
405   _mm_store_si128((__m128i *)&ap[2 * 16], p2);
406   _mm_store_si128((__m128i *)&ap[1 * 16], p1);
407   _mm_store_si128((__m128i *)&ap[0 * 16], p0);
408   _mm_store_si128((__m128i *)&aq[4 * 16], q4);
409   _mm_store_si128((__m128i *)&aq[3 * 16], q3);
410   _mm_store_si128((__m128i *)&aq[2 * 16], q2);
411   _mm_store_si128((__m128i *)&aq[1 * 16], q1);
412   _mm_store_si128((__m128i *)&aq[0 * 16], q0);
413 
414 
415   {
416     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
417                                           _mm_subs_epu8(p0, p1));
418     const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
419                                           _mm_subs_epu8(q0, q1));
420     const __m128i fe = _mm_set1_epi8(0xfe);
421     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
422     __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
423                                     _mm_subs_epu8(q0, p0));
424     __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
425                                     _mm_subs_epu8(q1, p1));
426     __m128i work;
427     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
428     hev = _mm_subs_epu8(flat, thresh);
429     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
430 
431     abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
432     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
433     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
434     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
435     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
436     mask = _mm_max_epu8(flat, mask);
437     // mask |= (abs(p1 - p0) > limit) * -1;
438     // mask |= (abs(q1 - q0) > limit) * -1;
439     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
440                                      _mm_subs_epu8(p1, p2)),
441                          _mm_or_si128(_mm_subs_epu8(p3, p2),
442                                       _mm_subs_epu8(p2, p3)));
443     mask = _mm_max_epu8(work, mask);
444     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
445                                      _mm_subs_epu8(q1, q2)),
446                          _mm_or_si128(_mm_subs_epu8(q3, q2),
447                                       _mm_subs_epu8(q2, q3)));
448     mask = _mm_max_epu8(work, mask);
449     mask = _mm_subs_epu8(mask, limit);
450     mask = _mm_cmpeq_epi8(mask, zero);
451   }
452 
453   // lp filter
454   {
455     const __m128i t4 = _mm_set1_epi8(4);
456     const __m128i t3 = _mm_set1_epi8(3);
457     const __m128i t80 = _mm_set1_epi8(0x80);
458     const __m128i te0 = _mm_set1_epi8(0xe0);
459     const __m128i t1f = _mm_set1_epi8(0x1f);
460     const __m128i t1 = _mm_set1_epi8(0x1);
461     const __m128i t7f = _mm_set1_epi8(0x7f);
462 
463     __m128i ps1 = _mm_xor_si128(p1, t80);
464     __m128i ps0 = _mm_xor_si128(p0, t80);
465     __m128i qs0 = _mm_xor_si128(q0, t80);
466     __m128i qs1 = _mm_xor_si128(q1, t80);
467     __m128i filt;
468     __m128i work_a;
469     __m128i filter1, filter2;
470 
471     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
472     work_a = _mm_subs_epi8(qs0, ps0);
473     filt = _mm_adds_epi8(filt, work_a);
474     filt = _mm_adds_epi8(filt, work_a);
475     filt = _mm_adds_epi8(filt, work_a);
476     // (vp9_filter + 3 * (qs0 - ps0)) & mask
477     filt = _mm_and_si128(filt, mask);
478 
479     filter1 = _mm_adds_epi8(filt, t4);
480     filter2 = _mm_adds_epi8(filt, t3);
481 
482     // Filter1 >> 3
483     work_a = _mm_cmpgt_epi8(zero, filter1);
484     filter1 = _mm_srli_epi16(filter1, 3);
485     work_a = _mm_and_si128(work_a, te0);
486     filter1 = _mm_and_si128(filter1, t1f);
487     filter1 = _mm_or_si128(filter1, work_a);
488     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
489 
490     // Filter2 >> 3
491     work_a = _mm_cmpgt_epi8(zero, filter2);
492     filter2 = _mm_srli_epi16(filter2, 3);
493     work_a = _mm_and_si128(work_a, te0);
494     filter2 = _mm_and_si128(filter2, t1f);
495     filter2 = _mm_or_si128(filter2, work_a);
496     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
497 
498     // filt >> 1
499     filt = _mm_adds_epi8(filter1, t1);
500     work_a = _mm_cmpgt_epi8(zero, filt);
501     filt = _mm_srli_epi16(filt, 1);
502     work_a = _mm_and_si128(work_a, t80);
503     filt = _mm_and_si128(filt, t7f);
504     filt = _mm_or_si128(filt, work_a);
505     filt = _mm_andnot_si128(hev, filt);
506     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
507     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
508     // loopfilter done
509 
510     {
511       __m128i work;
512       work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
513                                        _mm_subs_epu8(p0, p2)),
514                            _mm_or_si128(_mm_subs_epu8(q2, q0),
515                                         _mm_subs_epu8(q0, q2)));
516       flat = _mm_max_epu8(work, flat);
517       work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
518                                        _mm_subs_epu8(p0, p3)),
519                            _mm_or_si128(_mm_subs_epu8(q3, q0),
520                                         _mm_subs_epu8(q0, q3)));
521       flat = _mm_max_epu8(work, flat);
522       work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
523                                        _mm_subs_epu8(p0, p4)),
524                            _mm_or_si128(_mm_subs_epu8(q4, q0),
525                                         _mm_subs_epu8(q0, q4)));
526       flat = _mm_subs_epu8(flat, one);
527       flat = _mm_cmpeq_epi8(flat, zero);
528       flat = _mm_and_si128(flat, mask);
529 
530       p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
531       q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
532       flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
533                                        _mm_subs_epu8(p0, p5)),
534                            _mm_or_si128(_mm_subs_epu8(q5, q0),
535                                         _mm_subs_epu8(q0, q5)));
536       _mm_store_si128((__m128i *)&ap[5 * 16], p5);
537       _mm_store_si128((__m128i *)&aq[5 * 16], q5);
538       flat2 = _mm_max_epu8(work, flat2);
539       p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
540       q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
541       work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
542                                        _mm_subs_epu8(p0, p6)),
543                            _mm_or_si128(_mm_subs_epu8(q6, q0),
544                                         _mm_subs_epu8(q0, q6)));
545       _mm_store_si128((__m128i *)&ap[6 * 16], p6);
546       _mm_store_si128((__m128i *)&aq[6 * 16], q6);
547       flat2 = _mm_max_epu8(work, flat2);
548 
549       p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
550       q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
551       work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
552                                        _mm_subs_epu8(p0, p7)),
553                            _mm_or_si128(_mm_subs_epu8(q7, q0),
554                                         _mm_subs_epu8(q0, q7)));
555       _mm_store_si128((__m128i *)&ap[7 * 16], p7);
556       _mm_store_si128((__m128i *)&aq[7 * 16], q7);
557       flat2 = _mm_max_epu8(work, flat2);
558       flat2 = _mm_subs_epu8(flat2, one);
559       flat2 = _mm_cmpeq_epi8(flat2, zero);
560       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
561     }
562 
563     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
564     // flat and wide flat calculations
565     {
566       const __m128i eight = _mm_set1_epi16(8);
567       const __m128i four = _mm_set1_epi16(4);
568       __m128i temp_flat2 = flat2;
569       unsigned char *src = s;
570       int i = 0;
571       do {
572         __m128i workp_shft;
573         __m128i a, b, c;
574 
575         unsigned int off = i * 8;
576         p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)),
577                                zero);
578         p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)),
579                                zero);
580         p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)),
581                                zero);
582         p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)),
583                                zero);
584         p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)),
585                                zero);
586         p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)),
587                                zero);
588         p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)),
589                                zero);
590         p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)),
591                                zero);
592         q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)),
593                                zero);
594         q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)),
595                                zero);
596         q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)),
597                                zero);
598         q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)),
599                                zero);
600         q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)),
601                                zero);
602         q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)),
603                                zero);
604         q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)),
605                                zero);
606         q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)),
607                                zero);
608 
609         c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
610         c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
611 
612         b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
613         a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
614         a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
615 
616         _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8],
617                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
618                                           , b));
619 
620         c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
621         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
622         _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8],
623                          _mm_packus_epi16(workp_shft, workp_shft));
624 
625         a = _mm_add_epi16(q1, a);
626         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
627         _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8],
628                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
629                                           , b));
630 
631         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
632         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
633         _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8],
634                          _mm_packus_epi16(workp_shft, workp_shft));
635 
636         a = _mm_add_epi16(q2, a);
637         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
638         _mm_storel_epi64((__m128i *)&flat_op[i * 8],
639                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
640                                           , b));
641 
642         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
643         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
644         _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8],
645                          _mm_packus_epi16(workp_shft, workp_shft));
646 
647         a = _mm_add_epi16(q3, a);
648         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
649         _mm_storel_epi64((__m128i *)&flat_oq[i * 8],
650                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
651                                           , b));
652 
653         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
654         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
655         _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8],
656                          _mm_packus_epi16(workp_shft, workp_shft));
657 
658         b = _mm_add_epi16(q3, b);
659         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
660         _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8],
661                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
662                                           , b));
663 
664         c = _mm_add_epi16(q4, c);
665         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
666         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
667         _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8],
668                          _mm_packus_epi16(workp_shft, workp_shft));
669 
670         b = _mm_add_epi16(q3, b);
671         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
672         _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8],
673                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
674                                           , b));
675         a = _mm_add_epi16(q5, a);
676         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
677         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
678         _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8],
679                          _mm_packus_epi16(workp_shft, workp_shft));
680 
681         a = _mm_add_epi16(q6, a);
682         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
683         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
684         _mm_storel_epi64((__m128i *)&flat2_op[i * 8],
685                          _mm_packus_epi16(workp_shft, workp_shft));
686 
687         a = _mm_add_epi16(q7, a);
688         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
689         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
690         _mm_storel_epi64((__m128i *)&flat2_oq[i * 8],
691                          _mm_packus_epi16(workp_shft, workp_shft));
692 
693         a = _mm_add_epi16(q7, a);
694         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
695         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
696         _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8],
697                          _mm_packus_epi16(workp_shft, workp_shft));
698 
699         a = _mm_add_epi16(q7, a);
700         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
701         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
702         _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8],
703                          _mm_packus_epi16(workp_shft, workp_shft));
704 
705         a = _mm_add_epi16(q7, a);
706         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
707         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
708         _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8],
709                          _mm_packus_epi16(workp_shft, workp_shft));
710 
711         a = _mm_add_epi16(q7, a);
712         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
713         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
714         _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8],
715                          _mm_packus_epi16(workp_shft, workp_shft));
716 
717         a = _mm_add_epi16(q7, a);
718         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
719         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
720         _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8],
721                          _mm_packus_epi16(workp_shft, workp_shft));
722 
723         a = _mm_add_epi16(q7, a);
724         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
725         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
726         _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8],
727                          _mm_packus_epi16(workp_shft, workp_shft));
728 
729         temp_flat2 = _mm_srli_si128(temp_flat2, 8);
730         src += 8;
731       } while (++i < 2);
732     }
733     // wide flat
734     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
735 
736     work_a = _mm_load_si128((__m128i *)&ap[2 * 16]);
737     p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
738     work_a = _mm_andnot_si128(flat, work_a);
739     p2 = _mm_and_si128(flat, p2);
740     p2 = _mm_or_si128(work_a, p2);
741     _mm_store_si128((__m128i *)&flat_op[2 * 16], p2);
742 
743     p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
744     work_a = _mm_andnot_si128(flat, ps1);
745     p1 = _mm_and_si128(flat, p1);
746     p1 = _mm_or_si128(work_a, p1);
747     _mm_store_si128((__m128i *)&flat_op[1 * 16], p1);
748 
749     p0 = _mm_load_si128((__m128i *)&flat_op[0]);
750     work_a = _mm_andnot_si128(flat, ps0);
751     p0 = _mm_and_si128(flat, p0);
752     p0 = _mm_or_si128(work_a, p0);
753     _mm_store_si128((__m128i *)&flat_op[0], p0);
754 
755     q0 = _mm_load_si128((__m128i *)&flat_oq[0]);
756     work_a = _mm_andnot_si128(flat, qs0);
757     q0 = _mm_and_si128(flat, q0);
758     q0 = _mm_or_si128(work_a, q0);
759     _mm_store_si128((__m128i *)&flat_oq[0], q0);
760 
761     q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
762     work_a = _mm_andnot_si128(flat, qs1);
763     q1 = _mm_and_si128(flat, q1);
764     q1 = _mm_or_si128(work_a, q1);
765     _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1);
766 
767     work_a = _mm_load_si128((__m128i *)&aq[2 * 16]);
768     q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
769     work_a = _mm_andnot_si128(flat, work_a);
770     q2 = _mm_and_si128(flat, q2);
771     q2 = _mm_or_si128(work_a, q2);
772     _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2);
773 
774     // write out op6 - op3
775     {
776       unsigned char *dst = (s - 7 * p);
777       for (i = 6; i > 2; i--) {
778         __m128i flat2_output;
779         work_a = _mm_load_si128((__m128i *)&ap[i * 16]);
780         flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]);
781         work_a = _mm_andnot_si128(flat2, work_a);
782         flat2_output = _mm_and_si128(flat2, flat2_output);
783         work_a = _mm_or_si128(work_a, flat2_output);
784         _mm_storeu_si128((__m128i *)dst, work_a);
785         dst += p;
786       }
787     }
788 
789     work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
790     p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]);
791     work_a = _mm_andnot_si128(flat2, work_a);
792     p2 = _mm_and_si128(flat2, p2);
793     p2 = _mm_or_si128(work_a, p2);
794     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
795 
796     work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
797     p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]);
798     work_a = _mm_andnot_si128(flat2, work_a);
799     p1 = _mm_and_si128(flat2, p1);
800     p1 = _mm_or_si128(work_a, p1);
801     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
802 
803     work_a = _mm_load_si128((__m128i *)&flat_op[0]);
804     p0 = _mm_load_si128((__m128i *)&flat2_op[0]);
805     work_a = _mm_andnot_si128(flat2, work_a);
806     p0 = _mm_and_si128(flat2, p0);
807     p0 = _mm_or_si128(work_a, p0);
808     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
809 
810     work_a = _mm_load_si128((__m128i *)&flat_oq[0]);
811     q0 = _mm_load_si128((__m128i *)&flat2_oq[0]);
812     work_a = _mm_andnot_si128(flat2, work_a);
813     q0 = _mm_and_si128(flat2, q0);
814     q0 = _mm_or_si128(work_a, q0);
815     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
816 
817     work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
818     q1 = _mm_load_si128((__m128i *)&flat2_oq[16]);
819     work_a = _mm_andnot_si128(flat2, work_a);
820     q1 = _mm_and_si128(flat2, q1);
821     q1 = _mm_or_si128(work_a, q1);
822     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
823 
824     work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
825     q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]);
826     work_a = _mm_andnot_si128(flat2, work_a);
827     q2 = _mm_and_si128(flat2, q2);
828     q2 = _mm_or_si128(work_a, q2);
829     _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
830 
831     // write out oq3 - oq7
832     {
833       unsigned char *dst = (s + 3 * p);
834       for (i = 3; i < 7; i++) {
835         __m128i flat2_output;
836         work_a = _mm_load_si128((__m128i *)&aq[i * 16]);
837         flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]);
838         work_a = _mm_andnot_si128(flat2, work_a);
839         flat2_output = _mm_and_si128(flat2, flat2_output);
840         work_a = _mm_or_si128(work_a, flat2_output);
841         _mm_storeu_si128((__m128i *)dst, work_a);
842         dst += p;
843       }
844     }
845   }
846 }
847 
848 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
vp9_lpf_horizontal_16_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh,int count)849 void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
850                                 const unsigned char *_blimit,
851                                 const unsigned char *_limit,
852                                 const unsigned char *_thresh, int count) {
853   if (count == 1)
854     mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
855   else
856     mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
857 }
858 
vp9_lpf_horizontal_8_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh,int count)859 void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
860                                const unsigned char *_blimit,
861                                const unsigned char *_limit,
862                                const unsigned char *_thresh, int count) {
863   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
864   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
865   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
866   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
867   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
868   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
869   const __m128i zero = _mm_set1_epi16(0);
870   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
871   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
872   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
873   __m128i mask, hev, flat;
874   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
875   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
876 
877   (void)count;
878 
879   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
880                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
881   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
882                             _mm_loadl_epi64((__m128i *)(s + 2 * p)));
883   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
884                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
885   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
886                             _mm_loadl_epi64((__m128i *)(s - 0 * p)));
887   p1q1 = _mm_shuffle_epi32(q1p1, 78);
888   p0q0 = _mm_shuffle_epi32(q0p0, 78);
889 
890   {
891     // filter_mask and hev_mask
892     const __m128i one = _mm_set1_epi8(1);
893     const __m128i fe = _mm_set1_epi8(0xfe);
894     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
895     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
896     abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
897                             _mm_subs_epu8(q0p0, q1p1));
898     abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
899 
900     abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
901                             _mm_subs_epu8(p0q0, q0p0));
902     abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
903                             _mm_subs_epu8(p1q1, q1p1));
904     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
905     hev = _mm_subs_epu8(flat, thresh);
906     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
907 
908     abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
909     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
910     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
911     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
912     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
913     mask = _mm_max_epu8(abs_p1p0, mask);
914     // mask |= (abs(p1 - p0) > limit) * -1;
915     // mask |= (abs(q1 - q0) > limit) * -1;
916 
917     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
918                                      _mm_subs_epu8(q1p1, q2p2)),
919                         _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
920                                      _mm_subs_epu8(q2p2, q3p3)));
921     mask = _mm_max_epu8(work, mask);
922     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
923     mask = _mm_subs_epu8(mask, limit);
924     mask = _mm_cmpeq_epi8(mask, zero);
925 
926     // flat_mask4
927 
928     flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
929                                      _mm_subs_epu8(q0p0, q2p2)),
930                         _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
931                                      _mm_subs_epu8(q0p0, q3p3)));
932     flat = _mm_max_epu8(abs_p1p0, flat);
933     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
934     flat = _mm_subs_epu8(flat, one);
935     flat = _mm_cmpeq_epi8(flat, zero);
936     flat = _mm_and_si128(flat, mask);
937   }
938 
939   {
940     const __m128i four = _mm_set1_epi16(4);
941     unsigned char *src = s;
942     {
943       __m128i workp_a, workp_b, workp_shft;
944       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
945       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
946       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
947       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
948       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
949       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
950       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
951       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
952 
953       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
954       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
955       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
956       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
957       _mm_storel_epi64((__m128i *)&flat_op2[0],
958                        _mm_packus_epi16(workp_shft, workp_shft));
959 
960       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
961       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
962       _mm_storel_epi64((__m128i *)&flat_op1[0],
963                        _mm_packus_epi16(workp_shft, workp_shft));
964 
965       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
966       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
967       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
968       _mm_storel_epi64((__m128i *)&flat_op0[0],
969                        _mm_packus_epi16(workp_shft, workp_shft));
970 
971       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
972       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
973       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
974       _mm_storel_epi64((__m128i *)&flat_oq0[0],
975                        _mm_packus_epi16(workp_shft, workp_shft));
976 
977       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
978       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
979       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
980       _mm_storel_epi64((__m128i *)&flat_oq1[0],
981                        _mm_packus_epi16(workp_shft, workp_shft));
982 
983       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
984       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
985       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
986       _mm_storel_epi64((__m128i *)&flat_oq2[0],
987                        _mm_packus_epi16(workp_shft, workp_shft));
988     }
989   }
990   // lp filter
991   {
992     const __m128i t4 = _mm_set1_epi8(4);
993     const __m128i t3 = _mm_set1_epi8(3);
994     const __m128i t80 = _mm_set1_epi8(0x80);
995     const __m128i t1 = _mm_set1_epi8(0x1);
996     const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
997                                       t80);
998     const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
999                                       t80);
1000     const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
1001                                       t80);
1002     const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
1003                                       t80);
1004     __m128i filt;
1005     __m128i work_a;
1006     __m128i filter1, filter2;
1007 
1008     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1009     work_a = _mm_subs_epi8(qs0, ps0);
1010     filt = _mm_adds_epi8(filt, work_a);
1011     filt = _mm_adds_epi8(filt, work_a);
1012     filt = _mm_adds_epi8(filt, work_a);
1013     // (vp9_filter + 3 * (qs0 - ps0)) & mask
1014     filt = _mm_and_si128(filt, mask);
1015 
1016     filter1 = _mm_adds_epi8(filt, t4);
1017     filter2 = _mm_adds_epi8(filt, t3);
1018 
1019     // Filter1 >> 3
1020     filter1 = _mm_unpacklo_epi8(zero, filter1);
1021     filter1 = _mm_srai_epi16(filter1, 11);
1022     filter1 = _mm_packs_epi16(filter1, filter1);
1023 
1024     // Filter2 >> 3
1025     filter2 = _mm_unpacklo_epi8(zero, filter2);
1026     filter2 = _mm_srai_epi16(filter2, 11);
1027     filter2 = _mm_packs_epi16(filter2, zero);
1028 
1029     // filt >> 1
1030     filt = _mm_adds_epi8(filter1, t1);
1031     filt = _mm_unpacklo_epi8(zero, filt);
1032     filt = _mm_srai_epi16(filt, 9);
1033     filt = _mm_packs_epi16(filt, zero);
1034 
1035     filt = _mm_andnot_si128(hev, filt);
1036 
1037     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1038     q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1039     work_a = _mm_andnot_si128(flat, work_a);
1040     q0 = _mm_and_si128(flat, q0);
1041     q0 = _mm_or_si128(work_a, q0);
1042 
1043     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1044     q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1045     work_a = _mm_andnot_si128(flat, work_a);
1046     q1 = _mm_and_si128(flat, q1);
1047     q1 = _mm_or_si128(work_a, q1);
1048 
1049     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1050     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1051     work_a = _mm_andnot_si128(flat, work_a);
1052     q2 = _mm_and_si128(flat, q2);
1053     q2 = _mm_or_si128(work_a, q2);
1054 
1055     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1056     p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1057     work_a = _mm_andnot_si128(flat, work_a);
1058     p0 = _mm_and_si128(flat, p0);
1059     p0 = _mm_or_si128(work_a, p0);
1060 
1061     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1062     p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1063     work_a = _mm_andnot_si128(flat, work_a);
1064     p1 = _mm_and_si128(flat, p1);
1065     p1 = _mm_or_si128(work_a, p1);
1066 
1067     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1068     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1069     work_a = _mm_andnot_si128(flat, work_a);
1070     p2 = _mm_and_si128(flat, p2);
1071     p2 = _mm_or_si128(work_a, p2);
1072 
1073     _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1074     _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1075     _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1076     _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1077     _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1078     _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1079   }
1080 }
1081 
vp9_lpf_horizontal_8_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1082 void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
1083                                     const uint8_t *_blimit0,
1084                                     const uint8_t *_limit0,
1085                                     const uint8_t *_thresh0,
1086                                     const uint8_t *_blimit1,
1087                                     const uint8_t *_limit1,
1088                                     const uint8_t *_thresh1) {
1089   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
1090   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
1091   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
1092   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
1093   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
1094   DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
1095   const __m128i zero = _mm_set1_epi16(0);
1096   const __m128i blimit =
1097       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1098                          _mm_load_si128((const __m128i *)_blimit1));
1099   const __m128i limit =
1100       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1101                          _mm_load_si128((const __m128i *)_limit1));
1102   const __m128i thresh =
1103       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1104                          _mm_load_si128((const __m128i *)_thresh1));
1105 
1106   __m128i mask, hev, flat;
1107   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1108 
1109   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1110   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1111   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1112   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1113   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1114   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1115   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1116   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1117   {
1118     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1119                                           _mm_subs_epu8(p0, p1));
1120     const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1121                                           _mm_subs_epu8(q0, q1));
1122     const __m128i one = _mm_set1_epi8(1);
1123     const __m128i fe = _mm_set1_epi8(0xfe);
1124     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1125     __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1126                                     _mm_subs_epu8(q0, p0));
1127     __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1128                                     _mm_subs_epu8(q1, p1));
1129     __m128i work;
1130 
1131     // filter_mask and hev_mask
1132     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1133     hev = _mm_subs_epu8(flat, thresh);
1134     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1135 
1136     abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1137     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1138     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1139     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1140     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1141     mask = _mm_max_epu8(flat, mask);
1142     // mask |= (abs(p1 - p0) > limit) * -1;
1143     // mask |= (abs(q1 - q0) > limit) * -1;
1144     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1145                                      _mm_subs_epu8(p1, p2)),
1146                          _mm_or_si128(_mm_subs_epu8(p3, p2),
1147                                       _mm_subs_epu8(p2, p3)));
1148     mask = _mm_max_epu8(work, mask);
1149     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1150                                      _mm_subs_epu8(q1, q2)),
1151                          _mm_or_si128(_mm_subs_epu8(q3, q2),
1152                                       _mm_subs_epu8(q2, q3)));
1153     mask = _mm_max_epu8(work, mask);
1154     mask = _mm_subs_epu8(mask, limit);
1155     mask = _mm_cmpeq_epi8(mask, zero);
1156 
1157     // flat_mask4
1158     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
1159                                      _mm_subs_epu8(p0, p2)),
1160                          _mm_or_si128(_mm_subs_epu8(q2, q0),
1161                                       _mm_subs_epu8(q0, q2)));
1162     flat = _mm_max_epu8(work, flat);
1163     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
1164                                      _mm_subs_epu8(p0, p3)),
1165                          _mm_or_si128(_mm_subs_epu8(q3, q0),
1166                                       _mm_subs_epu8(q0, q3)));
1167     flat = _mm_max_epu8(work, flat);
1168     flat = _mm_subs_epu8(flat, one);
1169     flat = _mm_cmpeq_epi8(flat, zero);
1170     flat = _mm_and_si128(flat, mask);
1171   }
1172   {
1173     const __m128i four = _mm_set1_epi16(4);
1174     unsigned char *src = s;
1175     int i = 0;
1176 
1177     do {
1178       __m128i workp_a, workp_b, workp_shft;
1179       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1180       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1181       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1182       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1183       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1184       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1185       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1186       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1187 
1188       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1189       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1190       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1191       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1192       _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1193                        _mm_packus_epi16(workp_shft, workp_shft));
1194 
1195       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1196       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1197       _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1198                        _mm_packus_epi16(workp_shft, workp_shft));
1199 
1200       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1201       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1202       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1203       _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1204                        _mm_packus_epi16(workp_shft, workp_shft));
1205 
1206       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1207       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1208       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1209       _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1210                        _mm_packus_epi16(workp_shft, workp_shft));
1211 
1212       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1213       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1214       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1215       _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1216                        _mm_packus_epi16(workp_shft, workp_shft));
1217 
1218       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1219       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1220       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1221       _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1222                        _mm_packus_epi16(workp_shft, workp_shft));
1223 
1224       src += 8;
1225     } while (++i < 2);
1226   }
1227   // lp filter
1228   {
1229     const __m128i t4 = _mm_set1_epi8(4);
1230     const __m128i t3 = _mm_set1_epi8(3);
1231     const __m128i t80 = _mm_set1_epi8(0x80);
1232     const __m128i te0 = _mm_set1_epi8(0xe0);
1233     const __m128i t1f = _mm_set1_epi8(0x1f);
1234     const __m128i t1 = _mm_set1_epi8(0x1);
1235     const __m128i t7f = _mm_set1_epi8(0x7f);
1236 
1237     const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1238                                       t80);
1239     const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1240                                       t80);
1241     const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1242                                       t80);
1243     const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1244                                       t80);
1245     __m128i filt;
1246     __m128i work_a;
1247     __m128i filter1, filter2;
1248 
1249     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1250     work_a = _mm_subs_epi8(qs0, ps0);
1251     filt = _mm_adds_epi8(filt, work_a);
1252     filt = _mm_adds_epi8(filt, work_a);
1253     filt = _mm_adds_epi8(filt, work_a);
1254     // (vp9_filter + 3 * (qs0 - ps0)) & mask
1255     filt = _mm_and_si128(filt, mask);
1256 
1257     filter1 = _mm_adds_epi8(filt, t4);
1258     filter2 = _mm_adds_epi8(filt, t3);
1259 
1260     // Filter1 >> 3
1261     work_a = _mm_cmpgt_epi8(zero, filter1);
1262     filter1 = _mm_srli_epi16(filter1, 3);
1263     work_a = _mm_and_si128(work_a, te0);
1264     filter1 = _mm_and_si128(filter1, t1f);
1265     filter1 = _mm_or_si128(filter1, work_a);
1266 
1267     // Filter2 >> 3
1268     work_a = _mm_cmpgt_epi8(zero, filter2);
1269     filter2 = _mm_srli_epi16(filter2, 3);
1270     work_a = _mm_and_si128(work_a, te0);
1271     filter2 = _mm_and_si128(filter2, t1f);
1272     filter2 = _mm_or_si128(filter2, work_a);
1273 
1274     // filt >> 1
1275     filt = _mm_adds_epi8(filter1, t1);
1276     work_a = _mm_cmpgt_epi8(zero, filt);
1277     filt = _mm_srli_epi16(filt, 1);
1278     work_a = _mm_and_si128(work_a, t80);
1279     filt = _mm_and_si128(filt, t7f);
1280     filt = _mm_or_si128(filt, work_a);
1281 
1282     filt = _mm_andnot_si128(hev, filt);
1283 
1284     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1285     q0 = _mm_load_si128((__m128i *)flat_oq0);
1286     work_a = _mm_andnot_si128(flat, work_a);
1287     q0 = _mm_and_si128(flat, q0);
1288     q0 = _mm_or_si128(work_a, q0);
1289 
1290     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1291     q1 = _mm_load_si128((__m128i *)flat_oq1);
1292     work_a = _mm_andnot_si128(flat, work_a);
1293     q1 = _mm_and_si128(flat, q1);
1294     q1 = _mm_or_si128(work_a, q1);
1295 
1296     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1297     q2 = _mm_load_si128((__m128i *)flat_oq2);
1298     work_a = _mm_andnot_si128(flat, work_a);
1299     q2 = _mm_and_si128(flat, q2);
1300     q2 = _mm_or_si128(work_a, q2);
1301 
1302     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1303     p0 = _mm_load_si128((__m128i *)flat_op0);
1304     work_a = _mm_andnot_si128(flat, work_a);
1305     p0 = _mm_and_si128(flat, p0);
1306     p0 = _mm_or_si128(work_a, p0);
1307 
1308     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1309     p1 = _mm_load_si128((__m128i *)flat_op1);
1310     work_a = _mm_andnot_si128(flat, work_a);
1311     p1 = _mm_and_si128(flat, p1);
1312     p1 = _mm_or_si128(work_a, p1);
1313 
1314     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1315     p2 = _mm_load_si128((__m128i *)flat_op2);
1316     work_a = _mm_andnot_si128(flat, work_a);
1317     p2 = _mm_and_si128(flat, p2);
1318     p2 = _mm_or_si128(work_a, p2);
1319 
1320     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1321     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1322     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1323     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1324     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1325     _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1326   }
1327 }
1328 
vp9_lpf_horizontal_4_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1329 void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1330                                     const unsigned char *_blimit0,
1331                                     const unsigned char *_limit0,
1332                                     const unsigned char *_thresh0,
1333                                     const unsigned char *_blimit1,
1334                                     const unsigned char *_limit1,
1335                                     const unsigned char *_thresh1) {
1336   const __m128i blimit =
1337       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1338                          _mm_load_si128((const __m128i *)_blimit1));
1339   const __m128i limit =
1340       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1341                          _mm_load_si128((const __m128i *)_limit1));
1342   const __m128i thresh =
1343       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1344                          _mm_load_si128((const __m128i *)_thresh1));
1345   const __m128i zero = _mm_set1_epi16(0);
1346   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1347   __m128i mask, hev, flat;
1348 
1349   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1350   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1351   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1352   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1353   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1354   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1355   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1356   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1357 
1358   // filter_mask and hev_mask
1359   {
1360     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1361                                           _mm_subs_epu8(p0, p1));
1362     const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1363                                           _mm_subs_epu8(q0, q1));
1364     const __m128i fe = _mm_set1_epi8(0xfe);
1365     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1366     __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1367                                     _mm_subs_epu8(q0, p0));
1368     __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1369                                     _mm_subs_epu8(q1, p1));
1370     __m128i work;
1371 
1372     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1373     hev = _mm_subs_epu8(flat, thresh);
1374     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1375 
1376     abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1377     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1378     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1379     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1380     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1381     mask = _mm_max_epu8(flat, mask);
1382     // mask |= (abs(p1 - p0) > limit) * -1;
1383     // mask |= (abs(q1 - q0) > limit) * -1;
1384     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1385                                      _mm_subs_epu8(p1, p2)),
1386                          _mm_or_si128(_mm_subs_epu8(p3, p2),
1387                                       _mm_subs_epu8(p2, p3)));
1388     mask = _mm_max_epu8(work, mask);
1389     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1390                                      _mm_subs_epu8(q1, q2)),
1391                          _mm_or_si128(_mm_subs_epu8(q3, q2),
1392                                       _mm_subs_epu8(q2, q3)));
1393     mask = _mm_max_epu8(work, mask);
1394     mask = _mm_subs_epu8(mask, limit);
1395     mask = _mm_cmpeq_epi8(mask, zero);
1396   }
1397 
1398   // filter4
1399   {
1400     const __m128i t4 = _mm_set1_epi8(4);
1401     const __m128i t3 = _mm_set1_epi8(3);
1402     const __m128i t80 = _mm_set1_epi8(0x80);
1403     const __m128i te0 = _mm_set1_epi8(0xe0);
1404     const __m128i t1f = _mm_set1_epi8(0x1f);
1405     const __m128i t1 = _mm_set1_epi8(0x1);
1406     const __m128i t7f = _mm_set1_epi8(0x7f);
1407 
1408     const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1409                                       t80);
1410     const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1411                                       t80);
1412     const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1413                                       t80);
1414     const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1415                                       t80);
1416     __m128i filt;
1417     __m128i work_a;
1418     __m128i filter1, filter2;
1419 
1420     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1421     work_a = _mm_subs_epi8(qs0, ps0);
1422     filt = _mm_adds_epi8(filt, work_a);
1423     filt = _mm_adds_epi8(filt, work_a);
1424     filt = _mm_adds_epi8(filt, work_a);
1425     // (vp9_filter + 3 * (qs0 - ps0)) & mask
1426     filt = _mm_and_si128(filt, mask);
1427 
1428     filter1 = _mm_adds_epi8(filt, t4);
1429     filter2 = _mm_adds_epi8(filt, t3);
1430 
1431     // Filter1 >> 3
1432     work_a = _mm_cmpgt_epi8(zero, filter1);
1433     filter1 = _mm_srli_epi16(filter1, 3);
1434     work_a = _mm_and_si128(work_a, te0);
1435     filter1 = _mm_and_si128(filter1, t1f);
1436     filter1 = _mm_or_si128(filter1, work_a);
1437 
1438     // Filter2 >> 3
1439     work_a = _mm_cmpgt_epi8(zero, filter2);
1440     filter2 = _mm_srli_epi16(filter2, 3);
1441     work_a = _mm_and_si128(work_a, te0);
1442     filter2 = _mm_and_si128(filter2, t1f);
1443     filter2 = _mm_or_si128(filter2, work_a);
1444 
1445     // filt >> 1
1446     filt = _mm_adds_epi8(filter1, t1);
1447     work_a = _mm_cmpgt_epi8(zero, filt);
1448     filt = _mm_srli_epi16(filt, 1);
1449     work_a = _mm_and_si128(work_a, t80);
1450     filt = _mm_and_si128(filt, t7f);
1451     filt = _mm_or_si128(filt, work_a);
1452 
1453     filt = _mm_andnot_si128(hev, filt);
1454 
1455     q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1456     q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1457     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1458     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1459 
1460     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1461     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1462     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1463     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1464   }
1465 }
1466 
transpose8x16(unsigned char * in0,unsigned char * in1,int in_p,unsigned char * out,int out_p)1467 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1468                                  int in_p, unsigned char *out, int out_p) {
1469   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1470   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1471 
1472   // Read in 16 lines
1473   x0 = _mm_loadl_epi64((__m128i *)in0);
1474   x8 = _mm_loadl_epi64((__m128i *)in1);
1475   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
1476   x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
1477   x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
1478   x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
1479   x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
1480   x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
1481   x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
1482   x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
1483   x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
1484   x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
1485   x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
1486   x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
1487   x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
1488   x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
1489 
1490   x0 = _mm_unpacklo_epi8(x0, x1);
1491   x1 = _mm_unpacklo_epi8(x2, x3);
1492   x2 = _mm_unpacklo_epi8(x4, x5);
1493   x3 = _mm_unpacklo_epi8(x6, x7);
1494 
1495   x8 = _mm_unpacklo_epi8(x8, x9);
1496   x9 = _mm_unpacklo_epi8(x10, x11);
1497   x10 = _mm_unpacklo_epi8(x12, x13);
1498   x11 = _mm_unpacklo_epi8(x14, x15);
1499 
1500   x4 = _mm_unpacklo_epi16(x0, x1);
1501   x5 = _mm_unpacklo_epi16(x2, x3);
1502   x12 = _mm_unpacklo_epi16(x8, x9);
1503   x13 = _mm_unpacklo_epi16(x10, x11);
1504 
1505   x6 = _mm_unpacklo_epi32(x4, x5);
1506   x7 = _mm_unpackhi_epi32(x4, x5);
1507   x14 = _mm_unpacklo_epi32(x12, x13);
1508   x15 = _mm_unpackhi_epi32(x12, x13);
1509 
1510   // Store first 4-line result
1511   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1512   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1513   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1514   _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1515 
1516   x4 = _mm_unpackhi_epi16(x0, x1);
1517   x5 = _mm_unpackhi_epi16(x2, x3);
1518   x12 = _mm_unpackhi_epi16(x8, x9);
1519   x13 = _mm_unpackhi_epi16(x10, x11);
1520 
1521   x6 = _mm_unpacklo_epi32(x4, x5);
1522   x7 = _mm_unpackhi_epi32(x4, x5);
1523   x14 = _mm_unpacklo_epi32(x12, x13);
1524   x15 = _mm_unpackhi_epi32(x12, x13);
1525 
1526   // Store second 4-line result
1527   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1528   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1529   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1530   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1531 }
1532 
transpose(unsigned char * src[],int in_p,unsigned char * dst[],int out_p,int num_8x8_to_transpose)1533 static INLINE void transpose(unsigned char *src[], int in_p,
1534                              unsigned char *dst[], int out_p,
1535                              int num_8x8_to_transpose) {
1536   int idx8x8 = 0;
1537   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1538   do {
1539     unsigned char *in = src[idx8x8];
1540     unsigned char *out = dst[idx8x8];
1541 
1542     x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
1543     x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
1544     x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
1545     x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
1546     x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
1547     x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
1548     x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
1549     x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
1550     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1551     x0 = _mm_unpacklo_epi8(x0, x1);
1552     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1553     x1 = _mm_unpacklo_epi8(x2, x3);
1554     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1555     x2 = _mm_unpacklo_epi8(x4, x5);
1556     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1557     x3 = _mm_unpacklo_epi8(x6, x7);
1558     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1559     x4 = _mm_unpacklo_epi16(x0, x1);
1560     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1561     x5 = _mm_unpacklo_epi16(x2, x3);
1562     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1563     x6 = _mm_unpacklo_epi32(x4, x5);
1564     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1565     x7 = _mm_unpackhi_epi32(x4, x5);
1566 
1567     _mm_storel_pd((double *)(out + 0*out_p),
1568                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
1569     _mm_storeh_pd((double *)(out + 1*out_p),
1570                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
1571     _mm_storel_pd((double *)(out + 2*out_p),
1572                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
1573     _mm_storeh_pd((double *)(out + 3*out_p),
1574                   _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
1575 
1576     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1577     x4 = _mm_unpackhi_epi16(x0, x1);
1578     // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1579     x5 = _mm_unpackhi_epi16(x2, x3);
1580     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1581     x6 = _mm_unpacklo_epi32(x4, x5);
1582     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1583     x7 = _mm_unpackhi_epi32(x4, x5);
1584 
1585     _mm_storel_pd((double *)(out + 4*out_p),
1586                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
1587     _mm_storeh_pd((double *)(out + 5*out_p),
1588                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
1589     _mm_storel_pd((double *)(out + 6*out_p),
1590                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
1591     _mm_storeh_pd((double *)(out + 7*out_p),
1592                   _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
1593   } while (++idx8x8 < num_8x8_to_transpose);
1594 }
1595 
vp9_lpf_vertical_4_dual_sse2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1596 void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1597                                   const uint8_t *limit0,
1598                                   const uint8_t *thresh0,
1599                                   const uint8_t *blimit1,
1600                                   const uint8_t *limit1,
1601                                   const uint8_t *thresh1) {
1602   DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1603   unsigned char *src[2];
1604   unsigned char *dst[2];
1605 
1606   // Transpose 8x16
1607   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1608 
1609   // Loop filtering
1610   vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1611                                  blimit1, limit1, thresh1);
1612   src[0] = t_dst;
1613   src[1] = t_dst + 8;
1614   dst[0] = s - 4;
1615   dst[1] = s - 4 + p * 8;
1616 
1617   // Transpose back
1618   transpose(src, 16, dst, p, 2);
1619 }
1620 
vp9_lpf_vertical_8_sse2(unsigned char * s,int p,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh,int count)1621 void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
1622                              const unsigned char *blimit,
1623                              const unsigned char *limit,
1624                              const unsigned char *thresh, int count) {
1625   DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
1626   unsigned char *src[1];
1627   unsigned char *dst[1];
1628   (void)count;
1629 
1630   // Transpose 8x8
1631   src[0] = s - 4;
1632   dst[0] = t_dst;
1633 
1634   transpose(src, p, dst, 8, 1);
1635 
1636   // Loop filtering
1637   vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
1638 
1639   src[0] = t_dst;
1640   dst[0] = s - 4;
1641 
1642   // Transpose back
1643   transpose(src, 8, dst, p, 1);
1644 }
1645 
vp9_lpf_vertical_8_dual_sse2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1646 void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1647                                   const uint8_t *limit0,
1648                                   const uint8_t *thresh0,
1649                                   const uint8_t *blimit1,
1650                                   const uint8_t *limit1,
1651                                   const uint8_t *thresh1) {
1652   DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1653   unsigned char *src[2];
1654   unsigned char *dst[2];
1655 
1656   // Transpose 8x16
1657   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1658 
1659   // Loop filtering
1660   vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1661                                  blimit1, limit1, thresh1);
1662   src[0] = t_dst;
1663   src[1] = t_dst + 8;
1664 
1665   dst[0] = s - 4;
1666   dst[1] = s - 4 + p * 8;
1667 
1668   // Transpose back
1669   transpose(src, 16, dst, p, 2);
1670 }
1671 
vp9_lpf_vertical_16_sse2(unsigned char * s,int p,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)1672 void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
1673                               const unsigned char *blimit,
1674                               const unsigned char *limit,
1675                               const unsigned char *thresh) {
1676   DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
1677   unsigned char *src[2];
1678   unsigned char *dst[2];
1679 
1680   src[0] = s - 8;
1681   src[1] = s;
1682   dst[0] = t_dst;
1683   dst[1] = t_dst + 8 * 8;
1684 
1685   // Transpose 16x8
1686   transpose(src, p, dst, 8, 2);
1687 
1688   // Loop filtering
1689   mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
1690 
1691   src[0] = t_dst;
1692   src[1] = t_dst + 8 * 8;
1693   dst[0] = s - 8;
1694   dst[1] = s;
1695 
1696   // Transpose back
1697   transpose(src, 8, dst, p, 2);
1698 }
1699 
vp9_lpf_vertical_16_dual_sse2(unsigned char * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1700 void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1701                                    const uint8_t *blimit, const uint8_t *limit,
1702                                    const uint8_t *thresh) {
1703   DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1704 
1705   // Transpose 16x16
1706   transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1707   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1708 
1709   // Loop filtering
1710   mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
1711                                    thresh);
1712 
1713   // Transpose back
1714   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1715   transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1716 }
1717