1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_ports/mem.h"
15 #include "vpx_ports/emmintrin_compat.h"
16 #include "vpx_dsp/x86/mem_sse2.h"
17 
abs_diff(__m128i a,__m128i b)18 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
19   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
20 }
21 
22 // filter_mask and hev_mask
23 #define FILTER_HEV_MASK                                                       \
24   do {                                                                        \
25     /* (abs(q1 - q0), abs(p1 - p0) */                                         \
26     __m128i flat = abs_diff(q1p1, q0p0);                                      \
27     /* abs(p1 - q1), abs(p0 - q0) */                                          \
28     const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
29     __m128i abs_p0q0, abs_p1q1, work;                                         \
30                                                                               \
31     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
32     hev =                                                                     \
33         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
34     hev = _mm_cmpgt_epi16(hev, thresh_v);                                     \
35     hev = _mm_packs_epi16(hev, hev);                                          \
36                                                                               \
37     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
38     /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
39     abs_p0q0 =                                                                \
40         _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
41     abs_p1q1 =                                                                \
42         _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
43     abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
44     abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
45     /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
46     mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
47     /* abs(p3 - p2), abs(p2 - p1) */                                          \
48     work = abs_diff(p3p2, p2p1);                                              \
49     flat = _mm_max_epu8(work, flat);                                          \
50     /* abs(q3 - q2), abs(q2 - q1) */                                          \
51     work = abs_diff(q3q2, q2q1);                                              \
52     flat = _mm_max_epu8(work, flat);                                          \
53     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
54     mask = _mm_unpacklo_epi64(mask, flat);                                    \
55     mask = _mm_subs_epu8(mask, limit_v);                                      \
56     mask = _mm_cmpeq_epi8(mask, zero);                                        \
57     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
58   } while (0)
59 
60 #define FILTER4                                                             \
61   do {                                                                      \
62     const __m128i t3t4 =                                                    \
63         _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
64     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);                        \
65     __m128i filter, filter2filter1, work;                                   \
66                                                                             \
67     ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
68     qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
69                                                                             \
70     /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
71     work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
72     filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
73     /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
74     filter = _mm_subs_epi8(filter, work);                                   \
75     filter = _mm_subs_epi8(filter, work);                                   \
76     filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
77     filter = _mm_and_si128(filter, mask); /* & mask */                      \
78     filter = _mm_unpacklo_epi64(filter, filter);                            \
79                                                                             \
80     /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
81     /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
82     filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
83     filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
84     filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
85     filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
86     filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
87     filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
88                                                                             \
89     /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
90     filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
91     filter = _mm_unpacklo_epi8(filter, filter);                             \
92     filter = _mm_srai_epi16(filter, 9); /* round */                         \
93     filter = _mm_packs_epi16(filter, filter);                               \
94     filter = _mm_andnot_si128(hev, filter);                                 \
95                                                                             \
96     hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
97     filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
98                                                                             \
99     /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
100     qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
101     /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
102     ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
103     qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
104     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
105   } while (0)
106 
vpx_lpf_horizontal_4_sse2(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)107 void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
108                                const uint8_t *limit, const uint8_t *thresh) {
109   const __m128i zero = _mm_set1_epi16(0);
110   const __m128i limit_v =
111       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
112                          _mm_loadl_epi64((const __m128i *)limit));
113   const __m128i thresh_v =
114       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
115   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
116   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
117   __m128i mask, hev;
118 
119   p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
120                             _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
121   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
122                             _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
123   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
124                             _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
125   q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
126                             _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
127   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
128   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
129   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
130   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
131 
132   FILTER_HEV_MASK;
133   FILTER4;
134 
135   _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0));  // *op1
136   _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0);               // *op0
137   _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0);               // *oq0
138   _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0));  // *oq1
139 }
140 
vpx_lpf_vertical_4_sse2(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)141 void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
142                              const uint8_t *limit, const uint8_t *thresh) {
143   const __m128i zero = _mm_set1_epi16(0);
144   const __m128i limit_v =
145       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
146                          _mm_loadl_epi64((const __m128i *)limit));
147   const __m128i thresh_v =
148       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
149   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
150   __m128i x0, x1, x2, x3;
151   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
152   __m128i mask, hev;
153 
154   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
155   q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
156                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
157 
158   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
159   x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
160                          _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
161 
162   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
163   x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
164                          _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
165 
166   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
167   x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
168                          _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
169 
170   // Transpose 8x8
171   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
172   p1p0 = _mm_unpacklo_epi16(q1q0, x1);
173   // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
174   x0 = _mm_unpacklo_epi16(x2, x3);
175   // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
176   p3p2 = _mm_unpacklo_epi32(p1p0, x0);
177   // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
178   p1p0 = _mm_unpackhi_epi32(p1p0, x0);
179   p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
180   p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
181 
182   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
183   q1q0 = _mm_unpackhi_epi16(q1q0, x1);
184   // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
185   x2 = _mm_unpackhi_epi16(x2, x3);
186   // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
187   q3q2 = _mm_unpackhi_epi32(q1q0, x2);
188   // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
189   q1q0 = _mm_unpacklo_epi32(q1q0, x2);
190 
191   q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
192   q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
193   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
194   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
195   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
196 
197   FILTER_HEV_MASK;
198   FILTER4;
199 
200   // Transpose 8x4 to 4x8
201   // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
202   // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
203   // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
204   ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
205   // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
206   x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
207   // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
208   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
209   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
210   qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
211   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
212   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
213 
214   storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
215   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
216   storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
217   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
218   storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
219   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
220   storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
221 
222   storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
223   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
224   storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
225   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
226   storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
227   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
228   storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
229 }
230 
vpx_lpf_horizontal_16_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)231 void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
232                                 const unsigned char *blimit,
233                                 const unsigned char *limit,
234                                 const unsigned char *thresh) {
235   const __m128i zero = _mm_set1_epi16(0);
236   const __m128i one = _mm_set1_epi8(1);
237   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
238   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
239   const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
240   __m128i mask, hev, flat, flat2;
241   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
242   __m128i abs_p1p0;
243 
244   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
245   q4p4 = _mm_castps_si128(
246       _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
247   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
248   q3p3 = _mm_castps_si128(
249       _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
250   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
251   q2p2 = _mm_castps_si128(
252       _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
253   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
254   q1p1 = _mm_castps_si128(
255       _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
256   p1q1 = _mm_shuffle_epi32(q1p1, 78);
257   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
258   q0p0 = _mm_castps_si128(
259       _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
260   p0q0 = _mm_shuffle_epi32(q0p0, 78);
261 
262   {
263     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
264     abs_p1p0 = abs_diff(q1p1, q0p0);
265     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
266     fe = _mm_set1_epi8((int8_t)0xfe);
267     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
268     abs_p0q0 = abs_diff(q0p0, p0q0);
269     abs_p1q1 = abs_diff(q1p1, p1q1);
270     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
271     hev = _mm_subs_epu8(flat, thresh_v);
272     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
273 
274     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
275     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
276     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
277     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
278     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
279     mask = _mm_max_epu8(abs_p1p0, mask);
280     // mask |= (abs(p1 - p0) > limit) * -1;
281     // mask |= (abs(q1 - q0) > limit) * -1;
282 
283     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
284     mask = _mm_max_epu8(work, mask);
285     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
286     mask = _mm_subs_epu8(mask, limit_v);
287     mask = _mm_cmpeq_epi8(mask, zero);
288   }
289 
290   // lp filter
291   {
292     const __m128i t4 = _mm_set1_epi8(4);
293     const __m128i t3 = _mm_set1_epi8(3);
294     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
295     const __m128i t1 = _mm_set1_epi16(0x1);
296     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
297     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
298     __m128i qs0 = _mm_xor_si128(p0q0, t80);
299     __m128i qs1 = _mm_xor_si128(p1q1, t80);
300     __m128i filt;
301     __m128i work_a;
302     __m128i filter1, filter2;
303     __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
304     __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
305 
306     filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
307     work_a = _mm_subs_epi8(qs0, qs0ps0);
308     filt = _mm_adds_epi8(filt, work_a);
309     filt = _mm_adds_epi8(filt, work_a);
310     filt = _mm_adds_epi8(filt, work_a);
311     // (vpx_filter + 3 * (qs0 - ps0)) & mask
312     filt = _mm_and_si128(filt, mask);
313 
314     filter1 = _mm_adds_epi8(filt, t4);
315     filter2 = _mm_adds_epi8(filt, t3);
316 
317     filter1 = _mm_unpacklo_epi8(zero, filter1);
318     filter1 = _mm_srai_epi16(filter1, 0xB);
319     filter2 = _mm_unpacklo_epi8(zero, filter2);
320     filter2 = _mm_srai_epi16(filter2, 0xB);
321 
322     // Filter1 >> 3
323     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
324     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
325 
326     // filt >> 1
327     filt = _mm_adds_epi16(filter1, t1);
328     filt = _mm_srai_epi16(filt, 1);
329     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
330                             filt);
331     filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
332     qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
333     // loopfilter done
334 
335     {
336       __m128i work;
337       flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
338       flat = _mm_max_epu8(abs_p1p0, flat);
339       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
340       flat = _mm_subs_epu8(flat, one);
341       flat = _mm_cmpeq_epi8(flat, zero);
342       flat = _mm_and_si128(flat, mask);
343 
344       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
345       q5p5 = _mm_castps_si128(
346           _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
347 
348       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
349       q6p6 = _mm_castps_si128(
350           _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
351       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
352 
353       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
354       q7p7 = _mm_castps_si128(
355           _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
356       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
357       flat2 = _mm_max_epu8(work, flat2);
358       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
359       flat2 = _mm_subs_epu8(flat2, one);
360       flat2 = _mm_cmpeq_epi8(flat2, zero);
361       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
362     }
363 
364     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
365     // flat and wide flat calculations
366     {
367       const __m128i eight = _mm_set1_epi16(8);
368       const __m128i four = _mm_set1_epi16(4);
369       __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
370       __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
371       __m128i pixelFilter_p, pixelFilter_q;
372       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
373       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
374 
375       p7_16 = _mm_unpacklo_epi8(q7p7, zero);
376       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
377       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
378       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
379       p3_16 = _mm_unpacklo_epi8(q3p3, zero);
380       p2_16 = _mm_unpacklo_epi8(q2p2, zero);
381       p1_16 = _mm_unpacklo_epi8(q1p1, zero);
382       p0_16 = _mm_unpacklo_epi8(q0p0, zero);
383       q0_16 = _mm_unpackhi_epi8(q0p0, zero);
384       q1_16 = _mm_unpackhi_epi8(q1p1, zero);
385       q2_16 = _mm_unpackhi_epi8(q2p2, zero);
386       q3_16 = _mm_unpackhi_epi8(q3p3, zero);
387       q4_16 = _mm_unpackhi_epi8(q4p4, zero);
388       q5_16 = _mm_unpackhi_epi8(q5p5, zero);
389       q6_16 = _mm_unpackhi_epi8(q6p6, zero);
390       q7_16 = _mm_unpackhi_epi8(q7p7, zero);
391 
392       pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
393                                     _mm_add_epi16(p4_16, p3_16));
394       pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
395                                     _mm_add_epi16(q4_16, q3_16));
396 
397       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
398       pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
399 
400       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
401       pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
402       pixelFilter_p =
403           _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
404       pixetFilter_p2p1p0 = _mm_add_epi16(
405           four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
406       res_p = _mm_srli_epi16(
407           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
408       res_q = _mm_srli_epi16(
409           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
410       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
411       res_p = _mm_srli_epi16(
412           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
413       res_q = _mm_srli_epi16(
414           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
415 
416       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
417 
418       sum_p7 = _mm_add_epi16(p7_16, p7_16);
419       sum_q7 = _mm_add_epi16(q7_16, q7_16);
420       sum_p3 = _mm_add_epi16(p3_16, p3_16);
421       sum_q3 = _mm_add_epi16(q3_16, q3_16);
422 
423       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
424       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
425       res_p = _mm_srli_epi16(
426           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
427       res_q = _mm_srli_epi16(
428           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
429       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
430 
431       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
432       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
433       res_p = _mm_srli_epi16(
434           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
435       res_q = _mm_srli_epi16(
436           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
437       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
438 
439       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
440       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
441       sum_p3 = _mm_add_epi16(sum_p3, p3_16);
442       sum_q3 = _mm_add_epi16(sum_q3, q3_16);
443 
444       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
445       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
446       res_p = _mm_srli_epi16(
447           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
448       res_q = _mm_srli_epi16(
449           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
450       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
451 
452       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
453       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
454 
455       res_p = _mm_srli_epi16(
456           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
457       res_q = _mm_srli_epi16(
458           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
459       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
460 
461       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
462       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
463       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
464       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
465       res_p = _mm_srli_epi16(
466           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
467       res_q = _mm_srli_epi16(
468           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
469       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
470 
471       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
472       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
473       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
474       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
475       res_p = _mm_srli_epi16(
476           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
477       res_q = _mm_srli_epi16(
478           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
479       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
480 
481       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
482       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
483       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
484       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
485       res_p = _mm_srli_epi16(
486           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
487       res_q = _mm_srli_epi16(
488           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
489       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
490 
491       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
492       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
493       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
494       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
495       res_p = _mm_srli_epi16(
496           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
497       res_q = _mm_srli_epi16(
498           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
499       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
500     }
501     // wide flat
502     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
503 
504     flat = _mm_shuffle_epi32(flat, 68);
505     flat2 = _mm_shuffle_epi32(flat2, 68);
506 
507     q2p2 = _mm_andnot_si128(flat, q2p2);
508     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
509     q2p2 = _mm_or_si128(q2p2, flat_q2p2);
510 
511     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
512     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
513     q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
514 
515     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
516     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
517     q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
518 
519     q6p6 = _mm_andnot_si128(flat2, q6p6);
520     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
521     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
522     _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
523     _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
524 
525     q5p5 = _mm_andnot_si128(flat2, q5p5);
526     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
527     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
528     _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
529     _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
530 
531     q4p4 = _mm_andnot_si128(flat2, q4p4);
532     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
533     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
534     _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
535     _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
536 
537     q3p3 = _mm_andnot_si128(flat2, q3p3);
538     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
539     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
540     _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
541     _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
542 
543     q2p2 = _mm_andnot_si128(flat2, q2p2);
544     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
545     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
546     _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
547     _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
548 
549     q1p1 = _mm_andnot_si128(flat2, q1p1);
550     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
551     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
552     _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
553     _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
554 
555     q0p0 = _mm_andnot_si128(flat2, q0p0);
556     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
557     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
558     _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
559     _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
560   }
561 }
562 
filter_add2_sub2(const __m128i * const total,const __m128i * const a1,const __m128i * const a2,const __m128i * const s1,const __m128i * const s2)563 static INLINE __m128i filter_add2_sub2(const __m128i *const total,
564                                        const __m128i *const a1,
565                                        const __m128i *const a2,
566                                        const __m128i *const s1,
567                                        const __m128i *const s2) {
568   __m128i x = _mm_add_epi16(*a1, *total);
569   x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
570   return x;
571 }
572 
filter8_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f8_lo,const __m128i * const f8_hi)573 static INLINE __m128i filter8_mask(const __m128i *const flat,
574                                    const __m128i *const other_filt,
575                                    const __m128i *const f8_lo,
576                                    const __m128i *const f8_hi) {
577   const __m128i f8 =
578       _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
579   const __m128i result = _mm_and_si128(*flat, f8);
580   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
581 }
582 
filter16_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f_lo,const __m128i * const f_hi)583 static INLINE __m128i filter16_mask(const __m128i *const flat,
584                                     const __m128i *const other_filt,
585                                     const __m128i *const f_lo,
586                                     const __m128i *const f_hi) {
587   const __m128i f =
588       _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
589   const __m128i result = _mm_and_si128(*flat, f);
590   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
591 }
592 
vpx_lpf_horizontal_16_dual_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)593 void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
594                                      const unsigned char *blimit,
595                                      const unsigned char *limit,
596                                      const unsigned char *thresh) {
597   const __m128i zero = _mm_set1_epi16(0);
598   const __m128i one = _mm_set1_epi8(1);
599   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
600   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
601   const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
602   __m128i mask, hev, flat, flat2;
603   __m128i p7, p6, p5;
604   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
605   __m128i q5, q6, q7;
606 
607   __m128i op2, op1, op0, oq0, oq1, oq2;
608 
609   __m128i max_abs_p1p0q1q0;
610 
611   p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
612   p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
613   p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
614   p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
615   p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
616   p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
617   p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
618   p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
619   q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
620   q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
621   q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
622   q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
623   q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
624   q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
625   q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
626   q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
627 
628   {
629     const __m128i abs_p1p0 = abs_diff(p1, p0);
630     const __m128i abs_q1q0 = abs_diff(q1, q0);
631     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
632     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
633     __m128i abs_p0q0 = abs_diff(p0, q0);
634     __m128i abs_p1q1 = abs_diff(p1, q1);
635     __m128i work;
636     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
637 
638     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
639     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
640     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
641     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
642     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
643     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
644     // mask |= (abs(p1 - p0) > limit) * -1;
645     // mask |= (abs(q1 - q0) > limit) * -1;
646     work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
647     mask = _mm_max_epu8(work, mask);
648     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
649     mask = _mm_max_epu8(work, mask);
650     mask = _mm_subs_epu8(mask, limit_v);
651     mask = _mm_cmpeq_epi8(mask, zero);
652   }
653 
654   {
655     __m128i work;
656     work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
657     flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
658     work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
659     flat = _mm_max_epu8(work, flat);
660     work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
661     flat = _mm_subs_epu8(flat, one);
662     flat = _mm_cmpeq_epi8(flat, zero);
663     flat = _mm_and_si128(flat, mask);
664     flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
665     flat2 = _mm_max_epu8(work, flat2);
666     work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
667     flat2 = _mm_max_epu8(work, flat2);
668     work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
669     flat2 = _mm_max_epu8(work, flat2);
670     flat2 = _mm_subs_epu8(flat2, one);
671     flat2 = _mm_cmpeq_epi8(flat2, zero);
672     flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
673   }
674 
675   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
676   // filter4
677   {
678     const __m128i t4 = _mm_set1_epi8(4);
679     const __m128i t3 = _mm_set1_epi8(3);
680     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
681     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
682     const __m128i t1f = _mm_set1_epi8(0x1f);
683     const __m128i t1 = _mm_set1_epi8(0x1);
684     const __m128i t7f = _mm_set1_epi8(0x7f);
685     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
686 
687     __m128i filt;
688     __m128i work_a;
689     __m128i filter1, filter2;
690 
691     op1 = _mm_xor_si128(p1, t80);
692     op0 = _mm_xor_si128(p0, t80);
693     oq0 = _mm_xor_si128(q0, t80);
694     oq1 = _mm_xor_si128(q1, t80);
695 
696     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
697     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
698     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
699 
700     work_a = _mm_subs_epi8(oq0, op0);
701     filt = _mm_adds_epi8(filt, work_a);
702     filt = _mm_adds_epi8(filt, work_a);
703     filt = _mm_adds_epi8(filt, work_a);
704     // (vpx_filter + 3 * (qs0 - ps0)) & mask
705     filt = _mm_and_si128(filt, mask);
706     filter1 = _mm_adds_epi8(filt, t4);
707     filter2 = _mm_adds_epi8(filt, t3);
708 
709     // Filter1 >> 3
710     work_a = _mm_cmpgt_epi8(zero, filter1);
711     filter1 = _mm_srli_epi16(filter1, 3);
712     work_a = _mm_and_si128(work_a, te0);
713     filter1 = _mm_and_si128(filter1, t1f);
714     filter1 = _mm_or_si128(filter1, work_a);
715     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
716 
717     // Filter2 >> 3
718     work_a = _mm_cmpgt_epi8(zero, filter2);
719     filter2 = _mm_srli_epi16(filter2, 3);
720     work_a = _mm_and_si128(work_a, te0);
721     filter2 = _mm_and_si128(filter2, t1f);
722     filter2 = _mm_or_si128(filter2, work_a);
723     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
724 
725     // filt >> 1
726     filt = _mm_adds_epi8(filter1, t1);
727     work_a = _mm_cmpgt_epi8(zero, filt);
728     filt = _mm_srli_epi16(filt, 1);
729     work_a = _mm_and_si128(work_a, t80);
730     filt = _mm_and_si128(filt, t7f);
731     filt = _mm_or_si128(filt, work_a);
732     filt = _mm_andnot_si128(hev, filt);
733     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
734     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
735     // loopfilter done
736 
737     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
738     // filter8
739     {
740       const __m128i four = _mm_set1_epi16(4);
741       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
742       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
743       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
744       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
745       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
746       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
747       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
748       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
749 
750       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
751       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
752       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
753       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
754       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
755       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
756       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
757       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
758       __m128i f8_lo, f8_hi;
759 
760       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
761                             _mm_add_epi16(p3_lo, p2_lo));
762       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
763                             _mm_add_epi16(p2_lo, p1_lo));
764       f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
765 
766       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
767                             _mm_add_epi16(p3_hi, p2_hi));
768       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
769                             _mm_add_epi16(p2_hi, p1_hi));
770       f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
771 
772       op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
773 
774       f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
775       f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
776       op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
777 
778       f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
779       f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
780       op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
781 
782       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
783       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
784       oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
785 
786       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
787       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
788       oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
789 
790       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
791       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
792       oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
793     }
794 
795     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
796     // wide flat calculations
797     {
798       const __m128i eight = _mm_set1_epi16(8);
799       const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
800       const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
801       const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
802       const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
803       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
804       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
805       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
806       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
807       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
808       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
809       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
810       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
811       const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
812       const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
813       const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
814       const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
815 
816       const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
817       const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
818       const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
819       const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
820       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
821       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
822       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
823       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
824       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
825       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
826       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
827       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
828       const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
829       const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
830       const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
831       const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
832 
833       __m128i f_lo;
834       __m128i f_hi;
835 
836       f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
837       f_lo =
838           _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
839       f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
840                            _mm_add_epi16(p2_lo, p1_lo));
841       f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
842       f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
843 
844       f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
845       f_hi =
846           _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
847       f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
848                            _mm_add_epi16(p2_hi, p1_hi));
849       f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
850       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
851 
852       p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
853       _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
854 
855       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
856       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
857       p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
858       _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
859 
860       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
861       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
862       p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
863       _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
864 
865       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
866       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
867       p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
868       _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
869 
870       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
871       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
872       op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
873       _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
874 
875       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
876       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
877       op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
878       _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
879 
880       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
881       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
882       op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
883       _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
884 
885       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
886       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
887       oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
888       _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
889 
890       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
891       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
892       oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
893       _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
894 
895       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
896       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
897       oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
898       _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
899 
900       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
901       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
902       q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
903       _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
904 
905       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
906       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
907       q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
908       _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
909 
910       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
911       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
912       q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
913       _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
914 
915       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
916       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
917       q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
918       _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
919     }
920     // wide flat
921     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
922   }
923 }
924 
vpx_lpf_horizontal_8_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)925 void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
926                                const unsigned char *blimit,
927                                const unsigned char *limit,
928                                const unsigned char *thresh) {
929   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
930   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
931   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
932   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
933   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
934   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
935   const __m128i zero = _mm_set1_epi16(0);
936   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
937   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
938   const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
939   __m128i mask, hev, flat;
940   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
941   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
942 
943   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
944                             _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
945   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
946                             _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
947   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
948                             _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
949   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
950                             _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
951   p1q1 = _mm_shuffle_epi32(q1p1, 78);
952   p0q0 = _mm_shuffle_epi32(q0p0, 78);
953 
954   {
955     // filter_mask and hev_mask
956     const __m128i one = _mm_set1_epi8(1);
957     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
958     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
959     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
960     abs_p1p0 = abs_diff(q1p1, q0p0);
961     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
962 
963     abs_p0q0 = abs_diff(q0p0, p0q0);
964     abs_p1q1 = abs_diff(q1p1, p1q1);
965     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
966     hev = _mm_subs_epu8(flat, thresh_v);
967     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
968 
969     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
970     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
971     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
972     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
973     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
974     mask = _mm_max_epu8(abs_p1p0, mask);
975     // mask |= (abs(p1 - p0) > limit) * -1;
976     // mask |= (abs(q1 - q0) > limit) * -1;
977 
978     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
979     mask = _mm_max_epu8(work, mask);
980     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
981     mask = _mm_subs_epu8(mask, limit_v);
982     mask = _mm_cmpeq_epi8(mask, zero);
983 
984     // flat_mask4
985 
986     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
987     flat = _mm_max_epu8(abs_p1p0, flat);
988     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
989     flat = _mm_subs_epu8(flat, one);
990     flat = _mm_cmpeq_epi8(flat, zero);
991     flat = _mm_and_si128(flat, mask);
992   }
993 
994   {
995     const __m128i four = _mm_set1_epi16(4);
996     unsigned char *src = s;
997     {
998       __m128i workp_a, workp_b, workp_shft;
999       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
1000                              zero);
1001       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
1002                              zero);
1003       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
1004                              zero);
1005       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
1006                              zero);
1007       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
1008                              zero);
1009       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
1010                              zero);
1011       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
1012                              zero);
1013       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
1014                              zero);
1015 
1016       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1017       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1018       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1019       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1020       _mm_storel_epi64((__m128i *)&flat_op2[0],
1021                        _mm_packus_epi16(workp_shft, workp_shft));
1022 
1023       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1024       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1025       _mm_storel_epi64((__m128i *)&flat_op1[0],
1026                        _mm_packus_epi16(workp_shft, workp_shft));
1027 
1028       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1029       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1030       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1031       _mm_storel_epi64((__m128i *)&flat_op0[0],
1032                        _mm_packus_epi16(workp_shft, workp_shft));
1033 
1034       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1035       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1036       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1037       _mm_storel_epi64((__m128i *)&flat_oq0[0],
1038                        _mm_packus_epi16(workp_shft, workp_shft));
1039 
1040       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1041       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1042       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1043       _mm_storel_epi64((__m128i *)&flat_oq1[0],
1044                        _mm_packus_epi16(workp_shft, workp_shft));
1045 
1046       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1047       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1048       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1049       _mm_storel_epi64((__m128i *)&flat_oq2[0],
1050                        _mm_packus_epi16(workp_shft, workp_shft));
1051     }
1052   }
1053   // lp filter
1054   {
1055     const __m128i t4 = _mm_set1_epi8(4);
1056     const __m128i t3 = _mm_set1_epi8(3);
1057     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
1058     const __m128i t1 = _mm_set1_epi8(0x1);
1059     const __m128i ps1 =
1060         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
1061     const __m128i ps0 =
1062         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
1063     const __m128i qs0 =
1064         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
1065     const __m128i qs1 =
1066         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
1067     __m128i filt;
1068     __m128i work_a;
1069     __m128i filter1, filter2;
1070 
1071     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1072     work_a = _mm_subs_epi8(qs0, ps0);
1073     filt = _mm_adds_epi8(filt, work_a);
1074     filt = _mm_adds_epi8(filt, work_a);
1075     filt = _mm_adds_epi8(filt, work_a);
1076     // (vpx_filter + 3 * (qs0 - ps0)) & mask
1077     filt = _mm_and_si128(filt, mask);
1078 
1079     filter1 = _mm_adds_epi8(filt, t4);
1080     filter2 = _mm_adds_epi8(filt, t3);
1081 
1082     // Filter1 >> 3
1083     filter1 = _mm_unpacklo_epi8(zero, filter1);
1084     filter1 = _mm_srai_epi16(filter1, 11);
1085     filter1 = _mm_packs_epi16(filter1, filter1);
1086 
1087     // Filter2 >> 3
1088     filter2 = _mm_unpacklo_epi8(zero, filter2);
1089     filter2 = _mm_srai_epi16(filter2, 11);
1090     filter2 = _mm_packs_epi16(filter2, zero);
1091 
1092     // filt >> 1
1093     filt = _mm_adds_epi8(filter1, t1);
1094     filt = _mm_unpacklo_epi8(zero, filt);
1095     filt = _mm_srai_epi16(filt, 9);
1096     filt = _mm_packs_epi16(filt, zero);
1097 
1098     filt = _mm_andnot_si128(hev, filt);
1099 
1100     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1101     q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1102     work_a = _mm_andnot_si128(flat, work_a);
1103     q0 = _mm_and_si128(flat, q0);
1104     q0 = _mm_or_si128(work_a, q0);
1105 
1106     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1107     q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1108     work_a = _mm_andnot_si128(flat, work_a);
1109     q1 = _mm_and_si128(flat, q1);
1110     q1 = _mm_or_si128(work_a, q1);
1111 
1112     work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
1113     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1114     work_a = _mm_andnot_si128(flat, work_a);
1115     q2 = _mm_and_si128(flat, q2);
1116     q2 = _mm_or_si128(work_a, q2);
1117 
1118     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1119     p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1120     work_a = _mm_andnot_si128(flat, work_a);
1121     p0 = _mm_and_si128(flat, p0);
1122     p0 = _mm_or_si128(work_a, p0);
1123 
1124     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1125     p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1126     work_a = _mm_andnot_si128(flat, work_a);
1127     p1 = _mm_and_si128(flat, p1);
1128     p1 = _mm_or_si128(work_a, p1);
1129 
1130     work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
1131     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1132     work_a = _mm_andnot_si128(flat, work_a);
1133     p2 = _mm_and_si128(flat, p2);
1134     p2 = _mm_or_si128(work_a, p2);
1135 
1136     _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
1137     _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
1138     _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
1139     _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
1140     _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
1141     _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
1142   }
1143 }
1144 
vpx_lpf_horizontal_8_dual_sse2(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1145 void vpx_lpf_horizontal_8_dual_sse2(
1146     uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
1147     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1148     const uint8_t *thresh1) {
1149   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
1150   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
1151   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
1152   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
1153   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
1154   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
1155   const __m128i zero = _mm_set1_epi16(0);
1156   const __m128i blimit =
1157       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
1158                          _mm_load_si128((const __m128i *)blimit1));
1159   const __m128i limit =
1160       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
1161                          _mm_load_si128((const __m128i *)limit1));
1162   const __m128i thresh =
1163       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
1164                          _mm_load_si128((const __m128i *)thresh1));
1165 
1166   __m128i mask, hev, flat;
1167   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1168 
1169   p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
1170   p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
1171   p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
1172   p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
1173   q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
1174   q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
1175   q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
1176   q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
1177   {
1178     const __m128i abs_p1p0 =
1179         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1180     const __m128i abs_q1q0 =
1181         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1182     const __m128i one = _mm_set1_epi8(1);
1183     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
1184     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1185     __m128i abs_p0q0 =
1186         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1187     __m128i abs_p1q1 =
1188         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1189     __m128i work;
1190 
1191     // filter_mask and hev_mask
1192     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1193     hev = _mm_subs_epu8(flat, thresh);
1194     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1195 
1196     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1197     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1198     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1199     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1200     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1201     mask = _mm_max_epu8(flat, mask);
1202     // mask |= (abs(p1 - p0) > limit) * -1;
1203     // mask |= (abs(q1 - q0) > limit) * -1;
1204     work = _mm_max_epu8(
1205         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1206         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1207     mask = _mm_max_epu8(work, mask);
1208     work = _mm_max_epu8(
1209         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1210         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1211     mask = _mm_max_epu8(work, mask);
1212     mask = _mm_subs_epu8(mask, limit);
1213     mask = _mm_cmpeq_epi8(mask, zero);
1214 
1215     // flat_mask4
1216     work = _mm_max_epu8(
1217         _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
1218         _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
1219     flat = _mm_max_epu8(work, flat);
1220     work = _mm_max_epu8(
1221         _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
1222         _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
1223     flat = _mm_max_epu8(work, flat);
1224     flat = _mm_subs_epu8(flat, one);
1225     flat = _mm_cmpeq_epi8(flat, zero);
1226     flat = _mm_and_si128(flat, mask);
1227   }
1228   {
1229     const __m128i four = _mm_set1_epi16(4);
1230     unsigned char *src = s;
1231     int i = 0;
1232 
1233     do {
1234       __m128i workp_a, workp_b, workp_shft;
1235       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
1236                              zero);
1237       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
1238                              zero);
1239       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
1240                              zero);
1241       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
1242                              zero);
1243       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
1244                              zero);
1245       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
1246                              zero);
1247       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
1248                              zero);
1249       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
1250                              zero);
1251 
1252       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1253       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1254       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1255       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1256       _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1257                        _mm_packus_epi16(workp_shft, workp_shft));
1258 
1259       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1260       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1261       _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1262                        _mm_packus_epi16(workp_shft, workp_shft));
1263 
1264       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1265       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1266       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1267       _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1268                        _mm_packus_epi16(workp_shft, workp_shft));
1269 
1270       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1271       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1272       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1273       _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1274                        _mm_packus_epi16(workp_shft, workp_shft));
1275 
1276       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1277       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1278       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1279       _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1280                        _mm_packus_epi16(workp_shft, workp_shft));
1281 
1282       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1283       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1284       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1285       _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1286                        _mm_packus_epi16(workp_shft, workp_shft));
1287 
1288       src += 8;
1289     } while (++i < 2);
1290   }
1291   // lp filter
1292   {
1293     const __m128i t4 = _mm_set1_epi8(4);
1294     const __m128i t3 = _mm_set1_epi8(3);
1295     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
1296     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
1297     const __m128i t1f = _mm_set1_epi8(0x1f);
1298     const __m128i t1 = _mm_set1_epi8(0x1);
1299     const __m128i t7f = _mm_set1_epi8(0x7f);
1300 
1301     const __m128i ps1 =
1302         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
1303     const __m128i ps0 =
1304         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
1305     const __m128i qs0 =
1306         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
1307     const __m128i qs1 =
1308         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
1309     __m128i filt;
1310     __m128i work_a;
1311     __m128i filter1, filter2;
1312 
1313     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1314     work_a = _mm_subs_epi8(qs0, ps0);
1315     filt = _mm_adds_epi8(filt, work_a);
1316     filt = _mm_adds_epi8(filt, work_a);
1317     filt = _mm_adds_epi8(filt, work_a);
1318     // (vpx_filter + 3 * (qs0 - ps0)) & mask
1319     filt = _mm_and_si128(filt, mask);
1320 
1321     filter1 = _mm_adds_epi8(filt, t4);
1322     filter2 = _mm_adds_epi8(filt, t3);
1323 
1324     // Filter1 >> 3
1325     work_a = _mm_cmpgt_epi8(zero, filter1);
1326     filter1 = _mm_srli_epi16(filter1, 3);
1327     work_a = _mm_and_si128(work_a, te0);
1328     filter1 = _mm_and_si128(filter1, t1f);
1329     filter1 = _mm_or_si128(filter1, work_a);
1330 
1331     // Filter2 >> 3
1332     work_a = _mm_cmpgt_epi8(zero, filter2);
1333     filter2 = _mm_srli_epi16(filter2, 3);
1334     work_a = _mm_and_si128(work_a, te0);
1335     filter2 = _mm_and_si128(filter2, t1f);
1336     filter2 = _mm_or_si128(filter2, work_a);
1337 
1338     // filt >> 1
1339     filt = _mm_adds_epi8(filter1, t1);
1340     work_a = _mm_cmpgt_epi8(zero, filt);
1341     filt = _mm_srli_epi16(filt, 1);
1342     work_a = _mm_and_si128(work_a, t80);
1343     filt = _mm_and_si128(filt, t7f);
1344     filt = _mm_or_si128(filt, work_a);
1345 
1346     filt = _mm_andnot_si128(hev, filt);
1347 
1348     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1349     q0 = _mm_load_si128((__m128i *)flat_oq0);
1350     work_a = _mm_andnot_si128(flat, work_a);
1351     q0 = _mm_and_si128(flat, q0);
1352     q0 = _mm_or_si128(work_a, q0);
1353 
1354     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1355     q1 = _mm_load_si128((__m128i *)flat_oq1);
1356     work_a = _mm_andnot_si128(flat, work_a);
1357     q1 = _mm_and_si128(flat, q1);
1358     q1 = _mm_or_si128(work_a, q1);
1359 
1360     work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
1361     q2 = _mm_load_si128((__m128i *)flat_oq2);
1362     work_a = _mm_andnot_si128(flat, work_a);
1363     q2 = _mm_and_si128(flat, q2);
1364     q2 = _mm_or_si128(work_a, q2);
1365 
1366     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1367     p0 = _mm_load_si128((__m128i *)flat_op0);
1368     work_a = _mm_andnot_si128(flat, work_a);
1369     p0 = _mm_and_si128(flat, p0);
1370     p0 = _mm_or_si128(work_a, p0);
1371 
1372     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1373     p1 = _mm_load_si128((__m128i *)flat_op1);
1374     work_a = _mm_andnot_si128(flat, work_a);
1375     p1 = _mm_and_si128(flat, p1);
1376     p1 = _mm_or_si128(work_a, p1);
1377 
1378     work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
1379     p2 = _mm_load_si128((__m128i *)flat_op2);
1380     work_a = _mm_andnot_si128(flat, work_a);
1381     p2 = _mm_and_si128(flat, p2);
1382     p2 = _mm_or_si128(work_a, p2);
1383 
1384     _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
1385     _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
1386     _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
1387     _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
1388     _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
1389     _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
1390   }
1391 }
1392 
vpx_lpf_horizontal_4_dual_sse2(unsigned char * s,int pitch,const unsigned char * blimit0,const unsigned char * limit0,const unsigned char * thresh0,const unsigned char * blimit1,const unsigned char * limit1,const unsigned char * thresh1)1393 void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
1394                                     const unsigned char *blimit0,
1395                                     const unsigned char *limit0,
1396                                     const unsigned char *thresh0,
1397                                     const unsigned char *blimit1,
1398                                     const unsigned char *limit1,
1399                                     const unsigned char *thresh1) {
1400   const __m128i blimit =
1401       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
1402                          _mm_load_si128((const __m128i *)blimit1));
1403   const __m128i limit =
1404       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
1405                          _mm_load_si128((const __m128i *)limit1));
1406   const __m128i thresh =
1407       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
1408                          _mm_load_si128((const __m128i *)thresh1));
1409   const __m128i zero = _mm_set1_epi16(0);
1410   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1411   __m128i mask, hev, flat;
1412 
1413   p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
1414   p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
1415   p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
1416   p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
1417   q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
1418   q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
1419   q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
1420   q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
1421 
1422   // filter_mask and hev_mask
1423   {
1424     const __m128i abs_p1p0 =
1425         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1426     const __m128i abs_q1q0 =
1427         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1428     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
1429     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1430     __m128i abs_p0q0 =
1431         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1432     __m128i abs_p1q1 =
1433         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1434     __m128i work;
1435 
1436     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1437     hev = _mm_subs_epu8(flat, thresh);
1438     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1439 
1440     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1441     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1442     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1443     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1444     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1445     mask = _mm_max_epu8(flat, mask);
1446     // mask |= (abs(p1 - p0) > limit) * -1;
1447     // mask |= (abs(q1 - q0) > limit) * -1;
1448     work = _mm_max_epu8(
1449         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1450         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1451     mask = _mm_max_epu8(work, mask);
1452     work = _mm_max_epu8(
1453         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1454         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1455     mask = _mm_max_epu8(work, mask);
1456     mask = _mm_subs_epu8(mask, limit);
1457     mask = _mm_cmpeq_epi8(mask, zero);
1458   }
1459 
1460   // filter4
1461   {
1462     const __m128i t4 = _mm_set1_epi8(4);
1463     const __m128i t3 = _mm_set1_epi8(3);
1464     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
1465     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
1466     const __m128i t1f = _mm_set1_epi8(0x1f);
1467     const __m128i t1 = _mm_set1_epi8(0x1);
1468     const __m128i t7f = _mm_set1_epi8(0x7f);
1469 
1470     const __m128i ps1 =
1471         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
1472     const __m128i ps0 =
1473         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
1474     const __m128i qs0 =
1475         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
1476     const __m128i qs1 =
1477         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
1478     __m128i filt;
1479     __m128i work_a;
1480     __m128i filter1, filter2;
1481 
1482     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1483     work_a = _mm_subs_epi8(qs0, ps0);
1484     filt = _mm_adds_epi8(filt, work_a);
1485     filt = _mm_adds_epi8(filt, work_a);
1486     filt = _mm_adds_epi8(filt, work_a);
1487     // (vpx_filter + 3 * (qs0 - ps0)) & mask
1488     filt = _mm_and_si128(filt, mask);
1489 
1490     filter1 = _mm_adds_epi8(filt, t4);
1491     filter2 = _mm_adds_epi8(filt, t3);
1492 
1493     // Filter1 >> 3
1494     work_a = _mm_cmpgt_epi8(zero, filter1);
1495     filter1 = _mm_srli_epi16(filter1, 3);
1496     work_a = _mm_and_si128(work_a, te0);
1497     filter1 = _mm_and_si128(filter1, t1f);
1498     filter1 = _mm_or_si128(filter1, work_a);
1499 
1500     // Filter2 >> 3
1501     work_a = _mm_cmpgt_epi8(zero, filter2);
1502     filter2 = _mm_srli_epi16(filter2, 3);
1503     work_a = _mm_and_si128(work_a, te0);
1504     filter2 = _mm_and_si128(filter2, t1f);
1505     filter2 = _mm_or_si128(filter2, work_a);
1506 
1507     // filt >> 1
1508     filt = _mm_adds_epi8(filter1, t1);
1509     work_a = _mm_cmpgt_epi8(zero, filt);
1510     filt = _mm_srli_epi16(filt, 1);
1511     work_a = _mm_and_si128(work_a, t80);
1512     filt = _mm_and_si128(filt, t7f);
1513     filt = _mm_or_si128(filt, work_a);
1514 
1515     filt = _mm_andnot_si128(hev, filt);
1516 
1517     q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1518     q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1519     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1520     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1521 
1522     _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
1523     _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
1524     _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
1525     _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
1526   }
1527 }
1528 
transpose8x16(unsigned char * in0,unsigned char * in1,int in_p,unsigned char * out,int out_p)1529 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1530                                  int in_p, unsigned char *out, int out_p) {
1531   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1532   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1533 
1534   // 2-way interleave w/hoisting of unpacks
1535   x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
1536   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
1537   x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
1538 
1539   x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
1540   x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
1541   x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
1542 
1543   x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
1544   x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
1545   x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
1546 
1547   x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
1548   x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
1549   x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
1550   x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
1551 
1552   x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
1553   x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
1554   x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
1555   x5 = _mm_unpacklo_epi16(x2, x3);                // 10
1556 
1557   x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
1558   x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
1559   x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
1560 
1561   x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
1562   x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
1563   x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
1564   x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
1565 
1566   x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
1567   x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
1568   x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
1569   x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
1570 
1571   x6 = _mm_unpacklo_epi32(x4, x5);     // 13
1572   x7 = _mm_unpackhi_epi32(x4, x5);     // 14
1573   x14 = _mm_unpacklo_epi32(x12, x13);  // 15
1574   x15 = _mm_unpackhi_epi32(x12, x13);  // 16
1575 
1576   // Store first 4-line result
1577   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1578   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1579   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1580   _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1581 
1582   x4 = _mm_unpackhi_epi16(x0, x1);
1583   x5 = _mm_unpackhi_epi16(x2, x3);
1584   x12 = _mm_unpackhi_epi16(x8, x9);
1585   x13 = _mm_unpackhi_epi16(x10, x11);
1586 
1587   x6 = _mm_unpacklo_epi32(x4, x5);
1588   x7 = _mm_unpackhi_epi32(x4, x5);
1589   x14 = _mm_unpacklo_epi32(x12, x13);
1590   x15 = _mm_unpackhi_epi32(x12, x13);
1591 
1592   // Store second 4-line result
1593   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1594   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1595   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1596   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1597 }
1598 
transpose(unsigned char * src[],int in_p,unsigned char * dst[],int out_p,int num_8x8_to_transpose)1599 static INLINE void transpose(unsigned char *src[], int in_p,
1600                              unsigned char *dst[], int out_p,
1601                              int num_8x8_to_transpose) {
1602   int idx8x8 = 0;
1603   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1604   do {
1605     unsigned char *in = src[idx8x8];
1606     unsigned char *out = dst[idx8x8];
1607 
1608     x0 =
1609         _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
1610     x1 =
1611         _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
1612     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1613     x0 = _mm_unpacklo_epi8(x0, x1);
1614 
1615     x2 =
1616         _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
1617     x3 =
1618         _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
1619     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1620     x1 = _mm_unpacklo_epi8(x2, x3);
1621 
1622     x4 =
1623         _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
1624     x5 =
1625         _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
1626     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1627     x2 = _mm_unpacklo_epi8(x4, x5);
1628 
1629     x6 =
1630         _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
1631     x7 =
1632         _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
1633     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1634     x3 = _mm_unpacklo_epi8(x6, x7);
1635 
1636     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1637     x4 = _mm_unpacklo_epi16(x0, x1);
1638     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1639     x5 = _mm_unpacklo_epi16(x2, x3);
1640     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1641     x6 = _mm_unpacklo_epi32(x4, x5);
1642     mm_storelu(out + 0 * out_p, x6);  // 00 10 20 30 40 50 60 70
1643     mm_storehu(out + 1 * out_p, x6);  // 01 11 21 31 41 51 61 71
1644     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1645     x7 = _mm_unpackhi_epi32(x4, x5);
1646     mm_storelu(out + 2 * out_p, x7);  // 02 12 22 32 42 52 62 72
1647     mm_storehu(out + 3 * out_p, x7);  // 03 13 23 33 43 53 63 73
1648 
1649     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1650     x4 = _mm_unpackhi_epi16(x0, x1);
1651     // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1652     x5 = _mm_unpackhi_epi16(x2, x3);
1653     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1654     x6 = _mm_unpacklo_epi32(x4, x5);
1655     mm_storelu(out + 4 * out_p, x6);  // 04 14 24 34 44 54 64 74
1656     mm_storehu(out + 5 * out_p, x6);  // 05 15 25 35 45 55 65 75
1657     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1658     x7 = _mm_unpackhi_epi32(x4, x5);
1659 
1660     mm_storelu(out + 6 * out_p, x7);  // 06 16 26 36 46 56 66 76
1661     mm_storehu(out + 7 * out_p, x7);  // 07 17 27 37 47 57 67 77
1662   } while (++idx8x8 < num_8x8_to_transpose);
1663 }
1664 
vpx_lpf_vertical_4_dual_sse2(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1665 void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
1666                                   const uint8_t *limit0, const uint8_t *thresh0,
1667                                   const uint8_t *blimit1, const uint8_t *limit1,
1668                                   const uint8_t *thresh1) {
1669   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1670   unsigned char *src[2];
1671   unsigned char *dst[2];
1672 
1673   // Transpose 8x16
1674   transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
1675 
1676   // Loop filtering
1677   vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1678                                  blimit1, limit1, thresh1);
1679   src[0] = t_dst;
1680   src[1] = t_dst + 8;
1681   dst[0] = s - 4;
1682   dst[1] = s - 4 + pitch * 8;
1683 
1684   // Transpose back
1685   transpose(src, 16, dst, pitch, 2);
1686 }
1687 
vpx_lpf_vertical_8_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)1688 void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
1689                              const unsigned char *blimit,
1690                              const unsigned char *limit,
1691                              const unsigned char *thresh) {
1692   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
1693   unsigned char *src[1];
1694   unsigned char *dst[1];
1695 
1696   // Transpose 8x8
1697   src[0] = s - 4;
1698   dst[0] = t_dst;
1699 
1700   transpose(src, pitch, dst, 8, 1);
1701 
1702   // Loop filtering
1703   vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
1704 
1705   src[0] = t_dst;
1706   dst[0] = s - 4;
1707 
1708   // Transpose back
1709   transpose(src, 8, dst, pitch, 1);
1710 }
1711 
vpx_lpf_vertical_8_dual_sse2(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1712 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
1713                                   const uint8_t *limit0, const uint8_t *thresh0,
1714                                   const uint8_t *blimit1, const uint8_t *limit1,
1715                                   const uint8_t *thresh1) {
1716   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1717   unsigned char *src[2];
1718   unsigned char *dst[2];
1719 
1720   // Transpose 8x16
1721   transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
1722 
1723   // Loop filtering
1724   vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1725                                  blimit1, limit1, thresh1);
1726   src[0] = t_dst;
1727   src[1] = t_dst + 8;
1728 
1729   dst[0] = s - 4;
1730   dst[1] = s - 4 + pitch * 8;
1731 
1732   // Transpose back
1733   transpose(src, 16, dst, pitch, 2);
1734 }
1735 
vpx_lpf_vertical_16_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)1736 void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
1737                               const unsigned char *blimit,
1738                               const unsigned char *limit,
1739                               const unsigned char *thresh) {
1740   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
1741   unsigned char *src[2];
1742   unsigned char *dst[2];
1743 
1744   src[0] = s - 8;
1745   src[1] = s;
1746   dst[0] = t_dst;
1747   dst[1] = t_dst + 8 * 8;
1748 
1749   // Transpose 16x8
1750   transpose(src, pitch, dst, 8, 2);
1751 
1752   // Loop filtering
1753   vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
1754 
1755   src[0] = t_dst;
1756   src[1] = t_dst + 8 * 8;
1757   dst[0] = s - 8;
1758   dst[1] = s;
1759 
1760   // Transpose back
1761   transpose(src, 8, dst, pitch, 2);
1762 }
1763 
vpx_lpf_vertical_16_dual_sse2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1764 void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
1765                                    const uint8_t *blimit, const uint8_t *limit,
1766                                    const uint8_t *thresh) {
1767   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1768 
1769   // Transpose 16x16
1770   transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
1771   transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
1772 
1773   // Loop filtering
1774   vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
1775 
1776   // Transpose back
1777   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
1778   transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
1779 }
1780