1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <tmmintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_dsp/intrapred_common.h"
17 
18 // -----------------------------------------------------------------------------
19 // PAETH_PRED
20 
21 // Return 8 16-bit pixels in one row
paeth_8x1_pred(const __m128i * left,const __m128i * top,const __m128i * topleft)22 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23                                      const __m128i *topleft) {
24   const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25 
26   __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27   __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28   __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29 
30   __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31   mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32   __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33 
34   pl = _mm_andnot_si128(mask1, *left);
35 
36   ptl = _mm_and_si128(mask2, *topleft);
37   pt = _mm_andnot_si128(mask2, *top);
38   pt = _mm_or_si128(pt, ptl);
39   pt = _mm_and_si128(mask1, pt);
40 
41   return _mm_or_si128(pl, pt);
42 }
43 
aom_paeth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45                                    const uint8_t *above, const uint8_t *left) {
46   __m128i l = _mm_loadl_epi64((const __m128i *)left);
47   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48   const __m128i zero = _mm_setzero_si128();
49   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
51   __m128i rep = _mm_set1_epi16(0x8000);
52   const __m128i one = _mm_set1_epi16(1);
53 
54   int i;
55   for (i = 0; i < 4; ++i) {
56     const __m128i l16 = _mm_shuffle_epi8(l, rep);
57     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58 
59     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60     dst += stride;
61     rep = _mm_add_epi16(rep, one);
62   }
63 }
64 
aom_paeth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66                                    const uint8_t *above, const uint8_t *left) {
67   __m128i l = _mm_loadl_epi64((const __m128i *)left);
68   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69   const __m128i zero = _mm_setzero_si128();
70   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
72   __m128i rep = _mm_set1_epi16(0x8000);
73   const __m128i one = _mm_set1_epi16(1);
74 
75   int i;
76   for (i = 0; i < 8; ++i) {
77     const __m128i l16 = _mm_shuffle_epi8(l, rep);
78     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79 
80     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81     dst += stride;
82     rep = _mm_add_epi16(rep, one);
83   }
84 }
85 
aom_paeth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)86 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87                                     const uint8_t *above, const uint8_t *left) {
88   __m128i l = _mm_load_si128((const __m128i *)left);
89   const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
90   const __m128i zero = _mm_setzero_si128();
91   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
92   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
93   __m128i rep = _mm_set1_epi16(0x8000);
94   const __m128i one = _mm_set1_epi16(1);
95 
96   for (int i = 0; i < 16; ++i) {
97     const __m128i l16 = _mm_shuffle_epi8(l, rep);
98     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99 
100     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
101     dst += stride;
102     rep = _mm_add_epi16(rep, one);
103   }
104 }
105 
aom_paeth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)106 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107                                    const uint8_t *above, const uint8_t *left) {
108   __m128i l = _mm_loadl_epi64((const __m128i *)left);
109   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110   const __m128i zero = _mm_setzero_si128();
111   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
113   __m128i rep = _mm_set1_epi16(0x8000);
114   const __m128i one = _mm_set1_epi16(1);
115 
116   int i;
117   for (i = 0; i < 4; ++i) {
118     const __m128i l16 = _mm_shuffle_epi8(l, rep);
119     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120 
121     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122     dst += stride;
123     rep = _mm_add_epi16(rep, one);
124   }
125 }
126 
aom_paeth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)127 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128                                    const uint8_t *above, const uint8_t *left) {
129   __m128i l = _mm_loadl_epi64((const __m128i *)left);
130   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131   const __m128i zero = _mm_setzero_si128();
132   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
134   __m128i rep = _mm_set1_epi16(0x8000);
135   const __m128i one = _mm_set1_epi16(1);
136 
137   int i;
138   for (i = 0; i < 8; ++i) {
139     const __m128i l16 = _mm_shuffle_epi8(l, rep);
140     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141 
142     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143     dst += stride;
144     rep = _mm_add_epi16(rep, one);
145   }
146 }
147 
aom_paeth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)148 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149                                     const uint8_t *above, const uint8_t *left) {
150   __m128i l = _mm_load_si128((const __m128i *)left);
151   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152   const __m128i zero = _mm_setzero_si128();
153   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
154   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
155   __m128i rep = _mm_set1_epi16(0x8000);
156   const __m128i one = _mm_set1_epi16(1);
157 
158   int i;
159   for (i = 0; i < 16; ++i) {
160     const __m128i l16 = _mm_shuffle_epi8(l, rep);
161     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162 
163     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164     dst += stride;
165     rep = _mm_add_epi16(rep, one);
166   }
167 }
168 
aom_paeth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170                                     const uint8_t *above, const uint8_t *left) {
171   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172   const __m128i zero = _mm_setzero_si128();
173   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
174   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
175   const __m128i one = _mm_set1_epi16(1);
176 
177   for (int j = 0; j < 2; ++j) {
178     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
179     __m128i rep = _mm_set1_epi16(0x8000);
180     for (int i = 0; i < 16; ++i) {
181       const __m128i l16 = _mm_shuffle_epi8(l, rep);
182       const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183 
184       _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185       dst += stride;
186       rep = _mm_add_epi16(rep, one);
187     }
188   }
189 }
190 
191 // Return 16 8-bit pixels in one row
paeth_16x1_pred(const __m128i * left,const __m128i * top0,const __m128i * top1,const __m128i * topleft)192 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193                                       const __m128i *top1,
194                                       const __m128i *topleft) {
195   const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196   const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197   return _mm_packus_epi16(p0, p1);
198 }
199 
aom_paeth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)200 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201                                     const uint8_t *above, const uint8_t *left) {
202   __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
203   const __m128i t = _mm_load_si128((const __m128i *)above);
204   const __m128i zero = _mm_setzero_si128();
205   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
207   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
208   __m128i rep = _mm_set1_epi16(0x8000);
209   const __m128i one = _mm_set1_epi16(1);
210 
211   for (int i = 0; i < 4; ++i) {
212     const __m128i l16 = _mm_shuffle_epi8(l, rep);
213     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214 
215     _mm_store_si128((__m128i *)dst, row);
216     dst += stride;
217     rep = _mm_add_epi16(rep, one);
218   }
219 }
220 
aom_paeth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)221 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222                                     const uint8_t *above, const uint8_t *left) {
223   __m128i l = _mm_loadl_epi64((const __m128i *)left);
224   const __m128i t = _mm_load_si128((const __m128i *)above);
225   const __m128i zero = _mm_setzero_si128();
226   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
228   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
229   __m128i rep = _mm_set1_epi16(0x8000);
230   const __m128i one = _mm_set1_epi16(1);
231 
232   int i;
233   for (i = 0; i < 8; ++i) {
234     const __m128i l16 = _mm_shuffle_epi8(l, rep);
235     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236 
237     _mm_store_si128((__m128i *)dst, row);
238     dst += stride;
239     rep = _mm_add_epi16(rep, one);
240   }
241 }
242 
aom_paeth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)243 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244                                      const uint8_t *above,
245                                      const uint8_t *left) {
246   __m128i l = _mm_load_si128((const __m128i *)left);
247   const __m128i t = _mm_load_si128((const __m128i *)above);
248   const __m128i zero = _mm_setzero_si128();
249   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
251   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
252   __m128i rep = _mm_set1_epi16(0x8000);
253   const __m128i one = _mm_set1_epi16(1);
254 
255   int i;
256   for (i = 0; i < 16; ++i) {
257     const __m128i l16 = _mm_shuffle_epi8(l, rep);
258     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259 
260     _mm_store_si128((__m128i *)dst, row);
261     dst += stride;
262     rep = _mm_add_epi16(rep, one);
263   }
264 }
265 
aom_paeth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)266 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267                                      const uint8_t *above,
268                                      const uint8_t *left) {
269   __m128i l = _mm_load_si128((const __m128i *)left);
270   const __m128i t = _mm_load_si128((const __m128i *)above);
271   const __m128i zero = _mm_setzero_si128();
272   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
274   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
275   __m128i rep = _mm_set1_epi16(0x8000);
276   const __m128i one = _mm_set1_epi16(1);
277   __m128i l16;
278 
279   int i;
280   for (i = 0; i < 16; ++i) {
281     l16 = _mm_shuffle_epi8(l, rep);
282     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283 
284     _mm_store_si128((__m128i *)dst, row);
285     dst += stride;
286     rep = _mm_add_epi16(rep, one);
287   }
288 
289   l = _mm_load_si128((const __m128i *)(left + 16));
290   rep = _mm_set1_epi16(0x8000);
291   for (i = 0; i < 16; ++i) {
292     l16 = _mm_shuffle_epi8(l, rep);
293     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294 
295     _mm_store_si128((__m128i *)dst, row);
296     dst += stride;
297     rep = _mm_add_epi16(rep, one);
298   }
299 }
300 
aom_paeth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302                                      const uint8_t *above,
303                                      const uint8_t *left) {
304   const __m128i t = _mm_load_si128((const __m128i *)above);
305   const __m128i zero = _mm_setzero_si128();
306   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
308   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
309   const __m128i one = _mm_set1_epi16(1);
310 
311   for (int j = 0; j < 4; ++j) {
312     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
313     __m128i rep = _mm_set1_epi16(0x8000);
314     for (int i = 0; i < 16; ++i) {
315       const __m128i l16 = _mm_shuffle_epi8(l, rep);
316       const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317       _mm_store_si128((__m128i *)dst, row);
318       dst += stride;
319       rep = _mm_add_epi16(rep, one);
320     }
321   }
322 }
323 
aom_paeth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)324 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325                                     const uint8_t *above, const uint8_t *left) {
326   const __m128i a = _mm_load_si128((const __m128i *)above);
327   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328   const __m128i zero = _mm_setzero_si128();
329   const __m128i al = _mm_unpacklo_epi8(a, zero);
330   const __m128i ah = _mm_unpackhi_epi8(a, zero);
331   const __m128i bl = _mm_unpacklo_epi8(b, zero);
332   const __m128i bh = _mm_unpackhi_epi8(b, zero);
333 
334   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
335   __m128i rep = _mm_set1_epi16(0x8000);
336   const __m128i one = _mm_set1_epi16(1);
337   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338   __m128i l16;
339 
340   for (int i = 0; i < 8; ++i) {
341     l16 = _mm_shuffle_epi8(l, rep);
342     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344 
345     _mm_store_si128((__m128i *)dst, r32l);
346     _mm_store_si128((__m128i *)(dst + 16), r32h);
347     dst += stride;
348     rep = _mm_add_epi16(rep, one);
349   }
350 }
351 
aom_paeth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)352 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353                                      const uint8_t *above,
354                                      const uint8_t *left) {
355   const __m128i a = _mm_load_si128((const __m128i *)above);
356   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357   const __m128i zero = _mm_setzero_si128();
358   const __m128i al = _mm_unpacklo_epi8(a, zero);
359   const __m128i ah = _mm_unpackhi_epi8(a, zero);
360   const __m128i bl = _mm_unpacklo_epi8(b, zero);
361   const __m128i bh = _mm_unpackhi_epi8(b, zero);
362 
363   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
364   __m128i rep = _mm_set1_epi16(0x8000);
365   const __m128i one = _mm_set1_epi16(1);
366   __m128i l = _mm_load_si128((const __m128i *)left);
367   __m128i l16;
368 
369   int i;
370   for (i = 0; i < 16; ++i) {
371     l16 = _mm_shuffle_epi8(l, rep);
372     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374 
375     _mm_store_si128((__m128i *)dst, r32l);
376     _mm_store_si128((__m128i *)(dst + 16), r32h);
377     dst += stride;
378     rep = _mm_add_epi16(rep, one);
379   }
380 }
381 
aom_paeth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)382 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383                                      const uint8_t *above,
384                                      const uint8_t *left) {
385   const __m128i a = _mm_load_si128((const __m128i *)above);
386   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387   const __m128i zero = _mm_setzero_si128();
388   const __m128i al = _mm_unpacklo_epi8(a, zero);
389   const __m128i ah = _mm_unpackhi_epi8(a, zero);
390   const __m128i bl = _mm_unpacklo_epi8(b, zero);
391   const __m128i bh = _mm_unpackhi_epi8(b, zero);
392 
393   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
394   __m128i rep = _mm_set1_epi16(0x8000);
395   const __m128i one = _mm_set1_epi16(1);
396   __m128i l = _mm_load_si128((const __m128i *)left);
397   __m128i l16;
398 
399   int i;
400   for (i = 0; i < 16; ++i) {
401     l16 = _mm_shuffle_epi8(l, rep);
402     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404 
405     _mm_store_si128((__m128i *)dst, r32l);
406     _mm_store_si128((__m128i *)(dst + 16), r32h);
407     dst += stride;
408     rep = _mm_add_epi16(rep, one);
409   }
410 
411   rep = _mm_set1_epi16(0x8000);
412   l = _mm_load_si128((const __m128i *)(left + 16));
413   for (i = 0; i < 16; ++i) {
414     l16 = _mm_shuffle_epi8(l, rep);
415     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417 
418     _mm_store_si128((__m128i *)dst, r32l);
419     _mm_store_si128((__m128i *)(dst + 16), r32h);
420     dst += stride;
421     rep = _mm_add_epi16(rep, one);
422   }
423 }
424 
aom_paeth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)425 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426                                      const uint8_t *above,
427                                      const uint8_t *left) {
428   const __m128i a = _mm_load_si128((const __m128i *)above);
429   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430   const __m128i zero = _mm_setzero_si128();
431   const __m128i al = _mm_unpacklo_epi8(a, zero);
432   const __m128i ah = _mm_unpackhi_epi8(a, zero);
433   const __m128i bl = _mm_unpacklo_epi8(b, zero);
434   const __m128i bh = _mm_unpackhi_epi8(b, zero);
435 
436   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
437   const __m128i one = _mm_set1_epi16(1);
438   __m128i l16;
439 
440   int i, j;
441   for (j = 0; j < 4; ++j) {
442     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
443     __m128i rep = _mm_set1_epi16(0x8000);
444     for (i = 0; i < 16; ++i) {
445       l16 = _mm_shuffle_epi8(l, rep);
446       const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447       const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448 
449       _mm_store_si128((__m128i *)dst, r32l);
450       _mm_store_si128((__m128i *)(dst + 16), r32h);
451       dst += stride;
452       rep = _mm_add_epi16(rep, one);
453     }
454   }
455 }
456 
aom_paeth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458                                      const uint8_t *above,
459                                      const uint8_t *left) {
460   const __m128i a = _mm_load_si128((const __m128i *)above);
461   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464   const __m128i zero = _mm_setzero_si128();
465   const __m128i al = _mm_unpacklo_epi8(a, zero);
466   const __m128i ah = _mm_unpackhi_epi8(a, zero);
467   const __m128i bl = _mm_unpacklo_epi8(b, zero);
468   const __m128i bh = _mm_unpackhi_epi8(b, zero);
469   const __m128i cl = _mm_unpacklo_epi8(c, zero);
470   const __m128i ch = _mm_unpackhi_epi8(c, zero);
471   const __m128i dl = _mm_unpacklo_epi8(d, zero);
472   const __m128i dh = _mm_unpackhi_epi8(d, zero);
473 
474   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
475   const __m128i one = _mm_set1_epi16(1);
476   __m128i l16;
477 
478   int i, j;
479   for (j = 0; j < 2; ++j) {
480     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
481     __m128i rep = _mm_set1_epi16(0x8000);
482     for (i = 0; i < 16; ++i) {
483       l16 = _mm_shuffle_epi8(l, rep);
484       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488 
489       _mm_store_si128((__m128i *)dst, r0);
490       _mm_store_si128((__m128i *)(dst + 16), r1);
491       _mm_store_si128((__m128i *)(dst + 32), r2);
492       _mm_store_si128((__m128i *)(dst + 48), r3);
493       dst += stride;
494       rep = _mm_add_epi16(rep, one);
495     }
496   }
497 }
498 
aom_paeth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500                                      const uint8_t *above,
501                                      const uint8_t *left) {
502   const __m128i a = _mm_load_si128((const __m128i *)above);
503   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506   const __m128i zero = _mm_setzero_si128();
507   const __m128i al = _mm_unpacklo_epi8(a, zero);
508   const __m128i ah = _mm_unpackhi_epi8(a, zero);
509   const __m128i bl = _mm_unpacklo_epi8(b, zero);
510   const __m128i bh = _mm_unpackhi_epi8(b, zero);
511   const __m128i cl = _mm_unpacklo_epi8(c, zero);
512   const __m128i ch = _mm_unpackhi_epi8(c, zero);
513   const __m128i dl = _mm_unpacklo_epi8(d, zero);
514   const __m128i dh = _mm_unpackhi_epi8(d, zero);
515 
516   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
517   const __m128i one = _mm_set1_epi16(1);
518   __m128i l16;
519 
520   int i, j;
521   for (j = 0; j < 4; ++j) {
522     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
523     __m128i rep = _mm_set1_epi16(0x8000);
524     for (i = 0; i < 16; ++i) {
525       l16 = _mm_shuffle_epi8(l, rep);
526       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530 
531       _mm_store_si128((__m128i *)dst, r0);
532       _mm_store_si128((__m128i *)(dst + 16), r1);
533       _mm_store_si128((__m128i *)(dst + 32), r2);
534       _mm_store_si128((__m128i *)(dst + 48), r3);
535       dst += stride;
536       rep = _mm_add_epi16(rep, one);
537     }
538   }
539 }
540 
aom_paeth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)541 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542                                      const uint8_t *above,
543                                      const uint8_t *left) {
544   const __m128i a = _mm_load_si128((const __m128i *)above);
545   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548   const __m128i zero = _mm_setzero_si128();
549   const __m128i al = _mm_unpacklo_epi8(a, zero);
550   const __m128i ah = _mm_unpackhi_epi8(a, zero);
551   const __m128i bl = _mm_unpacklo_epi8(b, zero);
552   const __m128i bh = _mm_unpackhi_epi8(b, zero);
553   const __m128i cl = _mm_unpacklo_epi8(c, zero);
554   const __m128i ch = _mm_unpackhi_epi8(c, zero);
555   const __m128i dl = _mm_unpacklo_epi8(d, zero);
556   const __m128i dh = _mm_unpackhi_epi8(d, zero);
557 
558   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
559   const __m128i one = _mm_set1_epi16(1);
560   __m128i l16;
561 
562   int i;
563   const __m128i l = _mm_load_si128((const __m128i *)left);
564   __m128i rep = _mm_set1_epi16(0x8000);
565   for (i = 0; i < 16; ++i) {
566     l16 = _mm_shuffle_epi8(l, rep);
567     const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568     const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569     const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570     const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571 
572     _mm_store_si128((__m128i *)dst, r0);
573     _mm_store_si128((__m128i *)(dst + 16), r1);
574     _mm_store_si128((__m128i *)(dst + 32), r2);
575     _mm_store_si128((__m128i *)(dst + 48), r3);
576     dst += stride;
577     rep = _mm_add_epi16(rep, one);
578   }
579 }
580 
581 // -----------------------------------------------------------------------------
582 // SMOOTH_PRED
583 
584 // pixels[0]: above and below_pred interleave vector
585 // pixels[1]: left vector
586 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)587 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588                                  int height, __m128i *pixels) {
589   __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
590   if (height == 4)
591     pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
592   else if (height == 8)
593     pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594   else
595     pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596 
597   pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
598 
599   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
600   const __m128i zero = _mm_setzero_si128();
601   d = _mm_unpacklo_epi8(d, zero);
602   pixels[0] = _mm_unpacklo_epi16(d, bp);
603 }
604 
605 // weight_h[0]: weight_h vector
606 // weight_h[1]: scale - weight_h vector
607 // weight_h[2]: same as [0], second half for height = 16 only
608 // weight_h[3]: same as [1], second half for height = 16 only
609 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(const uint8_t * weight_array,int height,__m128i * weight_h,__m128i * weight_w)610 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
611                                   __m128i *weight_h, __m128i *weight_w) {
612   const __m128i zero = _mm_setzero_si128();
613   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
614   const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
615   weight_h[0] = _mm_unpacklo_epi8(t, zero);
616   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617   weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
618 
619   if (height == 8) {
620     const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
621     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623   } else if (height == 16) {
624     const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
625     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
626     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
627     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
628     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
629   }
630 }
631 
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)632 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
633                                    const __m128i *ww, int h, uint8_t *dst,
634                                    ptrdiff_t stride, int second_half) {
635   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
636   const __m128i one = _mm_set1_epi16(1);
637   const __m128i inc = _mm_set1_epi16(0x202);
638   const __m128i gat = _mm_set1_epi32(0xc080400);
639   __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
640   __m128i d = _mm_set1_epi16(0x100);
641 
642   for (int i = 0; i < h; ++i) {
643     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
644     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
645     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
646     __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
647 
648     __m128i b = _mm_shuffle_epi8(pixel[1], rep);
649     b = _mm_unpacklo_epi16(b, pixel[2]);
650     __m128i sum = _mm_madd_epi16(b, ww[0]);
651 
652     sum = _mm_add_epi32(s, sum);
653     sum = _mm_add_epi32(sum, round);
654     sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
655 
656     sum = _mm_shuffle_epi8(sum, gat);
657     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
658     dst += stride;
659 
660     rep = _mm_add_epi16(rep, one);
661     d = _mm_add_epi16(d, inc);
662   }
663 }
664 
aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)665 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
666                                     const uint8_t *above, const uint8_t *left) {
667   __m128i pixels[3];
668   load_pixel_w4(above, left, 4, pixels);
669 
670   __m128i wh[4], ww[2];
671   load_weight_w4(sm_weight_arrays, 4, wh, ww);
672 
673   smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
674 }
675 
aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)676 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
677                                     const uint8_t *above, const uint8_t *left) {
678   __m128i pixels[3];
679   load_pixel_w4(above, left, 8, pixels);
680 
681   __m128i wh[4], ww[2];
682   load_weight_w4(sm_weight_arrays, 8, wh, ww);
683 
684   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
685 }
686 
aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)687 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
688                                      const uint8_t *above,
689                                      const uint8_t *left) {
690   __m128i pixels[3];
691   load_pixel_w4(above, left, 16, pixels);
692 
693   __m128i wh[4], ww[2];
694   load_weight_w4(sm_weight_arrays, 16, wh, ww);
695 
696   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
697   dst += stride << 3;
698   smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
699 }
700 
701 // pixels[0]: above and below_pred interleave vector, first half
702 // pixels[1]: above and below_pred interleave vector, second half
703 // pixels[2]: left vector
704 // pixels[3]: right_pred vector
705 // pixels[4]: above and below_pred interleave vector, first half
706 // pixels[5]: above and below_pred interleave vector, second half
707 // pixels[6]: left vector + 16
708 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)709 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
710                                  int height, __m128i *pixels) {
711   const __m128i zero = _mm_setzero_si128();
712   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
713   __m128i d = _mm_loadl_epi64((const __m128i *)above);
714   d = _mm_unpacklo_epi8(d, zero);
715   pixels[0] = _mm_unpacklo_epi16(d, bp);
716   pixels[1] = _mm_unpackhi_epi16(d, bp);
717 
718   pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
719 
720   if (height == 4) {
721     pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
722   } else if (height == 8) {
723     pixels[2] = _mm_loadl_epi64((const __m128i *)left);
724   } else if (height == 16) {
725     pixels[2] = _mm_load_si128((const __m128i *)left);
726   } else {
727     pixels[2] = _mm_load_si128((const __m128i *)left);
728     pixels[4] = pixels[0];
729     pixels[5] = pixels[1];
730     pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
731     pixels[7] = pixels[3];
732   }
733 }
734 
735 // weight_h[0]: weight_h vector
736 // weight_h[1]: scale - weight_h vector
737 // weight_h[2]: same as [0], offset 8
738 // weight_h[3]: same as [1], offset 8
739 // weight_h[4]: same as [0], offset 16
740 // weight_h[5]: same as [1], offset 16
741 // weight_h[6]: same as [0], offset 24
742 // weight_h[7]: same as [1], offset 24
743 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
744 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(const uint8_t * weight_array,int height,__m128i * weight_h,__m128i * weight_w)745 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
746                                   __m128i *weight_h, __m128i *weight_w) {
747   const __m128i zero = _mm_setzero_si128();
748   const int we_offset = height < 8 ? 4 : 8;
749   __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
750   weight_h[0] = _mm_unpacklo_epi8(we, zero);
751   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
752   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
753 
754   if (height == 4) {
755     we = _mm_srli_si128(we, 4);
756     __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
757     __m128i tmp2 = _mm_sub_epi16(d, tmp1);
758     weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
759     weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
760   } else {
761     weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
762     weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
763   }
764 
765   if (height == 16) {
766     we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
767     weight_h[0] = _mm_unpacklo_epi8(we, zero);
768     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
769     weight_h[2] = _mm_unpackhi_epi8(we, zero);
770     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
771   } else if (height == 32) {
772     const __m128i weight_lo =
773         _mm_loadu_si128((const __m128i *)&weight_array[32]);
774     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
775     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
776     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
777     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
778     const __m128i weight_hi =
779         _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
780     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
781     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
782     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
783     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
784   }
785 }
786 
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)787 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
788                                    const __m128i *ww, int h, uint8_t *dst,
789                                    ptrdiff_t stride, int second_half) {
790   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
791   const __m128i one = _mm_set1_epi16(1);
792   const __m128i inc = _mm_set1_epi16(0x202);
793   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
794 
795   __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
796   __m128i d = _mm_set1_epi16(0x100);
797 
798   int i;
799   for (i = 0; i < h; ++i) {
800     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
801     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
802     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
803     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
804     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
805 
806     __m128i b = _mm_shuffle_epi8(pixels[2], rep);
807     b = _mm_unpacklo_epi16(b, pixels[3]);
808     __m128i sum0 = _mm_madd_epi16(b, ww[0]);
809     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
810 
811     s0 = _mm_add_epi32(s0, sum0);
812     s0 = _mm_add_epi32(s0, round);
813     s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
814 
815     s1 = _mm_add_epi32(s1, sum1);
816     s1 = _mm_add_epi32(s1, round);
817     s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
818 
819     sum0 = _mm_packus_epi16(s0, s1);
820     sum0 = _mm_shuffle_epi8(sum0, gat);
821     _mm_storel_epi64((__m128i *)dst, sum0);
822     dst += stride;
823 
824     rep = _mm_add_epi16(rep, one);
825     d = _mm_add_epi16(d, inc);
826   }
827 }
828 
aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)829 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
830                                     const uint8_t *above, const uint8_t *left) {
831   __m128i pixels[4];
832   load_pixel_w8(above, left, 4, pixels);
833 
834   __m128i wh[4], ww[2];
835   load_weight_w8(sm_weight_arrays, 4, wh, ww);
836 
837   smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
838 }
839 
aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)840 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
841                                     const uint8_t *above, const uint8_t *left) {
842   __m128i pixels[4];
843   load_pixel_w8(above, left, 8, pixels);
844 
845   __m128i wh[4], ww[2];
846   load_weight_w8(sm_weight_arrays, 8, wh, ww);
847 
848   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
849 }
850 
aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)851 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
852                                      const uint8_t *above,
853                                      const uint8_t *left) {
854   __m128i pixels[4];
855   load_pixel_w8(above, left, 16, pixels);
856 
857   __m128i wh[4], ww[2];
858   load_weight_w8(sm_weight_arrays, 16, wh, ww);
859 
860   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
861   dst += stride << 3;
862   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
863 }
864 
aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)865 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
866                                      const uint8_t *above,
867                                      const uint8_t *left) {
868   __m128i pixels[8];
869   load_pixel_w8(above, left, 32, pixels);
870 
871   __m128i wh[8], ww[2];
872   load_weight_w8(sm_weight_arrays, 32, wh, ww);
873 
874   smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
875   dst += stride << 3;
876   smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
877   dst += stride << 3;
878   smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
879   dst += stride << 3;
880   smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
881 }
882 
smooth_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)883 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
884                                         const uint8_t *above,
885                                         const uint8_t *left, uint32_t bw,
886                                         uint32_t bh) {
887   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
888   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
889   const __m128i zero = _mm_setzero_si128();
890   const __m128i scale_value =
891       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
892   const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
893   const __m128i dup16 = _mm_set1_epi32(0x01000100);
894   const __m128i top_right =
895       _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
896   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
897   const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
898 
899   for (uint32_t y = 0; y < bh; ++y) {
900     const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
901     const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
902     const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
903     __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
904     const __m128i wl_y =
905         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
906     pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
907     pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
908 
909     for (uint32_t x = 0; x < bw; x += 8) {
910       const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
911       const __m128i weights_x =
912           _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
913       const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
914       const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
915       const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
916 
917       __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
918       __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
919 
920       const __m128i scale_m_weights_x =
921           _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
922       const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
923       const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
924       const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
925 
926       pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
927       pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
928 
929       pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
930       pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
931 
932       pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
933       pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
934 
935       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
936       pred = _mm_shuffle_epi8(pred, gat);
937       _mm_storel_epi64((__m128i *)(dst + x), pred);
938     }
939     dst += stride;
940   }
941 }
942 
aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)943 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
944                                      const uint8_t *above,
945                                      const uint8_t *left) {
946   smooth_predictor_wxh(dst, stride, above, left, 16, 4);
947 }
948 
aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)949 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
950                                      const uint8_t *above,
951                                      const uint8_t *left) {
952   smooth_predictor_wxh(dst, stride, above, left, 16, 8);
953 }
954 
aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)955 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
956                                       const uint8_t *above,
957                                       const uint8_t *left) {
958   smooth_predictor_wxh(dst, stride, above, left, 16, 16);
959 }
960 
aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)961 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
962                                       const uint8_t *above,
963                                       const uint8_t *left) {
964   smooth_predictor_wxh(dst, stride, above, left, 16, 32);
965 }
966 
aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)967 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
968                                      const uint8_t *above,
969                                      const uint8_t *left) {
970   smooth_predictor_wxh(dst, stride, above, left, 32, 8);
971 }
972 
aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)973 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
974                                       const uint8_t *above,
975                                       const uint8_t *left) {
976   smooth_predictor_wxh(dst, stride, above, left, 32, 16);
977 }
978 
aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)979 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
980                                       const uint8_t *above,
981                                       const uint8_t *left) {
982   smooth_predictor_wxh(dst, stride, above, left, 32, 32);
983 }
984 
aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)985 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
986                                       const uint8_t *above,
987                                       const uint8_t *left) {
988   smooth_predictor_wxh(dst, stride, above, left, 32, 64);
989 }
990 
aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)991 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
992                                       const uint8_t *above,
993                                       const uint8_t *left) {
994   smooth_predictor_wxh(dst, stride, above, left, 64, 64);
995 }
996 
aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)997 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
998                                       const uint8_t *above,
999                                       const uint8_t *left) {
1000   smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1001 }
1002 
aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1003 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1004                                       const uint8_t *above,
1005                                       const uint8_t *left) {
1006   smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1007 }
1008 
aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1009 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1010                                       const uint8_t *above,
1011                                       const uint8_t *left) {
1012   smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1013 }
1014 
1015 // -----------------------------------------------------------------------------
1016 // SMOOTH_V_PRED
1017 
1018 // pixels[0]: above and below_pred interleave vector
load_pixel_v_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1019 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
1020                                    int height, __m128i *pixels) {
1021   const __m128i zero = _mm_setzero_si128();
1022   __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
1023   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1024   d = _mm_unpacklo_epi8(d, zero);
1025   pixels[0] = _mm_unpacklo_epi16(d, bp);
1026 }
1027 
1028 // weights[0]: weights_h vector
1029 // weights[1]: scale - weights_h vector
load_weight_v_w4(const uint8_t * weight_array,int height,__m128i * weights)1030 static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
1031                                     __m128i *weights) {
1032   const __m128i zero = _mm_setzero_si128();
1033   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1034 
1035   if (height == 4) {
1036     const __m128i weight =
1037         _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
1038     weights[0] = _mm_unpacklo_epi8(weight, zero);
1039     weights[1] = _mm_sub_epi16(d, weights[0]);
1040   } else if (height == 8) {
1041     const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
1042     weights[0] = _mm_unpacklo_epi8(weight, zero);
1043     weights[1] = _mm_sub_epi16(d, weights[0]);
1044   } else {
1045     const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1046     weights[0] = _mm_unpacklo_epi8(weight, zero);
1047     weights[1] = _mm_sub_epi16(d, weights[0]);
1048     weights[2] = _mm_unpackhi_epi8(weight, zero);
1049     weights[3] = _mm_sub_epi16(d, weights[2]);
1050   }
1051 }
1052 
smooth_v_pred_4xh(const __m128i * pixel,const __m128i * weight,int h,uint8_t * dst,ptrdiff_t stride)1053 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
1054                                      const __m128i *weight, int h, uint8_t *dst,
1055                                      ptrdiff_t stride) {
1056   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1057   const __m128i inc = _mm_set1_epi16(0x202);
1058   const __m128i gat = _mm_set1_epi32(0xc080400);
1059   __m128i d = _mm_set1_epi16(0x100);
1060 
1061   for (int i = 0; i < h; ++i) {
1062     const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
1063     const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
1064     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1065     __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
1066     sum = _mm_add_epi32(sum, pred_round);
1067     sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1068     sum = _mm_shuffle_epi8(sum, gat);
1069     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1070     dst += stride;
1071     d = _mm_add_epi16(d, inc);
1072   }
1073 }
1074 
aom_smooth_v_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1075 void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1076                                       const uint8_t *above,
1077                                       const uint8_t *left) {
1078   __m128i pixels;
1079   load_pixel_v_w4(above, left, 4, &pixels);
1080 
1081   __m128i weights[2];
1082   load_weight_v_w4(sm_weight_arrays, 4, weights);
1083 
1084   smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
1085 }
1086 
aom_smooth_v_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1087 void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1088                                       const uint8_t *above,
1089                                       const uint8_t *left) {
1090   __m128i pixels;
1091   load_pixel_v_w4(above, left, 8, &pixels);
1092 
1093   __m128i weights[2];
1094   load_weight_v_w4(sm_weight_arrays, 8, weights);
1095 
1096   smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1097 }
1098 
aom_smooth_v_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1099 void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1100                                        const uint8_t *above,
1101                                        const uint8_t *left) {
1102   __m128i pixels;
1103   load_pixel_v_w4(above, left, 16, &pixels);
1104 
1105   __m128i weights[4];
1106   load_weight_v_w4(sm_weight_arrays, 16, weights);
1107 
1108   smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1109   dst += stride << 3;
1110   smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
1111 }
1112 
1113 // pixels[0]: above and below_pred interleave vector, first half
1114 // pixels[1]: above and below_pred interleave vector, second half
load_pixel_v_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1115 static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
1116                                    int height, __m128i *pixels) {
1117   const __m128i zero = _mm_setzero_si128();
1118   __m128i d = _mm_loadl_epi64((const __m128i *)above);
1119   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1120   d = _mm_unpacklo_epi8(d, zero);
1121   pixels[0] = _mm_unpacklo_epi16(d, bp);
1122   pixels[1] = _mm_unpackhi_epi16(d, bp);
1123 }
1124 
1125 // weight_h[0]: weight_h vector
1126 // weight_h[1]: scale - weight_h vector
1127 // weight_h[2]: same as [0], offset 8
1128 // weight_h[3]: same as [1], offset 8
1129 // weight_h[4]: same as [0], offset 16
1130 // weight_h[5]: same as [1], offset 16
1131 // weight_h[6]: same as [0], offset 24
1132 // weight_h[7]: same as [1], offset 24
load_weight_v_w8(const uint8_t * weight_array,int height,__m128i * weight_h)1133 static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
1134                                     __m128i *weight_h) {
1135   const __m128i zero = _mm_setzero_si128();
1136   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1137 
1138   if (height < 16) {
1139     const int offset = height < 8 ? 4 : 8;
1140     const __m128i weight =
1141         _mm_loadu_si128((const __m128i *)&weight_array[offset]);
1142     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1143     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1144   } else if (height == 16) {
1145     const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1146     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1147     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1148     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
1149     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1150   } else {
1151     const __m128i weight_lo =
1152         _mm_loadu_si128((const __m128i *)&weight_array[32]);
1153     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
1154     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1155     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
1156     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1157     const __m128i weight_hi =
1158         _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
1159     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
1160     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
1161     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
1162     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
1163   }
1164 }
1165 
smooth_v_pred_8xh(const __m128i * pixels,const __m128i * wh,int h,uint8_t * dst,ptrdiff_t stride)1166 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
1167                                      int h, uint8_t *dst, ptrdiff_t stride) {
1168   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1169   const __m128i inc = _mm_set1_epi16(0x202);
1170   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1171   __m128i d = _mm_set1_epi16(0x100);
1172 
1173   for (int i = 0; i < h; ++i) {
1174     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
1175     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
1176     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1177     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
1178     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
1179 
1180     s0 = _mm_add_epi32(s0, pred_round);
1181     s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
1182 
1183     s1 = _mm_add_epi32(s1, pred_round);
1184     s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
1185 
1186     __m128i sum01 = _mm_packus_epi16(s0, s1);
1187     sum01 = _mm_shuffle_epi8(sum01, gat);
1188     _mm_storel_epi64((__m128i *)dst, sum01);
1189     dst += stride;
1190 
1191     d = _mm_add_epi16(d, inc);
1192   }
1193 }
1194 
aom_smooth_v_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1195 void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1196                                       const uint8_t *above,
1197                                       const uint8_t *left) {
1198   __m128i pixels[2];
1199   load_pixel_v_w8(above, left, 4, pixels);
1200 
1201   __m128i wh[2];
1202   load_weight_v_w8(sm_weight_arrays, 4, wh);
1203 
1204   smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
1205 }
1206 
aom_smooth_v_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1207 void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1208                                       const uint8_t *above,
1209                                       const uint8_t *left) {
1210   __m128i pixels[2];
1211   load_pixel_v_w8(above, left, 8, pixels);
1212 
1213   __m128i wh[2];
1214   load_weight_v_w8(sm_weight_arrays, 8, wh);
1215 
1216   smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1217 }
1218 
aom_smooth_v_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1219 void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1220                                        const uint8_t *above,
1221                                        const uint8_t *left) {
1222   __m128i pixels[2];
1223   load_pixel_v_w8(above, left, 16, pixels);
1224 
1225   __m128i wh[4];
1226   load_weight_v_w8(sm_weight_arrays, 16, wh);
1227 
1228   smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1229   dst += stride << 3;
1230   smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1231 }
1232 
aom_smooth_v_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1233 void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1234                                        const uint8_t *above,
1235                                        const uint8_t *left) {
1236   __m128i pixels[2];
1237   load_pixel_v_w8(above, left, 32, pixels);
1238 
1239   __m128i wh[8];
1240   load_weight_v_w8(sm_weight_arrays, 32, wh);
1241 
1242   smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
1243   dst += stride << 3;
1244   smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1245   dst += stride << 3;
1246   smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
1247   dst += stride << 3;
1248   smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
1249 }
1250 
smooth_v_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1251 static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1252                                           const uint8_t *above,
1253                                           const uint8_t *left, uint32_t bw,
1254                                           uint32_t bh) {
1255   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
1256   const __m128i zero = _mm_setzero_si128();
1257   const __m128i scale_value =
1258       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1259   const __m128i dup16 = _mm_set1_epi32(0x01000100);
1260   const __m128i bottom_left =
1261       _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
1262   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1263   const __m128i round =
1264       _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
1265 
1266   for (uint32_t y = 0; y < bh; ++y) {
1267     const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
1268     const __m128i scale_m_weights_y =
1269         _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
1270     const __m128i wl_y =
1271         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
1272 
1273     for (uint32_t x = 0; x < bw; x += 8) {
1274       const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
1275       // 8 -> 16
1276       const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
1277       const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
1278       const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
1279       // top_x * weights_y + scale_m_weights_y * bottom_left
1280       __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
1281       __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
1282 
1283       pred_lo = _mm_add_epi32(pred_lo, round);
1284       pred_hi = _mm_add_epi32(pred_hi, round);
1285       pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1286       pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1287 
1288       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1289       pred = _mm_shuffle_epi8(pred, gat);
1290       _mm_storel_epi64((__m128i *)(dst + x), pred);
1291     }
1292     dst += stride;
1293   }
1294 }
1295 
aom_smooth_v_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1296 void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1297                                        const uint8_t *above,
1298                                        const uint8_t *left) {
1299   smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
1300 }
1301 
aom_smooth_v_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1302 void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1303                                        const uint8_t *above,
1304                                        const uint8_t *left) {
1305   smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
1306 }
1307 
aom_smooth_v_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1308 void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1309                                         const uint8_t *above,
1310                                         const uint8_t *left) {
1311   smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
1312 }
1313 
aom_smooth_v_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1314 void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1315                                         const uint8_t *above,
1316                                         const uint8_t *left) {
1317   smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
1318 }
1319 
aom_smooth_v_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1320 void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1321                                        const uint8_t *above,
1322                                        const uint8_t *left) {
1323   smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
1324 }
1325 
aom_smooth_v_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1326 void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1327                                         const uint8_t *above,
1328                                         const uint8_t *left) {
1329   smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
1330 }
1331 
aom_smooth_v_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1332 void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1333                                         const uint8_t *above,
1334                                         const uint8_t *left) {
1335   smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
1336 }
1337 
aom_smooth_v_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1338 void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1339                                         const uint8_t *above,
1340                                         const uint8_t *left) {
1341   smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
1342 }
1343 
aom_smooth_v_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1344 void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1345                                         const uint8_t *above,
1346                                         const uint8_t *left) {
1347   smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
1348 }
1349 
aom_smooth_v_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1350 void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1351                                         const uint8_t *above,
1352                                         const uint8_t *left) {
1353   smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
1354 }
1355 
aom_smooth_v_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1356 void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1357                                         const uint8_t *above,
1358                                         const uint8_t *left) {
1359   smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
1360 }
1361 
aom_smooth_v_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1362 void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1363                                         const uint8_t *above,
1364                                         const uint8_t *left) {
1365   smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
1366 }
1367 
1368 // -----------------------------------------------------------------------------
1369 // SMOOTH_H_PRED
1370 
1371 // pixels[0]: left vector
1372 // pixels[1]: right_pred vector
load_pixel_h_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1373 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
1374                                    int height, __m128i *pixels) {
1375   if (height == 4)
1376     pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1377   else if (height == 8)
1378     pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
1379   else
1380     pixels[0] = _mm_loadu_si128(((const __m128i *)left));
1381   pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
1382 }
1383 
1384 // weights[0]: weights_w and scale - weights_w interleave vector
load_weight_h_w4(const uint8_t * weight_array,int height,__m128i * weights)1385 static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
1386                                     __m128i *weights) {
1387   (void)height;
1388   const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
1389   const __m128i zero = _mm_setzero_si128();
1390 
1391   const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
1392   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1393   const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
1394   weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
1395 }
1396 
smooth_h_pred_4xh(const __m128i * pixel,const __m128i * weight,int h,uint8_t * dst,ptrdiff_t stride)1397 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
1398                                      const __m128i *weight, int h, uint8_t *dst,
1399                                      ptrdiff_t stride) {
1400   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1401   const __m128i one = _mm_set1_epi16(1);
1402   const __m128i gat = _mm_set1_epi32(0xc080400);
1403   __m128i rep = _mm_set1_epi16(0x8000);
1404 
1405   for (int i = 0; i < h; ++i) {
1406     __m128i b = _mm_shuffle_epi8(pixel[0], rep);
1407     b = _mm_unpacklo_epi16(b, pixel[1]);
1408     __m128i sum = _mm_madd_epi16(b, weight[0]);
1409 
1410     sum = _mm_add_epi32(sum, pred_round);
1411     sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1412 
1413     sum = _mm_shuffle_epi8(sum, gat);
1414     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1415     dst += stride;
1416 
1417     rep = _mm_add_epi16(rep, one);
1418   }
1419 }
1420 
aom_smooth_h_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1421 void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1422                                       const uint8_t *above,
1423                                       const uint8_t *left) {
1424   __m128i pixels[2];
1425   load_pixel_h_w4(above, left, 4, pixels);
1426 
1427   __m128i weights;
1428   load_weight_h_w4(sm_weight_arrays, 4, &weights);
1429 
1430   smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
1431 }
1432 
aom_smooth_h_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1433 void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1434                                       const uint8_t *above,
1435                                       const uint8_t *left) {
1436   __m128i pixels[2];
1437   load_pixel_h_w4(above, left, 8, pixels);
1438 
1439   __m128i weights;
1440   load_weight_h_w4(sm_weight_arrays, 8, &weights);
1441 
1442   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1443 }
1444 
aom_smooth_h_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1445 void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1446                                        const uint8_t *above,
1447                                        const uint8_t *left) {
1448   __m128i pixels[2];
1449   load_pixel_h_w4(above, left, 16, pixels);
1450 
1451   __m128i weights;
1452   load_weight_h_w4(sm_weight_arrays, 8, &weights);
1453 
1454   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1455   dst += stride << 3;
1456 
1457   pixels[0] = _mm_srli_si128(pixels[0], 8);
1458   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1459 }
1460 
1461 // pixels[0]: left vector
1462 // pixels[1]: right_pred vector
1463 // pixels[2]: left vector + 16
1464 // pixels[3]: right_pred vector
load_pixel_h_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1465 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
1466                                    int height, __m128i *pixels) {
1467   pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
1468 
1469   if (height == 4) {
1470     pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1471   } else if (height == 8) {
1472     pixels[0] = _mm_loadl_epi64((const __m128i *)left);
1473   } else if (height == 16) {
1474     pixels[0] = _mm_load_si128((const __m128i *)left);
1475   } else {
1476     pixels[0] = _mm_load_si128((const __m128i *)left);
1477     pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
1478     pixels[3] = pixels[1];
1479   }
1480 }
1481 
1482 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
1483 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_h_w8(const uint8_t * weight_array,int height,__m128i * weight_w)1484 static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
1485                                     __m128i *weight_w) {
1486   (void)height;
1487   const __m128i zero = _mm_setzero_si128();
1488   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1489   const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
1490   const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
1491   const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
1492   weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
1493   weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
1494 }
1495 
smooth_h_pred_8xh(const __m128i * pixels,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)1496 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
1497                                      int h, uint8_t *dst, ptrdiff_t stride,
1498                                      int second_half) {
1499   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1500   const __m128i one = _mm_set1_epi16(1);
1501   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1502   __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
1503 
1504   for (int i = 0; i < h; ++i) {
1505     __m128i b = _mm_shuffle_epi8(pixels[0], rep);
1506     b = _mm_unpacklo_epi16(b, pixels[1]);
1507     __m128i sum0 = _mm_madd_epi16(b, ww[0]);
1508     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
1509 
1510     sum0 = _mm_add_epi32(sum0, pred_round);
1511     sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
1512 
1513     sum1 = _mm_add_epi32(sum1, pred_round);
1514     sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
1515 
1516     sum0 = _mm_packus_epi16(sum0, sum1);
1517     sum0 = _mm_shuffle_epi8(sum0, gat);
1518     _mm_storel_epi64((__m128i *)dst, sum0);
1519     dst += stride;
1520 
1521     rep = _mm_add_epi16(rep, one);
1522   }
1523 }
1524 
aom_smooth_h_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1525 void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1526                                       const uint8_t *above,
1527                                       const uint8_t *left) {
1528   __m128i pixels[2];
1529   load_pixel_h_w8(above, left, 4, pixels);
1530 
1531   __m128i ww[2];
1532   load_weight_h_w8(sm_weight_arrays, 4, ww);
1533 
1534   smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1535 }
1536 
aom_smooth_h_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1537 void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1538                                       const uint8_t *above,
1539                                       const uint8_t *left) {
1540   __m128i pixels[2];
1541   load_pixel_h_w8(above, left, 8, pixels);
1542 
1543   __m128i ww[2];
1544   load_weight_h_w8(sm_weight_arrays, 8, ww);
1545 
1546   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1547 }
1548 
aom_smooth_h_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1549 void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1550                                        const uint8_t *above,
1551                                        const uint8_t *left) {
1552   __m128i pixels[2];
1553   load_pixel_h_w8(above, left, 16, pixels);
1554 
1555   __m128i ww[2];
1556   load_weight_h_w8(sm_weight_arrays, 16, ww);
1557 
1558   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1559   dst += stride << 3;
1560   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1561 }
1562 
aom_smooth_h_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1563 void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1564                                        const uint8_t *above,
1565                                        const uint8_t *left) {
1566   __m128i pixels[4];
1567   load_pixel_h_w8(above, left, 32, pixels);
1568 
1569   __m128i ww[2];
1570   load_weight_h_w8(sm_weight_arrays, 32, ww);
1571 
1572   smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1573   dst += stride << 3;
1574   smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1575   dst += stride << 3;
1576   smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1577   dst += stride << 3;
1578   smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1579 }
1580 
smooth_h_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1581 static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1582                                           const uint8_t *above,
1583                                           const uint8_t *left, uint32_t bw,
1584                                           uint32_t bh) {
1585   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1586   const __m128i zero = _mm_setzero_si128();
1587   const __m128i scale_value =
1588       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1589   const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1590   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1591   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1592 
1593   for (uint32_t y = 0; y < bh; ++y) {
1594     const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1595     const __m128i tr_ly =
1596         _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1597 
1598     for (uint32_t x = 0; x < bw; x += 8) {
1599       const __m128i weights_x =
1600           _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1601       const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1602       const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1603       const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1604       const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1605       __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1606       __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1607 
1608       pred_lo = _mm_add_epi32(pred_lo, pred_round);
1609       pred_hi = _mm_add_epi32(pred_hi, pred_round);
1610 
1611       pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1612       pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1613 
1614       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1615       pred = _mm_shuffle_epi8(pred, gat);
1616       _mm_storel_epi64((__m128i *)(dst + x), pred);
1617     }
1618     dst += stride;
1619   }
1620 }
1621 
aom_smooth_h_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1622 void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1623                                        const uint8_t *above,
1624                                        const uint8_t *left) {
1625   smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1626 }
1627 
aom_smooth_h_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1628 void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1629                                        const uint8_t *above,
1630                                        const uint8_t *left) {
1631   smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1632 }
1633 
aom_smooth_h_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1634 void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1635                                         const uint8_t *above,
1636                                         const uint8_t *left) {
1637   smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1638 }
1639 
aom_smooth_h_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1640 void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1641                                         const uint8_t *above,
1642                                         const uint8_t *left) {
1643   smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1644 }
1645 
aom_smooth_h_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1646 void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1647                                         const uint8_t *above,
1648                                         const uint8_t *left) {
1649   smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1650 }
1651 
aom_smooth_h_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1652 void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1653                                        const uint8_t *above,
1654                                        const uint8_t *left) {
1655   smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1656 }
1657 
aom_smooth_h_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1658 void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1659                                         const uint8_t *above,
1660                                         const uint8_t *left) {
1661   smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1662 }
1663 
aom_smooth_h_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1664 void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1665                                         const uint8_t *above,
1666                                         const uint8_t *left) {
1667   smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1668 }
1669 
aom_smooth_h_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1670 void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1671                                         const uint8_t *above,
1672                                         const uint8_t *left) {
1673   smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1674 }
1675 
aom_smooth_h_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1676 void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1677                                         const uint8_t *above,
1678                                         const uint8_t *left) {
1679   smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1680 }
1681 
aom_smooth_h_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1682 void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1683                                         const uint8_t *above,
1684                                         const uint8_t *left) {
1685   smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1686 }
1687 
aom_smooth_h_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1688 void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1689                                         const uint8_t *above,
1690                                         const uint8_t *left) {
1691   smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1692 }
1693