• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <tmmintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_dsp/intrapred_common.h"
17 
18 // -----------------------------------------------------------------------------
19 // PAETH_PRED
20 
21 // Return 8 16-bit pixels in one row
paeth_8x1_pred(const __m128i * left,const __m128i * top,const __m128i * topleft)22 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23                                      const __m128i *topleft) {
24   const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25 
26   __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27   __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28   __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29 
30   __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31   mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32   __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33 
34   pl = _mm_andnot_si128(mask1, *left);
35 
36   ptl = _mm_and_si128(mask2, *topleft);
37   pt = _mm_andnot_si128(mask2, *top);
38   pt = _mm_or_si128(pt, ptl);
39   pt = _mm_and_si128(mask1, pt);
40 
41   return _mm_or_si128(pl, pt);
42 }
43 
aom_paeth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45                                    const uint8_t *above, const uint8_t *left) {
46   __m128i l = _mm_loadl_epi64((const __m128i *)left);
47   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48   const __m128i zero = _mm_setzero_si128();
49   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
51   __m128i rep = _mm_set1_epi16((short)0x8000);
52   const __m128i one = _mm_set1_epi16(1);
53 
54   int i;
55   for (i = 0; i < 4; ++i) {
56     const __m128i l16 = _mm_shuffle_epi8(l, rep);
57     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58 
59     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60     dst += stride;
61     rep = _mm_add_epi16(rep, one);
62   }
63 }
64 
aom_paeth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66                                    const uint8_t *above, const uint8_t *left) {
67   __m128i l = _mm_loadl_epi64((const __m128i *)left);
68   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69   const __m128i zero = _mm_setzero_si128();
70   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
72   __m128i rep = _mm_set1_epi16((short)0x8000);
73   const __m128i one = _mm_set1_epi16(1);
74 
75   int i;
76   for (i = 0; i < 8; ++i) {
77     const __m128i l16 = _mm_shuffle_epi8(l, rep);
78     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79 
80     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81     dst += stride;
82     rep = _mm_add_epi16(rep, one);
83   }
84 }
85 
aom_paeth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)86 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87                                     const uint8_t *above, const uint8_t *left) {
88   __m128i l = _mm_load_si128((const __m128i *)left);
89   const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
90   const __m128i zero = _mm_setzero_si128();
91   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
92   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
93   __m128i rep = _mm_set1_epi16((short)0x8000);
94   const __m128i one = _mm_set1_epi16(1);
95 
96   for (int i = 0; i < 16; ++i) {
97     const __m128i l16 = _mm_shuffle_epi8(l, rep);
98     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99 
100     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
101     dst += stride;
102     rep = _mm_add_epi16(rep, one);
103   }
104 }
105 
aom_paeth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)106 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107                                    const uint8_t *above, const uint8_t *left) {
108   __m128i l = _mm_loadl_epi64((const __m128i *)left);
109   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110   const __m128i zero = _mm_setzero_si128();
111   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
113   __m128i rep = _mm_set1_epi16((short)0x8000);
114   const __m128i one = _mm_set1_epi16(1);
115 
116   int i;
117   for (i = 0; i < 4; ++i) {
118     const __m128i l16 = _mm_shuffle_epi8(l, rep);
119     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120 
121     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122     dst += stride;
123     rep = _mm_add_epi16(rep, one);
124   }
125 }
126 
aom_paeth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)127 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128                                    const uint8_t *above, const uint8_t *left) {
129   __m128i l = _mm_loadl_epi64((const __m128i *)left);
130   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131   const __m128i zero = _mm_setzero_si128();
132   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
134   __m128i rep = _mm_set1_epi16((short)0x8000);
135   const __m128i one = _mm_set1_epi16(1);
136 
137   int i;
138   for (i = 0; i < 8; ++i) {
139     const __m128i l16 = _mm_shuffle_epi8(l, rep);
140     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141 
142     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143     dst += stride;
144     rep = _mm_add_epi16(rep, one);
145   }
146 }
147 
aom_paeth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)148 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149                                     const uint8_t *above, const uint8_t *left) {
150   __m128i l = _mm_load_si128((const __m128i *)left);
151   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152   const __m128i zero = _mm_setzero_si128();
153   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
154   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
155   __m128i rep = _mm_set1_epi16((short)0x8000);
156   const __m128i one = _mm_set1_epi16(1);
157 
158   int i;
159   for (i = 0; i < 16; ++i) {
160     const __m128i l16 = _mm_shuffle_epi8(l, rep);
161     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162 
163     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164     dst += stride;
165     rep = _mm_add_epi16(rep, one);
166   }
167 }
168 
aom_paeth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170                                     const uint8_t *above, const uint8_t *left) {
171   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172   const __m128i zero = _mm_setzero_si128();
173   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
174   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
175   const __m128i one = _mm_set1_epi16(1);
176 
177   for (int j = 0; j < 2; ++j) {
178     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
179     __m128i rep = _mm_set1_epi16((short)0x8000);
180     for (int i = 0; i < 16; ++i) {
181       const __m128i l16 = _mm_shuffle_epi8(l, rep);
182       const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183 
184       _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185       dst += stride;
186       rep = _mm_add_epi16(rep, one);
187     }
188   }
189 }
190 
191 // Return 16 8-bit pixels in one row
paeth_16x1_pred(const __m128i * left,const __m128i * top0,const __m128i * top1,const __m128i * topleft)192 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193                                       const __m128i *top1,
194                                       const __m128i *topleft) {
195   const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196   const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197   return _mm_packus_epi16(p0, p1);
198 }
199 
aom_paeth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)200 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201                                     const uint8_t *above, const uint8_t *left) {
202   __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
203   const __m128i t = _mm_load_si128((const __m128i *)above);
204   const __m128i zero = _mm_setzero_si128();
205   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
207   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
208   __m128i rep = _mm_set1_epi16((short)0x8000);
209   const __m128i one = _mm_set1_epi16(1);
210 
211   for (int i = 0; i < 4; ++i) {
212     const __m128i l16 = _mm_shuffle_epi8(l, rep);
213     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214 
215     _mm_store_si128((__m128i *)dst, row);
216     dst += stride;
217     rep = _mm_add_epi16(rep, one);
218   }
219 }
220 
aom_paeth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)221 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222                                     const uint8_t *above, const uint8_t *left) {
223   __m128i l = _mm_loadl_epi64((const __m128i *)left);
224   const __m128i t = _mm_load_si128((const __m128i *)above);
225   const __m128i zero = _mm_setzero_si128();
226   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
228   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
229   __m128i rep = _mm_set1_epi16((short)0x8000);
230   const __m128i one = _mm_set1_epi16(1);
231 
232   int i;
233   for (i = 0; i < 8; ++i) {
234     const __m128i l16 = _mm_shuffle_epi8(l, rep);
235     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236 
237     _mm_store_si128((__m128i *)dst, row);
238     dst += stride;
239     rep = _mm_add_epi16(rep, one);
240   }
241 }
242 
aom_paeth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)243 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244                                      const uint8_t *above,
245                                      const uint8_t *left) {
246   __m128i l = _mm_load_si128((const __m128i *)left);
247   const __m128i t = _mm_load_si128((const __m128i *)above);
248   const __m128i zero = _mm_setzero_si128();
249   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
251   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
252   __m128i rep = _mm_set1_epi16((short)0x8000);
253   const __m128i one = _mm_set1_epi16(1);
254 
255   int i;
256   for (i = 0; i < 16; ++i) {
257     const __m128i l16 = _mm_shuffle_epi8(l, rep);
258     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259 
260     _mm_store_si128((__m128i *)dst, row);
261     dst += stride;
262     rep = _mm_add_epi16(rep, one);
263   }
264 }
265 
aom_paeth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)266 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267                                      const uint8_t *above,
268                                      const uint8_t *left) {
269   __m128i l = _mm_load_si128((const __m128i *)left);
270   const __m128i t = _mm_load_si128((const __m128i *)above);
271   const __m128i zero = _mm_setzero_si128();
272   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
274   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
275   __m128i rep = _mm_set1_epi16((short)0x8000);
276   const __m128i one = _mm_set1_epi16(1);
277   __m128i l16;
278 
279   int i;
280   for (i = 0; i < 16; ++i) {
281     l16 = _mm_shuffle_epi8(l, rep);
282     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283 
284     _mm_store_si128((__m128i *)dst, row);
285     dst += stride;
286     rep = _mm_add_epi16(rep, one);
287   }
288 
289   l = _mm_load_si128((const __m128i *)(left + 16));
290   rep = _mm_set1_epi16((short)0x8000);
291   for (i = 0; i < 16; ++i) {
292     l16 = _mm_shuffle_epi8(l, rep);
293     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294 
295     _mm_store_si128((__m128i *)dst, row);
296     dst += stride;
297     rep = _mm_add_epi16(rep, one);
298   }
299 }
300 
aom_paeth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302                                      const uint8_t *above,
303                                      const uint8_t *left) {
304   const __m128i t = _mm_load_si128((const __m128i *)above);
305   const __m128i zero = _mm_setzero_si128();
306   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
308   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
309   const __m128i one = _mm_set1_epi16(1);
310 
311   for (int j = 0; j < 4; ++j) {
312     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
313     __m128i rep = _mm_set1_epi16((short)0x8000);
314     for (int i = 0; i < 16; ++i) {
315       const __m128i l16 = _mm_shuffle_epi8(l, rep);
316       const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317       _mm_store_si128((__m128i *)dst, row);
318       dst += stride;
319       rep = _mm_add_epi16(rep, one);
320     }
321   }
322 }
323 
aom_paeth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)324 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325                                     const uint8_t *above, const uint8_t *left) {
326   const __m128i a = _mm_load_si128((const __m128i *)above);
327   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328   const __m128i zero = _mm_setzero_si128();
329   const __m128i al = _mm_unpacklo_epi8(a, zero);
330   const __m128i ah = _mm_unpackhi_epi8(a, zero);
331   const __m128i bl = _mm_unpacklo_epi8(b, zero);
332   const __m128i bh = _mm_unpackhi_epi8(b, zero);
333 
334   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
335   __m128i rep = _mm_set1_epi16((short)0x8000);
336   const __m128i one = _mm_set1_epi16(1);
337   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338   __m128i l16;
339 
340   for (int i = 0; i < 8; ++i) {
341     l16 = _mm_shuffle_epi8(l, rep);
342     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344 
345     _mm_store_si128((__m128i *)dst, r32l);
346     _mm_store_si128((__m128i *)(dst + 16), r32h);
347     dst += stride;
348     rep = _mm_add_epi16(rep, one);
349   }
350 }
351 
aom_paeth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)352 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353                                      const uint8_t *above,
354                                      const uint8_t *left) {
355   const __m128i a = _mm_load_si128((const __m128i *)above);
356   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357   const __m128i zero = _mm_setzero_si128();
358   const __m128i al = _mm_unpacklo_epi8(a, zero);
359   const __m128i ah = _mm_unpackhi_epi8(a, zero);
360   const __m128i bl = _mm_unpacklo_epi8(b, zero);
361   const __m128i bh = _mm_unpackhi_epi8(b, zero);
362 
363   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
364   __m128i rep = _mm_set1_epi16((short)0x8000);
365   const __m128i one = _mm_set1_epi16(1);
366   __m128i l = _mm_load_si128((const __m128i *)left);
367   __m128i l16;
368 
369   int i;
370   for (i = 0; i < 16; ++i) {
371     l16 = _mm_shuffle_epi8(l, rep);
372     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374 
375     _mm_store_si128((__m128i *)dst, r32l);
376     _mm_store_si128((__m128i *)(dst + 16), r32h);
377     dst += stride;
378     rep = _mm_add_epi16(rep, one);
379   }
380 }
381 
aom_paeth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)382 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383                                      const uint8_t *above,
384                                      const uint8_t *left) {
385   const __m128i a = _mm_load_si128((const __m128i *)above);
386   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387   const __m128i zero = _mm_setzero_si128();
388   const __m128i al = _mm_unpacklo_epi8(a, zero);
389   const __m128i ah = _mm_unpackhi_epi8(a, zero);
390   const __m128i bl = _mm_unpacklo_epi8(b, zero);
391   const __m128i bh = _mm_unpackhi_epi8(b, zero);
392 
393   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
394   __m128i rep = _mm_set1_epi16((short)0x8000);
395   const __m128i one = _mm_set1_epi16(1);
396   __m128i l = _mm_load_si128((const __m128i *)left);
397   __m128i l16;
398 
399   int i;
400   for (i = 0; i < 16; ++i) {
401     l16 = _mm_shuffle_epi8(l, rep);
402     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404 
405     _mm_store_si128((__m128i *)dst, r32l);
406     _mm_store_si128((__m128i *)(dst + 16), r32h);
407     dst += stride;
408     rep = _mm_add_epi16(rep, one);
409   }
410 
411   rep = _mm_set1_epi16((short)0x8000);
412   l = _mm_load_si128((const __m128i *)(left + 16));
413   for (i = 0; i < 16; ++i) {
414     l16 = _mm_shuffle_epi8(l, rep);
415     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417 
418     _mm_store_si128((__m128i *)dst, r32l);
419     _mm_store_si128((__m128i *)(dst + 16), r32h);
420     dst += stride;
421     rep = _mm_add_epi16(rep, one);
422   }
423 }
424 
aom_paeth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)425 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426                                      const uint8_t *above,
427                                      const uint8_t *left) {
428   const __m128i a = _mm_load_si128((const __m128i *)above);
429   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430   const __m128i zero = _mm_setzero_si128();
431   const __m128i al = _mm_unpacklo_epi8(a, zero);
432   const __m128i ah = _mm_unpackhi_epi8(a, zero);
433   const __m128i bl = _mm_unpacklo_epi8(b, zero);
434   const __m128i bh = _mm_unpackhi_epi8(b, zero);
435 
436   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
437   const __m128i one = _mm_set1_epi16(1);
438   __m128i l16;
439 
440   int i, j;
441   for (j = 0; j < 4; ++j) {
442     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
443     __m128i rep = _mm_set1_epi16((short)0x8000);
444     for (i = 0; i < 16; ++i) {
445       l16 = _mm_shuffle_epi8(l, rep);
446       const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447       const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448 
449       _mm_store_si128((__m128i *)dst, r32l);
450       _mm_store_si128((__m128i *)(dst + 16), r32h);
451       dst += stride;
452       rep = _mm_add_epi16(rep, one);
453     }
454   }
455 }
456 
aom_paeth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458                                      const uint8_t *above,
459                                      const uint8_t *left) {
460   const __m128i a = _mm_load_si128((const __m128i *)above);
461   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464   const __m128i zero = _mm_setzero_si128();
465   const __m128i al = _mm_unpacklo_epi8(a, zero);
466   const __m128i ah = _mm_unpackhi_epi8(a, zero);
467   const __m128i bl = _mm_unpacklo_epi8(b, zero);
468   const __m128i bh = _mm_unpackhi_epi8(b, zero);
469   const __m128i cl = _mm_unpacklo_epi8(c, zero);
470   const __m128i ch = _mm_unpackhi_epi8(c, zero);
471   const __m128i dl = _mm_unpacklo_epi8(d, zero);
472   const __m128i dh = _mm_unpackhi_epi8(d, zero);
473 
474   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
475   const __m128i one = _mm_set1_epi16(1);
476   __m128i l16;
477 
478   int i, j;
479   for (j = 0; j < 2; ++j) {
480     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
481     __m128i rep = _mm_set1_epi16((short)0x8000);
482     for (i = 0; i < 16; ++i) {
483       l16 = _mm_shuffle_epi8(l, rep);
484       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488 
489       _mm_store_si128((__m128i *)dst, r0);
490       _mm_store_si128((__m128i *)(dst + 16), r1);
491       _mm_store_si128((__m128i *)(dst + 32), r2);
492       _mm_store_si128((__m128i *)(dst + 48), r3);
493       dst += stride;
494       rep = _mm_add_epi16(rep, one);
495     }
496   }
497 }
498 
aom_paeth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500                                      const uint8_t *above,
501                                      const uint8_t *left) {
502   const __m128i a = _mm_load_si128((const __m128i *)above);
503   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506   const __m128i zero = _mm_setzero_si128();
507   const __m128i al = _mm_unpacklo_epi8(a, zero);
508   const __m128i ah = _mm_unpackhi_epi8(a, zero);
509   const __m128i bl = _mm_unpacklo_epi8(b, zero);
510   const __m128i bh = _mm_unpackhi_epi8(b, zero);
511   const __m128i cl = _mm_unpacklo_epi8(c, zero);
512   const __m128i ch = _mm_unpackhi_epi8(c, zero);
513   const __m128i dl = _mm_unpacklo_epi8(d, zero);
514   const __m128i dh = _mm_unpackhi_epi8(d, zero);
515 
516   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
517   const __m128i one = _mm_set1_epi16(1);
518   __m128i l16;
519 
520   int i, j;
521   for (j = 0; j < 4; ++j) {
522     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
523     __m128i rep = _mm_set1_epi16((short)0x8000);
524     for (i = 0; i < 16; ++i) {
525       l16 = _mm_shuffle_epi8(l, rep);
526       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530 
531       _mm_store_si128((__m128i *)dst, r0);
532       _mm_store_si128((__m128i *)(dst + 16), r1);
533       _mm_store_si128((__m128i *)(dst + 32), r2);
534       _mm_store_si128((__m128i *)(dst + 48), r3);
535       dst += stride;
536       rep = _mm_add_epi16(rep, one);
537     }
538   }
539 }
540 
aom_paeth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)541 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542                                      const uint8_t *above,
543                                      const uint8_t *left) {
544   const __m128i a = _mm_load_si128((const __m128i *)above);
545   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548   const __m128i zero = _mm_setzero_si128();
549   const __m128i al = _mm_unpacklo_epi8(a, zero);
550   const __m128i ah = _mm_unpackhi_epi8(a, zero);
551   const __m128i bl = _mm_unpacklo_epi8(b, zero);
552   const __m128i bh = _mm_unpackhi_epi8(b, zero);
553   const __m128i cl = _mm_unpacklo_epi8(c, zero);
554   const __m128i ch = _mm_unpackhi_epi8(c, zero);
555   const __m128i dl = _mm_unpacklo_epi8(d, zero);
556   const __m128i dh = _mm_unpackhi_epi8(d, zero);
557 
558   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
559   const __m128i one = _mm_set1_epi16(1);
560   __m128i l16;
561 
562   int i;
563   const __m128i l = _mm_load_si128((const __m128i *)left);
564   __m128i rep = _mm_set1_epi16((short)0x8000);
565   for (i = 0; i < 16; ++i) {
566     l16 = _mm_shuffle_epi8(l, rep);
567     const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568     const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569     const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570     const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571 
572     _mm_store_si128((__m128i *)dst, r0);
573     _mm_store_si128((__m128i *)(dst + 16), r1);
574     _mm_store_si128((__m128i *)(dst + 32), r2);
575     _mm_store_si128((__m128i *)(dst + 48), r3);
576     dst += stride;
577     rep = _mm_add_epi16(rep, one);
578   }
579 }
580 
581 // -----------------------------------------------------------------------------
582 // SMOOTH_PRED
583 
584 // pixels[0]: above and below_pred interleave vector
585 // pixels[1]: left vector
586 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)587 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588                                  int height, __m128i *pixels) {
589   __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
590   if (height == 4)
591     pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
592   else if (height == 8)
593     pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594   else
595     pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596 
597   pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
598 
599   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
600   const __m128i zero = _mm_setzero_si128();
601   d = _mm_unpacklo_epi8(d, zero);
602   pixels[0] = _mm_unpacklo_epi16(d, bp);
603 }
604 
605 // weight_h[0]: weight_h vector
606 // weight_h[1]: scale - weight_h vector
607 // weight_h[2]: same as [0], second half for height = 16 only
608 // weight_h[3]: same as [1], second half for height = 16 only
609 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(const uint8_t * weight_array,int height,__m128i * weight_h,__m128i * weight_w)610 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
611                                   __m128i *weight_h, __m128i *weight_w) {
612   const __m128i zero = _mm_setzero_si128();
613   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
614   const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
615   weight_h[0] = _mm_unpacklo_epi8(t, zero);
616   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617   weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
618 
619   if (height == 8) {
620     const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
621     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623   } else if (height == 16) {
624     const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
625     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
626     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
627     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
628     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
629   }
630 }
631 
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)632 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
633                                    const __m128i *ww, int h, uint8_t *dst,
634                                    ptrdiff_t stride, int second_half) {
635   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
636   const __m128i one = _mm_set1_epi16(1);
637   const __m128i inc = _mm_set1_epi16(0x202);
638   const __m128i gat = _mm_set1_epi32(0xc080400);
639   __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
640                             : _mm_set1_epi16((short)0x8000);
641   __m128i d = _mm_set1_epi16(0x100);
642 
643   for (int i = 0; i < h; ++i) {
644     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
645     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
646     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
647     __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
648 
649     __m128i b = _mm_shuffle_epi8(pixel[1], rep);
650     b = _mm_unpacklo_epi16(b, pixel[2]);
651     __m128i sum = _mm_madd_epi16(b, ww[0]);
652 
653     sum = _mm_add_epi32(s, sum);
654     sum = _mm_add_epi32(sum, round);
655     sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
656 
657     sum = _mm_shuffle_epi8(sum, gat);
658     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
659     dst += stride;
660 
661     rep = _mm_add_epi16(rep, one);
662     d = _mm_add_epi16(d, inc);
663   }
664 }
665 
aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)666 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
667                                     const uint8_t *above, const uint8_t *left) {
668   __m128i pixels[3];
669   load_pixel_w4(above, left, 4, pixels);
670 
671   __m128i wh[4], ww[2];
672   load_weight_w4(sm_weight_arrays, 4, wh, ww);
673 
674   smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
675 }
676 
aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)677 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
678                                     const uint8_t *above, const uint8_t *left) {
679   __m128i pixels[3];
680   load_pixel_w4(above, left, 8, pixels);
681 
682   __m128i wh[4], ww[2];
683   load_weight_w4(sm_weight_arrays, 8, wh, ww);
684 
685   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
686 }
687 
aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)688 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
689                                      const uint8_t *above,
690                                      const uint8_t *left) {
691   __m128i pixels[3];
692   load_pixel_w4(above, left, 16, pixels);
693 
694   __m128i wh[4], ww[2];
695   load_weight_w4(sm_weight_arrays, 16, wh, ww);
696 
697   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
698   dst += stride << 3;
699   smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
700 }
701 
702 // pixels[0]: above and below_pred interleave vector, first half
703 // pixels[1]: above and below_pred interleave vector, second half
704 // pixels[2]: left vector
705 // pixels[3]: right_pred vector
706 // pixels[4]: above and below_pred interleave vector, first half
707 // pixels[5]: above and below_pred interleave vector, second half
708 // pixels[6]: left vector + 16
709 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)710 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
711                                  int height, __m128i *pixels) {
712   const __m128i zero = _mm_setzero_si128();
713   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
714   __m128i d = _mm_loadl_epi64((const __m128i *)above);
715   d = _mm_unpacklo_epi8(d, zero);
716   pixels[0] = _mm_unpacklo_epi16(d, bp);
717   pixels[1] = _mm_unpackhi_epi16(d, bp);
718 
719   pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
720 
721   if (height == 4) {
722     pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
723   } else if (height == 8) {
724     pixels[2] = _mm_loadl_epi64((const __m128i *)left);
725   } else if (height == 16) {
726     pixels[2] = _mm_load_si128((const __m128i *)left);
727   } else {
728     pixels[2] = _mm_load_si128((const __m128i *)left);
729     pixels[4] = pixels[0];
730     pixels[5] = pixels[1];
731     pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
732     pixels[7] = pixels[3];
733   }
734 }
735 
736 // weight_h[0]: weight_h vector
737 // weight_h[1]: scale - weight_h vector
738 // weight_h[2]: same as [0], offset 8
739 // weight_h[3]: same as [1], offset 8
740 // weight_h[4]: same as [0], offset 16
741 // weight_h[5]: same as [1], offset 16
742 // weight_h[6]: same as [0], offset 24
743 // weight_h[7]: same as [1], offset 24
744 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
745 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(const uint8_t * weight_array,int height,__m128i * weight_h,__m128i * weight_w)746 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
747                                   __m128i *weight_h, __m128i *weight_w) {
748   const __m128i zero = _mm_setzero_si128();
749   const int we_offset = height < 8 ? 4 : 8;
750   __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
751   weight_h[0] = _mm_unpacklo_epi8(we, zero);
752   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
753   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
754 
755   if (height == 4) {
756     we = _mm_srli_si128(we, 4);
757     __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
758     __m128i tmp2 = _mm_sub_epi16(d, tmp1);
759     weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
760     weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
761   } else {
762     weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
763     weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
764   }
765 
766   if (height == 16) {
767     we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
768     weight_h[0] = _mm_unpacklo_epi8(we, zero);
769     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
770     weight_h[2] = _mm_unpackhi_epi8(we, zero);
771     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
772   } else if (height == 32) {
773     const __m128i weight_lo =
774         _mm_loadu_si128((const __m128i *)&weight_array[32]);
775     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
776     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
777     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
778     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
779     const __m128i weight_hi =
780         _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
781     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
782     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
783     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
784     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
785   }
786 }
787 
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)788 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
789                                    const __m128i *ww, int h, uint8_t *dst,
790                                    ptrdiff_t stride, int second_half) {
791   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
792   const __m128i one = _mm_set1_epi16(1);
793   const __m128i inc = _mm_set1_epi16(0x202);
794   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
795 
796   __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
797                             : _mm_set1_epi16((short)0x8000);
798   __m128i d = _mm_set1_epi16(0x100);
799 
800   int i;
801   for (i = 0; i < h; ++i) {
802     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
803     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
804     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
805     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
806     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
807 
808     __m128i b = _mm_shuffle_epi8(pixels[2], rep);
809     b = _mm_unpacklo_epi16(b, pixels[3]);
810     __m128i sum0 = _mm_madd_epi16(b, ww[0]);
811     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
812 
813     s0 = _mm_add_epi32(s0, sum0);
814     s0 = _mm_add_epi32(s0, round);
815     s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
816 
817     s1 = _mm_add_epi32(s1, sum1);
818     s1 = _mm_add_epi32(s1, round);
819     s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
820 
821     sum0 = _mm_packus_epi16(s0, s1);
822     sum0 = _mm_shuffle_epi8(sum0, gat);
823     _mm_storel_epi64((__m128i *)dst, sum0);
824     dst += stride;
825 
826     rep = _mm_add_epi16(rep, one);
827     d = _mm_add_epi16(d, inc);
828   }
829 }
830 
aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)831 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
832                                     const uint8_t *above, const uint8_t *left) {
833   __m128i pixels[4];
834   load_pixel_w8(above, left, 4, pixels);
835 
836   __m128i wh[4], ww[2];
837   load_weight_w8(sm_weight_arrays, 4, wh, ww);
838 
839   smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
840 }
841 
aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)842 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
843                                     const uint8_t *above, const uint8_t *left) {
844   __m128i pixels[4];
845   load_pixel_w8(above, left, 8, pixels);
846 
847   __m128i wh[4], ww[2];
848   load_weight_w8(sm_weight_arrays, 8, wh, ww);
849 
850   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
851 }
852 
aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)853 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
854                                      const uint8_t *above,
855                                      const uint8_t *left) {
856   __m128i pixels[4];
857   load_pixel_w8(above, left, 16, pixels);
858 
859   __m128i wh[4], ww[2];
860   load_weight_w8(sm_weight_arrays, 16, wh, ww);
861 
862   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
863   dst += stride << 3;
864   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
865 }
866 
aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)867 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
868                                      const uint8_t *above,
869                                      const uint8_t *left) {
870   __m128i pixels[8];
871   load_pixel_w8(above, left, 32, pixels);
872 
873   __m128i wh[8], ww[2];
874   load_weight_w8(sm_weight_arrays, 32, wh, ww);
875 
876   smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
877   dst += stride << 3;
878   smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
879   dst += stride << 3;
880   smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
881   dst += stride << 3;
882   smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
883 }
884 
smooth_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)885 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
886                                         const uint8_t *above,
887                                         const uint8_t *left, uint32_t bw,
888                                         uint32_t bh) {
889   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
890   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
891   const __m128i zero = _mm_setzero_si128();
892   const __m128i scale_value =
893       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
894   const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
895   const __m128i dup16 = _mm_set1_epi32(0x01000100);
896   const __m128i top_right =
897       _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
898   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
899   const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
900 
901   for (uint32_t y = 0; y < bh; ++y) {
902     const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
903     const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
904     const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
905     __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
906     const __m128i wl_y =
907         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
908     pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
909     pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
910 
911     for (uint32_t x = 0; x < bw; x += 8) {
912       const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
913       const __m128i weights_x =
914           _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
915       const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
916       const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
917       const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
918 
919       __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
920       __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
921 
922       const __m128i scale_m_weights_x =
923           _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
924       const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
925       const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
926       const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
927 
928       pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
929       pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
930 
931       pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
932       pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
933 
934       pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
935       pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
936 
937       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
938       pred = _mm_shuffle_epi8(pred, gat);
939       _mm_storel_epi64((__m128i *)(dst + x), pred);
940     }
941     dst += stride;
942   }
943 }
944 
aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)945 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
946                                      const uint8_t *above,
947                                      const uint8_t *left) {
948   smooth_predictor_wxh(dst, stride, above, left, 16, 4);
949 }
950 
aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)951 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
952                                      const uint8_t *above,
953                                      const uint8_t *left) {
954   smooth_predictor_wxh(dst, stride, above, left, 16, 8);
955 }
956 
aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)957 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
958                                       const uint8_t *above,
959                                       const uint8_t *left) {
960   smooth_predictor_wxh(dst, stride, above, left, 16, 16);
961 }
962 
aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)963 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
964                                       const uint8_t *above,
965                                       const uint8_t *left) {
966   smooth_predictor_wxh(dst, stride, above, left, 16, 32);
967 }
968 
aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)969 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
970                                      const uint8_t *above,
971                                      const uint8_t *left) {
972   smooth_predictor_wxh(dst, stride, above, left, 32, 8);
973 }
974 
aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)975 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
976                                       const uint8_t *above,
977                                       const uint8_t *left) {
978   smooth_predictor_wxh(dst, stride, above, left, 32, 16);
979 }
980 
aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)981 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
982                                       const uint8_t *above,
983                                       const uint8_t *left) {
984   smooth_predictor_wxh(dst, stride, above, left, 32, 32);
985 }
986 
aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)987 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
988                                       const uint8_t *above,
989                                       const uint8_t *left) {
990   smooth_predictor_wxh(dst, stride, above, left, 32, 64);
991 }
992 
aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)993 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
994                                       const uint8_t *above,
995                                       const uint8_t *left) {
996   smooth_predictor_wxh(dst, stride, above, left, 64, 64);
997 }
998 
aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)999 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1000                                       const uint8_t *above,
1001                                       const uint8_t *left) {
1002   smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1003 }
1004 
aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1005 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1006                                       const uint8_t *above,
1007                                       const uint8_t *left) {
1008   smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1009 }
1010 
aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1011 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1012                                       const uint8_t *above,
1013                                       const uint8_t *left) {
1014   smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1015 }
1016 
1017 // -----------------------------------------------------------------------------
1018 // SMOOTH_V_PRED
1019 
1020 // pixels[0]: above and below_pred interleave vector
load_pixel_v_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1021 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
1022                                    int height, __m128i *pixels) {
1023   const __m128i zero = _mm_setzero_si128();
1024   __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
1025   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1026   d = _mm_unpacklo_epi8(d, zero);
1027   pixels[0] = _mm_unpacklo_epi16(d, bp);
1028 }
1029 
1030 // weights[0]: weights_h vector
1031 // weights[1]: scale - weights_h vector
load_weight_v_w4(const uint8_t * weight_array,int height,__m128i * weights)1032 static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
1033                                     __m128i *weights) {
1034   const __m128i zero = _mm_setzero_si128();
1035   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1036 
1037   if (height == 4) {
1038     const __m128i weight =
1039         _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
1040     weights[0] = _mm_unpacklo_epi8(weight, zero);
1041     weights[1] = _mm_sub_epi16(d, weights[0]);
1042   } else if (height == 8) {
1043     const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
1044     weights[0] = _mm_unpacklo_epi8(weight, zero);
1045     weights[1] = _mm_sub_epi16(d, weights[0]);
1046   } else {
1047     const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1048     weights[0] = _mm_unpacklo_epi8(weight, zero);
1049     weights[1] = _mm_sub_epi16(d, weights[0]);
1050     weights[2] = _mm_unpackhi_epi8(weight, zero);
1051     weights[3] = _mm_sub_epi16(d, weights[2]);
1052   }
1053 }
1054 
smooth_v_pred_4xh(const __m128i * pixel,const __m128i * weight,int h,uint8_t * dst,ptrdiff_t stride)1055 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
1056                                      const __m128i *weight, int h, uint8_t *dst,
1057                                      ptrdiff_t stride) {
1058   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1059   const __m128i inc = _mm_set1_epi16(0x202);
1060   const __m128i gat = _mm_set1_epi32(0xc080400);
1061   __m128i d = _mm_set1_epi16(0x100);
1062 
1063   for (int i = 0; i < h; ++i) {
1064     const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
1065     const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
1066     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1067     __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
1068     sum = _mm_add_epi32(sum, pred_round);
1069     sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1070     sum = _mm_shuffle_epi8(sum, gat);
1071     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1072     dst += stride;
1073     d = _mm_add_epi16(d, inc);
1074   }
1075 }
1076 
aom_smooth_v_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1077 void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1078                                       const uint8_t *above,
1079                                       const uint8_t *left) {
1080   __m128i pixels;
1081   load_pixel_v_w4(above, left, 4, &pixels);
1082 
1083   __m128i weights[2];
1084   load_weight_v_w4(sm_weight_arrays, 4, weights);
1085 
1086   smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
1087 }
1088 
aom_smooth_v_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1089 void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1090                                       const uint8_t *above,
1091                                       const uint8_t *left) {
1092   __m128i pixels;
1093   load_pixel_v_w4(above, left, 8, &pixels);
1094 
1095   __m128i weights[2];
1096   load_weight_v_w4(sm_weight_arrays, 8, weights);
1097 
1098   smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1099 }
1100 
aom_smooth_v_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1101 void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1102                                        const uint8_t *above,
1103                                        const uint8_t *left) {
1104   __m128i pixels;
1105   load_pixel_v_w4(above, left, 16, &pixels);
1106 
1107   __m128i weights[4];
1108   load_weight_v_w4(sm_weight_arrays, 16, weights);
1109 
1110   smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1111   dst += stride << 3;
1112   smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
1113 }
1114 
1115 // pixels[0]: above and below_pred interleave vector, first half
1116 // pixels[1]: above and below_pred interleave vector, second half
load_pixel_v_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1117 static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
1118                                    int height, __m128i *pixels) {
1119   const __m128i zero = _mm_setzero_si128();
1120   __m128i d = _mm_loadl_epi64((const __m128i *)above);
1121   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1122   d = _mm_unpacklo_epi8(d, zero);
1123   pixels[0] = _mm_unpacklo_epi16(d, bp);
1124   pixels[1] = _mm_unpackhi_epi16(d, bp);
1125 }
1126 
1127 // weight_h[0]: weight_h vector
1128 // weight_h[1]: scale - weight_h vector
1129 // weight_h[2]: same as [0], offset 8
1130 // weight_h[3]: same as [1], offset 8
1131 // weight_h[4]: same as [0], offset 16
1132 // weight_h[5]: same as [1], offset 16
1133 // weight_h[6]: same as [0], offset 24
1134 // weight_h[7]: same as [1], offset 24
load_weight_v_w8(const uint8_t * weight_array,int height,__m128i * weight_h)1135 static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
1136                                     __m128i *weight_h) {
1137   const __m128i zero = _mm_setzero_si128();
1138   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1139 
1140   if (height < 16) {
1141     const int offset = height < 8 ? 4 : 8;
1142     const __m128i weight =
1143         _mm_loadu_si128((const __m128i *)&weight_array[offset]);
1144     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1145     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1146   } else if (height == 16) {
1147     const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1148     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1149     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1150     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
1151     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1152   } else {
1153     const __m128i weight_lo =
1154         _mm_loadu_si128((const __m128i *)&weight_array[32]);
1155     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
1156     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1157     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
1158     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1159     const __m128i weight_hi =
1160         _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
1161     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
1162     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
1163     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
1164     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
1165   }
1166 }
1167 
smooth_v_pred_8xh(const __m128i * pixels,const __m128i * wh,int h,uint8_t * dst,ptrdiff_t stride)1168 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
1169                                      int h, uint8_t *dst, ptrdiff_t stride) {
1170   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1171   const __m128i inc = _mm_set1_epi16(0x202);
1172   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1173   __m128i d = _mm_set1_epi16(0x100);
1174 
1175   for (int i = 0; i < h; ++i) {
1176     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
1177     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
1178     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1179     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
1180     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
1181 
1182     s0 = _mm_add_epi32(s0, pred_round);
1183     s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
1184 
1185     s1 = _mm_add_epi32(s1, pred_round);
1186     s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
1187 
1188     __m128i sum01 = _mm_packus_epi16(s0, s1);
1189     sum01 = _mm_shuffle_epi8(sum01, gat);
1190     _mm_storel_epi64((__m128i *)dst, sum01);
1191     dst += stride;
1192 
1193     d = _mm_add_epi16(d, inc);
1194   }
1195 }
1196 
aom_smooth_v_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1197 void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1198                                       const uint8_t *above,
1199                                       const uint8_t *left) {
1200   __m128i pixels[2];
1201   load_pixel_v_w8(above, left, 4, pixels);
1202 
1203   __m128i wh[2];
1204   load_weight_v_w8(sm_weight_arrays, 4, wh);
1205 
1206   smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
1207 }
1208 
aom_smooth_v_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1209 void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1210                                       const uint8_t *above,
1211                                       const uint8_t *left) {
1212   __m128i pixels[2];
1213   load_pixel_v_w8(above, left, 8, pixels);
1214 
1215   __m128i wh[2];
1216   load_weight_v_w8(sm_weight_arrays, 8, wh);
1217 
1218   smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1219 }
1220 
aom_smooth_v_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1221 void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1222                                        const uint8_t *above,
1223                                        const uint8_t *left) {
1224   __m128i pixels[2];
1225   load_pixel_v_w8(above, left, 16, pixels);
1226 
1227   __m128i wh[4];
1228   load_weight_v_w8(sm_weight_arrays, 16, wh);
1229 
1230   smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1231   dst += stride << 3;
1232   smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1233 }
1234 
aom_smooth_v_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1235 void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1236                                        const uint8_t *above,
1237                                        const uint8_t *left) {
1238   __m128i pixels[2];
1239   load_pixel_v_w8(above, left, 32, pixels);
1240 
1241   __m128i wh[8];
1242   load_weight_v_w8(sm_weight_arrays, 32, wh);
1243 
1244   smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
1245   dst += stride << 3;
1246   smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1247   dst += stride << 3;
1248   smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
1249   dst += stride << 3;
1250   smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
1251 }
1252 
smooth_v_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1253 static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1254                                           const uint8_t *above,
1255                                           const uint8_t *left, uint32_t bw,
1256                                           uint32_t bh) {
1257   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
1258   const __m128i zero = _mm_setzero_si128();
1259   const __m128i scale_value =
1260       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1261   const __m128i dup16 = _mm_set1_epi32(0x01000100);
1262   const __m128i bottom_left =
1263       _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
1264   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1265   const __m128i round =
1266       _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
1267 
1268   for (uint32_t y = 0; y < bh; ++y) {
1269     const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
1270     const __m128i scale_m_weights_y =
1271         _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
1272     const __m128i wl_y =
1273         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
1274 
1275     for (uint32_t x = 0; x < bw; x += 8) {
1276       const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
1277       // 8 -> 16
1278       const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
1279       const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
1280       const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
1281       // top_x * weights_y + scale_m_weights_y * bottom_left
1282       __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
1283       __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
1284 
1285       pred_lo = _mm_add_epi32(pred_lo, round);
1286       pred_hi = _mm_add_epi32(pred_hi, round);
1287       pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1288       pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1289 
1290       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1291       pred = _mm_shuffle_epi8(pred, gat);
1292       _mm_storel_epi64((__m128i *)(dst + x), pred);
1293     }
1294     dst += stride;
1295   }
1296 }
1297 
aom_smooth_v_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1298 void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1299                                        const uint8_t *above,
1300                                        const uint8_t *left) {
1301   smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
1302 }
1303 
aom_smooth_v_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1304 void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1305                                        const uint8_t *above,
1306                                        const uint8_t *left) {
1307   smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
1308 }
1309 
aom_smooth_v_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1310 void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1311                                         const uint8_t *above,
1312                                         const uint8_t *left) {
1313   smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
1314 }
1315 
aom_smooth_v_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1316 void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1317                                         const uint8_t *above,
1318                                         const uint8_t *left) {
1319   smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
1320 }
1321 
aom_smooth_v_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1322 void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1323                                        const uint8_t *above,
1324                                        const uint8_t *left) {
1325   smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
1326 }
1327 
aom_smooth_v_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1328 void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1329                                         const uint8_t *above,
1330                                         const uint8_t *left) {
1331   smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
1332 }
1333 
aom_smooth_v_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1334 void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1335                                         const uint8_t *above,
1336                                         const uint8_t *left) {
1337   smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
1338 }
1339 
aom_smooth_v_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1340 void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1341                                         const uint8_t *above,
1342                                         const uint8_t *left) {
1343   smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
1344 }
1345 
aom_smooth_v_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1346 void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1347                                         const uint8_t *above,
1348                                         const uint8_t *left) {
1349   smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
1350 }
1351 
aom_smooth_v_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1352 void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1353                                         const uint8_t *above,
1354                                         const uint8_t *left) {
1355   smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
1356 }
1357 
aom_smooth_v_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1358 void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1359                                         const uint8_t *above,
1360                                         const uint8_t *left) {
1361   smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
1362 }
1363 
aom_smooth_v_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1364 void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1365                                         const uint8_t *above,
1366                                         const uint8_t *left) {
1367   smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
1368 }
1369 
1370 // -----------------------------------------------------------------------------
1371 // SMOOTH_H_PRED
1372 
1373 // pixels[0]: left vector
1374 // pixels[1]: right_pred vector
load_pixel_h_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1375 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
1376                                    int height, __m128i *pixels) {
1377   if (height == 4)
1378     pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1379   else if (height == 8)
1380     pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
1381   else
1382     pixels[0] = _mm_loadu_si128(((const __m128i *)left));
1383   pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
1384 }
1385 
1386 // weights[0]: weights_w and scale - weights_w interleave vector
load_weight_h_w4(const uint8_t * weight_array,int height,__m128i * weights)1387 static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
1388                                     __m128i *weights) {
1389   (void)height;
1390   const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
1391   const __m128i zero = _mm_setzero_si128();
1392 
1393   const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
1394   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1395   const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
1396   weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
1397 }
1398 
smooth_h_pred_4xh(const __m128i * pixel,const __m128i * weight,int h,uint8_t * dst,ptrdiff_t stride)1399 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
1400                                      const __m128i *weight, int h, uint8_t *dst,
1401                                      ptrdiff_t stride) {
1402   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1403   const __m128i one = _mm_set1_epi16(1);
1404   const __m128i gat = _mm_set1_epi32(0xc080400);
1405   __m128i rep = _mm_set1_epi16((short)0x8000);
1406 
1407   for (int i = 0; i < h; ++i) {
1408     __m128i b = _mm_shuffle_epi8(pixel[0], rep);
1409     b = _mm_unpacklo_epi16(b, pixel[1]);
1410     __m128i sum = _mm_madd_epi16(b, weight[0]);
1411 
1412     sum = _mm_add_epi32(sum, pred_round);
1413     sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1414 
1415     sum = _mm_shuffle_epi8(sum, gat);
1416     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1417     dst += stride;
1418 
1419     rep = _mm_add_epi16(rep, one);
1420   }
1421 }
1422 
aom_smooth_h_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1423 void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1424                                       const uint8_t *above,
1425                                       const uint8_t *left) {
1426   __m128i pixels[2];
1427   load_pixel_h_w4(above, left, 4, pixels);
1428 
1429   __m128i weights;
1430   load_weight_h_w4(sm_weight_arrays, 4, &weights);
1431 
1432   smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
1433 }
1434 
aom_smooth_h_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1435 void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1436                                       const uint8_t *above,
1437                                       const uint8_t *left) {
1438   __m128i pixels[2];
1439   load_pixel_h_w4(above, left, 8, pixels);
1440 
1441   __m128i weights;
1442   load_weight_h_w4(sm_weight_arrays, 8, &weights);
1443 
1444   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1445 }
1446 
aom_smooth_h_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1447 void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1448                                        const uint8_t *above,
1449                                        const uint8_t *left) {
1450   __m128i pixels[2];
1451   load_pixel_h_w4(above, left, 16, pixels);
1452 
1453   __m128i weights;
1454   load_weight_h_w4(sm_weight_arrays, 8, &weights);
1455 
1456   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1457   dst += stride << 3;
1458 
1459   pixels[0] = _mm_srli_si128(pixels[0], 8);
1460   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1461 }
1462 
1463 // pixels[0]: left vector
1464 // pixels[1]: right_pred vector
1465 // pixels[2]: left vector + 16
1466 // pixels[3]: right_pred vector
load_pixel_h_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1467 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
1468                                    int height, __m128i *pixels) {
1469   pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
1470 
1471   if (height == 4) {
1472     pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1473   } else if (height == 8) {
1474     pixels[0] = _mm_loadl_epi64((const __m128i *)left);
1475   } else if (height == 16) {
1476     pixels[0] = _mm_load_si128((const __m128i *)left);
1477   } else {
1478     pixels[0] = _mm_load_si128((const __m128i *)left);
1479     pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
1480     pixels[3] = pixels[1];
1481   }
1482 }
1483 
1484 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
1485 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_h_w8(const uint8_t * weight_array,int height,__m128i * weight_w)1486 static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
1487                                     __m128i *weight_w) {
1488   (void)height;
1489   const __m128i zero = _mm_setzero_si128();
1490   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1491   const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
1492   const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
1493   const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
1494   weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
1495   weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
1496 }
1497 
smooth_h_pred_8xh(const __m128i * pixels,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)1498 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
1499                                      int h, uint8_t *dst, ptrdiff_t stride,
1500                                      int second_half) {
1501   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1502   const __m128i one = _mm_set1_epi16(1);
1503   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1504   __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
1505                             : _mm_set1_epi16((short)0x8000);
1506 
1507   for (int i = 0; i < h; ++i) {
1508     __m128i b = _mm_shuffle_epi8(pixels[0], rep);
1509     b = _mm_unpacklo_epi16(b, pixels[1]);
1510     __m128i sum0 = _mm_madd_epi16(b, ww[0]);
1511     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
1512 
1513     sum0 = _mm_add_epi32(sum0, pred_round);
1514     sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
1515 
1516     sum1 = _mm_add_epi32(sum1, pred_round);
1517     sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
1518 
1519     sum0 = _mm_packus_epi16(sum0, sum1);
1520     sum0 = _mm_shuffle_epi8(sum0, gat);
1521     _mm_storel_epi64((__m128i *)dst, sum0);
1522     dst += stride;
1523 
1524     rep = _mm_add_epi16(rep, one);
1525   }
1526 }
1527 
aom_smooth_h_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1528 void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1529                                       const uint8_t *above,
1530                                       const uint8_t *left) {
1531   __m128i pixels[2];
1532   load_pixel_h_w8(above, left, 4, pixels);
1533 
1534   __m128i ww[2];
1535   load_weight_h_w8(sm_weight_arrays, 4, ww);
1536 
1537   smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1538 }
1539 
aom_smooth_h_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1540 void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1541                                       const uint8_t *above,
1542                                       const uint8_t *left) {
1543   __m128i pixels[2];
1544   load_pixel_h_w8(above, left, 8, pixels);
1545 
1546   __m128i ww[2];
1547   load_weight_h_w8(sm_weight_arrays, 8, ww);
1548 
1549   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1550 }
1551 
aom_smooth_h_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1552 void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1553                                        const uint8_t *above,
1554                                        const uint8_t *left) {
1555   __m128i pixels[2];
1556   load_pixel_h_w8(above, left, 16, pixels);
1557 
1558   __m128i ww[2];
1559   load_weight_h_w8(sm_weight_arrays, 16, ww);
1560 
1561   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1562   dst += stride << 3;
1563   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1564 }
1565 
aom_smooth_h_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1566 void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1567                                        const uint8_t *above,
1568                                        const uint8_t *left) {
1569   __m128i pixels[4];
1570   load_pixel_h_w8(above, left, 32, pixels);
1571 
1572   __m128i ww[2];
1573   load_weight_h_w8(sm_weight_arrays, 32, ww);
1574 
1575   smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1576   dst += stride << 3;
1577   smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1578   dst += stride << 3;
1579   smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1580   dst += stride << 3;
1581   smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1582 }
1583 
smooth_h_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1584 static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1585                                           const uint8_t *above,
1586                                           const uint8_t *left, uint32_t bw,
1587                                           uint32_t bh) {
1588   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1589   const __m128i zero = _mm_setzero_si128();
1590   const __m128i scale_value =
1591       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1592   const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1593   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1594   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1595 
1596   for (uint32_t y = 0; y < bh; ++y) {
1597     const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1598     const __m128i tr_ly =
1599         _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1600 
1601     for (uint32_t x = 0; x < bw; x += 8) {
1602       const __m128i weights_x =
1603           _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1604       const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1605       const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1606       const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1607       const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1608       __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1609       __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1610 
1611       pred_lo = _mm_add_epi32(pred_lo, pred_round);
1612       pred_hi = _mm_add_epi32(pred_hi, pred_round);
1613 
1614       pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1615       pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1616 
1617       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1618       pred = _mm_shuffle_epi8(pred, gat);
1619       _mm_storel_epi64((__m128i *)(dst + x), pred);
1620     }
1621     dst += stride;
1622   }
1623 }
1624 
aom_smooth_h_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1625 void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1626                                        const uint8_t *above,
1627                                        const uint8_t *left) {
1628   smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1629 }
1630 
aom_smooth_h_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1631 void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1632                                        const uint8_t *above,
1633                                        const uint8_t *left) {
1634   smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1635 }
1636 
aom_smooth_h_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1637 void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1638                                         const uint8_t *above,
1639                                         const uint8_t *left) {
1640   smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1641 }
1642 
aom_smooth_h_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1643 void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1644                                         const uint8_t *above,
1645                                         const uint8_t *left) {
1646   smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1647 }
1648 
aom_smooth_h_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1649 void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1650                                         const uint8_t *above,
1651                                         const uint8_t *left) {
1652   smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1653 }
1654 
aom_smooth_h_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1655 void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1656                                        const uint8_t *above,
1657                                        const uint8_t *left) {
1658   smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1659 }
1660 
aom_smooth_h_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1661 void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1662                                         const uint8_t *above,
1663                                         const uint8_t *left) {
1664   smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1665 }
1666 
aom_smooth_h_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1667 void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1668                                         const uint8_t *above,
1669                                         const uint8_t *left) {
1670   smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1671 }
1672 
aom_smooth_h_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1673 void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1674                                         const uint8_t *above,
1675                                         const uint8_t *left) {
1676   smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1677 }
1678 
aom_smooth_h_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1679 void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1680                                         const uint8_t *above,
1681                                         const uint8_t *left) {
1682   smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1683 }
1684 
aom_smooth_h_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1685 void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1686                                         const uint8_t *above,
1687                                         const uint8_t *left) {
1688   smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1689 }
1690 
aom_smooth_h_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1691 void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1692                                         const uint8_t *above,
1693                                         const uint8_t *left) {
1694   smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1695 }
1696