1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <tmmintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
16 #include "aom_dsp/intrapred_common.h"
17
18 // -----------------------------------------------------------------------------
19 // PAETH_PRED
20
21 // Return 8 16-bit pixels in one row
paeth_8x1_pred(const __m128i * left,const __m128i * top,const __m128i * topleft)22 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23 const __m128i *topleft) {
24 const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25
26 __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27 __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28 __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29
30 __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31 mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32 __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33
34 pl = _mm_andnot_si128(mask1, *left);
35
36 ptl = _mm_and_si128(mask2, *topleft);
37 pt = _mm_andnot_si128(mask2, *top);
38 pt = _mm_or_si128(pt, ptl);
39 pt = _mm_and_si128(mask1, pt);
40
41 return _mm_or_si128(pl, pt);
42 }
43
aom_paeth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45 const uint8_t *above, const uint8_t *left) {
46 __m128i l = _mm_loadl_epi64((const __m128i *)left);
47 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48 const __m128i zero = _mm_setzero_si128();
49 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
51 __m128i rep = _mm_set1_epi16((short)0x8000);
52 const __m128i one = _mm_set1_epi16(1);
53
54 int i;
55 for (i = 0; i < 4; ++i) {
56 const __m128i l16 = _mm_shuffle_epi8(l, rep);
57 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58
59 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60 dst += stride;
61 rep = _mm_add_epi16(rep, one);
62 }
63 }
64
aom_paeth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66 const uint8_t *above, const uint8_t *left) {
67 __m128i l = _mm_loadl_epi64((const __m128i *)left);
68 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69 const __m128i zero = _mm_setzero_si128();
70 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
72 __m128i rep = _mm_set1_epi16((short)0x8000);
73 const __m128i one = _mm_set1_epi16(1);
74
75 int i;
76 for (i = 0; i < 8; ++i) {
77 const __m128i l16 = _mm_shuffle_epi8(l, rep);
78 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79
80 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81 dst += stride;
82 rep = _mm_add_epi16(rep, one);
83 }
84 }
85
aom_paeth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)86 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87 const uint8_t *above, const uint8_t *left) {
88 __m128i l = _mm_load_si128((const __m128i *)left);
89 const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
90 const __m128i zero = _mm_setzero_si128();
91 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
92 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
93 __m128i rep = _mm_set1_epi16((short)0x8000);
94 const __m128i one = _mm_set1_epi16(1);
95
96 for (int i = 0; i < 16; ++i) {
97 const __m128i l16 = _mm_shuffle_epi8(l, rep);
98 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99
100 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
101 dst += stride;
102 rep = _mm_add_epi16(rep, one);
103 }
104 }
105
aom_paeth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)106 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107 const uint8_t *above, const uint8_t *left) {
108 __m128i l = _mm_loadl_epi64((const __m128i *)left);
109 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110 const __m128i zero = _mm_setzero_si128();
111 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
113 __m128i rep = _mm_set1_epi16((short)0x8000);
114 const __m128i one = _mm_set1_epi16(1);
115
116 int i;
117 for (i = 0; i < 4; ++i) {
118 const __m128i l16 = _mm_shuffle_epi8(l, rep);
119 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120
121 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122 dst += stride;
123 rep = _mm_add_epi16(rep, one);
124 }
125 }
126
aom_paeth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)127 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128 const uint8_t *above, const uint8_t *left) {
129 __m128i l = _mm_loadl_epi64((const __m128i *)left);
130 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131 const __m128i zero = _mm_setzero_si128();
132 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
134 __m128i rep = _mm_set1_epi16((short)0x8000);
135 const __m128i one = _mm_set1_epi16(1);
136
137 int i;
138 for (i = 0; i < 8; ++i) {
139 const __m128i l16 = _mm_shuffle_epi8(l, rep);
140 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141
142 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143 dst += stride;
144 rep = _mm_add_epi16(rep, one);
145 }
146 }
147
aom_paeth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)148 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149 const uint8_t *above, const uint8_t *left) {
150 __m128i l = _mm_load_si128((const __m128i *)left);
151 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152 const __m128i zero = _mm_setzero_si128();
153 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
154 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
155 __m128i rep = _mm_set1_epi16((short)0x8000);
156 const __m128i one = _mm_set1_epi16(1);
157
158 int i;
159 for (i = 0; i < 16; ++i) {
160 const __m128i l16 = _mm_shuffle_epi8(l, rep);
161 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162
163 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164 dst += stride;
165 rep = _mm_add_epi16(rep, one);
166 }
167 }
168
aom_paeth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170 const uint8_t *above, const uint8_t *left) {
171 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172 const __m128i zero = _mm_setzero_si128();
173 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
174 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
175 const __m128i one = _mm_set1_epi16(1);
176
177 for (int j = 0; j < 2; ++j) {
178 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
179 __m128i rep = _mm_set1_epi16((short)0x8000);
180 for (int i = 0; i < 16; ++i) {
181 const __m128i l16 = _mm_shuffle_epi8(l, rep);
182 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183
184 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185 dst += stride;
186 rep = _mm_add_epi16(rep, one);
187 }
188 }
189 }
190
191 // Return 16 8-bit pixels in one row
paeth_16x1_pred(const __m128i * left,const __m128i * top0,const __m128i * top1,const __m128i * topleft)192 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193 const __m128i *top1,
194 const __m128i *topleft) {
195 const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196 const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197 return _mm_packus_epi16(p0, p1);
198 }
199
aom_paeth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)200 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201 const uint8_t *above, const uint8_t *left) {
202 __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
203 const __m128i t = _mm_load_si128((const __m128i *)above);
204 const __m128i zero = _mm_setzero_si128();
205 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
207 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
208 __m128i rep = _mm_set1_epi16((short)0x8000);
209 const __m128i one = _mm_set1_epi16(1);
210
211 for (int i = 0; i < 4; ++i) {
212 const __m128i l16 = _mm_shuffle_epi8(l, rep);
213 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214
215 _mm_store_si128((__m128i *)dst, row);
216 dst += stride;
217 rep = _mm_add_epi16(rep, one);
218 }
219 }
220
aom_paeth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)221 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222 const uint8_t *above, const uint8_t *left) {
223 __m128i l = _mm_loadl_epi64((const __m128i *)left);
224 const __m128i t = _mm_load_si128((const __m128i *)above);
225 const __m128i zero = _mm_setzero_si128();
226 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
228 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
229 __m128i rep = _mm_set1_epi16((short)0x8000);
230 const __m128i one = _mm_set1_epi16(1);
231
232 int i;
233 for (i = 0; i < 8; ++i) {
234 const __m128i l16 = _mm_shuffle_epi8(l, rep);
235 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236
237 _mm_store_si128((__m128i *)dst, row);
238 dst += stride;
239 rep = _mm_add_epi16(rep, one);
240 }
241 }
242
aom_paeth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)243 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244 const uint8_t *above,
245 const uint8_t *left) {
246 __m128i l = _mm_load_si128((const __m128i *)left);
247 const __m128i t = _mm_load_si128((const __m128i *)above);
248 const __m128i zero = _mm_setzero_si128();
249 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
251 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
252 __m128i rep = _mm_set1_epi16((short)0x8000);
253 const __m128i one = _mm_set1_epi16(1);
254
255 int i;
256 for (i = 0; i < 16; ++i) {
257 const __m128i l16 = _mm_shuffle_epi8(l, rep);
258 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259
260 _mm_store_si128((__m128i *)dst, row);
261 dst += stride;
262 rep = _mm_add_epi16(rep, one);
263 }
264 }
265
aom_paeth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)266 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267 const uint8_t *above,
268 const uint8_t *left) {
269 __m128i l = _mm_load_si128((const __m128i *)left);
270 const __m128i t = _mm_load_si128((const __m128i *)above);
271 const __m128i zero = _mm_setzero_si128();
272 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
274 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
275 __m128i rep = _mm_set1_epi16((short)0x8000);
276 const __m128i one = _mm_set1_epi16(1);
277 __m128i l16;
278
279 int i;
280 for (i = 0; i < 16; ++i) {
281 l16 = _mm_shuffle_epi8(l, rep);
282 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283
284 _mm_store_si128((__m128i *)dst, row);
285 dst += stride;
286 rep = _mm_add_epi16(rep, one);
287 }
288
289 l = _mm_load_si128((const __m128i *)(left + 16));
290 rep = _mm_set1_epi16((short)0x8000);
291 for (i = 0; i < 16; ++i) {
292 l16 = _mm_shuffle_epi8(l, rep);
293 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294
295 _mm_store_si128((__m128i *)dst, row);
296 dst += stride;
297 rep = _mm_add_epi16(rep, one);
298 }
299 }
300
aom_paeth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302 const uint8_t *above,
303 const uint8_t *left) {
304 const __m128i t = _mm_load_si128((const __m128i *)above);
305 const __m128i zero = _mm_setzero_si128();
306 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
308 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
309 const __m128i one = _mm_set1_epi16(1);
310
311 for (int j = 0; j < 4; ++j) {
312 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
313 __m128i rep = _mm_set1_epi16((short)0x8000);
314 for (int i = 0; i < 16; ++i) {
315 const __m128i l16 = _mm_shuffle_epi8(l, rep);
316 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317 _mm_store_si128((__m128i *)dst, row);
318 dst += stride;
319 rep = _mm_add_epi16(rep, one);
320 }
321 }
322 }
323
aom_paeth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)324 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325 const uint8_t *above, const uint8_t *left) {
326 const __m128i a = _mm_load_si128((const __m128i *)above);
327 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328 const __m128i zero = _mm_setzero_si128();
329 const __m128i al = _mm_unpacklo_epi8(a, zero);
330 const __m128i ah = _mm_unpackhi_epi8(a, zero);
331 const __m128i bl = _mm_unpacklo_epi8(b, zero);
332 const __m128i bh = _mm_unpackhi_epi8(b, zero);
333
334 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
335 __m128i rep = _mm_set1_epi16((short)0x8000);
336 const __m128i one = _mm_set1_epi16(1);
337 const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338 __m128i l16;
339
340 for (int i = 0; i < 8; ++i) {
341 l16 = _mm_shuffle_epi8(l, rep);
342 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344
345 _mm_store_si128((__m128i *)dst, r32l);
346 _mm_store_si128((__m128i *)(dst + 16), r32h);
347 dst += stride;
348 rep = _mm_add_epi16(rep, one);
349 }
350 }
351
aom_paeth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)352 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353 const uint8_t *above,
354 const uint8_t *left) {
355 const __m128i a = _mm_load_si128((const __m128i *)above);
356 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357 const __m128i zero = _mm_setzero_si128();
358 const __m128i al = _mm_unpacklo_epi8(a, zero);
359 const __m128i ah = _mm_unpackhi_epi8(a, zero);
360 const __m128i bl = _mm_unpacklo_epi8(b, zero);
361 const __m128i bh = _mm_unpackhi_epi8(b, zero);
362
363 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
364 __m128i rep = _mm_set1_epi16((short)0x8000);
365 const __m128i one = _mm_set1_epi16(1);
366 __m128i l = _mm_load_si128((const __m128i *)left);
367 __m128i l16;
368
369 int i;
370 for (i = 0; i < 16; ++i) {
371 l16 = _mm_shuffle_epi8(l, rep);
372 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374
375 _mm_store_si128((__m128i *)dst, r32l);
376 _mm_store_si128((__m128i *)(dst + 16), r32h);
377 dst += stride;
378 rep = _mm_add_epi16(rep, one);
379 }
380 }
381
aom_paeth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)382 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383 const uint8_t *above,
384 const uint8_t *left) {
385 const __m128i a = _mm_load_si128((const __m128i *)above);
386 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387 const __m128i zero = _mm_setzero_si128();
388 const __m128i al = _mm_unpacklo_epi8(a, zero);
389 const __m128i ah = _mm_unpackhi_epi8(a, zero);
390 const __m128i bl = _mm_unpacklo_epi8(b, zero);
391 const __m128i bh = _mm_unpackhi_epi8(b, zero);
392
393 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
394 __m128i rep = _mm_set1_epi16((short)0x8000);
395 const __m128i one = _mm_set1_epi16(1);
396 __m128i l = _mm_load_si128((const __m128i *)left);
397 __m128i l16;
398
399 int i;
400 for (i = 0; i < 16; ++i) {
401 l16 = _mm_shuffle_epi8(l, rep);
402 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404
405 _mm_store_si128((__m128i *)dst, r32l);
406 _mm_store_si128((__m128i *)(dst + 16), r32h);
407 dst += stride;
408 rep = _mm_add_epi16(rep, one);
409 }
410
411 rep = _mm_set1_epi16((short)0x8000);
412 l = _mm_load_si128((const __m128i *)(left + 16));
413 for (i = 0; i < 16; ++i) {
414 l16 = _mm_shuffle_epi8(l, rep);
415 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417
418 _mm_store_si128((__m128i *)dst, r32l);
419 _mm_store_si128((__m128i *)(dst + 16), r32h);
420 dst += stride;
421 rep = _mm_add_epi16(rep, one);
422 }
423 }
424
aom_paeth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)425 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426 const uint8_t *above,
427 const uint8_t *left) {
428 const __m128i a = _mm_load_si128((const __m128i *)above);
429 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430 const __m128i zero = _mm_setzero_si128();
431 const __m128i al = _mm_unpacklo_epi8(a, zero);
432 const __m128i ah = _mm_unpackhi_epi8(a, zero);
433 const __m128i bl = _mm_unpacklo_epi8(b, zero);
434 const __m128i bh = _mm_unpackhi_epi8(b, zero);
435
436 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
437 const __m128i one = _mm_set1_epi16(1);
438 __m128i l16;
439
440 int i, j;
441 for (j = 0; j < 4; ++j) {
442 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
443 __m128i rep = _mm_set1_epi16((short)0x8000);
444 for (i = 0; i < 16; ++i) {
445 l16 = _mm_shuffle_epi8(l, rep);
446 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448
449 _mm_store_si128((__m128i *)dst, r32l);
450 _mm_store_si128((__m128i *)(dst + 16), r32h);
451 dst += stride;
452 rep = _mm_add_epi16(rep, one);
453 }
454 }
455 }
456
aom_paeth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458 const uint8_t *above,
459 const uint8_t *left) {
460 const __m128i a = _mm_load_si128((const __m128i *)above);
461 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464 const __m128i zero = _mm_setzero_si128();
465 const __m128i al = _mm_unpacklo_epi8(a, zero);
466 const __m128i ah = _mm_unpackhi_epi8(a, zero);
467 const __m128i bl = _mm_unpacklo_epi8(b, zero);
468 const __m128i bh = _mm_unpackhi_epi8(b, zero);
469 const __m128i cl = _mm_unpacklo_epi8(c, zero);
470 const __m128i ch = _mm_unpackhi_epi8(c, zero);
471 const __m128i dl = _mm_unpacklo_epi8(d, zero);
472 const __m128i dh = _mm_unpackhi_epi8(d, zero);
473
474 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
475 const __m128i one = _mm_set1_epi16(1);
476 __m128i l16;
477
478 int i, j;
479 for (j = 0; j < 2; ++j) {
480 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
481 __m128i rep = _mm_set1_epi16((short)0x8000);
482 for (i = 0; i < 16; ++i) {
483 l16 = _mm_shuffle_epi8(l, rep);
484 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488
489 _mm_store_si128((__m128i *)dst, r0);
490 _mm_store_si128((__m128i *)(dst + 16), r1);
491 _mm_store_si128((__m128i *)(dst + 32), r2);
492 _mm_store_si128((__m128i *)(dst + 48), r3);
493 dst += stride;
494 rep = _mm_add_epi16(rep, one);
495 }
496 }
497 }
498
aom_paeth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500 const uint8_t *above,
501 const uint8_t *left) {
502 const __m128i a = _mm_load_si128((const __m128i *)above);
503 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506 const __m128i zero = _mm_setzero_si128();
507 const __m128i al = _mm_unpacklo_epi8(a, zero);
508 const __m128i ah = _mm_unpackhi_epi8(a, zero);
509 const __m128i bl = _mm_unpacklo_epi8(b, zero);
510 const __m128i bh = _mm_unpackhi_epi8(b, zero);
511 const __m128i cl = _mm_unpacklo_epi8(c, zero);
512 const __m128i ch = _mm_unpackhi_epi8(c, zero);
513 const __m128i dl = _mm_unpacklo_epi8(d, zero);
514 const __m128i dh = _mm_unpackhi_epi8(d, zero);
515
516 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
517 const __m128i one = _mm_set1_epi16(1);
518 __m128i l16;
519
520 int i, j;
521 for (j = 0; j < 4; ++j) {
522 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
523 __m128i rep = _mm_set1_epi16((short)0x8000);
524 for (i = 0; i < 16; ++i) {
525 l16 = _mm_shuffle_epi8(l, rep);
526 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530
531 _mm_store_si128((__m128i *)dst, r0);
532 _mm_store_si128((__m128i *)(dst + 16), r1);
533 _mm_store_si128((__m128i *)(dst + 32), r2);
534 _mm_store_si128((__m128i *)(dst + 48), r3);
535 dst += stride;
536 rep = _mm_add_epi16(rep, one);
537 }
538 }
539 }
540
aom_paeth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)541 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542 const uint8_t *above,
543 const uint8_t *left) {
544 const __m128i a = _mm_load_si128((const __m128i *)above);
545 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548 const __m128i zero = _mm_setzero_si128();
549 const __m128i al = _mm_unpacklo_epi8(a, zero);
550 const __m128i ah = _mm_unpackhi_epi8(a, zero);
551 const __m128i bl = _mm_unpacklo_epi8(b, zero);
552 const __m128i bh = _mm_unpackhi_epi8(b, zero);
553 const __m128i cl = _mm_unpacklo_epi8(c, zero);
554 const __m128i ch = _mm_unpackhi_epi8(c, zero);
555 const __m128i dl = _mm_unpacklo_epi8(d, zero);
556 const __m128i dh = _mm_unpackhi_epi8(d, zero);
557
558 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
559 const __m128i one = _mm_set1_epi16(1);
560 __m128i l16;
561
562 int i;
563 const __m128i l = _mm_load_si128((const __m128i *)left);
564 __m128i rep = _mm_set1_epi16((short)0x8000);
565 for (i = 0; i < 16; ++i) {
566 l16 = _mm_shuffle_epi8(l, rep);
567 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571
572 _mm_store_si128((__m128i *)dst, r0);
573 _mm_store_si128((__m128i *)(dst + 16), r1);
574 _mm_store_si128((__m128i *)(dst + 32), r2);
575 _mm_store_si128((__m128i *)(dst + 48), r3);
576 dst += stride;
577 rep = _mm_add_epi16(rep, one);
578 }
579 }
580
581 // -----------------------------------------------------------------------------
582 // SMOOTH_PRED
583
584 // pixels[0]: above and below_pred interleave vector
585 // pixels[1]: left vector
586 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)587 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588 int height, __m128i *pixels) {
589 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
590 if (height == 4)
591 pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
592 else if (height == 8)
593 pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594 else
595 pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596
597 pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
598
599 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
600 const __m128i zero = _mm_setzero_si128();
601 d = _mm_unpacklo_epi8(d, zero);
602 pixels[0] = _mm_unpacklo_epi16(d, bp);
603 }
604
605 // weight_h[0]: weight_h vector
606 // weight_h[1]: scale - weight_h vector
607 // weight_h[2]: same as [0], second half for height = 16 only
608 // weight_h[3]: same as [1], second half for height = 16 only
609 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(const uint8_t * weight_array,int height,__m128i * weight_h,__m128i * weight_w)610 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
611 __m128i *weight_h, __m128i *weight_w) {
612 const __m128i zero = _mm_setzero_si128();
613 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
614 const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
615 weight_h[0] = _mm_unpacklo_epi8(t, zero);
616 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
618
619 if (height == 8) {
620 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
621 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623 } else if (height == 16) {
624 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
625 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
626 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
627 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
628 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
629 }
630 }
631
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)632 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
633 const __m128i *ww, int h, uint8_t *dst,
634 ptrdiff_t stride, int second_half) {
635 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
636 const __m128i one = _mm_set1_epi16(1);
637 const __m128i inc = _mm_set1_epi16(0x202);
638 const __m128i gat = _mm_set1_epi32(0xc080400);
639 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
640 : _mm_set1_epi16((short)0x8000);
641 __m128i d = _mm_set1_epi16(0x100);
642
643 for (int i = 0; i < h; ++i) {
644 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
645 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
646 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
647 __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
648
649 __m128i b = _mm_shuffle_epi8(pixel[1], rep);
650 b = _mm_unpacklo_epi16(b, pixel[2]);
651 __m128i sum = _mm_madd_epi16(b, ww[0]);
652
653 sum = _mm_add_epi32(s, sum);
654 sum = _mm_add_epi32(sum, round);
655 sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
656
657 sum = _mm_shuffle_epi8(sum, gat);
658 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
659 dst += stride;
660
661 rep = _mm_add_epi16(rep, one);
662 d = _mm_add_epi16(d, inc);
663 }
664 }
665
aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)666 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
667 const uint8_t *above, const uint8_t *left) {
668 __m128i pixels[3];
669 load_pixel_w4(above, left, 4, pixels);
670
671 __m128i wh[4], ww[2];
672 load_weight_w4(sm_weight_arrays, 4, wh, ww);
673
674 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
675 }
676
aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)677 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
678 const uint8_t *above, const uint8_t *left) {
679 __m128i pixels[3];
680 load_pixel_w4(above, left, 8, pixels);
681
682 __m128i wh[4], ww[2];
683 load_weight_w4(sm_weight_arrays, 8, wh, ww);
684
685 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
686 }
687
aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)688 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
689 const uint8_t *above,
690 const uint8_t *left) {
691 __m128i pixels[3];
692 load_pixel_w4(above, left, 16, pixels);
693
694 __m128i wh[4], ww[2];
695 load_weight_w4(sm_weight_arrays, 16, wh, ww);
696
697 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
698 dst += stride << 3;
699 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
700 }
701
702 // pixels[0]: above and below_pred interleave vector, first half
703 // pixels[1]: above and below_pred interleave vector, second half
704 // pixels[2]: left vector
705 // pixels[3]: right_pred vector
706 // pixels[4]: above and below_pred interleave vector, first half
707 // pixels[5]: above and below_pred interleave vector, second half
708 // pixels[6]: left vector + 16
709 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)710 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
711 int height, __m128i *pixels) {
712 const __m128i zero = _mm_setzero_si128();
713 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
714 __m128i d = _mm_loadl_epi64((const __m128i *)above);
715 d = _mm_unpacklo_epi8(d, zero);
716 pixels[0] = _mm_unpacklo_epi16(d, bp);
717 pixels[1] = _mm_unpackhi_epi16(d, bp);
718
719 pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
720
721 if (height == 4) {
722 pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
723 } else if (height == 8) {
724 pixels[2] = _mm_loadl_epi64((const __m128i *)left);
725 } else if (height == 16) {
726 pixels[2] = _mm_load_si128((const __m128i *)left);
727 } else {
728 pixels[2] = _mm_load_si128((const __m128i *)left);
729 pixels[4] = pixels[0];
730 pixels[5] = pixels[1];
731 pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
732 pixels[7] = pixels[3];
733 }
734 }
735
736 // weight_h[0]: weight_h vector
737 // weight_h[1]: scale - weight_h vector
738 // weight_h[2]: same as [0], offset 8
739 // weight_h[3]: same as [1], offset 8
740 // weight_h[4]: same as [0], offset 16
741 // weight_h[5]: same as [1], offset 16
742 // weight_h[6]: same as [0], offset 24
743 // weight_h[7]: same as [1], offset 24
744 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
745 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(const uint8_t * weight_array,int height,__m128i * weight_h,__m128i * weight_w)746 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
747 __m128i *weight_h, __m128i *weight_w) {
748 const __m128i zero = _mm_setzero_si128();
749 const int we_offset = height < 8 ? 4 : 8;
750 __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
751 weight_h[0] = _mm_unpacklo_epi8(we, zero);
752 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
753 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
754
755 if (height == 4) {
756 we = _mm_srli_si128(we, 4);
757 __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
758 __m128i tmp2 = _mm_sub_epi16(d, tmp1);
759 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
760 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
761 } else {
762 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
763 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
764 }
765
766 if (height == 16) {
767 we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
768 weight_h[0] = _mm_unpacklo_epi8(we, zero);
769 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
770 weight_h[2] = _mm_unpackhi_epi8(we, zero);
771 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
772 } else if (height == 32) {
773 const __m128i weight_lo =
774 _mm_loadu_si128((const __m128i *)&weight_array[32]);
775 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
776 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
777 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
778 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
779 const __m128i weight_hi =
780 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
781 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
782 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
783 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
784 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
785 }
786 }
787
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)788 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
789 const __m128i *ww, int h, uint8_t *dst,
790 ptrdiff_t stride, int second_half) {
791 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
792 const __m128i one = _mm_set1_epi16(1);
793 const __m128i inc = _mm_set1_epi16(0x202);
794 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
795
796 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
797 : _mm_set1_epi16((short)0x8000);
798 __m128i d = _mm_set1_epi16(0x100);
799
800 int i;
801 for (i = 0; i < h; ++i) {
802 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
803 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
804 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
805 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
806 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
807
808 __m128i b = _mm_shuffle_epi8(pixels[2], rep);
809 b = _mm_unpacklo_epi16(b, pixels[3]);
810 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
811 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
812
813 s0 = _mm_add_epi32(s0, sum0);
814 s0 = _mm_add_epi32(s0, round);
815 s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
816
817 s1 = _mm_add_epi32(s1, sum1);
818 s1 = _mm_add_epi32(s1, round);
819 s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
820
821 sum0 = _mm_packus_epi16(s0, s1);
822 sum0 = _mm_shuffle_epi8(sum0, gat);
823 _mm_storel_epi64((__m128i *)dst, sum0);
824 dst += stride;
825
826 rep = _mm_add_epi16(rep, one);
827 d = _mm_add_epi16(d, inc);
828 }
829 }
830
aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)831 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
832 const uint8_t *above, const uint8_t *left) {
833 __m128i pixels[4];
834 load_pixel_w8(above, left, 4, pixels);
835
836 __m128i wh[4], ww[2];
837 load_weight_w8(sm_weight_arrays, 4, wh, ww);
838
839 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
840 }
841
aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)842 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
843 const uint8_t *above, const uint8_t *left) {
844 __m128i pixels[4];
845 load_pixel_w8(above, left, 8, pixels);
846
847 __m128i wh[4], ww[2];
848 load_weight_w8(sm_weight_arrays, 8, wh, ww);
849
850 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
851 }
852
aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)853 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
854 const uint8_t *above,
855 const uint8_t *left) {
856 __m128i pixels[4];
857 load_pixel_w8(above, left, 16, pixels);
858
859 __m128i wh[4], ww[2];
860 load_weight_w8(sm_weight_arrays, 16, wh, ww);
861
862 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
863 dst += stride << 3;
864 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
865 }
866
aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)867 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
868 const uint8_t *above,
869 const uint8_t *left) {
870 __m128i pixels[8];
871 load_pixel_w8(above, left, 32, pixels);
872
873 __m128i wh[8], ww[2];
874 load_weight_w8(sm_weight_arrays, 32, wh, ww);
875
876 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
877 dst += stride << 3;
878 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
879 dst += stride << 3;
880 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
881 dst += stride << 3;
882 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
883 }
884
smooth_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)885 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
886 const uint8_t *above,
887 const uint8_t *left, uint32_t bw,
888 uint32_t bh) {
889 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
890 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
891 const __m128i zero = _mm_setzero_si128();
892 const __m128i scale_value =
893 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
894 const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
895 const __m128i dup16 = _mm_set1_epi32(0x01000100);
896 const __m128i top_right =
897 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
898 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
899 const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
900
901 for (uint32_t y = 0; y < bh; ++y) {
902 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
903 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
904 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
905 __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
906 const __m128i wl_y =
907 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
908 pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
909 pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
910
911 for (uint32_t x = 0; x < bw; x += 8) {
912 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
913 const __m128i weights_x =
914 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
915 const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
916 const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
917 const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
918
919 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
920 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
921
922 const __m128i scale_m_weights_x =
923 _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
924 const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
925 const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
926 const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
927
928 pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
929 pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
930
931 pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
932 pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
933
934 pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
935 pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
936
937 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
938 pred = _mm_shuffle_epi8(pred, gat);
939 _mm_storel_epi64((__m128i *)(dst + x), pred);
940 }
941 dst += stride;
942 }
943 }
944
aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)945 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
946 const uint8_t *above,
947 const uint8_t *left) {
948 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
949 }
950
aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)951 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
952 const uint8_t *above,
953 const uint8_t *left) {
954 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
955 }
956
aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)957 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
958 const uint8_t *above,
959 const uint8_t *left) {
960 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
961 }
962
aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)963 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
964 const uint8_t *above,
965 const uint8_t *left) {
966 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
967 }
968
aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)969 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
970 const uint8_t *above,
971 const uint8_t *left) {
972 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
973 }
974
aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)975 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
976 const uint8_t *above,
977 const uint8_t *left) {
978 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
979 }
980
aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)981 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
982 const uint8_t *above,
983 const uint8_t *left) {
984 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
985 }
986
aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)987 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
988 const uint8_t *above,
989 const uint8_t *left) {
990 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
991 }
992
aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)993 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
994 const uint8_t *above,
995 const uint8_t *left) {
996 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
997 }
998
aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)999 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1000 const uint8_t *above,
1001 const uint8_t *left) {
1002 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1003 }
1004
aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1005 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1006 const uint8_t *above,
1007 const uint8_t *left) {
1008 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1009 }
1010
aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1011 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1012 const uint8_t *above,
1013 const uint8_t *left) {
1014 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1015 }
1016
1017 // -----------------------------------------------------------------------------
1018 // SMOOTH_V_PRED
1019
1020 // pixels[0]: above and below_pred interleave vector
load_pixel_v_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1021 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
1022 int height, __m128i *pixels) {
1023 const __m128i zero = _mm_setzero_si128();
1024 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
1025 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1026 d = _mm_unpacklo_epi8(d, zero);
1027 pixels[0] = _mm_unpacklo_epi16(d, bp);
1028 }
1029
1030 // weights[0]: weights_h vector
1031 // weights[1]: scale - weights_h vector
load_weight_v_w4(const uint8_t * weight_array,int height,__m128i * weights)1032 static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
1033 __m128i *weights) {
1034 const __m128i zero = _mm_setzero_si128();
1035 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1036
1037 if (height == 4) {
1038 const __m128i weight =
1039 _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
1040 weights[0] = _mm_unpacklo_epi8(weight, zero);
1041 weights[1] = _mm_sub_epi16(d, weights[0]);
1042 } else if (height == 8) {
1043 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
1044 weights[0] = _mm_unpacklo_epi8(weight, zero);
1045 weights[1] = _mm_sub_epi16(d, weights[0]);
1046 } else {
1047 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1048 weights[0] = _mm_unpacklo_epi8(weight, zero);
1049 weights[1] = _mm_sub_epi16(d, weights[0]);
1050 weights[2] = _mm_unpackhi_epi8(weight, zero);
1051 weights[3] = _mm_sub_epi16(d, weights[2]);
1052 }
1053 }
1054
smooth_v_pred_4xh(const __m128i * pixel,const __m128i * weight,int h,uint8_t * dst,ptrdiff_t stride)1055 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
1056 const __m128i *weight, int h, uint8_t *dst,
1057 ptrdiff_t stride) {
1058 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1059 const __m128i inc = _mm_set1_epi16(0x202);
1060 const __m128i gat = _mm_set1_epi32(0xc080400);
1061 __m128i d = _mm_set1_epi16(0x100);
1062
1063 for (int i = 0; i < h; ++i) {
1064 const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
1065 const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
1066 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1067 __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
1068 sum = _mm_add_epi32(sum, pred_round);
1069 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1070 sum = _mm_shuffle_epi8(sum, gat);
1071 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1072 dst += stride;
1073 d = _mm_add_epi16(d, inc);
1074 }
1075 }
1076
aom_smooth_v_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1077 void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1078 const uint8_t *above,
1079 const uint8_t *left) {
1080 __m128i pixels;
1081 load_pixel_v_w4(above, left, 4, &pixels);
1082
1083 __m128i weights[2];
1084 load_weight_v_w4(sm_weight_arrays, 4, weights);
1085
1086 smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
1087 }
1088
aom_smooth_v_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1089 void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1090 const uint8_t *above,
1091 const uint8_t *left) {
1092 __m128i pixels;
1093 load_pixel_v_w4(above, left, 8, &pixels);
1094
1095 __m128i weights[2];
1096 load_weight_v_w4(sm_weight_arrays, 8, weights);
1097
1098 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1099 }
1100
aom_smooth_v_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1101 void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1102 const uint8_t *above,
1103 const uint8_t *left) {
1104 __m128i pixels;
1105 load_pixel_v_w4(above, left, 16, &pixels);
1106
1107 __m128i weights[4];
1108 load_weight_v_w4(sm_weight_arrays, 16, weights);
1109
1110 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1111 dst += stride << 3;
1112 smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
1113 }
1114
1115 // pixels[0]: above and below_pred interleave vector, first half
1116 // pixels[1]: above and below_pred interleave vector, second half
load_pixel_v_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1117 static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
1118 int height, __m128i *pixels) {
1119 const __m128i zero = _mm_setzero_si128();
1120 __m128i d = _mm_loadl_epi64((const __m128i *)above);
1121 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1122 d = _mm_unpacklo_epi8(d, zero);
1123 pixels[0] = _mm_unpacklo_epi16(d, bp);
1124 pixels[1] = _mm_unpackhi_epi16(d, bp);
1125 }
1126
1127 // weight_h[0]: weight_h vector
1128 // weight_h[1]: scale - weight_h vector
1129 // weight_h[2]: same as [0], offset 8
1130 // weight_h[3]: same as [1], offset 8
1131 // weight_h[4]: same as [0], offset 16
1132 // weight_h[5]: same as [1], offset 16
1133 // weight_h[6]: same as [0], offset 24
1134 // weight_h[7]: same as [1], offset 24
load_weight_v_w8(const uint8_t * weight_array,int height,__m128i * weight_h)1135 static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
1136 __m128i *weight_h) {
1137 const __m128i zero = _mm_setzero_si128();
1138 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1139
1140 if (height < 16) {
1141 const int offset = height < 8 ? 4 : 8;
1142 const __m128i weight =
1143 _mm_loadu_si128((const __m128i *)&weight_array[offset]);
1144 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1145 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1146 } else if (height == 16) {
1147 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1148 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1149 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1150 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
1151 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1152 } else {
1153 const __m128i weight_lo =
1154 _mm_loadu_si128((const __m128i *)&weight_array[32]);
1155 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
1156 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1157 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
1158 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1159 const __m128i weight_hi =
1160 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
1161 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
1162 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
1163 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
1164 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
1165 }
1166 }
1167
smooth_v_pred_8xh(const __m128i * pixels,const __m128i * wh,int h,uint8_t * dst,ptrdiff_t stride)1168 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
1169 int h, uint8_t *dst, ptrdiff_t stride) {
1170 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1171 const __m128i inc = _mm_set1_epi16(0x202);
1172 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1173 __m128i d = _mm_set1_epi16(0x100);
1174
1175 for (int i = 0; i < h; ++i) {
1176 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
1177 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
1178 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1179 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
1180 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
1181
1182 s0 = _mm_add_epi32(s0, pred_round);
1183 s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
1184
1185 s1 = _mm_add_epi32(s1, pred_round);
1186 s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
1187
1188 __m128i sum01 = _mm_packus_epi16(s0, s1);
1189 sum01 = _mm_shuffle_epi8(sum01, gat);
1190 _mm_storel_epi64((__m128i *)dst, sum01);
1191 dst += stride;
1192
1193 d = _mm_add_epi16(d, inc);
1194 }
1195 }
1196
aom_smooth_v_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1197 void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1198 const uint8_t *above,
1199 const uint8_t *left) {
1200 __m128i pixels[2];
1201 load_pixel_v_w8(above, left, 4, pixels);
1202
1203 __m128i wh[2];
1204 load_weight_v_w8(sm_weight_arrays, 4, wh);
1205
1206 smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
1207 }
1208
aom_smooth_v_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1209 void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1210 const uint8_t *above,
1211 const uint8_t *left) {
1212 __m128i pixels[2];
1213 load_pixel_v_w8(above, left, 8, pixels);
1214
1215 __m128i wh[2];
1216 load_weight_v_w8(sm_weight_arrays, 8, wh);
1217
1218 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1219 }
1220
aom_smooth_v_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1221 void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1222 const uint8_t *above,
1223 const uint8_t *left) {
1224 __m128i pixels[2];
1225 load_pixel_v_w8(above, left, 16, pixels);
1226
1227 __m128i wh[4];
1228 load_weight_v_w8(sm_weight_arrays, 16, wh);
1229
1230 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1231 dst += stride << 3;
1232 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1233 }
1234
aom_smooth_v_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1235 void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1236 const uint8_t *above,
1237 const uint8_t *left) {
1238 __m128i pixels[2];
1239 load_pixel_v_w8(above, left, 32, pixels);
1240
1241 __m128i wh[8];
1242 load_weight_v_w8(sm_weight_arrays, 32, wh);
1243
1244 smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
1245 dst += stride << 3;
1246 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1247 dst += stride << 3;
1248 smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
1249 dst += stride << 3;
1250 smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
1251 }
1252
smooth_v_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1253 static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1254 const uint8_t *above,
1255 const uint8_t *left, uint32_t bw,
1256 uint32_t bh) {
1257 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
1258 const __m128i zero = _mm_setzero_si128();
1259 const __m128i scale_value =
1260 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1261 const __m128i dup16 = _mm_set1_epi32(0x01000100);
1262 const __m128i bottom_left =
1263 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
1264 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1265 const __m128i round =
1266 _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
1267
1268 for (uint32_t y = 0; y < bh; ++y) {
1269 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
1270 const __m128i scale_m_weights_y =
1271 _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
1272 const __m128i wl_y =
1273 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
1274
1275 for (uint32_t x = 0; x < bw; x += 8) {
1276 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
1277 // 8 -> 16
1278 const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
1279 const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
1280 const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
1281 // top_x * weights_y + scale_m_weights_y * bottom_left
1282 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
1283 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
1284
1285 pred_lo = _mm_add_epi32(pred_lo, round);
1286 pred_hi = _mm_add_epi32(pred_hi, round);
1287 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1288 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1289
1290 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1291 pred = _mm_shuffle_epi8(pred, gat);
1292 _mm_storel_epi64((__m128i *)(dst + x), pred);
1293 }
1294 dst += stride;
1295 }
1296 }
1297
aom_smooth_v_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1298 void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1299 const uint8_t *above,
1300 const uint8_t *left) {
1301 smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
1302 }
1303
aom_smooth_v_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1304 void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1305 const uint8_t *above,
1306 const uint8_t *left) {
1307 smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
1308 }
1309
aom_smooth_v_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1310 void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1311 const uint8_t *above,
1312 const uint8_t *left) {
1313 smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
1314 }
1315
aom_smooth_v_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1316 void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1317 const uint8_t *above,
1318 const uint8_t *left) {
1319 smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
1320 }
1321
aom_smooth_v_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1322 void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1323 const uint8_t *above,
1324 const uint8_t *left) {
1325 smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
1326 }
1327
aom_smooth_v_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1328 void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1329 const uint8_t *above,
1330 const uint8_t *left) {
1331 smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
1332 }
1333
aom_smooth_v_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1334 void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1335 const uint8_t *above,
1336 const uint8_t *left) {
1337 smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
1338 }
1339
aom_smooth_v_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1340 void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1341 const uint8_t *above,
1342 const uint8_t *left) {
1343 smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
1344 }
1345
aom_smooth_v_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1346 void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1347 const uint8_t *above,
1348 const uint8_t *left) {
1349 smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
1350 }
1351
aom_smooth_v_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1352 void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1353 const uint8_t *above,
1354 const uint8_t *left) {
1355 smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
1356 }
1357
aom_smooth_v_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1358 void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1359 const uint8_t *above,
1360 const uint8_t *left) {
1361 smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
1362 }
1363
aom_smooth_v_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1364 void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1365 const uint8_t *above,
1366 const uint8_t *left) {
1367 smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
1368 }
1369
1370 // -----------------------------------------------------------------------------
1371 // SMOOTH_H_PRED
1372
1373 // pixels[0]: left vector
1374 // pixels[1]: right_pred vector
load_pixel_h_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1375 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
1376 int height, __m128i *pixels) {
1377 if (height == 4)
1378 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1379 else if (height == 8)
1380 pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
1381 else
1382 pixels[0] = _mm_loadu_si128(((const __m128i *)left));
1383 pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
1384 }
1385
1386 // weights[0]: weights_w and scale - weights_w interleave vector
load_weight_h_w4(const uint8_t * weight_array,int height,__m128i * weights)1387 static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
1388 __m128i *weights) {
1389 (void)height;
1390 const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
1391 const __m128i zero = _mm_setzero_si128();
1392
1393 const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
1394 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1395 const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
1396 weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
1397 }
1398
smooth_h_pred_4xh(const __m128i * pixel,const __m128i * weight,int h,uint8_t * dst,ptrdiff_t stride)1399 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
1400 const __m128i *weight, int h, uint8_t *dst,
1401 ptrdiff_t stride) {
1402 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1403 const __m128i one = _mm_set1_epi16(1);
1404 const __m128i gat = _mm_set1_epi32(0xc080400);
1405 __m128i rep = _mm_set1_epi16((short)0x8000);
1406
1407 for (int i = 0; i < h; ++i) {
1408 __m128i b = _mm_shuffle_epi8(pixel[0], rep);
1409 b = _mm_unpacklo_epi16(b, pixel[1]);
1410 __m128i sum = _mm_madd_epi16(b, weight[0]);
1411
1412 sum = _mm_add_epi32(sum, pred_round);
1413 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1414
1415 sum = _mm_shuffle_epi8(sum, gat);
1416 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1417 dst += stride;
1418
1419 rep = _mm_add_epi16(rep, one);
1420 }
1421 }
1422
aom_smooth_h_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1423 void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1424 const uint8_t *above,
1425 const uint8_t *left) {
1426 __m128i pixels[2];
1427 load_pixel_h_w4(above, left, 4, pixels);
1428
1429 __m128i weights;
1430 load_weight_h_w4(sm_weight_arrays, 4, &weights);
1431
1432 smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
1433 }
1434
aom_smooth_h_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1435 void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1436 const uint8_t *above,
1437 const uint8_t *left) {
1438 __m128i pixels[2];
1439 load_pixel_h_w4(above, left, 8, pixels);
1440
1441 __m128i weights;
1442 load_weight_h_w4(sm_weight_arrays, 8, &weights);
1443
1444 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1445 }
1446
aom_smooth_h_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1447 void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1448 const uint8_t *above,
1449 const uint8_t *left) {
1450 __m128i pixels[2];
1451 load_pixel_h_w4(above, left, 16, pixels);
1452
1453 __m128i weights;
1454 load_weight_h_w4(sm_weight_arrays, 8, &weights);
1455
1456 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1457 dst += stride << 3;
1458
1459 pixels[0] = _mm_srli_si128(pixels[0], 8);
1460 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1461 }
1462
1463 // pixels[0]: left vector
1464 // pixels[1]: right_pred vector
1465 // pixels[2]: left vector + 16
1466 // pixels[3]: right_pred vector
load_pixel_h_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1467 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
1468 int height, __m128i *pixels) {
1469 pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
1470
1471 if (height == 4) {
1472 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1473 } else if (height == 8) {
1474 pixels[0] = _mm_loadl_epi64((const __m128i *)left);
1475 } else if (height == 16) {
1476 pixels[0] = _mm_load_si128((const __m128i *)left);
1477 } else {
1478 pixels[0] = _mm_load_si128((const __m128i *)left);
1479 pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
1480 pixels[3] = pixels[1];
1481 }
1482 }
1483
1484 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
1485 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_h_w8(const uint8_t * weight_array,int height,__m128i * weight_w)1486 static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
1487 __m128i *weight_w) {
1488 (void)height;
1489 const __m128i zero = _mm_setzero_si128();
1490 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1491 const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
1492 const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
1493 const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
1494 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
1495 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
1496 }
1497
smooth_h_pred_8xh(const __m128i * pixels,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)1498 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
1499 int h, uint8_t *dst, ptrdiff_t stride,
1500 int second_half) {
1501 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1502 const __m128i one = _mm_set1_epi16(1);
1503 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1504 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
1505 : _mm_set1_epi16((short)0x8000);
1506
1507 for (int i = 0; i < h; ++i) {
1508 __m128i b = _mm_shuffle_epi8(pixels[0], rep);
1509 b = _mm_unpacklo_epi16(b, pixels[1]);
1510 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
1511 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
1512
1513 sum0 = _mm_add_epi32(sum0, pred_round);
1514 sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
1515
1516 sum1 = _mm_add_epi32(sum1, pred_round);
1517 sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
1518
1519 sum0 = _mm_packus_epi16(sum0, sum1);
1520 sum0 = _mm_shuffle_epi8(sum0, gat);
1521 _mm_storel_epi64((__m128i *)dst, sum0);
1522 dst += stride;
1523
1524 rep = _mm_add_epi16(rep, one);
1525 }
1526 }
1527
aom_smooth_h_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1528 void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1529 const uint8_t *above,
1530 const uint8_t *left) {
1531 __m128i pixels[2];
1532 load_pixel_h_w8(above, left, 4, pixels);
1533
1534 __m128i ww[2];
1535 load_weight_h_w8(sm_weight_arrays, 4, ww);
1536
1537 smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1538 }
1539
aom_smooth_h_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1540 void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1541 const uint8_t *above,
1542 const uint8_t *left) {
1543 __m128i pixels[2];
1544 load_pixel_h_w8(above, left, 8, pixels);
1545
1546 __m128i ww[2];
1547 load_weight_h_w8(sm_weight_arrays, 8, ww);
1548
1549 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1550 }
1551
aom_smooth_h_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1552 void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1553 const uint8_t *above,
1554 const uint8_t *left) {
1555 __m128i pixels[2];
1556 load_pixel_h_w8(above, left, 16, pixels);
1557
1558 __m128i ww[2];
1559 load_weight_h_w8(sm_weight_arrays, 16, ww);
1560
1561 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1562 dst += stride << 3;
1563 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1564 }
1565
aom_smooth_h_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1566 void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1567 const uint8_t *above,
1568 const uint8_t *left) {
1569 __m128i pixels[4];
1570 load_pixel_h_w8(above, left, 32, pixels);
1571
1572 __m128i ww[2];
1573 load_weight_h_w8(sm_weight_arrays, 32, ww);
1574
1575 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1576 dst += stride << 3;
1577 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1578 dst += stride << 3;
1579 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1580 dst += stride << 3;
1581 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1582 }
1583
smooth_h_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1584 static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1585 const uint8_t *above,
1586 const uint8_t *left, uint32_t bw,
1587 uint32_t bh) {
1588 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1589 const __m128i zero = _mm_setzero_si128();
1590 const __m128i scale_value =
1591 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1592 const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1593 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1594 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1595
1596 for (uint32_t y = 0; y < bh; ++y) {
1597 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1598 const __m128i tr_ly =
1599 _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1600
1601 for (uint32_t x = 0; x < bw; x += 8) {
1602 const __m128i weights_x =
1603 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1604 const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1605 const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1606 const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1607 const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1608 __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1609 __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1610
1611 pred_lo = _mm_add_epi32(pred_lo, pred_round);
1612 pred_hi = _mm_add_epi32(pred_hi, pred_round);
1613
1614 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1615 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1616
1617 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1618 pred = _mm_shuffle_epi8(pred, gat);
1619 _mm_storel_epi64((__m128i *)(dst + x), pred);
1620 }
1621 dst += stride;
1622 }
1623 }
1624
aom_smooth_h_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1625 void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1626 const uint8_t *above,
1627 const uint8_t *left) {
1628 smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1629 }
1630
aom_smooth_h_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1631 void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1632 const uint8_t *above,
1633 const uint8_t *left) {
1634 smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1635 }
1636
aom_smooth_h_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1637 void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1638 const uint8_t *above,
1639 const uint8_t *left) {
1640 smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1641 }
1642
aom_smooth_h_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1643 void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1644 const uint8_t *above,
1645 const uint8_t *left) {
1646 smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1647 }
1648
aom_smooth_h_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1649 void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1650 const uint8_t *above,
1651 const uint8_t *left) {
1652 smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1653 }
1654
aom_smooth_h_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1655 void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1656 const uint8_t *above,
1657 const uint8_t *left) {
1658 smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1659 }
1660
aom_smooth_h_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1661 void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1662 const uint8_t *above,
1663 const uint8_t *left) {
1664 smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1665 }
1666
aom_smooth_h_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1667 void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1668 const uint8_t *above,
1669 const uint8_t *left) {
1670 smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1671 }
1672
aom_smooth_h_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1673 void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1674 const uint8_t *above,
1675 const uint8_t *left) {
1676 smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1677 }
1678
aom_smooth_h_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1679 void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1680 const uint8_t *above,
1681 const uint8_t *left) {
1682 smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1683 }
1684
aom_smooth_h_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1685 void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1686 const uint8_t *above,
1687 const uint8_t *left) {
1688 smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1689 }
1690
aom_smooth_h_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1691 void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1692 const uint8_t *above,
1693 const uint8_t *left) {
1694 smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1695 }
1696