1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/ppc/types_vsx.h"
13
vpx_v_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)14 void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
15 const uint8_t *above, const uint8_t *left) {
16 const uint8x16_t d = vec_vsx_ld(0, above);
17 int i;
18 (void)left;
19
20 for (i = 0; i < 16; i++, dst += stride) {
21 vec_vsx_st(d, 0, dst);
22 }
23 }
24
vpx_v_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)25 void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
26 const uint8_t *above, const uint8_t *left) {
27 const uint8x16_t d0 = vec_vsx_ld(0, above);
28 const uint8x16_t d1 = vec_vsx_ld(16, above);
29 int i;
30 (void)left;
31
32 for (i = 0; i < 32; i++, dst += stride) {
33 vec_vsx_st(d0, 0, dst);
34 vec_vsx_st(d1, 16, dst);
35 }
36 }
37
38 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
39
vpx_h_predictor_4x4_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)40 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
41 const uint8_t *above, const uint8_t *left) {
42 const uint8x16_t d = vec_vsx_ld(0, left);
43 const uint8x16_t v0 = vec_splat(d, 0);
44 const uint8x16_t v1 = vec_splat(d, 1);
45 const uint8x16_t v2 = vec_splat(d, 2);
46 const uint8x16_t v3 = vec_splat(d, 3);
47
48 (void)above;
49
50 vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
51 dst += stride;
52 vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
53 dst += stride;
54 vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
55 dst += stride;
56 vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
57 }
58
vpx_h_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)59 void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
60 const uint8_t *above, const uint8_t *left) {
61 const uint8x16_t d = vec_vsx_ld(0, left);
62 const uint8x16_t v0 = vec_splat(d, 0);
63 const uint8x16_t v1 = vec_splat(d, 1);
64 const uint8x16_t v2 = vec_splat(d, 2);
65 const uint8x16_t v3 = vec_splat(d, 3);
66
67 const uint8x16_t v4 = vec_splat(d, 4);
68 const uint8x16_t v5 = vec_splat(d, 5);
69 const uint8x16_t v6 = vec_splat(d, 6);
70 const uint8x16_t v7 = vec_splat(d, 7);
71
72 (void)above;
73
74 vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
75 dst += stride;
76 vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
77 dst += stride;
78 vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
79 dst += stride;
80 vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
81 dst += stride;
82 vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
83 dst += stride;
84 vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
85 dst += stride;
86 vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
87 dst += stride;
88 vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
89 }
90
vpx_h_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)91 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
92 const uint8_t *above, const uint8_t *left) {
93 const uint8x16_t d = vec_vsx_ld(0, left);
94 const uint8x16_t v0 = vec_splat(d, 0);
95 const uint8x16_t v1 = vec_splat(d, 1);
96 const uint8x16_t v2 = vec_splat(d, 2);
97 const uint8x16_t v3 = vec_splat(d, 3);
98
99 const uint8x16_t v4 = vec_splat(d, 4);
100 const uint8x16_t v5 = vec_splat(d, 5);
101 const uint8x16_t v6 = vec_splat(d, 6);
102 const uint8x16_t v7 = vec_splat(d, 7);
103
104 const uint8x16_t v8 = vec_splat(d, 8);
105 const uint8x16_t v9 = vec_splat(d, 9);
106 const uint8x16_t v10 = vec_splat(d, 10);
107 const uint8x16_t v11 = vec_splat(d, 11);
108
109 const uint8x16_t v12 = vec_splat(d, 12);
110 const uint8x16_t v13 = vec_splat(d, 13);
111 const uint8x16_t v14 = vec_splat(d, 14);
112 const uint8x16_t v15 = vec_splat(d, 15);
113
114 (void)above;
115
116 vec_vsx_st(v0, 0, dst);
117 dst += stride;
118 vec_vsx_st(v1, 0, dst);
119 dst += stride;
120 vec_vsx_st(v2, 0, dst);
121 dst += stride;
122 vec_vsx_st(v3, 0, dst);
123 dst += stride;
124 vec_vsx_st(v4, 0, dst);
125 dst += stride;
126 vec_vsx_st(v5, 0, dst);
127 dst += stride;
128 vec_vsx_st(v6, 0, dst);
129 dst += stride;
130 vec_vsx_st(v7, 0, dst);
131 dst += stride;
132 vec_vsx_st(v8, 0, dst);
133 dst += stride;
134 vec_vsx_st(v9, 0, dst);
135 dst += stride;
136 vec_vsx_st(v10, 0, dst);
137 dst += stride;
138 vec_vsx_st(v11, 0, dst);
139 dst += stride;
140 vec_vsx_st(v12, 0, dst);
141 dst += stride;
142 vec_vsx_st(v13, 0, dst);
143 dst += stride;
144 vec_vsx_st(v14, 0, dst);
145 dst += stride;
146 vec_vsx_st(v15, 0, dst);
147 }
148
149 #define H_PREDICTOR_32(v) \
150 vec_vsx_st(v, 0, dst); \
151 vec_vsx_st(v, 16, dst); \
152 dst += stride
153
vpx_h_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)154 void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
155 const uint8_t *above, const uint8_t *left) {
156 const uint8x16_t d0 = vec_vsx_ld(0, left);
157 const uint8x16_t d1 = vec_vsx_ld(16, left);
158
159 const uint8x16_t v0_0 = vec_splat(d0, 0);
160 const uint8x16_t v1_0 = vec_splat(d0, 1);
161 const uint8x16_t v2_0 = vec_splat(d0, 2);
162 const uint8x16_t v3_0 = vec_splat(d0, 3);
163 const uint8x16_t v4_0 = vec_splat(d0, 4);
164 const uint8x16_t v5_0 = vec_splat(d0, 5);
165 const uint8x16_t v6_0 = vec_splat(d0, 6);
166 const uint8x16_t v7_0 = vec_splat(d0, 7);
167 const uint8x16_t v8_0 = vec_splat(d0, 8);
168 const uint8x16_t v9_0 = vec_splat(d0, 9);
169 const uint8x16_t v10_0 = vec_splat(d0, 10);
170 const uint8x16_t v11_0 = vec_splat(d0, 11);
171 const uint8x16_t v12_0 = vec_splat(d0, 12);
172 const uint8x16_t v13_0 = vec_splat(d0, 13);
173 const uint8x16_t v14_0 = vec_splat(d0, 14);
174 const uint8x16_t v15_0 = vec_splat(d0, 15);
175
176 const uint8x16_t v0_1 = vec_splat(d1, 0);
177 const uint8x16_t v1_1 = vec_splat(d1, 1);
178 const uint8x16_t v2_1 = vec_splat(d1, 2);
179 const uint8x16_t v3_1 = vec_splat(d1, 3);
180 const uint8x16_t v4_1 = vec_splat(d1, 4);
181 const uint8x16_t v5_1 = vec_splat(d1, 5);
182 const uint8x16_t v6_1 = vec_splat(d1, 6);
183 const uint8x16_t v7_1 = vec_splat(d1, 7);
184 const uint8x16_t v8_1 = vec_splat(d1, 8);
185 const uint8x16_t v9_1 = vec_splat(d1, 9);
186 const uint8x16_t v10_1 = vec_splat(d1, 10);
187 const uint8x16_t v11_1 = vec_splat(d1, 11);
188 const uint8x16_t v12_1 = vec_splat(d1, 12);
189 const uint8x16_t v13_1 = vec_splat(d1, 13);
190 const uint8x16_t v14_1 = vec_splat(d1, 14);
191 const uint8x16_t v15_1 = vec_splat(d1, 15);
192
193 (void)above;
194
195 H_PREDICTOR_32(v0_0);
196 H_PREDICTOR_32(v1_0);
197 H_PREDICTOR_32(v2_0);
198 H_PREDICTOR_32(v3_0);
199
200 H_PREDICTOR_32(v4_0);
201 H_PREDICTOR_32(v5_0);
202 H_PREDICTOR_32(v6_0);
203 H_PREDICTOR_32(v7_0);
204
205 H_PREDICTOR_32(v8_0);
206 H_PREDICTOR_32(v9_0);
207 H_PREDICTOR_32(v10_0);
208 H_PREDICTOR_32(v11_0);
209
210 H_PREDICTOR_32(v12_0);
211 H_PREDICTOR_32(v13_0);
212 H_PREDICTOR_32(v14_0);
213 H_PREDICTOR_32(v15_0);
214
215 H_PREDICTOR_32(v0_1);
216 H_PREDICTOR_32(v1_1);
217 H_PREDICTOR_32(v2_1);
218 H_PREDICTOR_32(v3_1);
219
220 H_PREDICTOR_32(v4_1);
221 H_PREDICTOR_32(v5_1);
222 H_PREDICTOR_32(v6_1);
223 H_PREDICTOR_32(v7_1);
224
225 H_PREDICTOR_32(v8_1);
226 H_PREDICTOR_32(v9_1);
227 H_PREDICTOR_32(v10_1);
228 H_PREDICTOR_32(v11_1);
229
230 H_PREDICTOR_32(v12_1);
231 H_PREDICTOR_32(v13_1);
232 H_PREDICTOR_32(v14_1);
233 H_PREDICTOR_32(v15_1);
234 }
235
vpx_tm_predictor_4x4_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)236 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
237 const uint8_t *above, const uint8_t *left) {
238 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
239 const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
240 const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
241 int16x8_t tmp, val;
242 uint8x16_t d;
243
244 d = vec_vsx_ld(0, dst);
245 tmp = unpack_to_s16_l(d);
246 val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
247 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
248 dst += stride;
249
250 d = vec_vsx_ld(0, dst);
251 tmp = unpack_to_s16_l(d);
252 val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
253 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
254 dst += stride;
255
256 d = vec_vsx_ld(0, dst);
257 tmp = unpack_to_s16_l(d);
258 val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
259 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
260 dst += stride;
261
262 d = vec_vsx_ld(0, dst);
263 tmp = unpack_to_s16_l(d);
264 val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
265 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
266 }
267
vpx_tm_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)268 void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
269 const uint8_t *above, const uint8_t *left) {
270 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
271 const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
272 const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
273 int16x8_t tmp, val;
274
275 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
276 val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
277 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
278 dst += stride;
279
280 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
281 val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
282 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
283 dst += stride;
284
285 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
286 val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
287 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
288 dst += stride;
289
290 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
291 val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
292 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
293 dst += stride;
294
295 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
296 val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
297 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
298 dst += stride;
299
300 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
301 val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
302 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
303 dst += stride;
304
305 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
306 val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
307 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
308 dst += stride;
309
310 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
311 val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
312 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
313 }
314
tm_predictor_16x8(uint8_t * dst,const ptrdiff_t stride,int16x8_t l,int16x8_t ah,int16x8_t al,int16x8_t tl)315 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
316 int16x8_t ah, int16x8_t al, int16x8_t tl) {
317 int16x8_t vh, vl, ls;
318
319 ls = vec_splat(l, 0);
320 vh = vec_sub(vec_add(ls, ah), tl);
321 vl = vec_sub(vec_add(ls, al), tl);
322 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
323 dst += stride;
324
325 ls = vec_splat(l, 1);
326 vh = vec_sub(vec_add(ls, ah), tl);
327 vl = vec_sub(vec_add(ls, al), tl);
328 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
329 dst += stride;
330
331 ls = vec_splat(l, 2);
332 vh = vec_sub(vec_add(ls, ah), tl);
333 vl = vec_sub(vec_add(ls, al), tl);
334 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
335 dst += stride;
336
337 ls = vec_splat(l, 3);
338 vh = vec_sub(vec_add(ls, ah), tl);
339 vl = vec_sub(vec_add(ls, al), tl);
340 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
341 dst += stride;
342
343 ls = vec_splat(l, 4);
344 vh = vec_sub(vec_add(ls, ah), tl);
345 vl = vec_sub(vec_add(ls, al), tl);
346 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
347 dst += stride;
348
349 ls = vec_splat(l, 5);
350 vh = vec_sub(vec_add(ls, ah), tl);
351 vl = vec_sub(vec_add(ls, al), tl);
352 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
353 dst += stride;
354
355 ls = vec_splat(l, 6);
356 vh = vec_sub(vec_add(ls, ah), tl);
357 vl = vec_sub(vec_add(ls, al), tl);
358 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
359 dst += stride;
360
361 ls = vec_splat(l, 7);
362 vh = vec_sub(vec_add(ls, ah), tl);
363 vl = vec_sub(vec_add(ls, al), tl);
364 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
365 }
366
vpx_tm_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)367 void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
368 const uint8_t *above, const uint8_t *left) {
369 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
370 const uint8x16_t l = vec_vsx_ld(0, left);
371 const int16x8_t lh = unpack_to_s16_h(l);
372 const int16x8_t ll = unpack_to_s16_l(l);
373 const uint8x16_t a = vec_vsx_ld(0, above);
374 const int16x8_t ah = unpack_to_s16_h(a);
375 const int16x8_t al = unpack_to_s16_l(a);
376
377 tm_predictor_16x8(dst, stride, lh, ah, al, tl);
378
379 dst += stride * 8;
380
381 tm_predictor_16x8(dst, stride, ll, ah, al, tl);
382 }
383
tm_predictor_32x1(uint8_t * dst,const int16x8_t ls,const int16x8_t a0h,const int16x8_t a0l,const int16x8_t a1h,const int16x8_t a1l,const int16x8_t tl)384 static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
385 const int16x8_t a0h, const int16x8_t a0l,
386 const int16x8_t a1h, const int16x8_t a1l,
387 const int16x8_t tl) {
388 int16x8_t vh, vl;
389
390 vh = vec_sub(vec_add(ls, a0h), tl);
391 vl = vec_sub(vec_add(ls, a0l), tl);
392 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
393 vh = vec_sub(vec_add(ls, a1h), tl);
394 vl = vec_sub(vec_add(ls, a1l), tl);
395 vec_vsx_st(vec_packsu(vh, vl), 16, dst);
396 }
397
tm_predictor_32x8(uint8_t * dst,const ptrdiff_t stride,const int16x8_t l,const uint8x16_t a0,const uint8x16_t a1,const int16x8_t tl)398 static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
399 const int16x8_t l, const uint8x16_t a0,
400 const uint8x16_t a1, const int16x8_t tl) {
401 const int16x8_t a0h = unpack_to_s16_h(a0);
402 const int16x8_t a0l = unpack_to_s16_l(a0);
403 const int16x8_t a1h = unpack_to_s16_h(a1);
404 const int16x8_t a1l = unpack_to_s16_l(a1);
405
406 tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
407 dst += stride;
408
409 tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
410 dst += stride;
411
412 tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
413 dst += stride;
414
415 tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
416 dst += stride;
417
418 tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
419 dst += stride;
420
421 tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
422 dst += stride;
423
424 tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
425 dst += stride;
426
427 tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
428 }
429
vpx_tm_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)430 void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
431 const uint8_t *above, const uint8_t *left) {
432 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
433 const uint8x16_t l0 = vec_vsx_ld(0, left);
434 const uint8x16_t l1 = vec_vsx_ld(16, left);
435 const uint8x16_t a0 = vec_vsx_ld(0, above);
436 const uint8x16_t a1 = vec_vsx_ld(16, above);
437
438 tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
439 dst += stride * 8;
440
441 tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
442 dst += stride * 8;
443
444 tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
445 dst += stride * 8;
446
447 tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
448 }
449
dc_fill_predictor_8x8(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)450 static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
451 const uint8x16_t val) {
452 int i;
453
454 for (i = 0; i < 8; i++, dst += stride) {
455 const uint8x16_t d = vec_vsx_ld(0, dst);
456 vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
457 }
458 }
459
dc_fill_predictor_16x16(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)460 static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
461 const uint8x16_t val) {
462 int i;
463
464 for (i = 0; i < 16; i++, dst += stride) {
465 vec_vsx_st(val, 0, dst);
466 }
467 }
468
vpx_dc_128_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)469 void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
470 const uint8_t *above, const uint8_t *left) {
471 const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
472 (void)above;
473 (void)left;
474
475 dc_fill_predictor_16x16(dst, stride, v128);
476 }
477
dc_fill_predictor_32x32(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)478 static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
479 const uint8x16_t val) {
480 int i;
481
482 for (i = 0; i < 32; i++, dst += stride) {
483 vec_vsx_st(val, 0, dst);
484 vec_vsx_st(val, 16, dst);
485 }
486 }
487
vpx_dc_128_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)488 void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
489 const uint8_t *above, const uint8_t *left) {
490 const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
491 (void)above;
492 (void)left;
493
494 dc_fill_predictor_32x32(dst, stride, v128);
495 }
496
avg16(const uint8_t * values)497 static uint8x16_t avg16(const uint8_t *values) {
498 const int32x4_t sum4s =
499 (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
500 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
501 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
502
503 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
504 3);
505 }
506
vpx_dc_left_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)507 void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
508 const uint8_t *above,
509 const uint8_t *left) {
510 (void)above;
511
512 dc_fill_predictor_16x16(dst, stride, avg16(left));
513 }
514
vpx_dc_top_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)515 void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
516 const uint8_t *above, const uint8_t *left) {
517 (void)left;
518
519 dc_fill_predictor_16x16(dst, stride, avg16(above));
520 }
521
avg32(const uint8_t * values)522 static uint8x16_t avg32(const uint8_t *values) {
523 const uint8x16_t v0 = vec_vsx_ld(0, values);
524 const uint8x16_t v1 = vec_vsx_ld(16, values);
525 const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
526 const int32x4_t sum4s =
527 (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
528 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
529 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
530
531 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
532 3);
533 }
534
vpx_dc_left_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)535 void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
536 const uint8_t *above,
537 const uint8_t *left) {
538 (void)above;
539
540 dc_fill_predictor_32x32(dst, stride, avg32(left));
541 }
542
vpx_dc_top_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)543 void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
544 const uint8_t *above, const uint8_t *left) {
545 (void)left;
546
547 dc_fill_predictor_32x32(dst, stride, avg32(above));
548 }
549
dc_avg8(const uint8_t * above,const uint8_t * left)550 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
551 const uint8x16_t a0 = vec_vsx_ld(0, above);
552 const uint8x16_t l0 = vec_vsx_ld(0, left);
553 const int32x4_t sum4s =
554 (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
555 const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
556 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
557 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
558
559 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
560 3);
561 }
562
dc_avg16(const uint8_t * above,const uint8_t * left)563 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
564 const uint8x16_t a0 = vec_vsx_ld(0, above);
565 const uint8x16_t l0 = vec_vsx_ld(0, left);
566 const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
567 const int32x4_t sum4s =
568 (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
569 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
570 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
571
572 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
573 3);
574 }
575
vpx_dc_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)576 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
577 const uint8_t *above, const uint8_t *left) {
578 dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
579 }
580
vpx_dc_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)581 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
582 const uint8_t *above, const uint8_t *left) {
583 dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
584 }
585
dc_avg32(const uint8_t * above,const uint8_t * left)586 static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
587 const uint8x16_t a0 = vec_vsx_ld(0, above);
588 const uint8x16_t a1 = vec_vsx_ld(16, above);
589 const uint8x16_t l0 = vec_vsx_ld(0, left);
590 const uint8x16_t l1 = vec_vsx_ld(16, left);
591 const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
592 const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
593 const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
594 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
595 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
596
597 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
598 3);
599 }
600
vpx_dc_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)601 void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
602 const uint8_t *above, const uint8_t *left) {
603 dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
604 }
605
avg3(const uint8x16_t a,const uint8x16_t b,const uint8x16_t c)606 static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
607 const uint8x16_t c) {
608 const uint8x16_t ac =
609 vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
610
611 return vec_avg(ac, b);
612 }
613
614 // Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
615 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
616 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
617
vpx_d45_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)618 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
619 const uint8_t *above, const uint8_t *left) {
620 const uint8x16_t af = vec_vsx_ld(0, above);
621 const uint8x16_t above_right = vec_splat(af, 7);
622 const uint8x16_t a = xxpermdi(af, above_right, 1);
623 const uint8x16_t b = vec_perm(a, above_right, sl1);
624 const uint8x16_t c = vec_perm(b, above_right, sl1);
625 uint8x16_t row = avg3(a, b, c);
626 int i;
627 (void)left;
628
629 for (i = 0; i < 8; i++) {
630 const uint8x16_t d = vec_vsx_ld(0, dst);
631 vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
632 dst += stride;
633 row = vec_perm(row, above_right, sl1);
634 }
635 }
636
vpx_d45_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)637 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
638 const uint8_t *above, const uint8_t *left) {
639 const uint8x16_t a = vec_vsx_ld(0, above);
640 const uint8x16_t above_right = vec_splat(a, 15);
641 const uint8x16_t b = vec_perm(a, above_right, sl1);
642 const uint8x16_t c = vec_perm(b, above_right, sl1);
643 uint8x16_t row = avg3(a, b, c);
644 int i;
645 (void)left;
646
647 for (i = 0; i < 16; i++) {
648 vec_vsx_st(row, 0, dst);
649 dst += stride;
650 row = vec_perm(row, above_right, sl1);
651 }
652 }
653
vpx_d45_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)654 void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
655 const uint8_t *above, const uint8_t *left) {
656 const uint8x16_t a0 = vec_vsx_ld(0, above);
657 const uint8x16_t a1 = vec_vsx_ld(16, above);
658 const uint8x16_t above_right = vec_splat(a1, 15);
659 const uint8x16_t b0 = vec_perm(a0, a1, sl1);
660 const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
661 const uint8x16_t c0 = vec_perm(b0, b1, sl1);
662 const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
663 uint8x16_t row0 = avg3(a0, b0, c0);
664 uint8x16_t row1 = avg3(a1, b1, c1);
665 int i;
666 (void)left;
667
668 for (i = 0; i < 32; i++) {
669 vec_vsx_st(row0, 0, dst);
670 vec_vsx_st(row1, 16, dst);
671 dst += stride;
672 row0 = vec_perm(row0, row1, sl1);
673 row1 = vec_perm(row1, above_right, sl1);
674 }
675 }
676
vpx_d63_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)677 void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
678 const uint8_t *above, const uint8_t *left) {
679 const uint8x16_t af = vec_vsx_ld(0, above);
680 const uint8x16_t above_right = vec_splat(af, 9);
681 const uint8x16_t a = xxpermdi(af, above_right, 1);
682 const uint8x16_t b = vec_perm(a, above_right, sl1);
683 const uint8x16_t c = vec_perm(b, above_right, sl1);
684 uint8x16_t row0 = vec_avg(a, b);
685 uint8x16_t row1 = avg3(a, b, c);
686 int i;
687 (void)left;
688
689 for (i = 0; i < 4; i++) {
690 const uint8x16_t d0 = vec_vsx_ld(0, dst);
691 const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
692 vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
693 vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
694 dst += stride * 2;
695 row0 = vec_perm(row0, above_right, sl1);
696 row1 = vec_perm(row1, above_right, sl1);
697 }
698 }
699
vpx_d63_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)700 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
701 const uint8_t *above, const uint8_t *left) {
702 const uint8x16_t a0 = vec_vsx_ld(0, above);
703 const uint8x16_t a1 = vec_vsx_ld(16, above);
704 const uint8x16_t above_right = vec_splat(a1, 0);
705 const uint8x16_t b = vec_perm(a0, above_right, sl1);
706 const uint8x16_t c = vec_perm(b, above_right, sl1);
707 uint8x16_t row0 = vec_avg(a0, b);
708 uint8x16_t row1 = avg3(a0, b, c);
709 int i;
710 (void)left;
711
712 for (i = 0; i < 8; i++) {
713 vec_vsx_st(row0, 0, dst);
714 vec_vsx_st(row1, 0, dst + stride);
715 dst += stride * 2;
716 row0 = vec_perm(row0, above_right, sl1);
717 row1 = vec_perm(row1, above_right, sl1);
718 }
719 }
720
vpx_d63_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)721 void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
722 const uint8_t *above, const uint8_t *left) {
723 const uint8x16_t a0 = vec_vsx_ld(0, above);
724 const uint8x16_t a1 = vec_vsx_ld(16, above);
725 const uint8x16_t a2 = vec_vsx_ld(32, above);
726 const uint8x16_t above_right = vec_splat(a2, 0);
727 const uint8x16_t b0 = vec_perm(a0, a1, sl1);
728 const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
729 const uint8x16_t c0 = vec_perm(b0, b1, sl1);
730 const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
731 uint8x16_t row0_0 = vec_avg(a0, b0);
732 uint8x16_t row0_1 = vec_avg(a1, b1);
733 uint8x16_t row1_0 = avg3(a0, b0, c0);
734 uint8x16_t row1_1 = avg3(a1, b1, c1);
735 int i;
736 (void)left;
737
738 for (i = 0; i < 16; i++) {
739 vec_vsx_st(row0_0, 0, dst);
740 vec_vsx_st(row0_1, 16, dst);
741 vec_vsx_st(row1_0, 0, dst + stride);
742 vec_vsx_st(row1_1, 16, dst + stride);
743 dst += stride * 2;
744 row0_0 = vec_perm(row0_0, row0_1, sl1);
745 row0_1 = vec_perm(row0_1, above_right, sl1);
746 row1_0 = vec_perm(row1_0, row1_1, sl1);
747 row1_1 = vec_perm(row1_1, above_right, sl1);
748 }
749 }
750