1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/ppc/types_vsx.h"
13 
vpx_v_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)14 void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
15                                const uint8_t *above, const uint8_t *left) {
16   const uint8x16_t d = vec_vsx_ld(0, above);
17   int i;
18   (void)left;
19 
20   for (i = 0; i < 16; i++, dst += stride) {
21     vec_vsx_st(d, 0, dst);
22   }
23 }
24 
vpx_v_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)25 void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
26                                const uint8_t *above, const uint8_t *left) {
27   const uint8x16_t d0 = vec_vsx_ld(0, above);
28   const uint8x16_t d1 = vec_vsx_ld(16, above);
29   int i;
30   (void)left;
31 
32   for (i = 0; i < 32; i++, dst += stride) {
33     vec_vsx_st(d0, 0, dst);
34     vec_vsx_st(d1, 16, dst);
35   }
36 }
37 
38 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
39 
vpx_h_predictor_4x4_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)40 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
41                              const uint8_t *above, const uint8_t *left) {
42   const uint8x16_t d = vec_vsx_ld(0, left);
43   const uint8x16_t v0 = vec_splat(d, 0);
44   const uint8x16_t v1 = vec_splat(d, 1);
45   const uint8x16_t v2 = vec_splat(d, 2);
46   const uint8x16_t v3 = vec_splat(d, 3);
47 
48   (void)above;
49 
50   vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
51   dst += stride;
52   vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
53   dst += stride;
54   vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
55   dst += stride;
56   vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
57 }
58 
vpx_h_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)59 void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
60                              const uint8_t *above, const uint8_t *left) {
61   const uint8x16_t d = vec_vsx_ld(0, left);
62   const uint8x16_t v0 = vec_splat(d, 0);
63   const uint8x16_t v1 = vec_splat(d, 1);
64   const uint8x16_t v2 = vec_splat(d, 2);
65   const uint8x16_t v3 = vec_splat(d, 3);
66 
67   const uint8x16_t v4 = vec_splat(d, 4);
68   const uint8x16_t v5 = vec_splat(d, 5);
69   const uint8x16_t v6 = vec_splat(d, 6);
70   const uint8x16_t v7 = vec_splat(d, 7);
71 
72   (void)above;
73 
74   vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
75   dst += stride;
76   vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
77   dst += stride;
78   vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
79   dst += stride;
80   vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
81   dst += stride;
82   vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
83   dst += stride;
84   vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
85   dst += stride;
86   vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
87   dst += stride;
88   vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
89 }
90 
vpx_h_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)91 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
92                                const uint8_t *above, const uint8_t *left) {
93   const uint8x16_t d = vec_vsx_ld(0, left);
94   const uint8x16_t v0 = vec_splat(d, 0);
95   const uint8x16_t v1 = vec_splat(d, 1);
96   const uint8x16_t v2 = vec_splat(d, 2);
97   const uint8x16_t v3 = vec_splat(d, 3);
98 
99   const uint8x16_t v4 = vec_splat(d, 4);
100   const uint8x16_t v5 = vec_splat(d, 5);
101   const uint8x16_t v6 = vec_splat(d, 6);
102   const uint8x16_t v7 = vec_splat(d, 7);
103 
104   const uint8x16_t v8 = vec_splat(d, 8);
105   const uint8x16_t v9 = vec_splat(d, 9);
106   const uint8x16_t v10 = vec_splat(d, 10);
107   const uint8x16_t v11 = vec_splat(d, 11);
108 
109   const uint8x16_t v12 = vec_splat(d, 12);
110   const uint8x16_t v13 = vec_splat(d, 13);
111   const uint8x16_t v14 = vec_splat(d, 14);
112   const uint8x16_t v15 = vec_splat(d, 15);
113 
114   (void)above;
115 
116   vec_vsx_st(v0, 0, dst);
117   dst += stride;
118   vec_vsx_st(v1, 0, dst);
119   dst += stride;
120   vec_vsx_st(v2, 0, dst);
121   dst += stride;
122   vec_vsx_st(v3, 0, dst);
123   dst += stride;
124   vec_vsx_st(v4, 0, dst);
125   dst += stride;
126   vec_vsx_st(v5, 0, dst);
127   dst += stride;
128   vec_vsx_st(v6, 0, dst);
129   dst += stride;
130   vec_vsx_st(v7, 0, dst);
131   dst += stride;
132   vec_vsx_st(v8, 0, dst);
133   dst += stride;
134   vec_vsx_st(v9, 0, dst);
135   dst += stride;
136   vec_vsx_st(v10, 0, dst);
137   dst += stride;
138   vec_vsx_st(v11, 0, dst);
139   dst += stride;
140   vec_vsx_st(v12, 0, dst);
141   dst += stride;
142   vec_vsx_st(v13, 0, dst);
143   dst += stride;
144   vec_vsx_st(v14, 0, dst);
145   dst += stride;
146   vec_vsx_st(v15, 0, dst);
147 }
148 
149 #define H_PREDICTOR_32(v) \
150   vec_vsx_st(v, 0, dst);  \
151   vec_vsx_st(v, 16, dst); \
152   dst += stride
153 
vpx_h_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)154 void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
155                                const uint8_t *above, const uint8_t *left) {
156   const uint8x16_t d0 = vec_vsx_ld(0, left);
157   const uint8x16_t d1 = vec_vsx_ld(16, left);
158 
159   const uint8x16_t v0_0 = vec_splat(d0, 0);
160   const uint8x16_t v1_0 = vec_splat(d0, 1);
161   const uint8x16_t v2_0 = vec_splat(d0, 2);
162   const uint8x16_t v3_0 = vec_splat(d0, 3);
163   const uint8x16_t v4_0 = vec_splat(d0, 4);
164   const uint8x16_t v5_0 = vec_splat(d0, 5);
165   const uint8x16_t v6_0 = vec_splat(d0, 6);
166   const uint8x16_t v7_0 = vec_splat(d0, 7);
167   const uint8x16_t v8_0 = vec_splat(d0, 8);
168   const uint8x16_t v9_0 = vec_splat(d0, 9);
169   const uint8x16_t v10_0 = vec_splat(d0, 10);
170   const uint8x16_t v11_0 = vec_splat(d0, 11);
171   const uint8x16_t v12_0 = vec_splat(d0, 12);
172   const uint8x16_t v13_0 = vec_splat(d0, 13);
173   const uint8x16_t v14_0 = vec_splat(d0, 14);
174   const uint8x16_t v15_0 = vec_splat(d0, 15);
175 
176   const uint8x16_t v0_1 = vec_splat(d1, 0);
177   const uint8x16_t v1_1 = vec_splat(d1, 1);
178   const uint8x16_t v2_1 = vec_splat(d1, 2);
179   const uint8x16_t v3_1 = vec_splat(d1, 3);
180   const uint8x16_t v4_1 = vec_splat(d1, 4);
181   const uint8x16_t v5_1 = vec_splat(d1, 5);
182   const uint8x16_t v6_1 = vec_splat(d1, 6);
183   const uint8x16_t v7_1 = vec_splat(d1, 7);
184   const uint8x16_t v8_1 = vec_splat(d1, 8);
185   const uint8x16_t v9_1 = vec_splat(d1, 9);
186   const uint8x16_t v10_1 = vec_splat(d1, 10);
187   const uint8x16_t v11_1 = vec_splat(d1, 11);
188   const uint8x16_t v12_1 = vec_splat(d1, 12);
189   const uint8x16_t v13_1 = vec_splat(d1, 13);
190   const uint8x16_t v14_1 = vec_splat(d1, 14);
191   const uint8x16_t v15_1 = vec_splat(d1, 15);
192 
193   (void)above;
194 
195   H_PREDICTOR_32(v0_0);
196   H_PREDICTOR_32(v1_0);
197   H_PREDICTOR_32(v2_0);
198   H_PREDICTOR_32(v3_0);
199 
200   H_PREDICTOR_32(v4_0);
201   H_PREDICTOR_32(v5_0);
202   H_PREDICTOR_32(v6_0);
203   H_PREDICTOR_32(v7_0);
204 
205   H_PREDICTOR_32(v8_0);
206   H_PREDICTOR_32(v9_0);
207   H_PREDICTOR_32(v10_0);
208   H_PREDICTOR_32(v11_0);
209 
210   H_PREDICTOR_32(v12_0);
211   H_PREDICTOR_32(v13_0);
212   H_PREDICTOR_32(v14_0);
213   H_PREDICTOR_32(v15_0);
214 
215   H_PREDICTOR_32(v0_1);
216   H_PREDICTOR_32(v1_1);
217   H_PREDICTOR_32(v2_1);
218   H_PREDICTOR_32(v3_1);
219 
220   H_PREDICTOR_32(v4_1);
221   H_PREDICTOR_32(v5_1);
222   H_PREDICTOR_32(v6_1);
223   H_PREDICTOR_32(v7_1);
224 
225   H_PREDICTOR_32(v8_1);
226   H_PREDICTOR_32(v9_1);
227   H_PREDICTOR_32(v10_1);
228   H_PREDICTOR_32(v11_1);
229 
230   H_PREDICTOR_32(v12_1);
231   H_PREDICTOR_32(v13_1);
232   H_PREDICTOR_32(v14_1);
233   H_PREDICTOR_32(v15_1);
234 }
235 
vpx_tm_predictor_4x4_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)236 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
237                               const uint8_t *above, const uint8_t *left) {
238   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
239   const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
240   const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
241   int16x8_t tmp, val;
242   uint8x16_t d;
243 
244   d = vec_vsx_ld(0, dst);
245   tmp = unpack_to_s16_l(d);
246   val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
247   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
248   dst += stride;
249 
250   d = vec_vsx_ld(0, dst);
251   tmp = unpack_to_s16_l(d);
252   val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
253   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
254   dst += stride;
255 
256   d = vec_vsx_ld(0, dst);
257   tmp = unpack_to_s16_l(d);
258   val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
259   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
260   dst += stride;
261 
262   d = vec_vsx_ld(0, dst);
263   tmp = unpack_to_s16_l(d);
264   val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
265   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
266 }
267 
vpx_tm_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)268 void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
269                               const uint8_t *above, const uint8_t *left) {
270   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
271   const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
272   const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
273   int16x8_t tmp, val;
274 
275   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
276   val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
277   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
278   dst += stride;
279 
280   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
281   val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
282   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
283   dst += stride;
284 
285   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
286   val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
287   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
288   dst += stride;
289 
290   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
291   val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
292   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
293   dst += stride;
294 
295   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
296   val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
297   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
298   dst += stride;
299 
300   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
301   val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
302   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
303   dst += stride;
304 
305   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
306   val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
307   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
308   dst += stride;
309 
310   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
311   val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
312   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
313 }
314 
tm_predictor_16x8(uint8_t * dst,const ptrdiff_t stride,int16x8_t l,int16x8_t ah,int16x8_t al,int16x8_t tl)315 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
316                               int16x8_t ah, int16x8_t al, int16x8_t tl) {
317   int16x8_t vh, vl, ls;
318 
319   ls = vec_splat(l, 0);
320   vh = vec_sub(vec_add(ls, ah), tl);
321   vl = vec_sub(vec_add(ls, al), tl);
322   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
323   dst += stride;
324 
325   ls = vec_splat(l, 1);
326   vh = vec_sub(vec_add(ls, ah), tl);
327   vl = vec_sub(vec_add(ls, al), tl);
328   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
329   dst += stride;
330 
331   ls = vec_splat(l, 2);
332   vh = vec_sub(vec_add(ls, ah), tl);
333   vl = vec_sub(vec_add(ls, al), tl);
334   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
335   dst += stride;
336 
337   ls = vec_splat(l, 3);
338   vh = vec_sub(vec_add(ls, ah), tl);
339   vl = vec_sub(vec_add(ls, al), tl);
340   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
341   dst += stride;
342 
343   ls = vec_splat(l, 4);
344   vh = vec_sub(vec_add(ls, ah), tl);
345   vl = vec_sub(vec_add(ls, al), tl);
346   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
347   dst += stride;
348 
349   ls = vec_splat(l, 5);
350   vh = vec_sub(vec_add(ls, ah), tl);
351   vl = vec_sub(vec_add(ls, al), tl);
352   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
353   dst += stride;
354 
355   ls = vec_splat(l, 6);
356   vh = vec_sub(vec_add(ls, ah), tl);
357   vl = vec_sub(vec_add(ls, al), tl);
358   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
359   dst += stride;
360 
361   ls = vec_splat(l, 7);
362   vh = vec_sub(vec_add(ls, ah), tl);
363   vl = vec_sub(vec_add(ls, al), tl);
364   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
365 }
366 
vpx_tm_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)367 void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
368                                 const uint8_t *above, const uint8_t *left) {
369   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
370   const uint8x16_t l = vec_vsx_ld(0, left);
371   const int16x8_t lh = unpack_to_s16_h(l);
372   const int16x8_t ll = unpack_to_s16_l(l);
373   const uint8x16_t a = vec_vsx_ld(0, above);
374   const int16x8_t ah = unpack_to_s16_h(a);
375   const int16x8_t al = unpack_to_s16_l(a);
376 
377   tm_predictor_16x8(dst, stride, lh, ah, al, tl);
378 
379   dst += stride * 8;
380 
381   tm_predictor_16x8(dst, stride, ll, ah, al, tl);
382 }
383 
tm_predictor_32x1(uint8_t * dst,const int16x8_t ls,const int16x8_t a0h,const int16x8_t a0l,const int16x8_t a1h,const int16x8_t a1l,const int16x8_t tl)384 static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
385                                      const int16x8_t a0h, const int16x8_t a0l,
386                                      const int16x8_t a1h, const int16x8_t a1l,
387                                      const int16x8_t tl) {
388   int16x8_t vh, vl;
389 
390   vh = vec_sub(vec_add(ls, a0h), tl);
391   vl = vec_sub(vec_add(ls, a0l), tl);
392   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
393   vh = vec_sub(vec_add(ls, a1h), tl);
394   vl = vec_sub(vec_add(ls, a1l), tl);
395   vec_vsx_st(vec_packsu(vh, vl), 16, dst);
396 }
397 
tm_predictor_32x8(uint8_t * dst,const ptrdiff_t stride,const int16x8_t l,const uint8x16_t a0,const uint8x16_t a1,const int16x8_t tl)398 static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
399                               const int16x8_t l, const uint8x16_t a0,
400                               const uint8x16_t a1, const int16x8_t tl) {
401   const int16x8_t a0h = unpack_to_s16_h(a0);
402   const int16x8_t a0l = unpack_to_s16_l(a0);
403   const int16x8_t a1h = unpack_to_s16_h(a1);
404   const int16x8_t a1l = unpack_to_s16_l(a1);
405 
406   tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
407   dst += stride;
408 
409   tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
410   dst += stride;
411 
412   tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
413   dst += stride;
414 
415   tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
416   dst += stride;
417 
418   tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
419   dst += stride;
420 
421   tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
422   dst += stride;
423 
424   tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
425   dst += stride;
426 
427   tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
428 }
429 
vpx_tm_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)430 void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
431                                 const uint8_t *above, const uint8_t *left) {
432   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
433   const uint8x16_t l0 = vec_vsx_ld(0, left);
434   const uint8x16_t l1 = vec_vsx_ld(16, left);
435   const uint8x16_t a0 = vec_vsx_ld(0, above);
436   const uint8x16_t a1 = vec_vsx_ld(16, above);
437 
438   tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
439   dst += stride * 8;
440 
441   tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
442   dst += stride * 8;
443 
444   tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
445   dst += stride * 8;
446 
447   tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
448 }
449 
dc_fill_predictor_8x8(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)450 static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
451                                          const uint8x16_t val) {
452   int i;
453 
454   for (i = 0; i < 8; i++, dst += stride) {
455     const uint8x16_t d = vec_vsx_ld(0, dst);
456     vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
457   }
458 }
459 
dc_fill_predictor_16x16(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)460 static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
461                                            const uint8x16_t val) {
462   int i;
463 
464   for (i = 0; i < 16; i++, dst += stride) {
465     vec_vsx_st(val, 0, dst);
466   }
467 }
468 
vpx_dc_128_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)469 void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
470                                     const uint8_t *above, const uint8_t *left) {
471   const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
472   (void)above;
473   (void)left;
474 
475   dc_fill_predictor_16x16(dst, stride, v128);
476 }
477 
dc_fill_predictor_32x32(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)478 static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
479                                            const uint8x16_t val) {
480   int i;
481 
482   for (i = 0; i < 32; i++, dst += stride) {
483     vec_vsx_st(val, 0, dst);
484     vec_vsx_st(val, 16, dst);
485   }
486 }
487 
vpx_dc_128_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)488 void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
489                                     const uint8_t *above, const uint8_t *left) {
490   const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
491   (void)above;
492   (void)left;
493 
494   dc_fill_predictor_32x32(dst, stride, v128);
495 }
496 
avg16(const uint8_t * values)497 static uint8x16_t avg16(const uint8_t *values) {
498   const int32x4_t sum4s =
499       (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
500   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
501   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
502 
503   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
504                    3);
505 }
506 
vpx_dc_left_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)507 void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
508                                      const uint8_t *above,
509                                      const uint8_t *left) {
510   (void)above;
511 
512   dc_fill_predictor_16x16(dst, stride, avg16(left));
513 }
514 
vpx_dc_top_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)515 void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
516                                     const uint8_t *above, const uint8_t *left) {
517   (void)left;
518 
519   dc_fill_predictor_16x16(dst, stride, avg16(above));
520 }
521 
avg32(const uint8_t * values)522 static uint8x16_t avg32(const uint8_t *values) {
523   const uint8x16_t v0 = vec_vsx_ld(0, values);
524   const uint8x16_t v1 = vec_vsx_ld(16, values);
525   const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
526   const int32x4_t sum4s =
527       (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
528   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
529   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
530 
531   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
532                    3);
533 }
534 
vpx_dc_left_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)535 void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
536                                      const uint8_t *above,
537                                      const uint8_t *left) {
538   (void)above;
539 
540   dc_fill_predictor_32x32(dst, stride, avg32(left));
541 }
542 
vpx_dc_top_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)543 void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
544                                     const uint8_t *above, const uint8_t *left) {
545   (void)left;
546 
547   dc_fill_predictor_32x32(dst, stride, avg32(above));
548 }
549 
dc_avg8(const uint8_t * above,const uint8_t * left)550 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
551   const uint8x16_t a0 = vec_vsx_ld(0, above);
552   const uint8x16_t l0 = vec_vsx_ld(0, left);
553   const int32x4_t sum4s =
554       (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
555   const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
556   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
557   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
558 
559   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
560                    3);
561 }
562 
dc_avg16(const uint8_t * above,const uint8_t * left)563 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
564   const uint8x16_t a0 = vec_vsx_ld(0, above);
565   const uint8x16_t l0 = vec_vsx_ld(0, left);
566   const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
567   const int32x4_t sum4s =
568       (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
569   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
570   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
571 
572   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
573                    3);
574 }
575 
vpx_dc_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)576 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
577                               const uint8_t *above, const uint8_t *left) {
578   dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
579 }
580 
vpx_dc_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)581 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
582                                 const uint8_t *above, const uint8_t *left) {
583   dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
584 }
585 
dc_avg32(const uint8_t * above,const uint8_t * left)586 static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
587   const uint8x16_t a0 = vec_vsx_ld(0, above);
588   const uint8x16_t a1 = vec_vsx_ld(16, above);
589   const uint8x16_t l0 = vec_vsx_ld(0, left);
590   const uint8x16_t l1 = vec_vsx_ld(16, left);
591   const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
592   const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
593   const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
594   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
595   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
596 
597   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
598                    3);
599 }
600 
vpx_dc_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)601 void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
602                                 const uint8_t *above, const uint8_t *left) {
603   dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
604 }
605 
avg3(const uint8x16_t a,const uint8x16_t b,const uint8x16_t c)606 static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
607                        const uint8x16_t c) {
608   const uint8x16_t ac =
609       vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
610 
611   return vec_avg(ac, b);
612 }
613 
614 // Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
615 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
616                                 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
617 
vpx_d45_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)618 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
619                                const uint8_t *above, const uint8_t *left) {
620   const uint8x16_t af = vec_vsx_ld(0, above);
621   const uint8x16_t above_right = vec_splat(af, 7);
622   const uint8x16_t a = xxpermdi(af, above_right, 1);
623   const uint8x16_t b = vec_perm(a, above_right, sl1);
624   const uint8x16_t c = vec_perm(b, above_right, sl1);
625   uint8x16_t row = avg3(a, b, c);
626   int i;
627   (void)left;
628 
629   for (i = 0; i < 8; i++) {
630     const uint8x16_t d = vec_vsx_ld(0, dst);
631     vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
632     dst += stride;
633     row = vec_perm(row, above_right, sl1);
634   }
635 }
636 
vpx_d45_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)637 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
638                                  const uint8_t *above, const uint8_t *left) {
639   const uint8x16_t a = vec_vsx_ld(0, above);
640   const uint8x16_t above_right = vec_splat(a, 15);
641   const uint8x16_t b = vec_perm(a, above_right, sl1);
642   const uint8x16_t c = vec_perm(b, above_right, sl1);
643   uint8x16_t row = avg3(a, b, c);
644   int i;
645   (void)left;
646 
647   for (i = 0; i < 16; i++) {
648     vec_vsx_st(row, 0, dst);
649     dst += stride;
650     row = vec_perm(row, above_right, sl1);
651   }
652 }
653 
vpx_d45_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)654 void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
655                                  const uint8_t *above, const uint8_t *left) {
656   const uint8x16_t a0 = vec_vsx_ld(0, above);
657   const uint8x16_t a1 = vec_vsx_ld(16, above);
658   const uint8x16_t above_right = vec_splat(a1, 15);
659   const uint8x16_t b0 = vec_perm(a0, a1, sl1);
660   const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
661   const uint8x16_t c0 = vec_perm(b0, b1, sl1);
662   const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
663   uint8x16_t row0 = avg3(a0, b0, c0);
664   uint8x16_t row1 = avg3(a1, b1, c1);
665   int i;
666   (void)left;
667 
668   for (i = 0; i < 32; i++) {
669     vec_vsx_st(row0, 0, dst);
670     vec_vsx_st(row1, 16, dst);
671     dst += stride;
672     row0 = vec_perm(row0, row1, sl1);
673     row1 = vec_perm(row1, above_right, sl1);
674   }
675 }
676 
vpx_d63_predictor_8x8_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)677 void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
678                                const uint8_t *above, const uint8_t *left) {
679   const uint8x16_t af = vec_vsx_ld(0, above);
680   const uint8x16_t above_right = vec_splat(af, 9);
681   const uint8x16_t a = xxpermdi(af, above_right, 1);
682   const uint8x16_t b = vec_perm(a, above_right, sl1);
683   const uint8x16_t c = vec_perm(b, above_right, sl1);
684   uint8x16_t row0 = vec_avg(a, b);
685   uint8x16_t row1 = avg3(a, b, c);
686   int i;
687   (void)left;
688 
689   for (i = 0; i < 4; i++) {
690     const uint8x16_t d0 = vec_vsx_ld(0, dst);
691     const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
692     vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
693     vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
694     dst += stride * 2;
695     row0 = vec_perm(row0, above_right, sl1);
696     row1 = vec_perm(row1, above_right, sl1);
697   }
698 }
699 
vpx_d63_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)700 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
701                                  const uint8_t *above, const uint8_t *left) {
702   const uint8x16_t a0 = vec_vsx_ld(0, above);
703   const uint8x16_t a1 = vec_vsx_ld(16, above);
704   const uint8x16_t above_right = vec_splat(a1, 0);
705   const uint8x16_t b = vec_perm(a0, above_right, sl1);
706   const uint8x16_t c = vec_perm(b, above_right, sl1);
707   uint8x16_t row0 = vec_avg(a0, b);
708   uint8x16_t row1 = avg3(a0, b, c);
709   int i;
710   (void)left;
711 
712   for (i = 0; i < 8; i++) {
713     vec_vsx_st(row0, 0, dst);
714     vec_vsx_st(row1, 0, dst + stride);
715     dst += stride * 2;
716     row0 = vec_perm(row0, above_right, sl1);
717     row1 = vec_perm(row1, above_right, sl1);
718   }
719 }
720 
vpx_d63_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)721 void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
722                                  const uint8_t *above, const uint8_t *left) {
723   const uint8x16_t a0 = vec_vsx_ld(0, above);
724   const uint8x16_t a1 = vec_vsx_ld(16, above);
725   const uint8x16_t a2 = vec_vsx_ld(32, above);
726   const uint8x16_t above_right = vec_splat(a2, 0);
727   const uint8x16_t b0 = vec_perm(a0, a1, sl1);
728   const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
729   const uint8x16_t c0 = vec_perm(b0, b1, sl1);
730   const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
731   uint8x16_t row0_0 = vec_avg(a0, b0);
732   uint8x16_t row0_1 = vec_avg(a1, b1);
733   uint8x16_t row1_0 = avg3(a0, b0, c0);
734   uint8x16_t row1_1 = avg3(a1, b1, c1);
735   int i;
736   (void)left;
737 
738   for (i = 0; i < 16; i++) {
739     vec_vsx_st(row0_0, 0, dst);
740     vec_vsx_st(row0_1, 16, dst);
741     vec_vsx_st(row1_0, 0, dst + stride);
742     vec_vsx_st(row1_1, 16, dst + stride);
743     dst += stride * 2;
744     row0_0 = vec_perm(row0_0, row0_1, sl1);
745     row0_1 = vec_perm(row0_1, above_right, sl1);
746     row1_0 = vec_perm(row1_0, row1_1, sl1);
747     row1_1 = vec_perm(row1_1, above_right, sl1);
748   }
749 }
750