1 
2 #include <arm_neon.h>
3 
4 
5 #define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
6 #define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
7 #define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
8 #define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
9 #define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
10 #define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
11 
12 #define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
13 #define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
14 #define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
15 #define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
16 
17 #ifndef PREAMBLE
18     #define PREAMBLE(state)
19     #define PREAMBLE_PARAM_X
20     #define PREAMBLE_PARAM_Y
21     #define PREAMBLE_ARG_X
22     #define PREAMBLE_ARG_Y
23 #endif
24 
SCALE_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)25 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
26                                 uint32_t xy[], int count, int x, int y) {
27     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
28                              SkMatrix::kScale_Mask)) == 0);
29 
30     PREAMBLE(s);
31 
32     // we store y, x, x, x, x, x
33     const unsigned maxX = s.fBitmap->width() - 1;
34     SkFractionalInt fx;
35     {
36         SkPoint pt;
37         s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
38                                  SkIntToScalar(y) + SK_ScalarHalf, &pt);
39         fx = SkScalarToFractionalInt(pt.fY);
40         const unsigned maxY = s.fBitmap->height() - 1;
41         *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY);
42         fx = SkScalarToFractionalInt(pt.fX);
43     }
44 
45     if (0 == maxX) {
46         // all of the following X values must be 0
47         memset(xy, 0, count * sizeof(uint16_t));
48         return;
49     }
50 
51     const SkFractionalInt dx = s.fInvSxFractionalInt;
52 
53 #ifdef CHECK_FOR_DECAL
54     // test if we don't need to apply the tile proc
55     if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
56         decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
57                              SkFractionalIntToFixed(dx), count);
58         return;
59     }
60 #endif
61 
62     if (count >= 8) {
63         SkFractionalInt dx2 = dx+dx;
64         SkFractionalInt dx4 = dx2+dx2;
65         SkFractionalInt dx8 = dx4+dx4;
66 
67         // now build fx/fx+dx/fx+2dx/fx+3dx
68         SkFractionalInt fx1, fx2, fx3;
69         int32x4_t lbase, hbase;
70         int16_t *dst16 = (int16_t *)xy;
71 
72         fx1 = fx+dx;
73         fx2 = fx1+dx;
74         fx3 = fx2+dx;
75 
76         lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
77         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
78         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
79         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
80         hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
81 
82         // store & bump
83         while (count >= 8) {
84 
85             int16x8_t fx8;
86 
87             fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
88 
89             vst1q_s16(dst16, fx8);
90 
91             // but preserving base & on to the next
92             lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
93             hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
94             dst16 += 8;
95             count -= 8;
96             fx += dx8;
97         };
98         xy = (uint32_t *) dst16;
99     }
100 
101     uint16_t* xx = (uint16_t*)xy;
102     for (int i = count; i > 0; --i) {
103         *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
104         fx += dx;
105     }
106 }
107 
AFFINE_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)108 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
109                                  uint32_t xy[], int count, int x, int y) {
110     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
111     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
112                              SkMatrix::kScale_Mask |
113                              SkMatrix::kAffine_Mask)) == 0);
114 
115     PREAMBLE(s);
116     SkPoint srcPt;
117     s.fInvProc(s.fInvMatrix,
118                SkIntToScalar(x) + SK_ScalarHalf,
119                SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
120 
121     SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX);
122     SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY);
123     SkFractionalInt dx = s.fInvSxFractionalInt;
124     SkFractionalInt dy = s.fInvKyFractionalInt;
125     int maxX = s.fBitmap->width() - 1;
126     int maxY = s.fBitmap->height() - 1;
127 
128     if (count >= 8) {
129         SkFractionalInt dx4 = dx * 4;
130         SkFractionalInt dy4 = dy * 4;
131         SkFractionalInt dx8 = dx * 8;
132         SkFractionalInt dy8 = dy * 8;
133 
134         int32x4_t xbase, ybase;
135         int32x4_t x2base, y2base;
136         int16_t *dst16 = (int16_t *) xy;
137 
138         // now build fx, fx+dx, fx+2dx, fx+3dx
139         xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
140         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
141         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
142         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
143 
144         // same for fy
145         ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
146         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
147         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
148         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
149 
150         x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
151         y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
152 
153         // store & bump
154         do {
155             int16x8x2_t hi16;
156 
157             hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
158             hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
159 
160             vst2q_s16(dst16, hi16);
161 
162             // moving base and on to the next
163             xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
164             ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
165             x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
166             y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
167 
168             dst16 += 16; // 8x32 aka 16x16
169             count -= 8;
170             fx += dx8;
171             fy += dy8;
172         } while (count >= 8);
173         xy = (uint32_t *) dst16;
174     }
175 
176     for (int i = count; i > 0; --i) {
177         *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
178                  TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
179         fx += dx; fy += dy;
180     }
181 }
182 
PERSP_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t * SK_RESTRICT xy,int count,int x,int y)183 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
184                                 uint32_t* SK_RESTRICT xy,
185                                 int count, int x, int y) {
186     SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
187 
188     PREAMBLE(s);
189     // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
190     int maxX = s.fBitmap->width() - 1;
191     int maxY = s.fBitmap->height() - 1;
192 
193     SkPerspIter iter(s.fInvMatrix,
194                      SkIntToScalar(x) + SK_ScalarHalf,
195                      SkIntToScalar(y) + SK_ScalarHalf, count);
196 
197     while ((count = iter.next()) != 0) {
198         const SkFixed* SK_RESTRICT srcXY = iter.getXY();
199 
200         if (count >= 8) {
201             int32_t *mysrc = (int32_t *) srcXY;
202             int16_t *mydst = (int16_t *) xy;
203             do {
204                 int16x8x2_t hi16;
205                 int32x4x2_t xy1, xy2;
206 
207                 xy1 = vld2q_s32(mysrc);
208                 xy2 = vld2q_s32(mysrc+8);
209 
210                 hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
211                 hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
212 
213                 vst2q_s16(mydst, hi16);
214 
215                 count -= 8;  // 8 iterations
216                 mysrc += 16; // 16 longs
217                 mydst += 16; // 16 shorts, aka 8 longs
218             } while (count >= 8);
219             // get xy and srcXY fixed up
220             srcXY = (const SkFixed *) mysrc;
221             xy = (uint32_t *) mydst;
222         }
223 
224         while (--count >= 0) {
225             *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
226                      TILEX_PROCF(srcXY[0], maxX);
227             srcXY += 2;
228         }
229     }
230 }
231 
PACK_FILTER_Y_NAME(SkFixed f,unsigned max,SkFixed one PREAMBLE_PARAM_Y)232 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
233                                           SkFixed one PREAMBLE_PARAM_Y) {
234     unsigned i = TILEY_PROCF(f, max);
235     i = (i << 4) | TILEY_LOW_BITS(f, max);
236     return (i << 14) | (TILEY_PROCF((f + one), max));
237 }
238 
PACK_FILTER_X_NAME(SkFixed f,unsigned max,SkFixed one PREAMBLE_PARAM_X)239 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
240                                           SkFixed one PREAMBLE_PARAM_X) {
241     unsigned i = TILEX_PROCF(f, max);
242     i = (i << 4) | TILEX_LOW_BITS(f, max);
243     return (i << 14) | (TILEX_PROCF((f + one), max));
244 }
245 
PACK_FILTER_X4_NAME(int32x4_t f,unsigned max,SkFixed one PREAMBLE_PARAM_X)246 static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
247                                           SkFixed one PREAMBLE_PARAM_X) {
248     int32x4_t ret, res, wide_one;
249 
250     // Prepare constants
251     wide_one = vdupq_n_s32(one);
252 
253     // Step 1
254     res = TILEX_PROCF_NEON4(f, max);
255 
256     // Step 2
257     ret = TILEX_LOW_BITS_NEON4(f, max);
258     ret = vsliq_n_s32(ret, res, 4);
259 
260     // Step 3
261     res = TILEX_PROCF_NEON4(f + wide_one, max);
262     ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
263 
264     return ret;
265 }
266 
PACK_FILTER_Y4_NAME(int32x4_t f,unsigned max,SkFixed one PREAMBLE_PARAM_X)267 static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
268                                           SkFixed one PREAMBLE_PARAM_X) {
269     int32x4_t ret, res, wide_one;
270 
271     // Prepare constants
272     wide_one = vdupq_n_s32(one);
273 
274     // Step 1
275     res = TILEY_PROCF_NEON4(f, max);
276 
277     // Step 2
278     ret = TILEY_LOW_BITS_NEON4(f, max);
279     ret = vsliq_n_s32(ret, res, 4);
280 
281     // Step 3
282     res = TILEY_PROCF_NEON4(f + wide_one, max);
283     ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
284 
285     return ret;
286 }
287 
SCALE_FILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)288 static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
289                               uint32_t xy[], int count, int x, int y) {
290     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
291                              SkMatrix::kScale_Mask)) == 0);
292     SkASSERT(s.fInvKy == 0);
293 
294     PREAMBLE(s);
295 
296     const unsigned maxX = s.fBitmap->width() - 1;
297     const SkFixed one = s.fFilterOneX;
298     const SkFractionalInt dx = s.fInvSxFractionalInt;
299     SkFractionalInt fx;
300 
301     {
302         SkPoint pt;
303         s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
304                                  SkIntToScalar(y) + SK_ScalarHalf, &pt);
305         const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
306         const unsigned maxY = s.fBitmap->height() - 1;
307         // compute our two Y values up front
308         *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
309         // now initialize fx
310         fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
311     }
312 
313 #ifdef CHECK_FOR_DECAL
314     // test if we don't need to apply the tile proc
315     if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
316         decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
317                              SkFractionalIntToFixed(dx), count);
318         return;
319     }
320 #endif
321     {
322 
323     if (count >= 4) {
324         int32x4_t wide_fx;
325 
326         wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
327         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
328         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
329         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
330 
331         while (count >= 4) {
332             int32x4_t res;
333 
334             res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
335 
336             vst1q_u32(xy, vreinterpretq_u32_s32(res));
337 
338             wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
339             fx += dx+dx+dx+dx;
340             xy += 4;
341             count -= 4;
342         }
343     }
344 
345     while (--count >= 0) {
346         *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
347         fx += dx;
348     }
349 
350     }
351 }
352 
AFFINE_FILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)353 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
354                                uint32_t xy[], int count, int x, int y) {
355     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
356     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
357                              SkMatrix::kScale_Mask |
358                              SkMatrix::kAffine_Mask)) == 0);
359 
360     PREAMBLE(s);
361     SkPoint srcPt;
362     s.fInvProc(s.fInvMatrix,
363                SkIntToScalar(x) + SK_ScalarHalf,
364                SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
365 
366     SkFixed oneX = s.fFilterOneX;
367     SkFixed oneY = s.fFilterOneY;
368     SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
369     SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
370     SkFixed dx = s.fInvSx;
371     SkFixed dy = s.fInvKy;
372     unsigned maxX = s.fBitmap->width() - 1;
373     unsigned maxY = s.fBitmap->height() - 1;
374 
375     if (count >= 4) {
376         int32x4_t wide_fy, wide_fx;
377 
378         wide_fx = vdupq_n_s32(fx);
379         wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
380         wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
381         wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
382 
383         wide_fy = vdupq_n_s32(fy);
384         wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
385         wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
386         wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
387 
388         while (count >= 4) {
389             int32x4x2_t vxy;
390 
391             // do the X side, then the Y side, then interleave them
392             vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
393             vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
394 
395             // interleave as YXYXYXYX as part of the storing
396             vst2q_s32((int32_t*)xy, vxy);
397 
398             // prepare next iteration
399             wide_fx += vdupq_n_s32(dx+dx+dx+dx);
400             fx += dx + dx + dx + dx;
401             wide_fy += vdupq_n_s32(dy+dy+dy+dy);
402             fy += dy+dy+dy+dy;
403             xy += 8; // 4 x's, 4 y's
404             count -= 4;
405         }
406     }
407 
408     while (--count >= 0) {
409         // NB: writing Y/X
410         *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
411         fy += dy;
412         *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
413         fx += dx;
414     }
415 }
416 
PERSP_FILTER_NAME(const SkBitmapProcState & s,uint32_t * SK_RESTRICT xy,int count,int x,int y)417 static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
418                               uint32_t* SK_RESTRICT xy, int count,
419                               int x, int y) {
420     SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
421 
422     PREAMBLE(s);
423     unsigned maxX = s.fBitmap->width() - 1;
424     unsigned maxY = s.fBitmap->height() - 1;
425     SkFixed oneX = s.fFilterOneX;
426     SkFixed oneY = s.fFilterOneY;
427 
428     SkPerspIter iter(s.fInvMatrix,
429                      SkIntToScalar(x) + SK_ScalarHalf,
430                      SkIntToScalar(y) + SK_ScalarHalf, count);
431 
432     while ((count = iter.next()) != 0) {
433         const SkFixed* SK_RESTRICT srcXY = iter.getXY();
434 
435         while (count >= 4) {
436             int32x4_t wide_x, wide_y;
437             int32x4x2_t vxy, vresyx;
438 
439             // load src:  x-y-x-y-x-y-x-y
440             vxy = vld2q_s32(srcXY);
441 
442             // do the X side, then the Y side, then interleave them
443             wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
444             wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
445 
446             vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
447             vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
448 
449             // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
450             vst2q_s32((int32_t*)xy, vresyx);
451 
452             // on to the next iteration
453             srcXY += 2*4;
454             count -= 4;
455             xy += 2*4;
456         }
457 
458         while (--count >= 0) {
459             // NB: we read x/y, we write y/x
460             *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
461                                        oneY PREAMBLE_ARG_Y);
462             *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
463                                        oneX PREAMBLE_ARG_X);
464             srcXY += 2;
465         }
466     }
467 }
468 
469 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
470     SCALE_NOFILTER_NAME,
471     SCALE_FILTER_NAME,
472     AFFINE_NOFILTER_NAME,
473     AFFINE_FILTER_NAME,
474     PERSP_NOFILTER_NAME,
475     PERSP_FILTER_NAME
476 };
477 
478 #undef TILEX_PROCF_NEON8
479 #undef TILEY_PROCF_NEON8
480 #undef TILEX_PROCF_NEON4
481 #undef TILEY_PROCF_NEON4
482 #undef TILEX_LOW_BITS_NEON4
483 #undef TILEY_LOW_BITS_NEON4
484 
485 #undef MAKENAME
486 #undef TILEX_PROCF
487 #undef TILEY_PROCF
488 #ifdef CHECK_FOR_DECAL
489     #undef CHECK_FOR_DECAL
490 #endif
491 
492 #undef SCALE_NOFILTER_NAME
493 #undef SCALE_FILTER_NAME
494 #undef AFFINE_NOFILTER_NAME
495 #undef AFFINE_FILTER_NAME
496 #undef PERSP_NOFILTER_NAME
497 #undef PERSP_FILTER_NAME
498 
499 #undef PREAMBLE
500 #undef PREAMBLE_PARAM_X
501 #undef PREAMBLE_PARAM_Y
502 #undef PREAMBLE_ARG_X
503 #undef PREAMBLE_ARG_Y
504 
505 #undef TILEX_LOW_BITS
506 #undef TILEY_LOW_BITS
507