1
2 #include <arm_neon.h>
3
4
5 #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
6 #define SCALE_FILTER_NAME MAKENAME(_filter_scale)
7 #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
8 #define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
9 #define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
10 #define PERSP_FILTER_NAME MAKENAME(_filter_persp)
11
12 #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
13 #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
14 #define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
15 #define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
16
17 #ifndef PREAMBLE
18 #define PREAMBLE(state)
19 #define PREAMBLE_PARAM_X
20 #define PREAMBLE_PARAM_Y
21 #define PREAMBLE_ARG_X
22 #define PREAMBLE_ARG_Y
23 #endif
24
SCALE_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)25 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
26 uint32_t xy[], int count, int x, int y) {
27 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
28 SkMatrix::kScale_Mask)) == 0);
29
30 PREAMBLE(s);
31
32 // we store y, x, x, x, x, x
33 const unsigned maxX = s.fBitmap->width() - 1;
34 SkFractionalInt fx;
35 {
36 SkPoint pt;
37 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
38 SkIntToScalar(y) + SK_ScalarHalf, &pt);
39 fx = SkScalarToFractionalInt(pt.fY);
40 const unsigned maxY = s.fBitmap->height() - 1;
41 *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY);
42 fx = SkScalarToFractionalInt(pt.fX);
43 }
44
45 if (0 == maxX) {
46 // all of the following X values must be 0
47 memset(xy, 0, count * sizeof(uint16_t));
48 return;
49 }
50
51 const SkFractionalInt dx = s.fInvSxFractionalInt;
52
53 #ifdef CHECK_FOR_DECAL
54 // test if we don't need to apply the tile proc
55 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
56 decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
57 SkFractionalIntToFixed(dx), count);
58 return;
59 }
60 #endif
61
62 if (count >= 8) {
63 SkFractionalInt dx2 = dx+dx;
64 SkFractionalInt dx4 = dx2+dx2;
65 SkFractionalInt dx8 = dx4+dx4;
66
67 // now build fx/fx+dx/fx+2dx/fx+3dx
68 SkFractionalInt fx1, fx2, fx3;
69 int32x4_t lbase, hbase;
70 int16_t *dst16 = (int16_t *)xy;
71
72 fx1 = fx+dx;
73 fx2 = fx1+dx;
74 fx3 = fx2+dx;
75
76 lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
77 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
78 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
79 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
80 hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
81
82 // store & bump
83 while (count >= 8) {
84
85 int16x8_t fx8;
86
87 fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
88
89 vst1q_s16(dst16, fx8);
90
91 // but preserving base & on to the next
92 lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
93 hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
94 dst16 += 8;
95 count -= 8;
96 fx += dx8;
97 };
98 xy = (uint32_t *) dst16;
99 }
100
101 uint16_t* xx = (uint16_t*)xy;
102 for (int i = count; i > 0; --i) {
103 *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
104 fx += dx;
105 }
106 }
107
AFFINE_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)108 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
109 uint32_t xy[], int count, int x, int y) {
110 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
111 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
112 SkMatrix::kScale_Mask |
113 SkMatrix::kAffine_Mask)) == 0);
114
115 PREAMBLE(s);
116 SkPoint srcPt;
117 s.fInvProc(s.fInvMatrix,
118 SkIntToScalar(x) + SK_ScalarHalf,
119 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
120
121 SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX);
122 SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY);
123 SkFractionalInt dx = s.fInvSxFractionalInt;
124 SkFractionalInt dy = s.fInvKyFractionalInt;
125 int maxX = s.fBitmap->width() - 1;
126 int maxY = s.fBitmap->height() - 1;
127
128 if (count >= 8) {
129 SkFractionalInt dx4 = dx * 4;
130 SkFractionalInt dy4 = dy * 4;
131 SkFractionalInt dx8 = dx * 8;
132 SkFractionalInt dy8 = dy * 8;
133
134 int32x4_t xbase, ybase;
135 int32x4_t x2base, y2base;
136 int16_t *dst16 = (int16_t *) xy;
137
138 // now build fx, fx+dx, fx+2dx, fx+3dx
139 xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
140 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
141 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
142 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
143
144 // same for fy
145 ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
146 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
147 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
148 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
149
150 x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
151 y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
152
153 // store & bump
154 do {
155 int16x8x2_t hi16;
156
157 hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
158 hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
159
160 vst2q_s16(dst16, hi16);
161
162 // moving base and on to the next
163 xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
164 ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
165 x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
166 y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
167
168 dst16 += 16; // 8x32 aka 16x16
169 count -= 8;
170 fx += dx8;
171 fy += dy8;
172 } while (count >= 8);
173 xy = (uint32_t *) dst16;
174 }
175
176 for (int i = count; i > 0; --i) {
177 *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
178 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
179 fx += dx; fy += dy;
180 }
181 }
182
PERSP_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t * SK_RESTRICT xy,int count,int x,int y)183 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
184 uint32_t* SK_RESTRICT xy,
185 int count, int x, int y) {
186 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
187
188 PREAMBLE(s);
189 // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
190 int maxX = s.fBitmap->width() - 1;
191 int maxY = s.fBitmap->height() - 1;
192
193 SkPerspIter iter(s.fInvMatrix,
194 SkIntToScalar(x) + SK_ScalarHalf,
195 SkIntToScalar(y) + SK_ScalarHalf, count);
196
197 while ((count = iter.next()) != 0) {
198 const SkFixed* SK_RESTRICT srcXY = iter.getXY();
199
200 if (count >= 8) {
201 int32_t *mysrc = (int32_t *) srcXY;
202 int16_t *mydst = (int16_t *) xy;
203 do {
204 int16x8x2_t hi16;
205 int32x4x2_t xy1, xy2;
206
207 xy1 = vld2q_s32(mysrc);
208 xy2 = vld2q_s32(mysrc+8);
209
210 hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
211 hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
212
213 vst2q_s16(mydst, hi16);
214
215 count -= 8; // 8 iterations
216 mysrc += 16; // 16 longs
217 mydst += 16; // 16 shorts, aka 8 longs
218 } while (count >= 8);
219 // get xy and srcXY fixed up
220 srcXY = (const SkFixed *) mysrc;
221 xy = (uint32_t *) mydst;
222 }
223
224 while (--count >= 0) {
225 *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
226 TILEX_PROCF(srcXY[0], maxX);
227 srcXY += 2;
228 }
229 }
230 }
231
PACK_FILTER_Y_NAME(SkFixed f,unsigned max,SkFixed one PREAMBLE_PARAM_Y)232 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
233 SkFixed one PREAMBLE_PARAM_Y) {
234 unsigned i = TILEY_PROCF(f, max);
235 i = (i << 4) | TILEY_LOW_BITS(f, max);
236 return (i << 14) | (TILEY_PROCF((f + one), max));
237 }
238
PACK_FILTER_X_NAME(SkFixed f,unsigned max,SkFixed one PREAMBLE_PARAM_X)239 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
240 SkFixed one PREAMBLE_PARAM_X) {
241 unsigned i = TILEX_PROCF(f, max);
242 i = (i << 4) | TILEX_LOW_BITS(f, max);
243 return (i << 14) | (TILEX_PROCF((f + one), max));
244 }
245
PACK_FILTER_X4_NAME(int32x4_t f,unsigned max,SkFixed one PREAMBLE_PARAM_X)246 static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
247 SkFixed one PREAMBLE_PARAM_X) {
248 int32x4_t ret, res, wide_one;
249
250 // Prepare constants
251 wide_one = vdupq_n_s32(one);
252
253 // Step 1
254 res = TILEX_PROCF_NEON4(f, max);
255
256 // Step 2
257 ret = TILEX_LOW_BITS_NEON4(f, max);
258 ret = vsliq_n_s32(ret, res, 4);
259
260 // Step 3
261 res = TILEX_PROCF_NEON4(f + wide_one, max);
262 ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
263
264 return ret;
265 }
266
PACK_FILTER_Y4_NAME(int32x4_t f,unsigned max,SkFixed one PREAMBLE_PARAM_X)267 static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
268 SkFixed one PREAMBLE_PARAM_X) {
269 int32x4_t ret, res, wide_one;
270
271 // Prepare constants
272 wide_one = vdupq_n_s32(one);
273
274 // Step 1
275 res = TILEY_PROCF_NEON4(f, max);
276
277 // Step 2
278 ret = TILEY_LOW_BITS_NEON4(f, max);
279 ret = vsliq_n_s32(ret, res, 4);
280
281 // Step 3
282 res = TILEY_PROCF_NEON4(f + wide_one, max);
283 ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
284
285 return ret;
286 }
287
SCALE_FILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)288 static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
289 uint32_t xy[], int count, int x, int y) {
290 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
291 SkMatrix::kScale_Mask)) == 0);
292 SkASSERT(s.fInvKy == 0);
293
294 PREAMBLE(s);
295
296 const unsigned maxX = s.fBitmap->width() - 1;
297 const SkFixed one = s.fFilterOneX;
298 const SkFractionalInt dx = s.fInvSxFractionalInt;
299 SkFractionalInt fx;
300
301 {
302 SkPoint pt;
303 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
304 SkIntToScalar(y) + SK_ScalarHalf, &pt);
305 const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
306 const unsigned maxY = s.fBitmap->height() - 1;
307 // compute our two Y values up front
308 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
309 // now initialize fx
310 fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
311 }
312
313 #ifdef CHECK_FOR_DECAL
314 // test if we don't need to apply the tile proc
315 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
316 decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
317 SkFractionalIntToFixed(dx), count);
318 return;
319 }
320 #endif
321 {
322
323 if (count >= 4) {
324 int32x4_t wide_fx;
325
326 wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
327 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
328 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
329 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
330
331 while (count >= 4) {
332 int32x4_t res;
333
334 res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
335
336 vst1q_u32(xy, vreinterpretq_u32_s32(res));
337
338 wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
339 fx += dx+dx+dx+dx;
340 xy += 4;
341 count -= 4;
342 }
343 }
344
345 while (--count >= 0) {
346 *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
347 fx += dx;
348 }
349
350 }
351 }
352
AFFINE_FILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)353 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
354 uint32_t xy[], int count, int x, int y) {
355 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
356 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
357 SkMatrix::kScale_Mask |
358 SkMatrix::kAffine_Mask)) == 0);
359
360 PREAMBLE(s);
361 SkPoint srcPt;
362 s.fInvProc(s.fInvMatrix,
363 SkIntToScalar(x) + SK_ScalarHalf,
364 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
365
366 SkFixed oneX = s.fFilterOneX;
367 SkFixed oneY = s.fFilterOneY;
368 SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
369 SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
370 SkFixed dx = s.fInvSx;
371 SkFixed dy = s.fInvKy;
372 unsigned maxX = s.fBitmap->width() - 1;
373 unsigned maxY = s.fBitmap->height() - 1;
374
375 if (count >= 4) {
376 int32x4_t wide_fy, wide_fx;
377
378 wide_fx = vdupq_n_s32(fx);
379 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
380 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
381 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
382
383 wide_fy = vdupq_n_s32(fy);
384 wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
385 wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
386 wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
387
388 while (count >= 4) {
389 int32x4x2_t vxy;
390
391 // do the X side, then the Y side, then interleave them
392 vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
393 vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
394
395 // interleave as YXYXYXYX as part of the storing
396 vst2q_s32((int32_t*)xy, vxy);
397
398 // prepare next iteration
399 wide_fx += vdupq_n_s32(dx+dx+dx+dx);
400 fx += dx + dx + dx + dx;
401 wide_fy += vdupq_n_s32(dy+dy+dy+dy);
402 fy += dy+dy+dy+dy;
403 xy += 8; // 4 x's, 4 y's
404 count -= 4;
405 }
406 }
407
408 while (--count >= 0) {
409 // NB: writing Y/X
410 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
411 fy += dy;
412 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
413 fx += dx;
414 }
415 }
416
PERSP_FILTER_NAME(const SkBitmapProcState & s,uint32_t * SK_RESTRICT xy,int count,int x,int y)417 static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
418 uint32_t* SK_RESTRICT xy, int count,
419 int x, int y) {
420 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
421
422 PREAMBLE(s);
423 unsigned maxX = s.fBitmap->width() - 1;
424 unsigned maxY = s.fBitmap->height() - 1;
425 SkFixed oneX = s.fFilterOneX;
426 SkFixed oneY = s.fFilterOneY;
427
428 SkPerspIter iter(s.fInvMatrix,
429 SkIntToScalar(x) + SK_ScalarHalf,
430 SkIntToScalar(y) + SK_ScalarHalf, count);
431
432 while ((count = iter.next()) != 0) {
433 const SkFixed* SK_RESTRICT srcXY = iter.getXY();
434
435 while (count >= 4) {
436 int32x4_t wide_x, wide_y;
437 int32x4x2_t vxy, vresyx;
438
439 // load src: x-y-x-y-x-y-x-y
440 vxy = vld2q_s32(srcXY);
441
442 // do the X side, then the Y side, then interleave them
443 wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
444 wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
445
446 vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
447 vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
448
449 // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
450 vst2q_s32((int32_t*)xy, vresyx);
451
452 // on to the next iteration
453 srcXY += 2*4;
454 count -= 4;
455 xy += 2*4;
456 }
457
458 while (--count >= 0) {
459 // NB: we read x/y, we write y/x
460 *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
461 oneY PREAMBLE_ARG_Y);
462 *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
463 oneX PREAMBLE_ARG_X);
464 srcXY += 2;
465 }
466 }
467 }
468
469 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
470 SCALE_NOFILTER_NAME,
471 SCALE_FILTER_NAME,
472 AFFINE_NOFILTER_NAME,
473 AFFINE_FILTER_NAME,
474 PERSP_NOFILTER_NAME,
475 PERSP_FILTER_NAME
476 };
477
478 #undef TILEX_PROCF_NEON8
479 #undef TILEY_PROCF_NEON8
480 #undef TILEX_PROCF_NEON4
481 #undef TILEY_PROCF_NEON4
482 #undef TILEX_LOW_BITS_NEON4
483 #undef TILEY_LOW_BITS_NEON4
484
485 #undef MAKENAME
486 #undef TILEX_PROCF
487 #undef TILEY_PROCF
488 #ifdef CHECK_FOR_DECAL
489 #undef CHECK_FOR_DECAL
490 #endif
491
492 #undef SCALE_NOFILTER_NAME
493 #undef SCALE_FILTER_NAME
494 #undef AFFINE_NOFILTER_NAME
495 #undef AFFINE_FILTER_NAME
496 #undef PERSP_NOFILTER_NAME
497 #undef PERSP_FILTER_NAME
498
499 #undef PREAMBLE
500 #undef PREAMBLE_PARAM_X
501 #undef PREAMBLE_PARAM_Y
502 #undef PREAMBLE_ARG_X
503 #undef PREAMBLE_ARG_Y
504
505 #undef TILEX_LOW_BITS
506 #undef TILEY_LOW_BITS
507