1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBlitRow_opts_arm_neon.h"
9
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16
17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h>
19
20 #ifdef SK_CPU_ARM64
sk_vld4_u8_arm64_3(const SkPMColor * SK_RESTRICT & src)21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22 uint8x8x4_t vsrc;
23 uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24
25 asm (
26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27 "mov %[vsrc0].8b, v0.8b \t\n"
28 "mov %[vsrc1].8b, v1.8b \t\n"
29 "mov %[vsrc2].8b, v2.8b \t\n"
30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32 : : "v0", "v1", "v2", "v3"
33 );
34
35 vsrc.val[0] = vsrc_0;
36 vsrc.val[1] = vsrc_1;
37 vsrc.val[2] = vsrc_2;
38
39 return vsrc;
40 }
41
sk_vld4_u8_arm64_4(const SkPMColor * SK_RESTRICT & src)42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43 uint8x8x4_t vsrc;
44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45
46 asm (
47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48 "mov %[vsrc0].8b, v0.8b \t\n"
49 "mov %[vsrc1].8b, v1.8b \t\n"
50 "mov %[vsrc2].8b, v2.8b \t\n"
51 "mov %[vsrc3].8b, v3.8b \t\n"
52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54 [src] "+&r" (src)
55 : : "v0", "v1", "v2", "v3"
56 );
57
58 vsrc.val[0] = vsrc_0;
59 vsrc.val[1] = vsrc_1;
60 vsrc.val[2] = vsrc_2;
61 vsrc.val[3] = vsrc_3;
62
63 return vsrc;
64 }
65 #endif
66
S32_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68 const SkPMColor* SK_RESTRICT src, int count,
69 U8CPU alpha, int /*x*/, int /*y*/) {
70 SkASSERT(255 == alpha);
71
72 while (count >= 8) {
73 uint8x8x4_t vsrc;
74 uint16x8_t vdst;
75
76 // Load
77 #ifdef SK_CPU_ARM64
78 vsrc = sk_vld4_u8_arm64_3(src);
79 #else
80 vsrc = vld4_u8((uint8_t*)src);
81 src += 8;
82 #endif
83
84 // Convert src to 565
85 vdst = SkPixel32ToPixel16_neon8(vsrc);
86
87 // Store
88 vst1q_u16(dst, vdst);
89
90 // Prepare next iteration
91 dst += 8;
92 count -= 8;
93 };
94
95 // Leftovers
96 while (count > 0) {
97 SkPMColor c = *src++;
98 SkPMColorAssert(c);
99 *dst = SkPixel32ToPixel16_ToU16(c);
100 dst++;
101 count--;
102 };
103 }
104
S32_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106 const SkPMColor* SK_RESTRICT src, int count,
107 U8CPU alpha, int /*x*/, int /*y*/) {
108 SkASSERT(255 > alpha);
109
110 uint16x8_t vmask_blue, vscale;
111
112 // prepare constants
113 vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114 vmask_blue = vmovq_n_u16(0x1F);
115
116 while (count >= 8) {
117 uint8x8x4_t vsrc;
118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119 uint16x8_t vres_r, vres_g, vres_b;
120
121 // Load src
122 #ifdef SK_CPU_ARM64
123 vsrc = sk_vld4_u8_arm64_3(src);
124 #else
125 {
126 register uint8x8_t d0 asm("d0");
127 register uint8x8_t d1 asm("d1");
128 register uint8x8_t d2 asm("d2");
129 register uint8x8_t d3 asm("d3");
130
131 asm (
132 "vld4.8 {d0-d3},[%[src]]!"
133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134 :
135 );
136 vsrc.val[0] = d0;
137 vsrc.val[1] = d1;
138 vsrc.val[2] = d2;
139 }
140 #endif
141
142 // Load and unpack dst
143 vdst = vld1q_u16(dst);
144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes
145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
148
149 // Shift src to 565 range
150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153
154 // Scale src - dst
155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158
159 vres_r = vshrq_n_u16(vres_r * vscale, 8);
160 vres_g = vshrq_n_u16(vres_g * vscale, 8);
161 vres_b = vshrq_n_u16(vres_b * vscale, 8);
162
163 vres_r += vdst_r;
164 vres_g += vdst_g;
165 vres_b += vdst_b;
166
167 // Combine
168 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue
169 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue
170
171 // Store
172 vst1q_u16(dst, vres_b);
173 dst += 8;
174 count -= 8;
175 }
176 if (count > 0) {
177 int scale = SkAlpha255To256(alpha);
178 do {
179 SkPMColor c = *src++;
180 SkPMColorAssert(c);
181 uint16_t d = *dst;
182 *dst++ = SkPackRGB16(
183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186 } while (--count != 0);
187 }
188 }
189
190 #ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192 const SkPMColor* SK_RESTRICT src, int count,
193 U8CPU alpha, int /*x*/, int /*y*/) {
194 SkASSERT(255 == alpha);
195
196 if (count >= 8) {
197 uint16_t* SK_RESTRICT keep_dst = 0;
198
199 asm volatile (
200 "ands ip, %[count], #7 \n\t"
201 "vmov.u8 d31, #1<<7 \n\t"
202 "vld1.16 {q12}, [%[dst]] \n\t"
203 "vld4.8 {d0-d3}, [%[src]] \n\t"
204 // Thumb does not support the standard ARM conditional
205 // instructions but instead requires the 'it' instruction
206 // to signal conditional execution
207 "it eq \n\t"
208 "moveq ip, #8 \n\t"
209 "mov %[keep_dst], %[dst] \n\t"
210
211 "add %[src], %[src], ip, LSL#2 \n\t"
212 "add %[dst], %[dst], ip, LSL#1 \n\t"
213 "subs %[count], %[count], ip \n\t"
214 "b 9f \n\t"
215 // LOOP
216 "2: \n\t"
217
218 "vld1.16 {q12}, [%[dst]]! \n\t"
219 "vld4.8 {d0-d3}, [%[src]]! \n\t"
220 "vst1.16 {q10}, [%[keep_dst]] \n\t"
221 "sub %[keep_dst], %[dst], #8*2 \n\t"
222 "subs %[count], %[count], #8 \n\t"
223 "9: \n\t"
224 "pld [%[dst],#32] \n\t"
225 // expand 0565 q12 to 8888 {d4-d7}
226 "vmovn.u16 d4, q12 \n\t"
227 "vshr.u16 q11, q12, #5 \n\t"
228 "vshr.u16 q10, q12, #6+5 \n\t"
229 "vmovn.u16 d5, q11 \n\t"
230 "vmovn.u16 d6, q10 \n\t"
231 "vshl.u8 d4, d4, #3 \n\t"
232 "vshl.u8 d5, d5, #2 \n\t"
233 "vshl.u8 d6, d6, #3 \n\t"
234
235 "vmovl.u8 q14, d31 \n\t"
236 "vmovl.u8 q13, d31 \n\t"
237 "vmovl.u8 q12, d31 \n\t"
238
239 // duplicate in 4/2/1 & 8pix vsns
240 "vmvn.8 d30, d3 \n\t"
241 "vmlal.u8 q14, d30, d6 \n\t"
242 "vmlal.u8 q13, d30, d5 \n\t"
243 "vmlal.u8 q12, d30, d4 \n\t"
244 "vshr.u16 q8, q14, #5 \n\t"
245 "vshr.u16 q9, q13, #6 \n\t"
246 "vaddhn.u16 d6, q14, q8 \n\t"
247 "vshr.u16 q8, q12, #5 \n\t"
248 "vaddhn.u16 d5, q13, q9 \n\t"
249 "vaddhn.u16 d4, q12, q8 \n\t"
250 // intentionally don't calculate alpha
251 // result in d4-d6
252
253 #ifdef SK_PMCOLOR_IS_RGBA
254 "vqadd.u8 d6, d6, d0 \n\t"
255 "vqadd.u8 d5, d5, d1 \n\t"
256 "vqadd.u8 d4, d4, d2 \n\t"
257 #else
258 "vqadd.u8 d6, d6, d2 \n\t"
259 "vqadd.u8 d5, d5, d1 \n\t"
260 "vqadd.u8 d4, d4, d0 \n\t"
261 #endif
262
263 // pack 8888 {d4-d6} to 0565 q10
264 "vshll.u8 q10, d6, #8 \n\t"
265 "vshll.u8 q3, d5, #8 \n\t"
266 "vshll.u8 q2, d4, #8 \n\t"
267 "vsri.u16 q10, q3, #5 \n\t"
268 "vsri.u16 q10, q2, #11 \n\t"
269
270 "bne 2b \n\t"
271
272 "1: \n\t"
273 "vst1.16 {q10}, [%[keep_dst]] \n\t"
274 : [count] "+r" (count)
275 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
276 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
277 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
278 "d30","d31"
279 );
280 }
281 else
282 { // handle count < 8
283 uint16_t* SK_RESTRICT keep_dst = 0;
284
285 asm volatile (
286 "vmov.u8 d31, #1<<7 \n\t"
287 "mov %[keep_dst], %[dst] \n\t"
288
289 "tst %[count], #4 \n\t"
290 "beq 14f \n\t"
291 "vld1.16 {d25}, [%[dst]]! \n\t"
292 "vld1.32 {q1}, [%[src]]! \n\t"
293
294 "14: \n\t"
295 "tst %[count], #2 \n\t"
296 "beq 12f \n\t"
297 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
298 "vld1.32 {d1}, [%[src]]! \n\t"
299
300 "12: \n\t"
301 "tst %[count], #1 \n\t"
302 "beq 11f \n\t"
303 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
304 "vld1.32 {d0[1]}, [%[src]]! \n\t"
305
306 "11: \n\t"
307 // unzips achieve the same as a vld4 operation
308 "vuzp.u16 q0, q1 \n\t"
309 "vuzp.u8 d0, d1 \n\t"
310 "vuzp.u8 d2, d3 \n\t"
311 // expand 0565 q12 to 8888 {d4-d7}
312 "vmovn.u16 d4, q12 \n\t"
313 "vshr.u16 q11, q12, #5 \n\t"
314 "vshr.u16 q10, q12, #6+5 \n\t"
315 "vmovn.u16 d5, q11 \n\t"
316 "vmovn.u16 d6, q10 \n\t"
317 "vshl.u8 d4, d4, #3 \n\t"
318 "vshl.u8 d5, d5, #2 \n\t"
319 "vshl.u8 d6, d6, #3 \n\t"
320
321 "vmovl.u8 q14, d31 \n\t"
322 "vmovl.u8 q13, d31 \n\t"
323 "vmovl.u8 q12, d31 \n\t"
324
325 // duplicate in 4/2/1 & 8pix vsns
326 "vmvn.8 d30, d3 \n\t"
327 "vmlal.u8 q14, d30, d6 \n\t"
328 "vmlal.u8 q13, d30, d5 \n\t"
329 "vmlal.u8 q12, d30, d4 \n\t"
330 "vshr.u16 q8, q14, #5 \n\t"
331 "vshr.u16 q9, q13, #6 \n\t"
332 "vaddhn.u16 d6, q14, q8 \n\t"
333 "vshr.u16 q8, q12, #5 \n\t"
334 "vaddhn.u16 d5, q13, q9 \n\t"
335 "vaddhn.u16 d4, q12, q8 \n\t"
336 // intentionally don't calculate alpha
337 // result in d4-d6
338
339 #ifdef SK_PMCOLOR_IS_RGBA
340 "vqadd.u8 d6, d6, d0 \n\t"
341 "vqadd.u8 d5, d5, d1 \n\t"
342 "vqadd.u8 d4, d4, d2 \n\t"
343 #else
344 "vqadd.u8 d6, d6, d2 \n\t"
345 "vqadd.u8 d5, d5, d1 \n\t"
346 "vqadd.u8 d4, d4, d0 \n\t"
347 #endif
348
349 // pack 8888 {d4-d6} to 0565 q10
350 "vshll.u8 q10, d6, #8 \n\t"
351 "vshll.u8 q3, d5, #8 \n\t"
352 "vshll.u8 q2, d4, #8 \n\t"
353 "vsri.u16 q10, q3, #5 \n\t"
354 "vsri.u16 q10, q2, #11 \n\t"
355
356 // store
357 "tst %[count], #4 \n\t"
358 "beq 24f \n\t"
359 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
360
361 "24: \n\t"
362 "tst %[count], #2 \n\t"
363 "beq 22f \n\t"
364 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
365
366 "22: \n\t"
367 "tst %[count], #1 \n\t"
368 "beq 21f \n\t"
369 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
370
371 "21: \n\t"
372 : [count] "+r" (count)
373 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
374 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
375 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
376 "d30","d31"
377 );
378 }
379 }
380
381 #else // #ifdef SK_CPU_ARM32
382
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)383 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
384 const SkPMColor* SK_RESTRICT src, int count,
385 U8CPU alpha, int /*x*/, int /*y*/) {
386 SkASSERT(255 == alpha);
387
388 if (count >= 16) {
389 asm (
390 "movi v4.8h, #0x80 \t\n"
391
392 "1: \t\n"
393 "sub %w[count], %w[count], #16 \t\n"
394 "ld1 {v16.8h-v17.8h}, [%[dst]] \t\n"
395 "ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n"
396 "prfm pldl1keep, [%[src],#512] \t\n"
397 "prfm pldl1keep, [%[dst],#256] \t\n"
398 "ushr v20.8h, v17.8h, #5 \t\n"
399 "ushr v31.8h, v16.8h, #5 \t\n"
400 "xtn v6.8b, v31.8h \t\n"
401 "xtn2 v6.16b, v20.8h \t\n"
402 "ushr v20.8h, v17.8h, #11 \t\n"
403 "shl v19.16b, v6.16b, #2 \t\n"
404 "ushr v31.8h, v16.8h, #11 \t\n"
405 "xtn v22.8b, v31.8h \t\n"
406 "xtn2 v22.16b, v20.8h \t\n"
407 "shl v18.16b, v22.16b, #3 \t\n"
408 "mvn v3.16b, v3.16b \t\n"
409 "xtn v16.8b, v16.8h \t\n"
410 "mov v7.16b, v4.16b \t\n"
411 "xtn2 v16.16b, v17.8h \t\n"
412 "umlal v7.8h, v3.8b, v19.8b \t\n"
413 "shl v16.16b, v16.16b, #3 \t\n"
414 "mov v22.16b, v4.16b \t\n"
415 "ushr v24.8h, v7.8h, #6 \t\n"
416 "umlal v22.8h, v3.8b, v18.8b \t\n"
417 "ushr v20.8h, v22.8h, #5 \t\n"
418 "addhn v20.8b, v22.8h, v20.8h \t\n"
419 "cmp %w[count], #16 \t\n"
420 "mov v6.16b, v4.16b \t\n"
421 "mov v5.16b, v4.16b \t\n"
422 "umlal v6.8h, v3.8b, v16.8b \t\n"
423 "umlal2 v5.8h, v3.16b, v19.16b \t\n"
424 "mov v17.16b, v4.16b \t\n"
425 "ushr v19.8h, v6.8h, #5 \t\n"
426 "umlal2 v17.8h, v3.16b, v18.16b \t\n"
427 "addhn v7.8b, v7.8h, v24.8h \t\n"
428 "ushr v18.8h, v5.8h, #6 \t\n"
429 "ushr v21.8h, v17.8h, #5 \t\n"
430 "addhn2 v7.16b, v5.8h, v18.8h \t\n"
431 "addhn2 v20.16b, v17.8h, v21.8h \t\n"
432 "mov v22.16b, v4.16b \t\n"
433 "addhn v6.8b, v6.8h, v19.8h \t\n"
434 "umlal2 v22.8h, v3.16b, v16.16b \t\n"
435 "ushr v5.8h, v22.8h, #5 \t\n"
436 "addhn2 v6.16b, v22.8h, v5.8h \t\n"
437 "uqadd v7.16b, v1.16b, v7.16b \t\n"
438 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
439 "uqadd v20.16b, v2.16b, v20.16b \t\n"
440 "uqadd v6.16b, v0.16b, v6.16b \t\n"
441 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
442 "uqadd v20.16b, v0.16b, v20.16b \t\n"
443 "uqadd v6.16b, v2.16b, v6.16b \t\n"
444 #else
445 #error "This function only supports BGRA and RGBA."
446 #endif
447 "shll v22.8h, v20.8b, #8 \t\n"
448 "shll v5.8h, v7.8b, #8 \t\n"
449 "sri v22.8h, v5.8h, #5 \t\n"
450 "shll v17.8h, v6.8b, #8 \t\n"
451 "shll2 v23.8h, v20.16b, #8 \t\n"
452 "shll2 v7.8h, v7.16b, #8 \t\n"
453 "sri v22.8h, v17.8h, #11 \t\n"
454 "sri v23.8h, v7.8h, #5 \t\n"
455 "shll2 v6.8h, v6.16b, #8 \t\n"
456 "st1 {v22.8h}, [%[dst]], #16 \t\n"
457 "sri v23.8h, v6.8h, #11 \t\n"
458 "st1 {v23.8h}, [%[dst]], #16 \t\n"
459 "b.ge 1b \t\n"
460 : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
461 :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
462 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
463 "v31"
464 );
465 }
466 // Leftovers
467 if (count > 0) {
468 do {
469 SkPMColor c = *src++;
470 SkPMColorAssert(c);
471 if (c) {
472 *dst = SkSrcOver32To16(c, *dst);
473 }
474 dst += 1;
475 } while (--count != 0);
476 }
477 }
478 #endif // #ifdef SK_CPU_ARM32
479
pmcolor_to_expand16(SkPMColor c)480 static uint32_t pmcolor_to_expand16(SkPMColor c) {
481 unsigned r = SkGetPackedR32(c);
482 unsigned g = SkGetPackedG32(c);
483 unsigned b = SkGetPackedB32(c);
484 return (g << 24) | (r << 13) | (b << 2);
485 }
486
Color32A_D565_neon(uint16_t dst[],SkPMColor src,int count,int x,int y)487 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
488 uint32_t src_expand;
489 unsigned scale;
490 uint16x8_t vmask_blue;
491
492 if (count <= 0) return;
493 SkASSERT(((size_t)dst & 0x01) == 0);
494
495 /*
496 * This preamble code is in order to make dst aligned to 8 bytes
497 * in the next mutiple bytes read & write access.
498 */
499 src_expand = pmcolor_to_expand16(src);
500 scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
501
502 #define DST_ALIGN 8
503
504 /*
505 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
506 */
507 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
508
509 for (int i = 0; i < preamble_size; i+=2, dst++) {
510 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
511 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
512 if (--count == 0)
513 break;
514 }
515
516 int count16 = 0;
517 count16 = count >> 4;
518 vmask_blue = vmovq_n_u16(SK_B16_MASK);
519
520 if (count16) {
521 uint16x8_t wide_sr;
522 uint16x8_t wide_sg;
523 uint16x8_t wide_sb;
524 uint16x8_t wide_256_sa;
525
526 unsigned sr = SkGetPackedR32(src);
527 unsigned sg = SkGetPackedG32(src);
528 unsigned sb = SkGetPackedB32(src);
529 unsigned sa = SkGetPackedA32(src);
530
531 // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
532 // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
533 //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
534 wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
535
536 // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
537 //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
538 wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
539
540 // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
541 //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
542 wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
543
544 wide_256_sa =
545 vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
546
547 while (count16-- > 0) {
548 uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
549 uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
550 vdst1 = vld1q_u16(dst);
551 dst += 8;
552 vdst2 = vld1q_u16(dst);
553 dst -= 8; //to store dst again.
554
555 vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes
556 vdst1_b = vdst1 & vmask_blue; // extract blue
557 vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extract red
558 vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
559
560 vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes
561 vdst2_b = vdst2 & vmask_blue; // extract blue
562 vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extract red
563 vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
564
565 vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + (256-sa) x dr1
566 vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + (256-sa) x dg1
567 vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + (256-sa) x db1
568
569 vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + (256-sa) x dr2
570 vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + (256-sa) x dg2
571 vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + (256-sa) x db2
572
573 vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red
574 vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green
575 vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue
576
577 vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue
578 vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue
579
580 vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red
581 vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green
582 vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue
583
584 vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue
585 vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue
586
587 vst1q_u16(dst, vdst1);
588 dst += 8;
589 vst1q_u16(dst, vdst2);
590 dst += 8;
591 }
592 }
593
594 count &= 0xF;
595 if (count > 0) {
596 do {
597 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
598 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
599 dst += 1;
600 } while (--count != 0);
601 }
602 }
603
SkDiv255Round_neon8(uint16x8_t prod)604 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
605 prod += vdupq_n_u16(128);
606 prod += vshrq_n_u16(prod, 8);
607 return vshrq_n_u16(prod, 8);
608 }
609
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)610 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
611 const SkPMColor* SK_RESTRICT src, int count,
612 U8CPU alpha, int /*x*/, int /*y*/) {
613 SkASSERT(255 > alpha);
614
615 /* This code implements a Neon version of S32A_D565_Blend. The results have
616 * a few mismatches compared to the original code. These mismatches never
617 * exceed 1.
618 */
619
620 if (count >= 8) {
621 uint16x8_t valpha_max, vmask_blue;
622 uint8x8_t valpha;
623
624 // prepare constants
625 valpha_max = vmovq_n_u16(255);
626 valpha = vdup_n_u8(alpha);
627 vmask_blue = vmovq_n_u16(SK_B16_MASK);
628
629 do {
630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
631 uint16x8_t vres_a, vres_r, vres_g, vres_b;
632 uint8x8x4_t vsrc;
633
634 // load pixels
635 vdst = vld1q_u16(dst);
636 #ifdef SK_CPU_ARM64
637 vsrc = sk_vld4_u8_arm64_4(src);
638 #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
639 asm (
640 "vld4.u8 %h[vsrc], [%[src]]!"
641 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
642 : :
643 );
644 #else
645 register uint8x8_t d0 asm("d0");
646 register uint8x8_t d1 asm("d1");
647 register uint8x8_t d2 asm("d2");
648 register uint8x8_t d3 asm("d3");
649
650 asm volatile (
651 "vld4.u8 {d0-d3},[%[src]]!;"
652 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
653 [src] "+&r" (src)
654 : :
655 );
656 vsrc.val[0] = d0;
657 vsrc.val[1] = d1;
658 vsrc.val[2] = d2;
659 vsrc.val[3] = d3;
660 #endif
661
662
663 // deinterleave dst
664 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes
665 vdst_b = vdst & vmask_blue; // extract blue
666 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
667 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
668
669 // shift src to 565
670 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
671 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
672 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
673
674 // calc src * src_scale
675 vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
676 vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
677 vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
678 vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
679
680 // prepare dst_scale
681 vres_a = SkDiv255Round_neon8(vres_a);
682 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
683
684 // add dst * dst_scale to previous result
685 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
686 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
687 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
688
689 #ifdef S32A_D565_BLEND_EXACT
690 // It is possible to get exact results with this but it is slow,
691 // even slower than C code in some cases
692 vres_r = SkDiv255Round_neon8(vres_r);
693 vres_g = SkDiv255Round_neon8(vres_g);
694 vres_b = SkDiv255Round_neon8(vres_b);
695 #else
696 vres_r = vrshrq_n_u16(vres_r, 8);
697 vres_g = vrshrq_n_u16(vres_g, 8);
698 vres_b = vrshrq_n_u16(vres_b, 8);
699 #endif
700 // pack result
701 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
702 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
703
704 // store
705 vst1q_u16(dst, vres_b);
706 dst += 8;
707 count -= 8;
708 } while (count >= 8);
709 }
710
711 // leftovers
712 while (count-- > 0) {
713 SkPMColor sc = *src++;
714 if (sc) {
715 uint16_t dc = *dst;
716 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
717 unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
718 unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
719 unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
720 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
721 }
722 dst += 1;
723 }
724 }
725
726 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
727 * each dither value is spaced out into byte lanes, and repeated
728 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
729 * start of each row.
730 */
731 static const uint8_t gDitherMatrix_Neon[48] = {
732 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
733 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
734 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
735 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
736
737 };
738
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)739 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
740 int count, U8CPU alpha, int x, int y)
741 {
742
743 SkASSERT(255 > alpha);
744
745 // rescale alpha to range 1 - 256
746 int scale = SkAlpha255To256(alpha);
747
748 if (count >= 8) {
749 /* select row and offset for dither array */
750 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
751
752 uint8x8_t vdither = vld1_u8(dstart); // load dither values
753 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
754
755 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg
756 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
757
758 do {
759
760 uint8x8x4_t vsrc;
761 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
762 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
763 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
764 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
765 uint16x8_t vdst;
766 uint16x8_t vdst_r, vdst_g, vdst_b;
767 int16x8_t vres_r, vres_g, vres_b;
768 int8x8_t vres8_r, vres8_g, vres8_b;
769
770 // Load source and add dither
771 #ifdef SK_CPU_ARM64
772 vsrc = sk_vld4_u8_arm64_3(src);
773 #else
774 {
775 register uint8x8_t d0 asm("d0");
776 register uint8x8_t d1 asm("d1");
777 register uint8x8_t d2 asm("d2");
778 register uint8x8_t d3 asm("d3");
779
780 asm (
781 "vld4.8 {d0-d3},[%[src]]! "
782 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
783 :
784 );
785 vsrc.val[0] = d0;
786 vsrc.val[1] = d1;
787 vsrc.val[2] = d2;
788 }
789 #endif
790 vsrc_r = vsrc.val[NEON_R];
791 vsrc_g = vsrc.val[NEON_G];
792 vsrc_b = vsrc.val[NEON_B];
793
794 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
795 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
796 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
797
798 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
799 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen
800 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen
801
802 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result
803 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
804 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result
805
806 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
807 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
808 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
809
810 // Load dst and unpack
811 vdst = vld1q_u16(dst);
812 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green
813 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
814 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
815
816 // subtract dst from src and widen
817 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
818 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
819 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
820
821 // multiply diffs by scale and shift
822 vres_r = vmulq_s16(vres_r, vscale);
823 vres_g = vmulq_s16(vres_g, vscale);
824 vres_b = vmulq_s16(vres_b, vscale);
825
826 vres8_r = vshrn_n_s16(vres_r, 8);
827 vres8_g = vshrn_n_s16(vres_g, 8);
828 vres8_b = vshrn_n_s16(vres_b, 8);
829
830 // add dst to result
831 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
832 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
833 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
834
835 // put result into 565 format
836 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue
837 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
838
839 // Store result
840 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
841
842 // Next iteration
843 dst += 8;
844 count -= 8;
845
846 } while (count >= 8);
847 }
848
849 // Leftovers
850 if (count > 0) {
851 int scale = SkAlpha255To256(alpha);
852 DITHER_565_SCAN(y);
853 do {
854 SkPMColor c = *src++;
855 SkPMColorAssert(c);
856
857 int dither = DITHER_VALUE(x);
858 int sr = SkGetPackedR32(c);
859 int sg = SkGetPackedG32(c);
860 int sb = SkGetPackedB32(c);
861 sr = SkDITHER_R32To565(sr, dither);
862 sg = SkDITHER_G32To565(sg, dither);
863 sb = SkDITHER_B32To565(sb, dither);
864
865 uint16_t d = *dst;
866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
867 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
868 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
869 DITHER_INC_X(x);
870 } while (--count != 0);
871 }
872 }
873
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)874 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
875 const SkPMColor* SK_RESTRICT src,
876 int count, U8CPU alpha) {
877
878 SkASSERT(255 == alpha);
879 if (count > 0) {
880
881
882 uint8x8_t alpha_mask;
883
884 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
885 alpha_mask = vld1_u8(alpha_mask_setup);
886
887 /* do the NEON unrolled code */
888 #define UNROLL 4
889 while (count >= UNROLL) {
890 uint8x8_t src_raw, dst_raw, dst_final;
891 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
892
893 /* The two prefetches below may make the code slighlty
894 * slower for small values of count but are worth having
895 * in the general case.
896 */
897 __builtin_prefetch(src+32);
898 __builtin_prefetch(dst+32);
899
900 /* get the source */
901 src_raw = vreinterpret_u8_u32(vld1_u32(src));
902 #if UNROLL > 2
903 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
904 #endif
905
906 /* get and hold the dst too */
907 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
908 #if UNROLL > 2
909 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
910 #endif
911
912 /* 1st and 2nd bits of the unrolling */
913 {
914 uint8x8_t dst_cooked;
915 uint16x8_t dst_wide;
916 uint8x8_t alpha_narrow;
917 uint16x8_t alpha_wide;
918
919 /* get the alphas spread out properly */
920 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
921 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
922
923 /* spread the dest */
924 dst_wide = vmovl_u8(dst_raw);
925
926 /* alpha mul the dest */
927 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
928 dst_cooked = vshrn_n_u16(dst_wide, 8);
929
930 /* sum -- ignoring any byte lane overflows */
931 dst_final = vadd_u8(src_raw, dst_cooked);
932 }
933
934 #if UNROLL > 2
935 /* the 3rd and 4th bits of our unrolling */
936 {
937 uint8x8_t dst_cooked;
938 uint16x8_t dst_wide;
939 uint8x8_t alpha_narrow;
940 uint16x8_t alpha_wide;
941
942 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
943 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
944
945 /* spread the dest */
946 dst_wide = vmovl_u8(dst_raw_2);
947
948 /* alpha mul the dest */
949 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
950 dst_cooked = vshrn_n_u16(dst_wide, 8);
951
952 /* sum -- ignoring any byte lane overflows */
953 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
954 }
955 #endif
956
957 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
958 #if UNROLL > 2
959 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
960 #endif
961
962 src += UNROLL;
963 dst += UNROLL;
964 count -= UNROLL;
965 }
966 #undef UNROLL
967
968 /* do any residual iterations */
969 while (--count >= 0) {
970 *dst = SkPMSrcOver(*src, *dst);
971 src += 1;
972 dst += 1;
973 }
974 }
975 }
976
S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)977 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
978 const SkPMColor* SK_RESTRICT src,
979 int count, U8CPU alpha) {
980 SkASSERT(255 == alpha);
981
982 if (count <= 0)
983 return;
984
985 /* Use these to check if src is transparent or opaque */
986 const unsigned int ALPHA_OPAQ = 0xFF000000;
987 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
988
989 #define UNROLL 4
990 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
991 const SkPMColor* SK_RESTRICT src_temp = src;
992
993 /* set up the NEON variables */
994 uint8x8_t alpha_mask;
995 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
996 alpha_mask = vld1_u8(alpha_mask_setup);
997
998 uint8x8_t src_raw, dst_raw, dst_final;
999 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
1000 uint8x8_t dst_cooked;
1001 uint16x8_t dst_wide;
1002 uint8x8_t alpha_narrow;
1003 uint16x8_t alpha_wide;
1004
1005 /* choose the first processing type */
1006 if( src >= src_end)
1007 goto TAIL;
1008 if(*src <= ALPHA_TRANS)
1009 goto ALPHA_0;
1010 if(*src >= ALPHA_OPAQ)
1011 goto ALPHA_255;
1012 /* fall-thru */
1013
1014 ALPHA_1_TO_254:
1015 do {
1016
1017 /* get the source */
1018 src_raw = vreinterpret_u8_u32(vld1_u32(src));
1019 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
1020
1021 /* get and hold the dst too */
1022 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
1023 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
1024
1025
1026 /* get the alphas spread out properly */
1027 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
1028 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1029 /* we collapsed (255-a)+1 ... */
1030 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1031
1032 /* spread the dest */
1033 dst_wide = vmovl_u8(dst_raw);
1034
1035 /* alpha mul the dest */
1036 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1037 dst_cooked = vshrn_n_u16(dst_wide, 8);
1038
1039 /* sum -- ignoring any byte lane overflows */
1040 dst_final = vadd_u8(src_raw, dst_cooked);
1041
1042 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
1043 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1044 /* we collapsed (255-a)+1 ... */
1045 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1046
1047 /* spread the dest */
1048 dst_wide = vmovl_u8(dst_raw_2);
1049
1050 /* alpha mul the dest */
1051 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1052 dst_cooked = vshrn_n_u16(dst_wide, 8);
1053
1054 /* sum -- ignoring any byte lane overflows */
1055 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
1056
1057 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
1058 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
1059
1060 src += UNROLL;
1061 dst += UNROLL;
1062
1063 /* if 2 of the next pixels aren't between 1 and 254
1064 it might make sense to go to the optimized loops */
1065 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
1066 break;
1067
1068 } while(src < src_end);
1069
1070 if (src >= src_end)
1071 goto TAIL;
1072
1073 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
1074 goto ALPHA_255;
1075
1076 /*fall-thru*/
1077
1078 ALPHA_0:
1079
1080 /*In this state, we know the current alpha is 0 and
1081 we optimize for the next alpha also being zero. */
1082 src_temp = src; //so we don't have to increment dst every time
1083 do {
1084 if(*(++src) > ALPHA_TRANS)
1085 break;
1086 if(*(++src) > ALPHA_TRANS)
1087 break;
1088 if(*(++src) > ALPHA_TRANS)
1089 break;
1090 if(*(++src) > ALPHA_TRANS)
1091 break;
1092 } while(src < src_end);
1093
1094 dst += (src - src_temp);
1095
1096 /* no longer alpha 0, so determine where to go next. */
1097 if( src >= src_end)
1098 goto TAIL;
1099 if(*src >= ALPHA_OPAQ)
1100 goto ALPHA_255;
1101 else
1102 goto ALPHA_1_TO_254;
1103
1104 ALPHA_255:
1105 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
1106 dst[0]=src[0];
1107 dst[1]=src[1];
1108 dst[2]=src[2];
1109 dst[3]=src[3];
1110 src+=UNROLL;
1111 dst+=UNROLL;
1112 if(src >= src_end)
1113 goto TAIL;
1114 }
1115
1116 //Handle remainder.
1117 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1118 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1119 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
1120 }
1121 }
1122
1123 if( src >= src_end)
1124 goto TAIL;
1125 if(*src <= ALPHA_TRANS)
1126 goto ALPHA_0;
1127 else
1128 goto ALPHA_1_TO_254;
1129
1130 TAIL:
1131 /* do any residual iterations */
1132 src_end += UNROLL + 1; //goto the real end
1133 while(src != src_end) {
1134 if( *src != 0 ) {
1135 if( *src >= ALPHA_OPAQ ) {
1136 *dst = *src;
1137 }
1138 else {
1139 *dst = SkPMSrcOver(*src, *dst);
1140 }
1141 }
1142 src++;
1143 dst++;
1144 }
1145
1146 #undef UNROLL
1147 return;
1148 }
1149
1150 /* Neon version of S32_Blend_BlitRow32()
1151 * portable version is in src/core/SkBlitRow_D32.cpp
1152 */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1153 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1154 const SkPMColor* SK_RESTRICT src,
1155 int count, U8CPU alpha) {
1156 SkASSERT(alpha <= 255);
1157
1158 if (count <= 0) {
1159 return;
1160 }
1161
1162 uint16_t src_scale = SkAlpha255To256(alpha);
1163 uint16_t dst_scale = 256 - src_scale;
1164
1165 while (count >= 2) {
1166 uint8x8_t vsrc, vdst, vres;
1167 uint16x8_t vsrc_wide, vdst_wide;
1168
1169 /* These commented prefetches are a big win for count
1170 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
1171 * They also hurt a little (<5%) on an A15
1172 */
1173 //__builtin_prefetch(src+32);
1174 //__builtin_prefetch(dst+32);
1175
1176 // Load
1177 vsrc = vreinterpret_u8_u32(vld1_u32(src));
1178 vdst = vreinterpret_u8_u32(vld1_u32(dst));
1179
1180 // Process src
1181 vsrc_wide = vmovl_u8(vsrc);
1182 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1183
1184 // Process dst
1185 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1186
1187 // Combine
1188 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1189
1190 // Store
1191 vst1_u32(dst, vreinterpret_u32_u8(vres));
1192
1193 src += 2;
1194 dst += 2;
1195 count -= 2;
1196 }
1197
1198 if (count == 1) {
1199 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1200 uint16x8_t vsrc_wide, vdst_wide;
1201
1202 // Load
1203 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1204 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1205
1206 // Process
1207 vsrc_wide = vmovl_u8(vsrc);
1208 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1209 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1210 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1211
1212 // Store
1213 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1214 }
1215 }
1216
1217 #ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1218 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1219 const SkPMColor* SK_RESTRICT src,
1220 int count, U8CPU alpha) {
1221
1222 SkASSERT(255 >= alpha);
1223
1224 if (count <= 0) {
1225 return;
1226 }
1227
1228 unsigned alpha256 = SkAlpha255To256(alpha);
1229
1230 // First deal with odd counts
1231 if (count & 1) {
1232 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1233 uint16x8_t vdst_wide, vsrc_wide;
1234 unsigned dst_scale;
1235
1236 // Load
1237 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1238 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1239
1240 // Calc dst_scale
1241 dst_scale = vget_lane_u8(vsrc, 3);
1242 dst_scale *= alpha256;
1243 dst_scale >>= 8;
1244 dst_scale = 256 - dst_scale;
1245
1246 // Process src
1247 vsrc_wide = vmovl_u8(vsrc);
1248 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1249
1250 // Process dst
1251 vdst_wide = vmovl_u8(vdst);
1252 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1253
1254 // Combine
1255 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1256
1257 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1258 dst++;
1259 src++;
1260 count--;
1261 }
1262
1263 if (count) {
1264 uint8x8_t alpha_mask;
1265 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1266 alpha_mask = vld1_u8(alpha_mask_setup);
1267
1268 do {
1269
1270 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1271 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1272
1273 __builtin_prefetch(src+32);
1274 __builtin_prefetch(dst+32);
1275
1276 // Load
1277 vsrc = vreinterpret_u8_u32(vld1_u32(src));
1278 vdst = vreinterpret_u8_u32(vld1_u32(dst));
1279
1280 // Prepare src_scale
1281 vsrc_scale = vdupq_n_u16(alpha256);
1282
1283 // Calc dst_scale
1284 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1285 vdst_scale = vmovl_u8(vsrc_alphas);
1286 vdst_scale *= vsrc_scale;
1287 vdst_scale = vshrq_n_u16(vdst_scale, 8);
1288 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1289
1290 // Process src
1291 vsrc_wide = vmovl_u8(vsrc);
1292 vsrc_wide *= vsrc_scale;
1293
1294 // Process dst
1295 vdst_wide = vmovl_u8(vdst);
1296 vdst_wide *= vdst_scale;
1297
1298 // Combine
1299 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1300
1301 vst1_u32(dst, vreinterpret_u32_u8(vres));
1302
1303 src += 2;
1304 dst += 2;
1305 count -= 2;
1306 } while(count);
1307 }
1308 }
1309
1310 ///////////////////////////////////////////////////////////////////////////////
1311
1312 #endif // #ifdef SK_CPU_ARM32
1313
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1314 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1315 const SkPMColor* SK_RESTRICT src,
1316 int count, U8CPU alpha, int x, int y) {
1317 SkASSERT(255 == alpha);
1318
1319 #define UNROLL 8
1320
1321 if (count >= UNROLL) {
1322
1323 uint8x8_t dbase;
1324 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1325 dbase = vld1_u8(dstart);
1326
1327 do {
1328 uint8x8x4_t vsrc;
1329 uint8x8_t sr, sg, sb, sa, d;
1330 uint16x8_t dst8, scale8, alpha8;
1331 uint16x8_t dst_r, dst_g, dst_b;
1332
1333 #ifdef SK_CPU_ARM64
1334 vsrc = sk_vld4_u8_arm64_4(src);
1335 #else
1336 {
1337 register uint8x8_t d0 asm("d0");
1338 register uint8x8_t d1 asm("d1");
1339 register uint8x8_t d2 asm("d2");
1340 register uint8x8_t d3 asm("d3");
1341
1342 asm ("vld4.8 {d0-d3},[%[src]]! "
1343 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1344 :
1345 );
1346 vsrc.val[0] = d0;
1347 vsrc.val[1] = d1;
1348 vsrc.val[2] = d2;
1349 vsrc.val[3] = d3;
1350 }
1351 #endif
1352 sa = vsrc.val[NEON_A];
1353 sr = vsrc.val[NEON_R];
1354 sg = vsrc.val[NEON_G];
1355 sb = vsrc.val[NEON_B];
1356
1357 /* calculate 'd', which will be 0..7
1358 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1359 */
1360 alpha8 = vmovl_u8(dbase);
1361 alpha8 = vmlal_u8(alpha8, sa, dbase);
1362 d = vshrn_n_u16(alpha8, 8); // narrowing too
1363
1364 // sr = sr - (sr>>5) + d
1365 /* watching for 8-bit overflow. d is 0..7; risky range of
1366 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1367 * safe as long as we do ((sr-sr>>5) + d)
1368 */
1369 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1370 sr = vadd_u8(sr, d);
1371
1372 // sb = sb - (sb>>5) + d
1373 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1374 sb = vadd_u8(sb, d);
1375
1376 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1377 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1378 sg = vadd_u8(sg, vshr_n_u8(d,1));
1379
1380 // need to pick up 8 dst's -- at 16 bits each, 128 bits
1381 dst8 = vld1q_u16(dst);
1382 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1383 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1384 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
1385
1386 // blend
1387 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1388
1389 // combine the addq and mul, save 3 insns
1390 scale8 = vshrq_n_u16(scale8, 3);
1391 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1392 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1393 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1394
1395 // repack to store
1396 dst8 = vshrq_n_u16(dst_b, 5);
1397 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1398 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1399
1400 vst1q_u16(dst, dst8);
1401
1402 dst += UNROLL;
1403 count -= UNROLL;
1404 // skip x += UNROLL, since it's unchanged mod-4
1405 } while (count >= UNROLL);
1406 }
1407 #undef UNROLL
1408
1409 // residuals
1410 if (count > 0) {
1411 DITHER_565_SCAN(y);
1412 do {
1413 SkPMColor c = *src++;
1414 SkPMColorAssert(c);
1415 if (c) {
1416 unsigned a = SkGetPackedA32(c);
1417
1418 // dither and alpha are just temporary variables to work-around
1419 // an ICE in debug.
1420 unsigned dither = DITHER_VALUE(x);
1421 unsigned alpha = SkAlpha255To256(a);
1422 int d = SkAlphaMul(dither, alpha);
1423
1424 unsigned sr = SkGetPackedR32(c);
1425 unsigned sg = SkGetPackedG32(c);
1426 unsigned sb = SkGetPackedB32(c);
1427 sr = SkDITHER_R32_FOR_565(sr, d);
1428 sg = SkDITHER_G32_FOR_565(sg, d);
1429 sb = SkDITHER_B32_FOR_565(sb, d);
1430
1431 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1432 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1433 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1434 // now src and dst expanded are in g:11 r:10 x:1 b:10
1435 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1436 }
1437 dst += 1;
1438 DITHER_INC_X(x);
1439 } while (--count != 0);
1440 }
1441 }
1442
1443 ///////////////////////////////////////////////////////////////////////////////
1444
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1445 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1446 const SkPMColor* SK_RESTRICT src,
1447 int count, U8CPU alpha, int x, int y) {
1448 SkASSERT(255 == alpha);
1449
1450 #define UNROLL 8
1451 if (count >= UNROLL) {
1452 uint8x8_t d;
1453 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1454 d = vld1_u8(dstart);
1455
1456 while (count >= UNROLL) {
1457 uint8x8_t sr, sg, sb;
1458 uint16x8_t dr, dg, db;
1459 uint16x8_t dst8;
1460 uint8x8x4_t vsrc;
1461
1462 #ifdef SK_CPU_ARM64
1463 vsrc = sk_vld4_u8_arm64_3(src);
1464 #else
1465 {
1466 register uint8x8_t d0 asm("d0");
1467 register uint8x8_t d1 asm("d1");
1468 register uint8x8_t d2 asm("d2");
1469 register uint8x8_t d3 asm("d3");
1470
1471 asm (
1472 "vld4.8 {d0-d3},[%[src]]! "
1473 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1474 :
1475 );
1476 vsrc.val[0] = d0;
1477 vsrc.val[1] = d1;
1478 vsrc.val[2] = d2;
1479 }
1480 #endif
1481 sr = vsrc.val[NEON_R];
1482 sg = vsrc.val[NEON_G];
1483 sb = vsrc.val[NEON_B];
1484
1485 /* XXX: if we want to prefetch, hide it in the above asm()
1486 * using the gcc __builtin_prefetch(), the prefetch will
1487 * fall to the bottom of the loop -- it won't stick up
1488 * at the top of the loop, just after the vld4.
1489 */
1490
1491 // sr = sr - (sr>>5) + d
1492 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1493 dr = vaddl_u8(sr, d);
1494
1495 // sb = sb - (sb>>5) + d
1496 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1497 db = vaddl_u8(sb, d);
1498
1499 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1500 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1501 dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1502
1503 // pack high bits of each into 565 format (rgb, b is lsb)
1504 dst8 = vshrq_n_u16(db, 3);
1505 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1506 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1507
1508 // store it
1509 vst1q_u16(dst, dst8);
1510
1511 dst += UNROLL;
1512 // we don't need to increment src as the asm above has already done it
1513 count -= UNROLL;
1514 x += UNROLL; // probably superfluous
1515 }
1516 }
1517 #undef UNROLL
1518
1519 // residuals
1520 if (count > 0) {
1521 DITHER_565_SCAN(y);
1522 do {
1523 SkPMColor c = *src++;
1524 SkPMColorAssert(c);
1525 SkASSERT(SkGetPackedA32(c) == 255);
1526
1527 unsigned dither = DITHER_VALUE(x);
1528 *dst++ = SkDitherRGB32To565(c, dither);
1529 DITHER_INC_X(x);
1530 } while (--count != 0);
1531 }
1532 }
1533
1534 ///////////////////////////////////////////////////////////////////////////////
1535
1536 const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
1537 // no dither
1538 S32_D565_Opaque_neon,
1539 S32_D565_Blend_neon,
1540 S32A_D565_Opaque_neon,
1541 #if 0
1542 S32A_D565_Blend_neon,
1543 #else
1544 nullptr, // https://code.google.com/p/skia/issues/detail?id=2797
1545 #endif
1546
1547 // dither
1548 S32_D565_Opaque_Dither_neon,
1549 S32_D565_Blend_Dither_neon,
1550 S32A_D565_Opaque_Dither_neon,
1551 nullptr, // S32A_D565_Blend_Dither
1552 };
1553
1554 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1555 Color32A_D565_neon, // Color32_D565,
1556 Color32A_D565_neon, // Color32A_D565,
1557 Color32A_D565_neon, // Color32_D565_Dither,
1558 Color32A_D565_neon, // Color32A_D565_Dither
1559 };
1560
1561 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1562 nullptr, // S32_Opaque,
1563 S32_Blend_BlitRow32_neon, // S32_Blend,
1564 /*
1565 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1566 * value and attempts to optimize accordingly. The optimization is
1567 * sensitive to the source content and is not a win in all cases. For
1568 * example, if there are a lot of transitions between the alpha states,
1569 * the performance will almost certainly be worse. However, for many
1570 * common cases the performance is equivalent or better than the standard
1571 * case where we do not inspect the src alpha.
1572 */
1573 #if SK_A32_SHIFT == 24
1574 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1575 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1576 #else
1577 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1578 #endif
1579 #ifdef SK_CPU_ARM32
1580 S32A_Blend_BlitRow32_neon // S32A_Blend
1581 #else
1582 nullptr
1583 #endif
1584 };
1585