1 /* 2 * Copyright 2015 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkBlitMask_opts_DEFINED 9 #define SkBlitMask_opts_DEFINED 10 11 #include "Sk4px.h" 12 13 namespace SK_OPTS_NS { 14 15 #if defined(SK_ARM_HAS_NEON) 16 // The Sk4px versions below will work fine with NEON, but we have had many indications 17 // that it doesn't perform as well as this NEON-specific code. TODO(mtklein): why? 18 19 #define NEON_A (SK_A32_SHIFT / 8) 20 #define NEON_R (SK_R32_SHIFT / 8) 21 #define NEON_G (SK_G32_SHIFT / 8) 22 #define NEON_B (SK_B32_SHIFT / 8) 23 24 static inline uint16x8_t SkAlpha255To256_neon8(uint8x8_t alpha) { 25 return vaddw_u8(vdupq_n_u16(1), alpha); 26 } 27 28 static inline uint8x8_t SkAlphaMul_neon8(uint8x8_t color, uint16x8_t scale) { 29 return vshrn_n_u16(vmovl_u8(color) * scale, 8); 30 } 31 32 static inline uint8x8x4_t SkAlphaMulQ_neon8(uint8x8x4_t color, uint16x8_t scale) { 33 uint8x8x4_t ret; 34 35 ret.val[0] = SkAlphaMul_neon8(color.val[0], scale); 36 ret.val[1] = SkAlphaMul_neon8(color.val[1], scale); 37 ret.val[2] = SkAlphaMul_neon8(color.val[2], scale); 38 ret.val[3] = SkAlphaMul_neon8(color.val[3], scale); 39 40 return ret; 41 } 42 43 44 template <bool isColor> 45 static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB, 46 const void* SK_RESTRICT maskPtr, size_t maskRB, 47 SkColor color, int width, int height) { 48 SkPMColor pmc = SkPreMultiplyColor(color); 49 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; 50 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; 51 uint8x8x4_t vpmc; 52 53 maskRB -= width; 54 dstRB -= (width << 2); 55 56 if (width >= 8) { 57 vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); 58 vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); 59 vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); 60 vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); 61 } 62 do { 63 int w = width; 64 while (w >= 8) { 65 uint8x8_t vmask = vld1_u8(mask); 66 uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask); 67 if (isColor) { 68 vscale = vsubw_u8(vdupq_n_u16(256), 69 SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)); 70 } else { 71 vscale = vsubw_u8(vdupq_n_u16(256), vmask); 72 } 73 uint8x8x4_t vdev = vld4_u8((uint8_t*)device); 74 75 vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256) 76 + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); 77 vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256) 78 + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); 79 vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256) 80 + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); 81 vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256) 82 + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); 83 84 vst4_u8((uint8_t*)device, vdev); 85 86 mask += 8; 87 device += 8; 88 w -= 8; 89 } 90 91 while (w--) { 92 unsigned aa = *mask++; 93 if (isColor) { 94 *device = SkBlendARGB32(pmc, *device, aa); 95 } else { 96 *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) 97 + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); 98 } 99 device += 1; 100 }; 101 102 device = (uint32_t*)((char*)device + dstRB); 103 mask += maskRB; 104 105 } while (--height != 0); 106 } 107 108 static void blit_mask_d32_a8_general(SkPMColor* dst, size_t dstRB, 109 const SkAlpha* mask, size_t maskRB, 110 SkColor color, int w, int h) { 111 D32_A8_Opaque_Color_neon<true>(dst, dstRB, mask, maskRB, color, w, h); 112 } 113 114 // As above, but made slightly simpler by requiring that color is opaque. 115 static void blit_mask_d32_a8_opaque(SkPMColor* dst, size_t dstRB, 116 const SkAlpha* mask, size_t maskRB, 117 SkColor color, int w, int h) { 118 D32_A8_Opaque_Color_neon<false>(dst, dstRB, mask, maskRB, color, w, h); 119 } 120 121 // Same as _opaque, but assumes color == SK_ColorBLACK, a very common and even simpler case. 122 static void blit_mask_d32_a8_black(SkPMColor* dst, size_t dstRB, 123 const SkAlpha* maskPtr, size_t maskRB, 124 int width, int height) { 125 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; 126 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; 127 128 maskRB -= width; 129 dstRB -= (width << 2); 130 do { 131 int w = width; 132 while (w >= 8) { 133 uint8x8_t vmask = vld1_u8(mask); 134 uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); 135 uint8x8x4_t vdevice = vld4_u8((uint8_t*)device); 136 137 vdevice = SkAlphaMulQ_neon8(vdevice, vscale); 138 vdevice.val[NEON_A] += vmask; 139 140 vst4_u8((uint8_t*)device, vdevice); 141 142 mask += 8; 143 device += 8; 144 w -= 8; 145 } 146 while (w-- > 0) { 147 unsigned aa = *mask++; 148 *device = (aa << SK_A32_SHIFT) 149 + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); 150 device += 1; 151 }; 152 device = (uint32_t*)((char*)device + dstRB); 153 mask += maskRB; 154 } while (--height != 0); 155 } 156 157 #else 158 static void blit_mask_d32_a8_general(SkPMColor* dst, size_t dstRB, 159 const SkAlpha* mask, size_t maskRB, 160 SkColor color, int w, int h) { 161 auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color)); 162 auto fn = [&](const Sk4px& d, const Sk4px& aa) { 163 // = (s + d(1-sa))aa + d(1-aa) 164 // = s*aa + d(1-sa*aa) 165 auto left = s.approxMulDiv255(aa), 166 right = d.approxMulDiv255(left.alphas().inv()); 167 return left + right; // This does not overflow (exhaustively checked). 168 }; 169 while (h --> 0) { 170 Sk4px::MapDstAlpha(w, dst, mask, fn); 171 dst += dstRB / sizeof(*dst); 172 mask += maskRB / sizeof(*mask); 173 } 174 } 175 176 // As above, but made slightly simpler by requiring that color is opaque. 177 static void blit_mask_d32_a8_opaque(SkPMColor* dst, size_t dstRB, 178 const SkAlpha* mask, size_t maskRB, 179 SkColor color, int w, int h) { 180 SkASSERT(SkColorGetA(color) == 0xFF); 181 auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color)); 182 auto fn = [&](const Sk4px& d, const Sk4px& aa) { 183 // = (s + d(1-sa))aa + d(1-aa) 184 // = s*aa + d(1-sa*aa) 185 // ~~~> 186 // = s*aa + d(1-aa) 187 return s.approxMulDiv255(aa) + d.approxMulDiv255(aa.inv()); 188 }; 189 while (h --> 0) { 190 Sk4px::MapDstAlpha(w, dst, mask, fn); 191 dst += dstRB / sizeof(*dst); 192 mask += maskRB / sizeof(*mask); 193 } 194 } 195 196 // Same as _opaque, but assumes color == SK_ColorBLACK, a very common and even simpler case. 197 static void blit_mask_d32_a8_black(SkPMColor* dst, size_t dstRB, 198 const SkAlpha* mask, size_t maskRB, 199 int w, int h) { 200 auto fn = [](const Sk4px& d, const Sk4px& aa) { 201 // = (s + d(1-sa))aa + d(1-aa) 202 // = s*aa + d(1-sa*aa) 203 // ~~~> 204 // a = 1*aa + d(1-1*aa) = aa + d(1-aa) 205 // c = 0*aa + d(1-1*aa) = d(1-aa) 206 return Sk4px(Sk16b(aa) & Sk16b(0,0,0,255, 0,0,0,255, 0,0,0,255, 0,0,0,255)) 207 + d.approxMulDiv255(aa.inv()); 208 }; 209 while (h --> 0) { 210 Sk4px::MapDstAlpha(w, dst, mask, fn); 211 dst += dstRB / sizeof(*dst); 212 mask += maskRB / sizeof(*mask); 213 } 214 } 215 #endif 216 217 /*not static*/ inline void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB, 218 const SkAlpha* mask, size_t maskRB, 219 SkColor color, int w, int h) { 220 if (color == SK_ColorBLACK) { 221 blit_mask_d32_a8_black(dst, dstRB, mask, maskRB, w, h); 222 } else if (SkColorGetA(color) == 0xFF) { 223 blit_mask_d32_a8_opaque(dst, dstRB, mask, maskRB, color, w, h); 224 } else { 225 blit_mask_d32_a8_general(dst, dstRB, mask, maskRB, color, w, h); 226 } 227 } 228 229 } // SK_OPTS_NS 230 231 #endif//SkBlitMask_opts_DEFINED 232