1 /* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "SkBlitRow_opts_arm_neon.h" 9 10 #include "SkBlitMask.h" 11 #include "SkBlitRow.h" 12 #include "SkColorData.h" 13 #include "SkDither.h" 14 #include "SkMathPriv.h" 15 #include "SkUtils.h" 16 17 #include "SkColor_opts_neon.h" 18 #include <arm_neon.h> 19 20 /* Neon version of S32_Blend_BlitRow32() 21 * portable version is in src/core/SkBlitRow_D32.cpp 22 */ 23 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 24 const SkPMColor* SK_RESTRICT src, 25 int count, U8CPU alpha) { 26 SkASSERT(alpha <= 255); 27 28 if (count <= 0) { 29 return; 30 } 31 32 uint16_t src_scale = SkAlpha255To256(alpha); 33 uint16_t dst_scale = 256 - src_scale; 34 35 while (count >= 2) { 36 uint8x8_t vsrc, vdst, vres; 37 uint16x8_t vsrc_wide, vdst_wide; 38 39 /* These commented prefetches are a big win for count 40 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. 41 * They also hurt a little (<5%) on an A15 42 */ 43 //__builtin_prefetch(src+32); 44 //__builtin_prefetch(dst+32); 45 46 // Load 47 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 48 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 49 50 // Process src 51 vsrc_wide = vmovl_u8(vsrc); 52 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 53 54 // Process dst 55 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 56 57 // Combine 58 vdst_wide += vsrc_wide; 59 vres = vshrn_n_u16(vdst_wide, 8); 60 61 // Store 62 vst1_u32(dst, vreinterpret_u32_u8(vres)); 63 64 src += 2; 65 dst += 2; 66 count -= 2; 67 } 68 69 if (count == 1) { 70 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 71 uint16x8_t vsrc_wide, vdst_wide; 72 73 // Load 74 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 75 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 76 77 // Process 78 vsrc_wide = vmovl_u8(vsrc); 79 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 80 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 81 vdst_wide += vsrc_wide; 82 vres = vshrn_n_u16(vdst_wide, 8); 83 84 // Store 85 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 86 } 87 } 88 89 #ifdef SK_CPU_ARM32 90 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 91 const SkPMColor* SK_RESTRICT src, 92 int count, U8CPU alpha) { 93 94 SkASSERT(255 > alpha); 95 96 if (count <= 0) { 97 return; 98 } 99 100 unsigned alpha256 = SkAlpha255To256(alpha); 101 102 // First deal with odd counts 103 if (count & 1) { 104 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 105 uint16x8_t vdst_wide, vsrc_wide; 106 unsigned dst_scale; 107 108 // Load 109 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 110 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 111 112 // Calc dst_scale 113 dst_scale = vget_lane_u8(vsrc, 3); 114 dst_scale = SkAlphaMulInv256(dst_scale, alpha256); 115 116 // Process src 117 vsrc_wide = vmovl_u8(vsrc); 118 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 119 120 // Process dst 121 vdst_wide = vmovl_u8(vdst); 122 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 123 124 // Combine 125 vdst_wide += vsrc_wide; 126 vres = vshrn_n_u16(vdst_wide, 8); 127 128 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 129 dst++; 130 src++; 131 count--; 132 } 133 134 if (count) { 135 uint8x8_t alpha_mask; 136 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 137 alpha_mask = vld1_u8(alpha_mask_setup); 138 139 do { 140 141 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 142 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 143 144 __builtin_prefetch(src+32); 145 __builtin_prefetch(dst+32); 146 147 // Load 148 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 149 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 150 151 // Prepare src_scale 152 vsrc_scale = vdupq_n_u16(alpha256); 153 154 // Calc dst_scale 155 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 156 vdst_scale = vmovl_u8(vsrc_alphas); 157 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). 158 // A 16-bit lane would overflow if we used 0xFFFF here, 159 // so use an approximation with 0xFF00 that is off by 1, 160 // and add back 1 after to get the correct value. 161 // This is valid if alpha256 <= 255. 162 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); 163 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); 164 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); 165 166 // Process src 167 vsrc_wide = vmovl_u8(vsrc); 168 vsrc_wide *= vsrc_scale; 169 170 // Process dst 171 vdst_wide = vmovl_u8(vdst); 172 vdst_wide *= vdst_scale; 173 174 // Combine 175 vdst_wide += vsrc_wide; 176 vres = vshrn_n_u16(vdst_wide, 8); 177 178 vst1_u32(dst, vreinterpret_u32_u8(vres)); 179 180 src += 2; 181 dst += 2; 182 count -= 2; 183 } while(count); 184 } 185 } 186 187 /////////////////////////////////////////////////////////////////////////////// 188 189 #endif // #ifdef SK_CPU_ARM32 190 191 /////////////////////////////////////////////////////////////////////////////// 192 193 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 194 nullptr, // S32_Opaque, 195 S32_Blend_BlitRow32_neon, // S32_Blend, 196 nullptr, // Ported to SkOpts 197 #ifdef SK_CPU_ARM32 198 S32A_Blend_BlitRow32_neon // S32A_Blend 199 #else 200 nullptr 201 #endif 202 }; 203