1 /* 2 * Copyright 2018 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkBitmapProcState_opts_DEFINED 9 #define SkBitmapProcState_opts_DEFINED 10 11 #include "SkBitmapProcState.h" 12 13 // SkBitmapProcState optimized Shader, Sample, or Matrix procs. 14 // 15 // Only S32_alpha_D32_filter_DX exploits instructions beyond 16 // our common baseline SSE2/NEON instruction sets, so that's 17 // all that lives here. 18 // 19 // The rest are scattershot at the moment but I want to get them 20 // all migrated to be normal code inside SkBitmapProcState.cpp. 21 22 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 23 #include <immintrin.h> 24 #elif defined(SK_ARM_HAS_NEON) 25 #include <arm_neon.h> 26 #endif 27 28 namespace SK_OPTS_NS { 29 30 // This same basic packing scheme is used throughout the file. 31 static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) { 32 // The top 14 bits are the integer coordinate x0 or y0. 33 *v0 = packed >> 18; 34 35 // The bottom 14 bits are the integer coordinate x1 or y1. 36 *v1 = packed & 0x3fff; 37 38 // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1. 39 *w = (packed >> 14) & 0xf; 40 } 41 42 #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 43 44 // As above, 4x. 45 static void decode_packed_coordinates_and_weight(__m128i packed, 46 int v0[4], int v1[4], __m128i* w) { 47 _mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18)); 48 _mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff))); 49 *w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); 50 } 51 52 // This is the crux of the SSSE3 implementation, 53 // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). 54 static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1, 55 uint32_t B0, uint32_t B1, 56 const __m128i& interlaced_x_weights) { 57 // _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp. 58 // 59 // It takes two arguments interlaced byte-wise: 60 // - first arg: [ x,y, ... 7 more pairs of 8-bit values ...] 61 // - second arg: [ z,w, ... 7 more pairs of 8-bit values ...] 62 // and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ]. 63 // 64 // That's why we go to all this trouble to make interlaced_x_weights, 65 // and here we're interlacing A0 with A1, B0 with B1 to match. 66 67 __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)), 68 interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1)); 69 70 return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B), 71 interlaced_x_weights); 72 } 73 74 // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. 75 // Returns two pixels, with each channel in a 16-bit lane of the __m128i. 76 static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1, 77 uint32_t A2, uint32_t A3, 78 uint32_t B0, uint32_t B1, 79 uint32_t B2, uint32_t B3, 80 const __m128i& interlaced_x_weights, 81 int wy) { 82 // The stored Y weight wy is for y1, and y0 gets a weight 16-wy. 83 const __m128i wy1 = _mm_set1_epi16(wy), 84 wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1); 85 86 // First interpolate in X, 87 // leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights. 88 __m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights), 89 row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights); 90 91 // Interpolate in Y across the two rows, 92 // then scale everything down by the maximum total weight 16x16 = 256. 93 return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0), 94 _mm_mullo_epi16(row1, wy1)), 8); 95 } 96 97 /*not static*/ inline 98 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, 99 const uint32_t* xy, int count, uint32_t* colors) { 100 SkASSERT(count > 0 && colors != nullptr); 101 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); 102 SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); 103 104 int alpha = s.fAlphaScale; 105 106 // Return (px * s.fAlphaScale) / 256. (s.fAlphaScale is in [0,256].) 107 auto scale_by_alpha = [alpha](const __m128i& px) { 108 return alpha == 256 ? px 109 : _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8); 110 }; 111 112 // We're in _DX_ mode here, so we're only varying in X. 113 // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. 114 // All the other entries in xy will be pairs of X coordinates and the X weight. 115 int y0, y1, wy; 116 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); 117 118 auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()), 119 row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes()); 120 121 while (count >= 4) { 122 // We can really get going, loading 4 X pairs at a time to produce 4 output pixels. 123 const __m128i xx = _mm_loadu_si128((const __m128i*)xy); 124 125 int x0[4], 126 x1[4]; 127 __m128i wx; 128 decode_packed_coordinates_and_weight(xx, x0, x1, &wx); 129 130 // Splat out each x weight wx four times (one for each pixel channel) as wx1, 131 // and sixteen minus that as the weight for x0, wx0. 132 __m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)), 133 wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1); 134 135 // We need to interlace wx0 and wx1 for _mm_maddubs_epi16(). 136 __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1), 137 interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1); 138 139 // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time 140 // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each. 141 __m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]], 142 row1[x0[0]], row1[x1[0]], 143 row0[x0[1]], row0[x1[1]], 144 row1[x0[1]], row1[x1[1]], 145 interlaced_x_weights_AB, wy); 146 147 // Once more with the other half of the x-weights for two more pixels C,D. 148 __m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]], 149 row1[x0[2]], row1[x1[2]], 150 row0[x0[3]], row0[x1[3]], 151 row1[x0[3]], row1[x1[3]], 152 interlaced_x_weights_CD, wy); 153 154 // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels! 155 _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB), 156 scale_by_alpha(CD))); 157 xy += 4; 158 colors += 4; 159 count -= 4; 160 } 161 162 while (count --> 0) { 163 // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. 164 int x0, x1, wx; 165 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); 166 167 // As above, splat out wx four times as wx1, and sixteen minus that as wx0. 168 __m128i wx1 = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine. 169 wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1); 170 171 __m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1); 172 173 __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1], 174 row1[x0], row1[x1], 175 0, 0, 176 0, 0, 177 interlaced_x_weights_A, wy); 178 179 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128())); 180 } 181 } 182 183 184 #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 185 186 // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc. 187 188 /*not static*/ inline 189 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, 190 const uint32_t* xy, int count, uint32_t* colors) { 191 SkASSERT(count > 0 && colors != nullptr); 192 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); 193 SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); 194 SkASSERT(s.fAlphaScale <= 256); 195 196 int y0, y1, wy; 197 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); 198 199 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), 200 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); 201 202 // We'll put one pixel in the low 4 16-bit lanes to line up with wy, 203 // and another in the upper 4 16-bit lanes to line up with 16 - wy. 204 const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), 205 _mm_set1_epi16(16-wy)); 206 207 while (count --> 0) { 208 int x0, x1, wx; 209 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); 210 211 // Load the 4 pixels we're interpolating. 212 const __m128i a00 = _mm_cvtsi32_si128(row0[x0]), 213 a01 = _mm_cvtsi32_si128(row0[x1]), 214 a10 = _mm_cvtsi32_si128(row1[x0]), 215 a11 = _mm_cvtsi32_si128(row1[x1]); 216 217 // Line up low-x pixels a00 and a10 with allY. 218 __m128i a00a10 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a10, a00), 219 _mm_setzero_si128()); 220 221 // Scale by allY and 16-wx. 222 a00a10 = _mm_mullo_epi16(a00a10, allY); 223 a00a10 = _mm_mullo_epi16(a00a10, _mm_set1_epi16(16-wx)); 224 225 226 // Line up high-x pixels a01 and a11 with allY. 227 __m128i a01a11 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a11, a01), 228 _mm_setzero_si128()); 229 230 // Scale by allY and wx. 231 a01a11 = _mm_mullo_epi16(a01a11, allY); 232 a01a11 = _mm_mullo_epi16(a01a11, _mm_set1_epi16(wx)); 233 234 235 // Add the two intermediates, summing across in one direction. 236 __m128i halves = _mm_add_epi16(a00a10, a01a11); 237 238 // Add the two halves to each other to sum in the other direction. 239 __m128i sum = _mm_add_epi16(halves, _mm_srli_si128(halves, 8)); 240 241 // Get back to [0,255] by dividing by maximum weight 16x16 = 256. 242 sum = _mm_srli_epi16(sum, 8); 243 244 if (s.fAlphaScale < 256) { 245 // Scale by alpha, which is in [0,256]. 246 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale)); 247 sum = _mm_srli_epi16(sum, 8); 248 } 249 250 // Pack back into 8-bit values and store. 251 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128())); 252 } 253 } 254 255 #else 256 257 // The NEON code only actually differs from the portable code in the 258 // filtering step after we've loaded all four pixels we want to bilerp. 259 260 #if defined(SK_ARM_HAS_NEON) 261 static void filter_and_scale_by_alpha(unsigned x, unsigned y, 262 SkPMColor a00, SkPMColor a01, 263 SkPMColor a10, SkPMColor a11, 264 SkPMColor *dst, 265 uint16_t scale) { 266 uint8x8_t vy, vconst16_8, v16_y, vres; 267 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; 268 uint32x2_t va0, va1; 269 uint16x8_t tmp1, tmp2; 270 271 vy = vdup_n_u8(y); // duplicate y into vy 272 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 273 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y 274 275 va0 = vdup_n_u32(a00); // duplicate a00 276 va1 = vdup_n_u32(a10); // duplicate a10 277 va0 = vset_lane_u32(a01, va0, 1); // set top to a01 278 va1 = vset_lane_u32(a11, va1, 1); // set top to a11 279 280 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) 281 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y 282 283 vx = vdup_n_u16(x); // duplicate x into vx 284 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 285 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x 286 287 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x 288 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x 289 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) 290 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) 291 292 if (scale < 256) { 293 vscale = vdup_n_u16(scale); // duplicate scale 294 tmp = vshr_n_u16(tmp, 8); // shift down result by 8 295 tmp = vmul_u16(tmp, vscale); // multiply result by scale 296 } 297 298 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8 299 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result 300 } 301 #else 302 static void filter_and_scale_by_alpha(unsigned x, unsigned y, 303 SkPMColor a00, SkPMColor a01, 304 SkPMColor a10, SkPMColor a11, 305 SkPMColor* dstColor, 306 unsigned alphaScale) { 307 SkASSERT((unsigned)x <= 0xF); 308 SkASSERT((unsigned)y <= 0xF); 309 SkASSERT(alphaScale <= 256); 310 311 int xy = x * y; 312 const uint32_t mask = 0xFF00FF; 313 314 int scale = 256 - 16*y - 16*x + xy; 315 uint32_t lo = (a00 & mask) * scale; 316 uint32_t hi = ((a00 >> 8) & mask) * scale; 317 318 scale = 16*x - xy; 319 lo += (a01 & mask) * scale; 320 hi += ((a01 >> 8) & mask) * scale; 321 322 scale = 16*y - xy; 323 lo += (a10 & mask) * scale; 324 hi += ((a10 >> 8) & mask) * scale; 325 326 lo += (a11 & mask) * xy; 327 hi += ((a11 >> 8) & mask) * xy; 328 329 if (alphaScale < 256) { 330 lo = ((lo >> 8) & mask) * alphaScale; 331 hi = ((hi >> 8) & mask) * alphaScale; 332 } 333 334 *dstColor = ((lo >> 8) & mask) | (hi & ~mask); 335 } 336 #endif 337 338 339 /*not static*/ inline 340 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, 341 const uint32_t* xy, int count, SkPMColor* colors) { 342 SkASSERT(count > 0 && colors != nullptr); 343 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); 344 SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); 345 SkASSERT(s.fAlphaScale <= 256); 346 347 int y0, y1, wy; 348 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); 349 350 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), 351 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); 352 353 while (count --> 0) { 354 int x0, x1, wx; 355 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); 356 357 filter_and_scale_by_alpha(wx, wy, 358 row0[x0], row0[x1], 359 row1[x0], row1[x1], 360 colors++, 361 s.fAlphaScale); 362 } 363 } 364 365 #endif 366 367 } // namespace SK_OPTS_NS 368 369 #endif 370