1 /*
2  * Copyright 2015 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkBlitRow_opts_DEFINED
9 #define SkBlitRow_opts_DEFINED
10 
11 #include "Sk4px.h"
12 #include "SkColorData.h"
13 #include "SkMSAN.h"
14 
15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
16     #include "SkColor_opts_SSE2.h"
17     #include <immintrin.h>
18 #endif
19 
20 namespace SK_OPTS_NS {
21 
22 // Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp.
23 // It's not quite perfect, but it's never wrong in the interesting edge cases,
24 // and it's quite a bit faster than blend_perfect.
25 //
26 // blend_256_round_alt is our currently blessed algorithm.  Please use it or an analogous one.
27 static inline
28 void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
29     unsigned invA = 255 - SkGetPackedA32(color);
30     invA += invA >> 7;
31     SkASSERT(invA < 256);  // We've should have already handled alpha == 0 externally.
32 
33     Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128);
34     Sk16b invA_16x(invA);
35 
36     Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
37         return (src4 * invA_16x).addNarrowHi(colorHighAndRound);
38     });
39 }
40 
41 #if defined(SK_ARM_HAS_NEON)
42 
43 // Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
44 // y[i] are the i-th lanes of the corresponding NEON vectors.
45 static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
46     uint16x8_t prod = vmull_u8(x, y);
47     return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
48 }
49 
50 // The implementations of SkPMSrcOver below perform alpha blending consistently with
51 // SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
52 //
53 //   result_i = src_i + rint(g(src_alpha, dst_i))
54 //
55 // where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer.
56 
57 // In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
58 // of the same color component for 8 consecutive pixels. The result of this function follows the
59 // same convention.
60 static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
61     uint8x8_t nalphas = vmvn_u8(src.val[3]);
62     uint8x8x4_t result;
63     result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas,  dst.val[0]));
64     result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas,  dst.val[1]));
65     result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas,  dst.val[2]));
66     result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas,  dst.val[3]));
67     return result;
68 }
69 
70 // In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
71 // pixels. The return value follows the same convention.
72 static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
73     const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
74     uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
75     return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
76 }
77 
78 #endif
79 
80 /*not static*/ inline
81 void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
82     SkASSERT(alpha == 0xFF);
83     sk_msan_assert_initialized(src, src+len);
84 
85 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
86     while (len >= 16) {
87         // Load 16 source pixels.
88         auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
89              s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
90              s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
91              s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
92 
93         const auto alphaMask = _mm_set1_epi32(0xFF000000);
94 
95         auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
96         if (_mm_testz_si128(ORed, alphaMask)) {
97             // All 16 source pixels are transparent.  Nothing to do.
98             src += 16;
99             dst += 16;
100             len -= 16;
101             continue;
102         }
103 
104         auto d0 = (__m128i*)(dst) + 0,
105              d1 = (__m128i*)(dst) + 1,
106              d2 = (__m128i*)(dst) + 2,
107              d3 = (__m128i*)(dst) + 3;
108 
109         auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
110         if (_mm_testc_si128(ANDed, alphaMask)) {
111             // All 16 source pixels are opaque.  SrcOver becomes Src.
112             _mm_storeu_si128(d0, s0);
113             _mm_storeu_si128(d1, s1);
114             _mm_storeu_si128(d2, s2);
115             _mm_storeu_si128(d3, s3);
116             src += 16;
117             dst += 16;
118             len -= 16;
119             continue;
120         }
121 
122         // TODO: This math is wrong.
123         // Do SrcOver.
124         _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
125         _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
126         _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
127         _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
128         src += 16;
129         dst += 16;
130         len -= 16;
131     }
132 
133 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
134     while (len >= 16) {
135         // Load 16 source pixels.
136         auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
137              s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
138              s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
139              s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
140 
141         const auto alphaMask = _mm_set1_epi32(0xFF000000);
142 
143         auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
144         if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
145                                                        _mm_setzero_si128()))) {
146             // All 16 source pixels are transparent.  Nothing to do.
147             src += 16;
148             dst += 16;
149             len -= 16;
150             continue;
151         }
152 
153         auto d0 = (__m128i*)(dst) + 0,
154              d1 = (__m128i*)(dst) + 1,
155              d2 = (__m128i*)(dst) + 2,
156              d3 = (__m128i*)(dst) + 3;
157 
158         auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
159         if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
160                                                        alphaMask))) {
161             // All 16 source pixels are opaque.  SrcOver becomes Src.
162             _mm_storeu_si128(d0, s0);
163             _mm_storeu_si128(d1, s1);
164             _mm_storeu_si128(d2, s2);
165             _mm_storeu_si128(d3, s3);
166             src += 16;
167             dst += 16;
168             len -= 16;
169             continue;
170         }
171 
172         // TODO: This math is wrong.
173         // Do SrcOver.
174         _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
175         _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
176         _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
177         _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
178 
179         src += 16;
180         dst += 16;
181         len -= 16;
182     }
183 
184 #elif defined(SK_ARM_HAS_NEON)
185     // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
186     // underperformed on some of the platforms under test for inputs with frequent transitions of
187     // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
188     // revisiting the situation in the future.
189     while (len >= 8) {
190         // Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
191         // for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
192         // pixels).
193         uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
194         src += 8;
195         len -= 8;
196 
197         // We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
198         // are all transparent), the second when all alphas are fully set (they are all opaque).
199         uint8x8_t alphas = src_col.val[3];
200         uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0);
201         if (alphas_u64 == 0) {
202             // All pixels transparent.
203             dst += 8;
204             continue;
205         }
206 
207         if (~alphas_u64 == 0) {
208             // All pixels opaque.
209             vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
210             dst += 8;
211             continue;
212         }
213 
214         uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
215         vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
216         dst += 8;
217     }
218 
219     // Deal with leftover pixels.
220     for (; len >= 2; len -= 2, src += 2, dst += 2) {
221         uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
222         uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
223         vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
224     }
225 
226     if (len != 0) {
227         uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(*dst), vcreate_u8(*src));
228         vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
229     }
230     return;
231 #endif
232 
233     while (len-- > 0) {
234         // This 0xFF000000 is not semantically necessary, but for compatibility
235         // with chromium:611002 we need to keep it until we figure out where
236         // the non-premultiplied src values (like 0x00FFFFFF) are coming from.
237         // TODO(mtklein): sort this out and assert *src is premul here.
238         if (*src & 0xFF000000) {
239             *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
240         }
241         src++;
242         dst++;
243     }
244 }
245 
246 }  // SK_OPTS_NS
247 
248 #endif//SkBlitRow_opts_DEFINED
249