1 /*
2  * Copyright 2015 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkBlitRow_opts_DEFINED
9 #define SkBlitRow_opts_DEFINED
10 
11 #include "SkColorData.h"
12 #include "SkMSAN.h"
13 
14 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
15     #include <immintrin.h>
16 
SkPMSrcOver_SSE2(const __m128i & src,const __m128i & dst)17     static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
18         auto SkAlphaMulQ_SSE2 = [](const __m128i& c, const __m128i& scale) {
19             const __m128i mask = _mm_set1_epi32(0xFF00FF);
20             __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
21 
22             // uint32_t rb = ((c & mask) * scale) >> 8
23             __m128i rb = _mm_and_si128(mask, c);
24             rb = _mm_mullo_epi16(rb, s);
25             rb = _mm_srli_epi16(rb, 8);
26 
27             // uint32_t ag = ((c >> 8) & mask) * scale
28             __m128i ag = _mm_srli_epi16(c, 8);
29             ag = _mm_mullo_epi16(ag, s);
30 
31             // (rb & mask) | (ag & ~mask)
32             ag = _mm_andnot_si128(mask, ag);
33             return _mm_or_si128(rb, ag);
34         };
35         return _mm_add_epi32(src,
36                              SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256),
37                                                                  _mm_srli_epi32(src, 24))));
38     }
39 #endif
40 
41 namespace SK_OPTS_NS {
42 
43 #if defined(SK_ARM_HAS_NEON)
44 
45 // Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
46 // y[i] are the i-th lanes of the corresponding NEON vectors.
SkMulDiv255Round_neon8(uint8x8_t x,uint8x8_t y)47 static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
48     uint16x8_t prod = vmull_u8(x, y);
49     return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
50 }
51 
52 // The implementations of SkPMSrcOver below perform alpha blending consistently with
53 // SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
54 //
55 //   result_i = src_i + rint(g(src_alpha, dst_i))
56 //
57 // where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer.
58 
59 // In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
60 // of the same color component for 8 consecutive pixels. The result of this function follows the
61 // same convention.
SkPMSrcOver_neon8(uint8x8x4_t dst,uint8x8x4_t src)62 static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
63     uint8x8_t nalphas = vmvn_u8(src.val[3]);
64     uint8x8x4_t result;
65     result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas,  dst.val[0]));
66     result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas,  dst.val[1]));
67     result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas,  dst.val[2]));
68     result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas,  dst.val[3]));
69     return result;
70 }
71 
72 // In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
73 // pixels. The return value follows the same convention.
SkPMSrcOver_neon2(uint8x8_t dst,uint8x8_t src)74 static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
75     const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
76     uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
77     return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
78 }
79 
80 #endif
81 
82 /*not static*/ inline
blit_row_s32a_opaque(SkPMColor * dst,const SkPMColor * src,int len,U8CPU alpha)83 void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
84     SkASSERT(alpha == 0xFF);
85     sk_msan_assert_initialized(src, src+len);
86 
87 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
88     while (len >= 16) {
89         // Load 16 source pixels.
90         auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
91              s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
92              s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
93              s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
94 
95         const auto alphaMask = _mm_set1_epi32(0xFF000000);
96 
97         auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
98         if (_mm_testz_si128(ORed, alphaMask)) {
99             // All 16 source pixels are transparent.  Nothing to do.
100             src += 16;
101             dst += 16;
102             len -= 16;
103             continue;
104         }
105 
106         auto d0 = (__m128i*)(dst) + 0,
107              d1 = (__m128i*)(dst) + 1,
108              d2 = (__m128i*)(dst) + 2,
109              d3 = (__m128i*)(dst) + 3;
110 
111         auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
112         if (_mm_testc_si128(ANDed, alphaMask)) {
113             // All 16 source pixels are opaque.  SrcOver becomes Src.
114             _mm_storeu_si128(d0, s0);
115             _mm_storeu_si128(d1, s1);
116             _mm_storeu_si128(d2, s2);
117             _mm_storeu_si128(d3, s3);
118             src += 16;
119             dst += 16;
120             len -= 16;
121             continue;
122         }
123 
124         // TODO: This math is wrong.
125         // Do SrcOver.
126         _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
127         _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
128         _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
129         _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
130         src += 16;
131         dst += 16;
132         len -= 16;
133     }
134 
135 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
136     while (len >= 16) {
137         // Load 16 source pixels.
138         auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
139              s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
140              s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
141              s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
142 
143         const auto alphaMask = _mm_set1_epi32(0xFF000000);
144 
145         auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
146         if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
147                                                        _mm_setzero_si128()))) {
148             // All 16 source pixels are transparent.  Nothing to do.
149             src += 16;
150             dst += 16;
151             len -= 16;
152             continue;
153         }
154 
155         auto d0 = (__m128i*)(dst) + 0,
156              d1 = (__m128i*)(dst) + 1,
157              d2 = (__m128i*)(dst) + 2,
158              d3 = (__m128i*)(dst) + 3;
159 
160         auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
161         if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
162                                                        alphaMask))) {
163             // All 16 source pixels are opaque.  SrcOver becomes Src.
164             _mm_storeu_si128(d0, s0);
165             _mm_storeu_si128(d1, s1);
166             _mm_storeu_si128(d2, s2);
167             _mm_storeu_si128(d3, s3);
168             src += 16;
169             dst += 16;
170             len -= 16;
171             continue;
172         }
173 
174         // TODO: This math is wrong.
175         // Do SrcOver.
176         _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
177         _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
178         _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
179         _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
180 
181         src += 16;
182         dst += 16;
183         len -= 16;
184     }
185 
186 #elif defined(SK_ARM_HAS_NEON)
187     // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
188     // underperformed on some of the platforms under test for inputs with frequent transitions of
189     // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
190     // revisiting the situation in the future.
191     while (len >= 8) {
192         // Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
193         // for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
194         // pixels).
195         uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
196         src += 8;
197         len -= 8;
198 
199         // We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
200         // are all transparent), the second when all alphas are fully set (they are all opaque).
201         uint8x8_t alphas = src_col.val[3];
202         uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0);
203         if (alphas_u64 == 0) {
204             // All pixels transparent.
205             dst += 8;
206             continue;
207         }
208 
209         if (~alphas_u64 == 0) {
210             // All pixels opaque.
211             vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
212             dst += 8;
213             continue;
214         }
215 
216         uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
217         vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
218         dst += 8;
219     }
220 
221     // Deal with leftover pixels.
222     for (; len >= 2; len -= 2, src += 2, dst += 2) {
223         uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
224         uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
225         vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
226     }
227 
228     if (len != 0) {
229         uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(*dst), vcreate_u8(*src));
230         vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
231     }
232     return;
233 #endif
234 
235     while (len-- > 0) {
236         // This 0xFF000000 is not semantically necessary, but for compatibility
237         // with chromium:611002 we need to keep it until we figure out where
238         // the non-premultiplied src values (like 0x00FFFFFF) are coming from.
239         // TODO(mtklein): sort this out and assert *src is premul here.
240         if (*src & 0xFF000000) {
241             *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
242         }
243         src++;
244         dst++;
245     }
246 }
247 
248 }  // SK_OPTS_NS
249 
250 #endif//SkBlitRow_opts_DEFINED
251