1 /*
2  * Copyright 2015 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkBlurImageFilter_opts_DEFINED
9 #define SkBlurImageFilter_opts_DEFINED
10 
11 #include "SkColorPriv.h"
12 #include "SkTypes.h"
13 
14 namespace SK_OPTS_NS {
15 
16 enum class BlurDirection { kX, kY };
17 
18 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
19 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
20 // ARGB -> 000A 000R 000G 000B
expand(SkPMColor p)21 static inline __m128i expand(SkPMColor p) {
22     return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
23 };
24 // Axxx Rxxx Gxxx Bxxx -> ARGB
repack(__m128i p)25 static inline SkPMColor repack(__m128i p) {
26     const char _ = ~0;  // Don't care what ends up in these bytes.  This zeros them.
27     p = _mm_shuffle_epi8(p, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 15,11,7,3));
28     return _mm_cvtsi128_si32(p);
29 };
30 #define mullo_epi32 _mm_mullo_epi32
31 
32 #else
33 // ARGB -> 000A 000R 000G 000B
expand(int p)34 static inline __m128i expand(int p) {
35     auto result = _mm_cvtsi32_si128(p);
36     result = _mm_unpacklo_epi8(result, _mm_setzero_si128());
37     result = _mm_unpacklo_epi16(result, _mm_setzero_si128());
38     return result;
39 };
40 // Axxx Rxxx Gxxx Bxxx -> ARGB
repack(__m128i p)41 static inline SkPMColor repack(__m128i p) {
42     p = _mm_srli_epi32(p, 24);  // 000A 000R 000G 000B
43     p = _mm_packs_epi32(p, p);  // xxxx xxxx 0A0R 0G0B
44     p = _mm_packus_epi16(p, p); // xxxx xxxx xxxx ARGB
45     return _mm_cvtsi128_si32(p);
46 };
47 
48 // _mm_mullo_epi32 is not available, so use the standard trick to emulate it.
mullo_epi32(__m128i a,__m128i b)49 static inline __m128i mullo_epi32(__m128i a, __m128i b) {
50     __m128i p02 = _mm_mul_epu32(a, b),
51             p13 = _mm_mul_epu32(_mm_srli_si128(a, 4),
52                                 _mm_srli_si128(b, 4));
53     return _mm_unpacklo_epi32(_mm_shuffle_epi32(p02, _MM_SHUFFLE(0,0,2,0)),
54                               _mm_shuffle_epi32(p13, _MM_SHUFFLE(0,0,2,0)));
55 };
56 #endif
57 #define INIT_SCALE const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize);
58 #define INIT_HALF const __m128i half = _mm_set1_epi32(1 << 23);
59 #define INIT_SUMS __m128i sum = _mm_setzero_si128();
60 #define INCREMENT_SUMS(c) sum = _mm_add_epi32(sum, expand(c))
61 #define DECREMENT_SUMS(c) sum = _mm_sub_epi32(sum, expand(c))
62 #define STORE_SUMS \
63     auto result = mullo_epi32(sum, scale); \
64     result = _mm_add_epi32(result, half); \
65     *dptr = repack(result);
66 #define DOUBLE_ROW_OPTIMIZATION
67 
68 #elif defined(SK_ARM_HAS_NEON)
69 
70 // val = (sum * scale * 2 + 0x8000) >> 16
71 #define STORE_SUMS_DOUBLE \
72     uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16( \
73         vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale))); \
74     if (dstDirection == BlurDirection::kX) { \
75         uint32x2_t px2 = vreinterpret_u32_u8(vmovn_u16(resultPixels)); \
76         vst1_lane_u32(dptr +     0, px2, 0); \
77         vst1_lane_u32(dptr + width, px2, 1); \
78     } else { \
79         vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \
80     }
81 
82 #define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p))
83 #define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p))
84 
85 // Fast path for kernel sizes between 2 and 127, working on two rows at a time.
86 template<BlurDirection srcDirection, BlurDirection dstDirection>
box_blur_double(const SkPMColor ** src,int srcStride,const SkIRect & srcBounds,SkPMColor ** dst,int kernelSize,int leftOffset,int rightOffset,int width,int height)87 int box_blur_double(const SkPMColor** src, int srcStride, const SkIRect& srcBounds, SkPMColor** dst, int kernelSize,
88                      int leftOffset, int rightOffset, int width, int height) {
89     // Load 2 pixels from adjacent rows.
90     auto load_2_pixels = [&](const SkPMColor* s) {
91         if (srcDirection == BlurDirection::kX) {
92             // 10% faster by adding these 2 prefetches
93             SK_PREFETCH(s + 16);
94             SK_PREFETCH(s + 16 + srcStride);
95             auto one = vld1_lane_u32(s +         0, vdup_n_u32(0), 0),
96                  two = vld1_lane_u32(s + srcStride,           one, 1);
97             return vreinterpret_u8_u32(two);
98         } else {
99             return vld1_u8((uint8_t*)s);
100         }
101     };
102     int left = srcBounds.left();
103     int right = srcBounds.right();
104     int top = srcBounds.top();
105     int bottom = srcBounds.bottom();
106     int incrementStart = SkMax32(left - rightOffset - 1, left - right);
107     int incrementEnd = SkMax32(right - rightOffset - 1, 0);
108     int decrementStart = SkMin32(left + leftOffset, width);
109     int decrementEnd = SkMin32(right + leftOffset, width);
110     const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride;
111     const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height;
112     const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1;
113     const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1;
114     const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize);
115 
116     for (; bottom - top >= 2; top += 2) {
117         uint16x8_t sum = vdupq_n_u16(0);
118         const SkPMColor* lptr = *src;
119         const SkPMColor* rptr = *src;
120         SkPMColor* dptr = *dst;
121         int x;
122         for (x = incrementStart; x < 0; ++x) {
123             INCREMENT_SUMS_DOUBLE(rptr);
124             rptr += srcStrideX;
125         }
126         // Clear to zero when sampling to the left our domain. "sum" is zero here because we
127         // initialized it above, and the preceeding loop has no effect in this case.
128         for (x = 0; x < incrementStart; ++x) {
129             STORE_SUMS_DOUBLE
130             dptr += dstStrideX;
131         }
132         for (; x < decrementStart && x < incrementEnd; ++x) {
133             STORE_SUMS_DOUBLE
134             dptr += dstStrideX;
135             INCREMENT_SUMS_DOUBLE(rptr);
136             rptr += srcStrideX;
137         }
138         for (x = decrementStart; x < incrementEnd; ++x) {
139             STORE_SUMS_DOUBLE
140             dptr += dstStrideX;
141             INCREMENT_SUMS_DOUBLE(rptr);
142             rptr += srcStrideX;
143             DECREMENT_SUMS_DOUBLE(lptr);
144             lptr += srcStrideX;
145         }
146         for (x = incrementEnd; x < decrementStart; ++x) {
147             STORE_SUMS_DOUBLE
148             dptr += dstStrideX;
149         }
150         for (; x < decrementEnd; ++x) {
151             STORE_SUMS_DOUBLE
152             dptr += dstStrideX;
153             DECREMENT_SUMS_DOUBLE(lptr);
154             lptr += srcStrideX;
155         }
156         // Clear to zero when sampling to the right of our domain. "sum" is zero here because we
157         // added on then subtracted off all of the pixels, leaving zero.
158         for (; x < width; ++x) {
159             STORE_SUMS_DOUBLE
160             dptr += dstStrideX;
161         }
162         *src += srcStrideY * 2;
163         *dst += dstStrideY * 2;
164     }
165     return top;
166 }
167 
168 // ARGB -> 0A0R 0G0B
expand(SkPMColor p)169 static inline uint16x4_t expand(SkPMColor p) {
170     return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p))));
171 };
172 
173 #define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);
174 #define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23);
175 #define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0);
176 #define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c));
177 #define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c));
178 
179 #define STORE_SUMS \
180     uint32x4_t result = vmlaq_u32(half, sum, scale); \
181     uint16x4_t result16 = vqshrn_n_u32(result, 16); \
182     uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \
183     vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0);
184 
185 #define DOUBLE_ROW_OPTIMIZATION \
186     if (1 < kernelSize && kernelSize < 128) { \
187         top = box_blur_double<srcDirection, dstDirection>(&src, srcStride, srcBounds, &dst, \
188                                                           kernelSize, leftOffset, rightOffset, \
189                                                           width, height); \
190     }
191 
192 #else  // Neither NEON nor >=SSE2.
193 
194 #define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize;
195 #define INIT_HALF  uint32_t half = 1 << 23;
196 #define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0;
197 #define INCREMENT_SUMS(c) \
198     sumA += SkGetPackedA32(c); \
199     sumR += SkGetPackedR32(c); \
200     sumG += SkGetPackedG32(c); \
201     sumB += SkGetPackedB32(c)
202 #define DECREMENT_SUMS(c) \
203     sumA -= SkGetPackedA32(c); \
204     sumR -= SkGetPackedR32(c); \
205     sumG -= SkGetPackedG32(c); \
206     sumB -= SkGetPackedB32(c)
207 #define STORE_SUMS \
208     *dptr = SkPackARGB32((sumA * scale + half) >> 24, \
209                          (sumR * scale + half) >> 24, \
210                          (sumG * scale + half) >> 24, \
211                          (sumB * scale + half) >> 24);
212 #define DOUBLE_ROW_OPTIMIZATION
213 
214 #endif
215 
216 #define PREFETCH_RPTR \
217     if (srcDirection == BlurDirection::kY) { \
218         SK_PREFETCH(rptr); \
219     }
220 
221 template<BlurDirection srcDirection, BlurDirection dstDirection>
box_blur(const SkPMColor * src,int srcStride,const SkIRect & srcBounds,SkPMColor * dst,int kernelSize,int leftOffset,int rightOffset,int width,int height)222 static void box_blur(const SkPMColor* src, int srcStride, const SkIRect& srcBounds, SkPMColor* dst,
223                      int kernelSize, int leftOffset, int rightOffset, int width, int height) {
224     int left = srcBounds.left();
225     int right = srcBounds.right();
226     int top = srcBounds.top();
227     int bottom = srcBounds.bottom();
228     int incrementStart = SkMax32(left - rightOffset - 1, left - right);
229     int incrementEnd = SkMax32(right - rightOffset - 1, 0);
230     int decrementStart = SkMin32(left + leftOffset, width);
231     int decrementEnd = SkMin32(right + leftOffset, width);
232     int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride;
233     int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height;
234     int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1;
235     int dstStrideY = dstDirection == BlurDirection::kX ? width : 1;
236     INIT_SCALE
237     INIT_HALF
238 
239     // Clear to zero when sampling above our domain.
240     for (int y = 0; y < top; y++) {
241         SkColor* dptr = dst;
242         for (int x = 0; x < width; ++x) {
243             *dptr = 0;
244             dptr += dstStrideX;
245         }
246         dst += dstStrideY;
247     }
248 
249     DOUBLE_ROW_OPTIMIZATION
250 
251     for (int y = top; y < bottom; ++y) {
252         INIT_SUMS
253         const SkPMColor* lptr = src;
254         const SkPMColor* rptr = src;
255         SkColor* dptr = dst;
256         int x;
257         for (x = incrementStart; x < 0; ++x) {
258             INCREMENT_SUMS(*rptr);
259             rptr += srcStrideX;
260             PREFETCH_RPTR
261         }
262         // Clear to zero when sampling to the left of our domain.
263         for (x = 0; x < incrementStart; ++x) {
264             *dptr = 0;
265             dptr += dstStrideX;
266         }
267         for (; x < decrementStart && x < incrementEnd; ++x) {
268             STORE_SUMS
269             dptr += dstStrideX;
270             INCREMENT_SUMS(*rptr);
271             rptr += srcStrideX;
272             PREFETCH_RPTR
273         }
274         for (x = decrementStart; x < incrementEnd; ++x) {
275             STORE_SUMS
276             dptr += dstStrideX;
277             INCREMENT_SUMS(*rptr);
278             rptr += srcStrideX;
279             PREFETCH_RPTR
280             DECREMENT_SUMS(*lptr);
281             lptr += srcStrideX;
282         }
283         for (x = incrementEnd; x < decrementStart; ++x) {
284             STORE_SUMS
285             dptr += dstStrideX;
286         }
287         for (; x < decrementEnd; ++x) {
288             STORE_SUMS
289             dptr += dstStrideX;
290             DECREMENT_SUMS(*lptr);
291             lptr += srcStrideX;
292         }
293         // Clear to zero when sampling to the right of our domain.
294         for (; x < width; ++x) {
295             *dptr = 0;
296             dptr += dstStrideX;
297         }
298         src += srcStrideY;
299         dst += dstStrideY;
300     }
301     // Clear to zero when sampling below our domain.
302     for (int y = bottom; y < height; ++y) {
303         SkColor* dptr = dst;
304         for (int x = 0; x < width; ++x) {
305             *dptr = 0;
306             dptr += dstStrideX;
307         }
308         dst += dstStrideY;
309     }
310 }
311 
312 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>,
313             box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>,
314             box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>;
315 
316 }  // namespace SK_OPTS_NS
317 
318 #endif
319