1 /*
2  * Copyright 2012 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBitmapProcState.h"
9 #include "SkBitmapProcState_filter.h"
10 #include "SkColorPriv.h"
11 #include "SkFilterProc.h"
12 #include "SkPaint.h"
13 #include "SkShader.h"   // for tilemodes
14 #include "SkUtilsArm.h"
15 
16 // Required to ensure the table is part of the final binary.
17 extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];
18 
19 #define   NAME_WRAP(x)  x ## _neon
20 #include "SkBitmapProcState_filter_neon.h"
21 #include "SkBitmapProcState_procs.h"
22 
23 const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
24     S32_opaque_D32_nofilter_DXDY_neon,
25     S32_alpha_D32_nofilter_DXDY_neon,
26     S32_opaque_D32_nofilter_DX_neon,
27     S32_alpha_D32_nofilter_DX_neon,
28     S32_opaque_D32_filter_DXDY_neon,
29     S32_alpha_D32_filter_DXDY_neon,
30     S32_opaque_D32_filter_DX_neon,
31     S32_alpha_D32_filter_DX_neon,
32 
33     S16_opaque_D32_nofilter_DXDY_neon,
34     S16_alpha_D32_nofilter_DXDY_neon,
35     S16_opaque_D32_nofilter_DX_neon,
36     S16_alpha_D32_nofilter_DX_neon,
37     S16_opaque_D32_filter_DXDY_neon,
38     S16_alpha_D32_filter_DXDY_neon,
39     S16_opaque_D32_filter_DX_neon,
40     S16_alpha_D32_filter_DX_neon,
41 
42     SI8_opaque_D32_nofilter_DXDY_neon,
43     SI8_alpha_D32_nofilter_DXDY_neon,
44     SI8_opaque_D32_nofilter_DX_neon,
45     SI8_alpha_D32_nofilter_DX_neon,
46     SI8_opaque_D32_filter_DXDY_neon,
47     SI8_alpha_D32_filter_DXDY_neon,
48     SI8_opaque_D32_filter_DX_neon,
49     SI8_alpha_D32_filter_DX_neon,
50 
51     S4444_opaque_D32_nofilter_DXDY_neon,
52     S4444_alpha_D32_nofilter_DXDY_neon,
53     S4444_opaque_D32_nofilter_DX_neon,
54     S4444_alpha_D32_nofilter_DX_neon,
55     S4444_opaque_D32_filter_DXDY_neon,
56     S4444_alpha_D32_filter_DXDY_neon,
57     S4444_opaque_D32_filter_DX_neon,
58     S4444_alpha_D32_filter_DX_neon,
59 
60     // A8 treats alpha/opauqe the same (equally efficient)
61     SA8_alpha_D32_nofilter_DXDY_neon,
62     SA8_alpha_D32_nofilter_DXDY_neon,
63     SA8_alpha_D32_nofilter_DX_neon,
64     SA8_alpha_D32_nofilter_DX_neon,
65     SA8_alpha_D32_filter_DXDY_neon,
66     SA8_alpha_D32_filter_DXDY_neon,
67     SA8_alpha_D32_filter_DX_neon,
68     SA8_alpha_D32_filter_DX_neon,
69 
70     // todo: possibly specialize on opaqueness
71     SG8_alpha_D32_nofilter_DXDY_neon,
72     SG8_alpha_D32_nofilter_DXDY_neon,
73     SG8_alpha_D32_nofilter_DX_neon,
74     SG8_alpha_D32_nofilter_DX_neon,
75     SG8_alpha_D32_filter_DXDY_neon,
76     SG8_alpha_D32_filter_DXDY_neon,
77     SG8_alpha_D32_filter_DX_neon,
78     SG8_alpha_D32_filter_DX_neon,
79 };
80 
81 ///////////////////////////////////////////////////////////////////////////////
82 
83 #include <arm_neon.h>
84 #include "SkConvolver.h"
85 
86 // Convolves horizontally along a single row. The row data is given in
87 // |srcData| and continues for the numValues() of the filter.
convolveHorizontally_neon(const unsigned char * srcData,const SkConvolutionFilter1D & filter,unsigned char * outRow,bool hasAlpha)88 void convolveHorizontally_neon(const unsigned char* srcData,
89                                const SkConvolutionFilter1D& filter,
90                                unsigned char* outRow,
91                                bool hasAlpha) {
92     // Loop over each pixel on this row in the output image.
93     int numValues = filter.numValues();
94     for (int outX = 0; outX < numValues; outX++) {
95         uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
96         uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
97         uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
98         uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
99         // Get the filter that determines the current output pixel.
100         int filterOffset, filterLength;
101         const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
102             filter.FilterForValue(outX, &filterOffset, &filterLength);
103 
104         // Compute the first pixel in this row that the filter affects. It will
105         // touch |filterLength| pixels (4 bytes each) after this.
106         const unsigned char* rowToFilter = &srcData[filterOffset * 4];
107 
108         // Apply the filter to the row to get the destination pixel in |accum|.
109         int32x4_t accum = vdupq_n_s32(0);
110         for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
111             // Load 4 coefficients
112             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
113             coeffs = vld1_s16(filterValues);
114             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
115             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
116             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
117             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
118 
119             // Load pixels and calc
120             uint8x16_t pixels = vld1q_u8(rowToFilter);
121             int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
122             int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
123 
124             int16x4_t p0_src = vget_low_s16(p01_16);
125             int16x4_t p1_src = vget_high_s16(p01_16);
126             int16x4_t p2_src = vget_low_s16(p23_16);
127             int16x4_t p3_src = vget_high_s16(p23_16);
128 
129             int32x4_t p0 = vmull_s16(p0_src, coeff0);
130             int32x4_t p1 = vmull_s16(p1_src, coeff1);
131             int32x4_t p2 = vmull_s16(p2_src, coeff2);
132             int32x4_t p3 = vmull_s16(p3_src, coeff3);
133 
134             accum += p0;
135             accum += p1;
136             accum += p2;
137             accum += p3;
138 
139             // Advance the pointers
140             rowToFilter += 16;
141             filterValues += 4;
142         }
143         int r = filterLength & 3;
144         if (r) {
145             const uint16_t mask[4][4] = {
146                 {0, 0, 0, 0},
147                 {0xFFFF, 0, 0, 0},
148                 {0xFFFF, 0xFFFF, 0, 0},
149                 {0xFFFF, 0xFFFF, 0xFFFF, 0}
150             };
151             uint16x4_t coeffs;
152             int16x4_t coeff0, coeff1, coeff2;
153             coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
154             coeffs &= vld1_u16(&mask[r][0]);
155             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));
156             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));
157             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));
158 
159             // Load pixels and calc
160             uint8x16_t pixels = vld1q_u8(rowToFilter);
161             int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
162             int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
163             int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
164             int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
165             int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
166 
167             accum += p0;
168             accum += p1;
169             accum += p2;
170         }
171 
172         // Bring this value back in range. All of the filter scaling factors
173         // are in fixed point with kShiftBits bits of fractional part.
174         accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
175 
176         // Pack and store the new pixel.
177         int16x4_t accum16 = vqmovn_s32(accum);
178         uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
179         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);
180         outRow += 4;
181     }
182 }
183 
184 // Does vertical convolution to produce one output row. The filter values and
185 // length are given in the first two parameters. These are applied to each
186 // of the rows pointed to in the |sourceDataRows| array, with each row
187 // being |pixelWidth| wide.
188 //
189 // The output must have room for |pixelWidth * 4| bytes.
190 template<bool hasAlpha>
convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed * filterValues,int filterLength,unsigned char * const * sourceDataRows,int pixelWidth,unsigned char * outRow)191 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
192                              int filterLength,
193                              unsigned char* const* sourceDataRows,
194                              int pixelWidth,
195                              unsigned char* outRow) {
196     int width = pixelWidth & ~3;
197 
198     int32x4_t accum0, accum1, accum2, accum3;
199     int16x4_t coeff16;
200 
201     // Output four pixels per iteration (16 bytes).
202     for (int outX = 0; outX < width; outX += 4) {
203 
204         // Accumulated result for each pixel. 32 bits per RGBA channel.
205         accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);
206 
207         // Convolve with one filter coefficient per iteration.
208         for (int filterY = 0; filterY < filterLength; filterY++) {
209 
210             // Duplicate the filter coefficient 4 times.
211             // [16] cj cj cj cj
212             coeff16 = vdup_n_s16(filterValues[filterY]);
213 
214             // Load four pixels (16 bytes) together.
215             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
216             uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
217 
218             int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
219             int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
220             int16x4_t src16_0 = vget_low_s16(src16_01);
221             int16x4_t src16_1 = vget_high_s16(src16_01);
222             int16x4_t src16_2 = vget_low_s16(src16_23);
223             int16x4_t src16_3 = vget_high_s16(src16_23);
224 
225             accum0 += vmull_s16(src16_0, coeff16);
226             accum1 += vmull_s16(src16_1, coeff16);
227             accum2 += vmull_s16(src16_2, coeff16);
228             accum3 += vmull_s16(src16_3, coeff16);
229         }
230 
231         // Shift right for fixed point implementation.
232         accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
233         accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
234         accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
235         accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
236 
237         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
238         // [16] a1 b1 g1 r1 a0 b0 g0 r0
239         int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
240         // [16] a3 b3 g3 r3 a2 b2 g2 r2
241         int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
242 
243         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
244         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
245         uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
246 
247         if (hasAlpha) {
248             // Compute the max(ri, gi, bi) for each pixel.
249             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
250             uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
251             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
252             uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
253             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
254             a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
255             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
256             b = vmaxq_u8(a, b); // Max of r and g and b.
257             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
258             b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
259 
260             // Make sure the value of alpha channel is always larger than maximum
261             // value of color channels.
262             accum8 = vmaxq_u8(b, accum8);
263         } else {
264             // Set value of alpha channels to 0xFF.
265             accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
266         }
267 
268         // Store the convolution result (16 bytes) and advance the pixel pointers.
269         vst1q_u8(outRow, accum8);
270         outRow += 16;
271     }
272 
273     // Process the leftovers when the width of the output is not divisible
274     // by 4, that is at most 3 pixels.
275     int r = pixelWidth & 3;
276     if (r) {
277 
278         accum0 = accum1 = accum2 = vdupq_n_s32(0);
279 
280         for (int filterY = 0; filterY < filterLength; ++filterY) {
281             coeff16 = vdup_n_s16(filterValues[filterY]);
282 
283             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
284             uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
285 
286             int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
287             int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
288             int16x4_t src16_0 = vget_low_s16(src16_01);
289             int16x4_t src16_1 = vget_high_s16(src16_01);
290             int16x4_t src16_2 = vget_low_s16(src16_23);
291 
292             accum0 += vmull_s16(src16_0, coeff16);
293             accum1 += vmull_s16(src16_1, coeff16);
294             accum2 += vmull_s16(src16_2, coeff16);
295         }
296 
297         accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
298         accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
299         accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
300 
301         int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
302         int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
303 
304         uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
305 
306         if (hasAlpha) {
307             // Compute the max(ri, gi, bi) for each pixel.
308             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
309             uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
310             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
311             uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
312             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
313             a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
314             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
315             b = vmaxq_u8(a, b); // Max of r and g and b.
316             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
317             b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
318 
319             // Make sure the value of alpha channel is always larger than maximum
320             // value of color channels.
321             accum8 = vmaxq_u8(b, accum8);
322         } else {
323             // Set value of alpha channels to 0xFF.
324             accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
325         }
326 
327         switch(r) {
328         case 1:
329             vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);
330             break;
331         case 2:
332             vst1_u32(reinterpret_cast<uint32_t*>(outRow),
333                      vreinterpret_u32_u8(vget_low_u8(accum8)));
334             break;
335         case 3:
336             vst1_u32(reinterpret_cast<uint32_t*>(outRow),
337                      vreinterpret_u32_u8(vget_low_u8(accum8)));
338             vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);
339             break;
340         }
341     }
342 }
343 
convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed * filterValues,int filterLength,unsigned char * const * sourceDataRows,int pixelWidth,unsigned char * outRow,bool sourceHasAlpha)344 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
345                              int filterLength,
346                              unsigned char* const* sourceDataRows,
347                              int pixelWidth,
348                              unsigned char* outRow,
349                              bool sourceHasAlpha) {
350     if (sourceHasAlpha) {
351         convolveVertically_neon<true>(filterValues, filterLength,
352                                       sourceDataRows, pixelWidth,
353                                       outRow);
354     } else {
355         convolveVertically_neon<false>(filterValues, filterLength,
356                                        sourceDataRows, pixelWidth,
357                                        outRow);
358     }
359 }
360 
361 // Convolves horizontally along four rows. The row data is given in
362 // |src_data| and continues for the num_values() of the filter.
363 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
364 // refer to that function for detailed comments.
convolve4RowsHorizontally_neon(const unsigned char * srcData[4],const SkConvolutionFilter1D & filter,unsigned char * outRow[4],size_t outRowBytes)365 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
366                                     const SkConvolutionFilter1D& filter,
367                                     unsigned char* outRow[4],
368                                     size_t outRowBytes) {
369 
370     uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
371     uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
372     uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
373     uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
374     int num_values = filter.numValues();
375 
376     int filterOffset, filterLength;
377     // |mask| will be used to decimate all extra filter coefficients that are
378     // loaded by SIMD when |filter_length| is not divisible by 4.
379     // mask[0] is not used in following algorithm.
380     const uint16_t mask[4][4] = {
381         {0, 0, 0, 0},
382         {0xFFFF, 0, 0, 0},
383         {0xFFFF, 0xFFFF, 0, 0},
384         {0xFFFF, 0xFFFF, 0xFFFF, 0}
385     };
386 
387     // Output one pixel each iteration, calculating all channels (RGBA) together.
388     for (int outX = 0; outX < num_values; outX++) {
389 
390         const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
391         filter.FilterForValue(outX, &filterOffset, &filterLength);
392 
393         // four pixels in a column per iteration.
394         int32x4_t accum0 = vdupq_n_s32(0);
395         int32x4_t accum1 = vdupq_n_s32(0);
396         int32x4_t accum2 = vdupq_n_s32(0);
397         int32x4_t accum3 = vdupq_n_s32(0);
398 
399         int start = (filterOffset<<2);
400 
401         // We will load and accumulate with four coefficients per iteration.
402         for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {
403             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
404 
405             coeffs = vld1_s16(filterValues);
406             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
407             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
408             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
409             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
410 
411             uint8x16_t pixels;
412             int16x8_t p01_16, p23_16;
413             int32x4_t p0, p1, p2, p3;
414 
415 
416 #define ITERATION(src, accum)                                       \
417     pixels = vld1q_u8(src);                                         \
418     p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));  \
419     p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
420     p0 = vmull_s16(vget_low_s16(p01_16), coeff0);                   \
421     p1 = vmull_s16(vget_high_s16(p01_16), coeff1);                  \
422     p2 = vmull_s16(vget_low_s16(p23_16), coeff2);                   \
423     p3 = vmull_s16(vget_high_s16(p23_16), coeff3);                  \
424     accum += p0;                                                    \
425     accum += p1;                                                    \
426     accum += p2;                                                    \
427     accum += p3
428 
429             ITERATION(srcData[0] + start, accum0);
430             ITERATION(srcData[1] + start, accum1);
431             ITERATION(srcData[2] + start, accum2);
432             ITERATION(srcData[3] + start, accum3);
433 
434             start += 16;
435             filterValues += 4;
436         }
437 
438         int r = filterLength & 3;
439         if (r) {
440             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
441             coeffs = vld1_s16(filterValues);
442             coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));
443             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
444             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
445             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
446             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
447 
448             uint8x16_t pixels;
449             int16x8_t p01_16, p23_16;
450             int32x4_t p0, p1, p2, p3;
451 
452             ITERATION(srcData[0] + start, accum0);
453             ITERATION(srcData[1] + start, accum1);
454             ITERATION(srcData[2] + start, accum2);
455             ITERATION(srcData[3] + start, accum3);
456         }
457 
458         int16x4_t accum16;
459         uint8x8_t res0, res1, res2, res3;
460 
461 #define PACK_RESULT(accum, res)                                         \
462         accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);  \
463         accum16 = vqmovn_s32(accum);                                    \
464         res = vqmovun_s16(vcombine_s16(accum16, accum16));
465 
466         PACK_RESULT(accum0, res0);
467         PACK_RESULT(accum1, res1);
468         PACK_RESULT(accum2, res2);
469         PACK_RESULT(accum3, res3);
470 
471         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);
472         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);
473         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);
474         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);
475         outRow[0] += 4;
476         outRow[1] += 4;
477         outRow[2] += 4;
478         outRow[3] += 4;
479     }
480 }
481 
applySIMDPadding_neon(SkConvolutionFilter1D * filter)482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
483     // Padding |paddingCount| of more dummy coefficients after the coefficients
484     // of last filter to prevent SIMD instructions which load 8 or 16 bytes
485     // together to access invalid memory areas. We are not trying to align the
486     // coefficients right now due to the opaqueness of <vector> implementation.
487     // This has to be done after all |AddFilter| calls.
488     for (int i = 0; i < 8; ++i) {
489         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
490     }
491 }
492 
platformConvolutionProcs_arm_neon(SkConvolutionProcs * procs)493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
494     procs->fExtraHorizontalReads = 3;
495     procs->fConvolveVertically = &convolveVertically_neon;
496     procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
497     procs->fConvolveHorizontally = &convolveHorizontally_neon;
498     procs->fApplySIMDPadding = &applySIMDPadding_neon;
499 }
500