1 /*
2 * Copyright 2012 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBitmapProcState.h"
9 #include "SkBitmapProcState_filter.h"
10 #include "SkColorPriv.h"
11 #include "SkFilterProc.h"
12 #include "SkPaint.h"
13 #include "SkShader.h" // for tilemodes
14 #include "SkUtilsArm.h"
15
16 // Required to ensure the table is part of the final binary.
17 extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];
18
19 #define NAME_WRAP(x) x ## _neon
20 #include "SkBitmapProcState_filter_neon.h"
21 #include "SkBitmapProcState_procs.h"
22
23 const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
24 S32_opaque_D32_nofilter_DXDY_neon,
25 S32_alpha_D32_nofilter_DXDY_neon,
26 S32_opaque_D32_nofilter_DX_neon,
27 S32_alpha_D32_nofilter_DX_neon,
28 S32_opaque_D32_filter_DXDY_neon,
29 S32_alpha_D32_filter_DXDY_neon,
30 S32_opaque_D32_filter_DX_neon,
31 S32_alpha_D32_filter_DX_neon,
32
33 S16_opaque_D32_nofilter_DXDY_neon,
34 S16_alpha_D32_nofilter_DXDY_neon,
35 S16_opaque_D32_nofilter_DX_neon,
36 S16_alpha_D32_nofilter_DX_neon,
37 S16_opaque_D32_filter_DXDY_neon,
38 S16_alpha_D32_filter_DXDY_neon,
39 S16_opaque_D32_filter_DX_neon,
40 S16_alpha_D32_filter_DX_neon,
41
42 SI8_opaque_D32_nofilter_DXDY_neon,
43 SI8_alpha_D32_nofilter_DXDY_neon,
44 SI8_opaque_D32_nofilter_DX_neon,
45 SI8_alpha_D32_nofilter_DX_neon,
46 SI8_opaque_D32_filter_DXDY_neon,
47 SI8_alpha_D32_filter_DXDY_neon,
48 SI8_opaque_D32_filter_DX_neon,
49 SI8_alpha_D32_filter_DX_neon,
50
51 S4444_opaque_D32_nofilter_DXDY_neon,
52 S4444_alpha_D32_nofilter_DXDY_neon,
53 S4444_opaque_D32_nofilter_DX_neon,
54 S4444_alpha_D32_nofilter_DX_neon,
55 S4444_opaque_D32_filter_DXDY_neon,
56 S4444_alpha_D32_filter_DXDY_neon,
57 S4444_opaque_D32_filter_DX_neon,
58 S4444_alpha_D32_filter_DX_neon,
59
60 // A8 treats alpha/opauqe the same (equally efficient)
61 SA8_alpha_D32_nofilter_DXDY_neon,
62 SA8_alpha_D32_nofilter_DXDY_neon,
63 SA8_alpha_D32_nofilter_DX_neon,
64 SA8_alpha_D32_nofilter_DX_neon,
65 SA8_alpha_D32_filter_DXDY_neon,
66 SA8_alpha_D32_filter_DXDY_neon,
67 SA8_alpha_D32_filter_DX_neon,
68 SA8_alpha_D32_filter_DX_neon,
69
70 // todo: possibly specialize on opaqueness
71 SG8_alpha_D32_nofilter_DXDY_neon,
72 SG8_alpha_D32_nofilter_DXDY_neon,
73 SG8_alpha_D32_nofilter_DX_neon,
74 SG8_alpha_D32_nofilter_DX_neon,
75 SG8_alpha_D32_filter_DXDY_neon,
76 SG8_alpha_D32_filter_DXDY_neon,
77 SG8_alpha_D32_filter_DX_neon,
78 SG8_alpha_D32_filter_DX_neon,
79 };
80
81 ///////////////////////////////////////////////////////////////////////////////
82
83 #include <arm_neon.h>
84 #include "SkConvolver.h"
85
86 // Convolves horizontally along a single row. The row data is given in
87 // |srcData| and continues for the numValues() of the filter.
convolveHorizontally_neon(const unsigned char * srcData,const SkConvolutionFilter1D & filter,unsigned char * outRow,bool hasAlpha)88 void convolveHorizontally_neon(const unsigned char* srcData,
89 const SkConvolutionFilter1D& filter,
90 unsigned char* outRow,
91 bool hasAlpha) {
92 // Loop over each pixel on this row in the output image.
93 int numValues = filter.numValues();
94 for (int outX = 0; outX < numValues; outX++) {
95 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
96 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
97 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
98 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
99 // Get the filter that determines the current output pixel.
100 int filterOffset, filterLength;
101 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
102 filter.FilterForValue(outX, &filterOffset, &filterLength);
103
104 // Compute the first pixel in this row that the filter affects. It will
105 // touch |filterLength| pixels (4 bytes each) after this.
106 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
107
108 // Apply the filter to the row to get the destination pixel in |accum|.
109 int32x4_t accum = vdupq_n_s32(0);
110 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
111 // Load 4 coefficients
112 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
113 coeffs = vld1_s16(filterValues);
114 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
115 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
116 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
117 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
118
119 // Load pixels and calc
120 uint8x16_t pixels = vld1q_u8(rowToFilter);
121 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
122 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
123
124 int16x4_t p0_src = vget_low_s16(p01_16);
125 int16x4_t p1_src = vget_high_s16(p01_16);
126 int16x4_t p2_src = vget_low_s16(p23_16);
127 int16x4_t p3_src = vget_high_s16(p23_16);
128
129 int32x4_t p0 = vmull_s16(p0_src, coeff0);
130 int32x4_t p1 = vmull_s16(p1_src, coeff1);
131 int32x4_t p2 = vmull_s16(p2_src, coeff2);
132 int32x4_t p3 = vmull_s16(p3_src, coeff3);
133
134 accum += p0;
135 accum += p1;
136 accum += p2;
137 accum += p3;
138
139 // Advance the pointers
140 rowToFilter += 16;
141 filterValues += 4;
142 }
143 int r = filterLength & 3;
144 if (r) {
145 const uint16_t mask[4][4] = {
146 {0, 0, 0, 0},
147 {0xFFFF, 0, 0, 0},
148 {0xFFFF, 0xFFFF, 0, 0},
149 {0xFFFF, 0xFFFF, 0xFFFF, 0}
150 };
151 uint16x4_t coeffs;
152 int16x4_t coeff0, coeff1, coeff2;
153 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
154 coeffs &= vld1_u16(&mask[r][0]);
155 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));
156 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));
157 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));
158
159 // Load pixels and calc
160 uint8x16_t pixels = vld1q_u8(rowToFilter);
161 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
162 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
163 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
164 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
165 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
166
167 accum += p0;
168 accum += p1;
169 accum += p2;
170 }
171
172 // Bring this value back in range. All of the filter scaling factors
173 // are in fixed point with kShiftBits bits of fractional part.
174 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
175
176 // Pack and store the new pixel.
177 int16x4_t accum16 = vqmovn_s32(accum);
178 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
179 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);
180 outRow += 4;
181 }
182 }
183
184 // Does vertical convolution to produce one output row. The filter values and
185 // length are given in the first two parameters. These are applied to each
186 // of the rows pointed to in the |sourceDataRows| array, with each row
187 // being |pixelWidth| wide.
188 //
189 // The output must have room for |pixelWidth * 4| bytes.
190 template<bool hasAlpha>
convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed * filterValues,int filterLength,unsigned char * const * sourceDataRows,int pixelWidth,unsigned char * outRow)191 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
192 int filterLength,
193 unsigned char* const* sourceDataRows,
194 int pixelWidth,
195 unsigned char* outRow) {
196 int width = pixelWidth & ~3;
197
198 int32x4_t accum0, accum1, accum2, accum3;
199 int16x4_t coeff16;
200
201 // Output four pixels per iteration (16 bytes).
202 for (int outX = 0; outX < width; outX += 4) {
203
204 // Accumulated result for each pixel. 32 bits per RGBA channel.
205 accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);
206
207 // Convolve with one filter coefficient per iteration.
208 for (int filterY = 0; filterY < filterLength; filterY++) {
209
210 // Duplicate the filter coefficient 4 times.
211 // [16] cj cj cj cj
212 coeff16 = vdup_n_s16(filterValues[filterY]);
213
214 // Load four pixels (16 bytes) together.
215 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
216 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
217
218 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
219 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
220 int16x4_t src16_0 = vget_low_s16(src16_01);
221 int16x4_t src16_1 = vget_high_s16(src16_01);
222 int16x4_t src16_2 = vget_low_s16(src16_23);
223 int16x4_t src16_3 = vget_high_s16(src16_23);
224
225 accum0 += vmull_s16(src16_0, coeff16);
226 accum1 += vmull_s16(src16_1, coeff16);
227 accum2 += vmull_s16(src16_2, coeff16);
228 accum3 += vmull_s16(src16_3, coeff16);
229 }
230
231 // Shift right for fixed point implementation.
232 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
233 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
234 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
235 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
236
237 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
238 // [16] a1 b1 g1 r1 a0 b0 g0 r0
239 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
240 // [16] a3 b3 g3 r3 a2 b2 g2 r2
241 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
242
243 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
244 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
245 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
246
247 if (hasAlpha) {
248 // Compute the max(ri, gi, bi) for each pixel.
249 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
250 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
251 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
252 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
253 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
254 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
255 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
256 b = vmaxq_u8(a, b); // Max of r and g and b.
257 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
258 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
259
260 // Make sure the value of alpha channel is always larger than maximum
261 // value of color channels.
262 accum8 = vmaxq_u8(b, accum8);
263 } else {
264 // Set value of alpha channels to 0xFF.
265 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
266 }
267
268 // Store the convolution result (16 bytes) and advance the pixel pointers.
269 vst1q_u8(outRow, accum8);
270 outRow += 16;
271 }
272
273 // Process the leftovers when the width of the output is not divisible
274 // by 4, that is at most 3 pixels.
275 int r = pixelWidth & 3;
276 if (r) {
277
278 accum0 = accum1 = accum2 = vdupq_n_s32(0);
279
280 for (int filterY = 0; filterY < filterLength; ++filterY) {
281 coeff16 = vdup_n_s16(filterValues[filterY]);
282
283 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
284 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
285
286 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
287 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
288 int16x4_t src16_0 = vget_low_s16(src16_01);
289 int16x4_t src16_1 = vget_high_s16(src16_01);
290 int16x4_t src16_2 = vget_low_s16(src16_23);
291
292 accum0 += vmull_s16(src16_0, coeff16);
293 accum1 += vmull_s16(src16_1, coeff16);
294 accum2 += vmull_s16(src16_2, coeff16);
295 }
296
297 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
298 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
299 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
300
301 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
302 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
303
304 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
305
306 if (hasAlpha) {
307 // Compute the max(ri, gi, bi) for each pixel.
308 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
309 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
310 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
311 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
312 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
313 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
314 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
315 b = vmaxq_u8(a, b); // Max of r and g and b.
316 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
317 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
318
319 // Make sure the value of alpha channel is always larger than maximum
320 // value of color channels.
321 accum8 = vmaxq_u8(b, accum8);
322 } else {
323 // Set value of alpha channels to 0xFF.
324 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
325 }
326
327 switch(r) {
328 case 1:
329 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);
330 break;
331 case 2:
332 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
333 vreinterpret_u32_u8(vget_low_u8(accum8)));
334 break;
335 case 3:
336 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
337 vreinterpret_u32_u8(vget_low_u8(accum8)));
338 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);
339 break;
340 }
341 }
342 }
343
convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed * filterValues,int filterLength,unsigned char * const * sourceDataRows,int pixelWidth,unsigned char * outRow,bool sourceHasAlpha)344 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
345 int filterLength,
346 unsigned char* const* sourceDataRows,
347 int pixelWidth,
348 unsigned char* outRow,
349 bool sourceHasAlpha) {
350 if (sourceHasAlpha) {
351 convolveVertically_neon<true>(filterValues, filterLength,
352 sourceDataRows, pixelWidth,
353 outRow);
354 } else {
355 convolveVertically_neon<false>(filterValues, filterLength,
356 sourceDataRows, pixelWidth,
357 outRow);
358 }
359 }
360
361 // Convolves horizontally along four rows. The row data is given in
362 // |src_data| and continues for the num_values() of the filter.
363 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
364 // refer to that function for detailed comments.
convolve4RowsHorizontally_neon(const unsigned char * srcData[4],const SkConvolutionFilter1D & filter,unsigned char * outRow[4],size_t outRowBytes)365 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
366 const SkConvolutionFilter1D& filter,
367 unsigned char* outRow[4],
368 size_t outRowBytes) {
369
370 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
371 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
372 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
373 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
374 int num_values = filter.numValues();
375
376 int filterOffset, filterLength;
377 // |mask| will be used to decimate all extra filter coefficients that are
378 // loaded by SIMD when |filter_length| is not divisible by 4.
379 // mask[0] is not used in following algorithm.
380 const uint16_t mask[4][4] = {
381 {0, 0, 0, 0},
382 {0xFFFF, 0, 0, 0},
383 {0xFFFF, 0xFFFF, 0, 0},
384 {0xFFFF, 0xFFFF, 0xFFFF, 0}
385 };
386
387 // Output one pixel each iteration, calculating all channels (RGBA) together.
388 for (int outX = 0; outX < num_values; outX++) {
389
390 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
391 filter.FilterForValue(outX, &filterOffset, &filterLength);
392
393 // four pixels in a column per iteration.
394 int32x4_t accum0 = vdupq_n_s32(0);
395 int32x4_t accum1 = vdupq_n_s32(0);
396 int32x4_t accum2 = vdupq_n_s32(0);
397 int32x4_t accum3 = vdupq_n_s32(0);
398
399 int start = (filterOffset<<2);
400
401 // We will load and accumulate with four coefficients per iteration.
402 for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {
403 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
404
405 coeffs = vld1_s16(filterValues);
406 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
407 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
408 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
409 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
410
411 uint8x16_t pixels;
412 int16x8_t p01_16, p23_16;
413 int32x4_t p0, p1, p2, p3;
414
415
416 #define ITERATION(src, accum) \
417 pixels = vld1q_u8(src); \
418 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \
419 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
420 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \
421 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \
422 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \
423 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \
424 accum += p0; \
425 accum += p1; \
426 accum += p2; \
427 accum += p3
428
429 ITERATION(srcData[0] + start, accum0);
430 ITERATION(srcData[1] + start, accum1);
431 ITERATION(srcData[2] + start, accum2);
432 ITERATION(srcData[3] + start, accum3);
433
434 start += 16;
435 filterValues += 4;
436 }
437
438 int r = filterLength & 3;
439 if (r) {
440 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
441 coeffs = vld1_s16(filterValues);
442 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));
443 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
444 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
445 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
446 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
447
448 uint8x16_t pixels;
449 int16x8_t p01_16, p23_16;
450 int32x4_t p0, p1, p2, p3;
451
452 ITERATION(srcData[0] + start, accum0);
453 ITERATION(srcData[1] + start, accum1);
454 ITERATION(srcData[2] + start, accum2);
455 ITERATION(srcData[3] + start, accum3);
456 }
457
458 int16x4_t accum16;
459 uint8x8_t res0, res1, res2, res3;
460
461 #define PACK_RESULT(accum, res) \
462 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \
463 accum16 = vqmovn_s32(accum); \
464 res = vqmovun_s16(vcombine_s16(accum16, accum16));
465
466 PACK_RESULT(accum0, res0);
467 PACK_RESULT(accum1, res1);
468 PACK_RESULT(accum2, res2);
469 PACK_RESULT(accum3, res3);
470
471 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);
472 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);
473 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);
474 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);
475 outRow[0] += 4;
476 outRow[1] += 4;
477 outRow[2] += 4;
478 outRow[3] += 4;
479 }
480 }
481
applySIMDPadding_neon(SkConvolutionFilter1D * filter)482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
483 // Padding |paddingCount| of more dummy coefficients after the coefficients
484 // of last filter to prevent SIMD instructions which load 8 or 16 bytes
485 // together to access invalid memory areas. We are not trying to align the
486 // coefficients right now due to the opaqueness of <vector> implementation.
487 // This has to be done after all |AddFilter| calls.
488 for (int i = 0; i < 8; ++i) {
489 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
490 }
491 }
492
platformConvolutionProcs_arm_neon(SkConvolutionProcs * procs)493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
494 procs->fExtraHorizontalReads = 3;
495 procs->fConvolveVertically = &convolveVertically_neon;
496 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
497 procs->fConvolveHorizontally = &convolveHorizontally_neon;
498 procs->fApplySIMDPadding = &applySIMDPadding_neon;
499 }
500