1 /*
2  * Copyright 2013 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include <emmintrin.h>
9 #include "SkBitmap.h"
10 #include "SkBitmapFilter_opts_SSE2.h"
11 #include "SkBitmapProcState.h"
12 #include "SkColor.h"
13 #include "SkColorPriv.h"
14 #include "SkConvolver.h"
15 #include "SkShader.h"
16 #include "SkUnPreMultiply.h"
17 
18 #if 0
19 static inline void print128i(__m128i value) {
20     int *v = (int*) &value;
21     printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
22 }
23 
24 static inline void print128i_16(__m128i value) {
25     short *v = (short*) &value;
26     printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
27 }
28 
29 static inline void print128i_8(__m128i value) {
30     unsigned char *v = (unsigned char*) &value;
31     printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n",
32            v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
33            v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]
34            );
35 }
36 
37 static inline void print128f(__m128 value) {
38     float *f = (float*) &value;
39     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);
40 }
41 #endif
42 
43 // Convolves horizontally along a single row. The row data is given in
44 // |src_data| and continues for the num_values() of the filter.
convolveHorizontally_SSE2(const unsigned char * src_data,const SkConvolutionFilter1D & filter,unsigned char * out_row,bool)45 void convolveHorizontally_SSE2(const unsigned char* src_data,
46                                const SkConvolutionFilter1D& filter,
47                                unsigned char* out_row,
48                                bool /*has_alpha*/) {
49     int num_values = filter.numValues();
50 
51     int filter_offset, filter_length;
52     __m128i zero = _mm_setzero_si128();
53     __m128i mask[4];
54     // |mask| will be used to decimate all extra filter coefficients that are
55     // loaded by SIMD when |filter_length| is not divisible by 4.
56     // mask[0] is not used in following algorithm.
57     mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
58     mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
59     mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
60 
61     // Output one pixel each iteration, calculating all channels (RGBA) together.
62     for (int out_x = 0; out_x < num_values; out_x++) {
63         const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
64             filter.FilterForValue(out_x, &filter_offset, &filter_length);
65 
66         __m128i accum = _mm_setzero_si128();
67 
68         // Compute the first pixel in this row that the filter affects. It will
69         // touch |filter_length| pixels (4 bytes each) after this.
70         const __m128i* row_to_filter =
71             reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
72 
73         // We will load and accumulate with four coefficients per iteration.
74         for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
75 
76             // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
77             __m128i coeff, coeff16;
78             // [16] xx xx xx xx c3 c2 c1 c0
79             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
80             // [16] xx xx xx xx c1 c1 c0 c0
81             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
82             // [16] c1 c1 c1 c1 c0 c0 c0 c0
83             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
84 
85             // Load four pixels => unpack the first two pixels to 16 bits =>
86             // multiply with coefficients => accumulate the convolution result.
87             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
88             __m128i src8 = _mm_loadu_si128(row_to_filter);
89             // [16] a1 b1 g1 r1 a0 b0 g0 r0
90             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
91             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
92             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
93             // [32]  a0*c0 b0*c0 g0*c0 r0*c0
94             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
95             accum = _mm_add_epi32(accum, t);
96             // [32]  a1*c1 b1*c1 g1*c1 r1*c1
97             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
98             accum = _mm_add_epi32(accum, t);
99 
100             // Duplicate 3rd and 4th coefficients for all channels =>
101             // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
102             // => accumulate the convolution results.
103             // [16] xx xx xx xx c3 c3 c2 c2
104             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
105             // [16] c3 c3 c3 c3 c2 c2 c2 c2
106             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
107             // [16] a3 g3 b3 r3 a2 g2 b2 r2
108             src16 = _mm_unpackhi_epi8(src8, zero);
109             mul_hi = _mm_mulhi_epi16(src16, coeff16);
110             mul_lo = _mm_mullo_epi16(src16, coeff16);
111             // [32]  a2*c2 b2*c2 g2*c2 r2*c2
112             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
113             accum = _mm_add_epi32(accum, t);
114             // [32]  a3*c3 b3*c3 g3*c3 r3*c3
115             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
116             accum = _mm_add_epi32(accum, t);
117 
118             // Advance the pixel and coefficients pointers.
119             row_to_filter += 1;
120             filter_values += 4;
121         }
122 
123         // When |filter_length| is not divisible by 4, we need to decimate some of
124         // the filter coefficient that was loaded incorrectly to zero; Other than
125         // that the algorithm is same with above, exceot that the 4th pixel will be
126         // always absent.
127         int r = filter_length&3;
128         if (r) {
129             // Note: filter_values must be padded to align_up(filter_offset, 8).
130             __m128i coeff, coeff16;
131             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
132             // Mask out extra filter taps.
133             coeff = _mm_and_si128(coeff, mask[r]);
134             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
135             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
136 
137             // Note: line buffer must be padded to align_up(filter_offset, 16).
138             // We resolve this by use C-version for the last horizontal line.
139             __m128i src8 = _mm_loadu_si128(row_to_filter);
140             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
141             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
142             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
143             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
144             accum = _mm_add_epi32(accum, t);
145             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
146             accum = _mm_add_epi32(accum, t);
147 
148             src16 = _mm_unpackhi_epi8(src8, zero);
149             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
150             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
151             mul_hi = _mm_mulhi_epi16(src16, coeff16);
152             mul_lo = _mm_mullo_epi16(src16, coeff16);
153             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
154             accum = _mm_add_epi32(accum, t);
155         }
156 
157         // Shift right for fixed point implementation.
158         accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
159 
160         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
161         accum = _mm_packs_epi32(accum, zero);
162         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
163         accum = _mm_packus_epi16(accum, zero);
164 
165         // Store the pixel value of 32 bits.
166         *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
167         out_row += 4;
168     }
169 }
170 
171 // Convolves horizontally along four rows. The row data is given in
172 // |src_data| and continues for the num_values() of the filter.
173 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
174 // refer to that function for detailed comments.
convolve4RowsHorizontally_SSE2(const unsigned char * src_data[4],const SkConvolutionFilter1D & filter,unsigned char * out_row[4],size_t outRowBytes)175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
176                                     const SkConvolutionFilter1D& filter,
177                                     unsigned char* out_row[4],
178                                     size_t outRowBytes) {
179     SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];)
180 
181     int num_values = filter.numValues();
182 
183     int filter_offset, filter_length;
184     __m128i zero = _mm_setzero_si128();
185     __m128i mask[4];
186     // |mask| will be used to decimate all extra filter coefficients that are
187     // loaded by SIMD when |filter_length| is not divisible by 4.
188     // mask[0] is not used in following algorithm.
189     mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
190     mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
191     mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
192 
193     // Output one pixel each iteration, calculating all channels (RGBA) together.
194     for (int out_x = 0; out_x < num_values; out_x++) {
195         const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
196             filter.FilterForValue(out_x, &filter_offset, &filter_length);
197 
198         // four pixels in a column per iteration.
199         __m128i accum0 = _mm_setzero_si128();
200         __m128i accum1 = _mm_setzero_si128();
201         __m128i accum2 = _mm_setzero_si128();
202         __m128i accum3 = _mm_setzero_si128();
203         int start = (filter_offset<<2);
204         // We will load and accumulate with four coefficients per iteration.
205         for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
206             __m128i coeff, coeff16lo, coeff16hi;
207             // [16] xx xx xx xx c3 c2 c1 c0
208             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
209             // [16] xx xx xx xx c1 c1 c0 c0
210             coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
211             // [16] c1 c1 c1 c1 c0 c0 c0 c0
212             coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
213             // [16] xx xx xx xx c3 c3 c2 c2
214             coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
215             // [16] c3 c3 c3 c3 c2 c2 c2 c2
216             coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
217 
218             __m128i src8, src16, mul_hi, mul_lo, t;
219 
220 #define ITERATION(src, accum)                                                \
221             src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
222             src16 = _mm_unpacklo_epi8(src8, zero);                           \
223             mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
224             mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
225             t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
226             accum = _mm_add_epi32(accum, t);                                 \
227             t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
228             accum = _mm_add_epi32(accum, t);                                 \
229             src16 = _mm_unpackhi_epi8(src8, zero);                           \
230             mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
231             mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
232             t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
233             accum = _mm_add_epi32(accum, t);                                 \
234             t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
235             accum = _mm_add_epi32(accum, t)
236 
237             ITERATION(src_data[0] + start, accum0);
238             ITERATION(src_data[1] + start, accum1);
239             ITERATION(src_data[2] + start, accum2);
240             ITERATION(src_data[3] + start, accum3);
241 
242             start += 16;
243             filter_values += 4;
244         }
245 
246         int r = filter_length & 3;
247         if (r) {
248             // Note: filter_values must be padded to align_up(filter_offset, 8);
249             __m128i coeff;
250             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
251             // Mask out extra filter taps.
252             coeff = _mm_and_si128(coeff, mask[r]);
253 
254             __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
255             /* c1 c1 c1 c1 c0 c0 c0 c0 */
256             coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
257             __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
258             coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
259 
260             __m128i src8, src16, mul_hi, mul_lo, t;
261 
262             ITERATION(src_data[0] + start, accum0);
263             ITERATION(src_data[1] + start, accum1);
264             ITERATION(src_data[2] + start, accum2);
265             ITERATION(src_data[3] + start, accum3);
266         }
267 
268         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
269         accum0 = _mm_packs_epi32(accum0, zero);
270         accum0 = _mm_packus_epi16(accum0, zero);
271         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
272         accum1 = _mm_packs_epi32(accum1, zero);
273         accum1 = _mm_packus_epi16(accum1, zero);
274         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
275         accum2 = _mm_packs_epi32(accum2, zero);
276         accum2 = _mm_packus_epi16(accum2, zero);
277         accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
278         accum3 = _mm_packs_epi32(accum3, zero);
279         accum3 = _mm_packus_epi16(accum3, zero);
280 
281         // We seem to be running off the edge here (chromium:491660).
282         SkASSERT(((size_t)out_row[0] - (size_t)out_row_0_start) < outRowBytes);
283 
284         *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
285         *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
286         *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
287         *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
288 
289         out_row[0] += 4;
290         out_row[1] += 4;
291         out_row[2] += 4;
292         out_row[3] += 4;
293     }
294 }
295 
296 // Does vertical convolution to produce one output row. The filter values and
297 // length are given in the first two parameters. These are applied to each
298 // of the rows pointed to in the |source_data_rows| array, with each row
299 // being |pixel_width| wide.
300 //
301 // The output must have room for |pixel_width * 4| bytes.
302 template<bool has_alpha>
convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed * filter_values,int filter_length,unsigned char * const * source_data_rows,int pixel_width,unsigned char * out_row)303 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
304                              int filter_length,
305                              unsigned char* const* source_data_rows,
306                              int pixel_width,
307                              unsigned char* out_row) {
308     int width = pixel_width & ~3;
309 
310     __m128i zero = _mm_setzero_si128();
311     __m128i accum0, accum1, accum2, accum3, coeff16;
312     const __m128i* src;
313     // Output four pixels per iteration (16 bytes).
314     for (int out_x = 0; out_x < width; out_x += 4) {
315 
316         // Accumulated result for each pixel. 32 bits per RGBA channel.
317         accum0 = _mm_setzero_si128();
318         accum1 = _mm_setzero_si128();
319         accum2 = _mm_setzero_si128();
320         accum3 = _mm_setzero_si128();
321 
322         // Convolve with one filter coefficient per iteration.
323         for (int filter_y = 0; filter_y < filter_length; filter_y++) {
324 
325             // Duplicate the filter coefficient 8 times.
326             // [16] cj cj cj cj cj cj cj cj
327             coeff16 = _mm_set1_epi16(filter_values[filter_y]);
328 
329             // Load four pixels (16 bytes) together.
330             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
331             src = reinterpret_cast<const __m128i*>(
332                 &source_data_rows[filter_y][out_x << 2]);
333             __m128i src8 = _mm_loadu_si128(src);
334 
335             // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
336             // multiply with current coefficient => accumulate the result.
337             // [16] a1 b1 g1 r1 a0 b0 g0 r0
338             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
339             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
340             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
341             // [32] a0 b0 g0 r0
342             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
343             accum0 = _mm_add_epi32(accum0, t);
344             // [32] a1 b1 g1 r1
345             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
346             accum1 = _mm_add_epi32(accum1, t);
347 
348             // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
349             // multiply with current coefficient => accumulate the result.
350             // [16] a3 b3 g3 r3 a2 b2 g2 r2
351             src16 = _mm_unpackhi_epi8(src8, zero);
352             mul_hi = _mm_mulhi_epi16(src16, coeff16);
353             mul_lo = _mm_mullo_epi16(src16, coeff16);
354             // [32] a2 b2 g2 r2
355             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
356             accum2 = _mm_add_epi32(accum2, t);
357             // [32] a3 b3 g3 r3
358             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
359             accum3 = _mm_add_epi32(accum3, t);
360         }
361 
362         // Shift right for fixed point implementation.
363         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
364         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
365         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
366         accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
367 
368         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
369         // [16] a1 b1 g1 r1 a0 b0 g0 r0
370         accum0 = _mm_packs_epi32(accum0, accum1);
371         // [16] a3 b3 g3 r3 a2 b2 g2 r2
372         accum2 = _mm_packs_epi32(accum2, accum3);
373 
374         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
375         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
376         accum0 = _mm_packus_epi16(accum0, accum2);
377 
378         if (has_alpha) {
379             // Compute the max(ri, gi, bi) for each pixel.
380             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
381             __m128i a = _mm_srli_epi32(accum0, 8);
382             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
383             __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
384             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
385             a = _mm_srli_epi32(accum0, 16);
386             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
387             b = _mm_max_epu8(a, b);  // Max of r and g and b.
388             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
389             b = _mm_slli_epi32(b, 24);
390 
391             // Make sure the value of alpha channel is always larger than maximum
392             // value of color channels.
393             accum0 = _mm_max_epu8(b, accum0);
394         } else {
395             // Set value of alpha channels to 0xFF.
396             __m128i mask = _mm_set1_epi32(0xff000000);
397             accum0 = _mm_or_si128(accum0, mask);
398         }
399 
400         // Store the convolution result (16 bytes) and advance the pixel pointers.
401         _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
402         out_row += 16;
403     }
404 
405     // When the width of the output is not divisible by 4, We need to save one
406     // pixel (4 bytes) each time. And also the fourth pixel is always absent.
407     if (pixel_width & 3) {
408         accum0 = _mm_setzero_si128();
409         accum1 = _mm_setzero_si128();
410         accum2 = _mm_setzero_si128();
411         for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
412             coeff16 = _mm_set1_epi16(filter_values[filter_y]);
413             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
414             src = reinterpret_cast<const __m128i*>(
415                 &source_data_rows[filter_y][width<<2]);
416             __m128i src8 = _mm_loadu_si128(src);
417             // [16] a1 b1 g1 r1 a0 b0 g0 r0
418             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
419             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
420             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
421             // [32] a0 b0 g0 r0
422             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
423             accum0 = _mm_add_epi32(accum0, t);
424             // [32] a1 b1 g1 r1
425             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
426             accum1 = _mm_add_epi32(accum1, t);
427             // [16] a3 b3 g3 r3 a2 b2 g2 r2
428             src16 = _mm_unpackhi_epi8(src8, zero);
429             mul_hi = _mm_mulhi_epi16(src16, coeff16);
430             mul_lo = _mm_mullo_epi16(src16, coeff16);
431             // [32] a2 b2 g2 r2
432             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
433             accum2 = _mm_add_epi32(accum2, t);
434         }
435 
436         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
437         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
438         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
439         // [16] a1 b1 g1 r1 a0 b0 g0 r0
440         accum0 = _mm_packs_epi32(accum0, accum1);
441         // [16] a3 b3 g3 r3 a2 b2 g2 r2
442         accum2 = _mm_packs_epi32(accum2, zero);
443         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
444         accum0 = _mm_packus_epi16(accum0, accum2);
445         if (has_alpha) {
446             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
447             __m128i a = _mm_srli_epi32(accum0, 8);
448             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
449             __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
450             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
451             a = _mm_srli_epi32(accum0, 16);
452             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
453             b = _mm_max_epu8(a, b);  // Max of r and g and b.
454             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
455             b = _mm_slli_epi32(b, 24);
456             accum0 = _mm_max_epu8(b, accum0);
457         } else {
458             __m128i mask = _mm_set1_epi32(0xff000000);
459             accum0 = _mm_or_si128(accum0, mask);
460         }
461 
462         for (int out_x = width; out_x < pixel_width; out_x++) {
463             *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
464             accum0 = _mm_srli_si128(accum0, 4);
465             out_row += 4;
466         }
467     }
468 }
469 
convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed * filter_values,int filter_length,unsigned char * const * source_data_rows,int pixel_width,unsigned char * out_row,bool has_alpha)470 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
471                              int filter_length,
472                              unsigned char* const* source_data_rows,
473                              int pixel_width,
474                              unsigned char* out_row,
475                              bool has_alpha) {
476     if (has_alpha) {
477         convolveVertically_SSE2<true>(filter_values,
478                                       filter_length,
479                                       source_data_rows,
480                                       pixel_width,
481                                       out_row);
482     } else {
483         convolveVertically_SSE2<false>(filter_values,
484                                        filter_length,
485                                        source_data_rows,
486                                        pixel_width,
487                                        out_row);
488     }
489 }
490 
applySIMDPadding_SSE2(SkConvolutionFilter1D * filter)491 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
492     // Padding |paddingCount| of more dummy coefficients after the coefficients
493     // of last filter to prevent SIMD instructions which load 8 or 16 bytes
494     // together to access invalid memory areas. We are not trying to align the
495     // coefficients right now due to the opaqueness of <vector> implementation.
496     // This has to be done after all |AddFilter| calls.
497     for (int i = 0; i < 8; ++i) {
498         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
499     }
500 }
501