1 /*
2 * Copyright 2013 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include <emmintrin.h>
9 #include "SkBitmap.h"
10 #include "SkBitmapFilter_opts_SSE2.h"
11 #include "SkBitmapProcState.h"
12 #include "SkColor.h"
13 #include "SkColorPriv.h"
14 #include "SkConvolver.h"
15 #include "SkShader.h"
16 #include "SkUnPreMultiply.h"
17
18 #if 0
19 static inline void print128i(__m128i value) {
20 int *v = (int*) &value;
21 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
22 }
23
24 static inline void print128i_16(__m128i value) {
25 short *v = (short*) &value;
26 printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
27 }
28
29 static inline void print128i_8(__m128i value) {
30 unsigned char *v = (unsigned char*) &value;
31 printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n",
32 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
33 v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]
34 );
35 }
36
37 static inline void print128f(__m128 value) {
38 float *f = (float*) &value;
39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);
40 }
41 #endif
42
43 // Convolves horizontally along a single row. The row data is given in
44 // |src_data| and continues for the num_values() of the filter.
convolveHorizontally_SSE2(const unsigned char * src_data,const SkConvolutionFilter1D & filter,unsigned char * out_row,bool)45 void convolveHorizontally_SSE2(const unsigned char* src_data,
46 const SkConvolutionFilter1D& filter,
47 unsigned char* out_row,
48 bool /*has_alpha*/) {
49 int num_values = filter.numValues();
50
51 int filter_offset, filter_length;
52 __m128i zero = _mm_setzero_si128();
53 __m128i mask[4];
54 // |mask| will be used to decimate all extra filter coefficients that are
55 // loaded by SIMD when |filter_length| is not divisible by 4.
56 // mask[0] is not used in following algorithm.
57 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
58 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
59 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
60
61 // Output one pixel each iteration, calculating all channels (RGBA) together.
62 for (int out_x = 0; out_x < num_values; out_x++) {
63 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
64 filter.FilterForValue(out_x, &filter_offset, &filter_length);
65
66 __m128i accum = _mm_setzero_si128();
67
68 // Compute the first pixel in this row that the filter affects. It will
69 // touch |filter_length| pixels (4 bytes each) after this.
70 const __m128i* row_to_filter =
71 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
72
73 // We will load and accumulate with four coefficients per iteration.
74 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
75
76 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
77 __m128i coeff, coeff16;
78 // [16] xx xx xx xx c3 c2 c1 c0
79 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
80 // [16] xx xx xx xx c1 c1 c0 c0
81 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
82 // [16] c1 c1 c1 c1 c0 c0 c0 c0
83 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
84
85 // Load four pixels => unpack the first two pixels to 16 bits =>
86 // multiply with coefficients => accumulate the convolution result.
87 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
88 __m128i src8 = _mm_loadu_si128(row_to_filter);
89 // [16] a1 b1 g1 r1 a0 b0 g0 r0
90 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
91 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
92 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
93 // [32] a0*c0 b0*c0 g0*c0 r0*c0
94 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
95 accum = _mm_add_epi32(accum, t);
96 // [32] a1*c1 b1*c1 g1*c1 r1*c1
97 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
98 accum = _mm_add_epi32(accum, t);
99
100 // Duplicate 3rd and 4th coefficients for all channels =>
101 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
102 // => accumulate the convolution results.
103 // [16] xx xx xx xx c3 c3 c2 c2
104 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
105 // [16] c3 c3 c3 c3 c2 c2 c2 c2
106 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
107 // [16] a3 g3 b3 r3 a2 g2 b2 r2
108 src16 = _mm_unpackhi_epi8(src8, zero);
109 mul_hi = _mm_mulhi_epi16(src16, coeff16);
110 mul_lo = _mm_mullo_epi16(src16, coeff16);
111 // [32] a2*c2 b2*c2 g2*c2 r2*c2
112 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
113 accum = _mm_add_epi32(accum, t);
114 // [32] a3*c3 b3*c3 g3*c3 r3*c3
115 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
116 accum = _mm_add_epi32(accum, t);
117
118 // Advance the pixel and coefficients pointers.
119 row_to_filter += 1;
120 filter_values += 4;
121 }
122
123 // When |filter_length| is not divisible by 4, we need to decimate some of
124 // the filter coefficient that was loaded incorrectly to zero; Other than
125 // that the algorithm is same with above, exceot that the 4th pixel will be
126 // always absent.
127 int r = filter_length&3;
128 if (r) {
129 // Note: filter_values must be padded to align_up(filter_offset, 8).
130 __m128i coeff, coeff16;
131 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
132 // Mask out extra filter taps.
133 coeff = _mm_and_si128(coeff, mask[r]);
134 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
135 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
136
137 // Note: line buffer must be padded to align_up(filter_offset, 16).
138 // We resolve this by use C-version for the last horizontal line.
139 __m128i src8 = _mm_loadu_si128(row_to_filter);
140 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
141 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
142 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
143 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
144 accum = _mm_add_epi32(accum, t);
145 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
146 accum = _mm_add_epi32(accum, t);
147
148 src16 = _mm_unpackhi_epi8(src8, zero);
149 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
150 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
151 mul_hi = _mm_mulhi_epi16(src16, coeff16);
152 mul_lo = _mm_mullo_epi16(src16, coeff16);
153 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
154 accum = _mm_add_epi32(accum, t);
155 }
156
157 // Shift right for fixed point implementation.
158 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
159
160 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
161 accum = _mm_packs_epi32(accum, zero);
162 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
163 accum = _mm_packus_epi16(accum, zero);
164
165 // Store the pixel value of 32 bits.
166 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
167 out_row += 4;
168 }
169 }
170
171 // Convolves horizontally along four rows. The row data is given in
172 // |src_data| and continues for the num_values() of the filter.
173 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
174 // refer to that function for detailed comments.
convolve4RowsHorizontally_SSE2(const unsigned char * src_data[4],const SkConvolutionFilter1D & filter,unsigned char * out_row[4],size_t outRowBytes)175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
176 const SkConvolutionFilter1D& filter,
177 unsigned char* out_row[4],
178 size_t outRowBytes) {
179 SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];)
180
181 int num_values = filter.numValues();
182
183 int filter_offset, filter_length;
184 __m128i zero = _mm_setzero_si128();
185 __m128i mask[4];
186 // |mask| will be used to decimate all extra filter coefficients that are
187 // loaded by SIMD when |filter_length| is not divisible by 4.
188 // mask[0] is not used in following algorithm.
189 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
190 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
191 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
192
193 // Output one pixel each iteration, calculating all channels (RGBA) together.
194 for (int out_x = 0; out_x < num_values; out_x++) {
195 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
196 filter.FilterForValue(out_x, &filter_offset, &filter_length);
197
198 // four pixels in a column per iteration.
199 __m128i accum0 = _mm_setzero_si128();
200 __m128i accum1 = _mm_setzero_si128();
201 __m128i accum2 = _mm_setzero_si128();
202 __m128i accum3 = _mm_setzero_si128();
203 int start = (filter_offset<<2);
204 // We will load and accumulate with four coefficients per iteration.
205 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
206 __m128i coeff, coeff16lo, coeff16hi;
207 // [16] xx xx xx xx c3 c2 c1 c0
208 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
209 // [16] xx xx xx xx c1 c1 c0 c0
210 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
211 // [16] c1 c1 c1 c1 c0 c0 c0 c0
212 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
213 // [16] xx xx xx xx c3 c3 c2 c2
214 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
215 // [16] c3 c3 c3 c3 c2 c2 c2 c2
216 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
217
218 __m128i src8, src16, mul_hi, mul_lo, t;
219
220 #define ITERATION(src, accum) \
221 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
222 src16 = _mm_unpacklo_epi8(src8, zero); \
223 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
224 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
225 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
226 accum = _mm_add_epi32(accum, t); \
227 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
228 accum = _mm_add_epi32(accum, t); \
229 src16 = _mm_unpackhi_epi8(src8, zero); \
230 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
231 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
232 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
233 accum = _mm_add_epi32(accum, t); \
234 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
235 accum = _mm_add_epi32(accum, t)
236
237 ITERATION(src_data[0] + start, accum0);
238 ITERATION(src_data[1] + start, accum1);
239 ITERATION(src_data[2] + start, accum2);
240 ITERATION(src_data[3] + start, accum3);
241
242 start += 16;
243 filter_values += 4;
244 }
245
246 int r = filter_length & 3;
247 if (r) {
248 // Note: filter_values must be padded to align_up(filter_offset, 8);
249 __m128i coeff;
250 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
251 // Mask out extra filter taps.
252 coeff = _mm_and_si128(coeff, mask[r]);
253
254 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
255 /* c1 c1 c1 c1 c0 c0 c0 c0 */
256 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
257 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
258 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
259
260 __m128i src8, src16, mul_hi, mul_lo, t;
261
262 ITERATION(src_data[0] + start, accum0);
263 ITERATION(src_data[1] + start, accum1);
264 ITERATION(src_data[2] + start, accum2);
265 ITERATION(src_data[3] + start, accum3);
266 }
267
268 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
269 accum0 = _mm_packs_epi32(accum0, zero);
270 accum0 = _mm_packus_epi16(accum0, zero);
271 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
272 accum1 = _mm_packs_epi32(accum1, zero);
273 accum1 = _mm_packus_epi16(accum1, zero);
274 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
275 accum2 = _mm_packs_epi32(accum2, zero);
276 accum2 = _mm_packus_epi16(accum2, zero);
277 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
278 accum3 = _mm_packs_epi32(accum3, zero);
279 accum3 = _mm_packus_epi16(accum3, zero);
280
281 // We seem to be running off the edge here (chromium:491660).
282 SkASSERT(((size_t)out_row[0] - (size_t)out_row_0_start) < outRowBytes);
283
284 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
285 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
286 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
287 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
288
289 out_row[0] += 4;
290 out_row[1] += 4;
291 out_row[2] += 4;
292 out_row[3] += 4;
293 }
294 }
295
296 // Does vertical convolution to produce one output row. The filter values and
297 // length are given in the first two parameters. These are applied to each
298 // of the rows pointed to in the |source_data_rows| array, with each row
299 // being |pixel_width| wide.
300 //
301 // The output must have room for |pixel_width * 4| bytes.
302 template<bool has_alpha>
convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed * filter_values,int filter_length,unsigned char * const * source_data_rows,int pixel_width,unsigned char * out_row)303 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
304 int filter_length,
305 unsigned char* const* source_data_rows,
306 int pixel_width,
307 unsigned char* out_row) {
308 int width = pixel_width & ~3;
309
310 __m128i zero = _mm_setzero_si128();
311 __m128i accum0, accum1, accum2, accum3, coeff16;
312 const __m128i* src;
313 // Output four pixels per iteration (16 bytes).
314 for (int out_x = 0; out_x < width; out_x += 4) {
315
316 // Accumulated result for each pixel. 32 bits per RGBA channel.
317 accum0 = _mm_setzero_si128();
318 accum1 = _mm_setzero_si128();
319 accum2 = _mm_setzero_si128();
320 accum3 = _mm_setzero_si128();
321
322 // Convolve with one filter coefficient per iteration.
323 for (int filter_y = 0; filter_y < filter_length; filter_y++) {
324
325 // Duplicate the filter coefficient 8 times.
326 // [16] cj cj cj cj cj cj cj cj
327 coeff16 = _mm_set1_epi16(filter_values[filter_y]);
328
329 // Load four pixels (16 bytes) together.
330 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
331 src = reinterpret_cast<const __m128i*>(
332 &source_data_rows[filter_y][out_x << 2]);
333 __m128i src8 = _mm_loadu_si128(src);
334
335 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
336 // multiply with current coefficient => accumulate the result.
337 // [16] a1 b1 g1 r1 a0 b0 g0 r0
338 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
339 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
340 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
341 // [32] a0 b0 g0 r0
342 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
343 accum0 = _mm_add_epi32(accum0, t);
344 // [32] a1 b1 g1 r1
345 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
346 accum1 = _mm_add_epi32(accum1, t);
347
348 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
349 // multiply with current coefficient => accumulate the result.
350 // [16] a3 b3 g3 r3 a2 b2 g2 r2
351 src16 = _mm_unpackhi_epi8(src8, zero);
352 mul_hi = _mm_mulhi_epi16(src16, coeff16);
353 mul_lo = _mm_mullo_epi16(src16, coeff16);
354 // [32] a2 b2 g2 r2
355 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
356 accum2 = _mm_add_epi32(accum2, t);
357 // [32] a3 b3 g3 r3
358 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
359 accum3 = _mm_add_epi32(accum3, t);
360 }
361
362 // Shift right for fixed point implementation.
363 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
364 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
365 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
366 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
367
368 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
369 // [16] a1 b1 g1 r1 a0 b0 g0 r0
370 accum0 = _mm_packs_epi32(accum0, accum1);
371 // [16] a3 b3 g3 r3 a2 b2 g2 r2
372 accum2 = _mm_packs_epi32(accum2, accum3);
373
374 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
375 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
376 accum0 = _mm_packus_epi16(accum0, accum2);
377
378 if (has_alpha) {
379 // Compute the max(ri, gi, bi) for each pixel.
380 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
381 __m128i a = _mm_srli_epi32(accum0, 8);
382 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
383 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
384 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
385 a = _mm_srli_epi32(accum0, 16);
386 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
387 b = _mm_max_epu8(a, b); // Max of r and g and b.
388 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
389 b = _mm_slli_epi32(b, 24);
390
391 // Make sure the value of alpha channel is always larger than maximum
392 // value of color channels.
393 accum0 = _mm_max_epu8(b, accum0);
394 } else {
395 // Set value of alpha channels to 0xFF.
396 __m128i mask = _mm_set1_epi32(0xff000000);
397 accum0 = _mm_or_si128(accum0, mask);
398 }
399
400 // Store the convolution result (16 bytes) and advance the pixel pointers.
401 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
402 out_row += 16;
403 }
404
405 // When the width of the output is not divisible by 4, We need to save one
406 // pixel (4 bytes) each time. And also the fourth pixel is always absent.
407 if (pixel_width & 3) {
408 accum0 = _mm_setzero_si128();
409 accum1 = _mm_setzero_si128();
410 accum2 = _mm_setzero_si128();
411 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
412 coeff16 = _mm_set1_epi16(filter_values[filter_y]);
413 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
414 src = reinterpret_cast<const __m128i*>(
415 &source_data_rows[filter_y][width<<2]);
416 __m128i src8 = _mm_loadu_si128(src);
417 // [16] a1 b1 g1 r1 a0 b0 g0 r0
418 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
419 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
420 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
421 // [32] a0 b0 g0 r0
422 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
423 accum0 = _mm_add_epi32(accum0, t);
424 // [32] a1 b1 g1 r1
425 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
426 accum1 = _mm_add_epi32(accum1, t);
427 // [16] a3 b3 g3 r3 a2 b2 g2 r2
428 src16 = _mm_unpackhi_epi8(src8, zero);
429 mul_hi = _mm_mulhi_epi16(src16, coeff16);
430 mul_lo = _mm_mullo_epi16(src16, coeff16);
431 // [32] a2 b2 g2 r2
432 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
433 accum2 = _mm_add_epi32(accum2, t);
434 }
435
436 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
437 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
438 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
439 // [16] a1 b1 g1 r1 a0 b0 g0 r0
440 accum0 = _mm_packs_epi32(accum0, accum1);
441 // [16] a3 b3 g3 r3 a2 b2 g2 r2
442 accum2 = _mm_packs_epi32(accum2, zero);
443 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
444 accum0 = _mm_packus_epi16(accum0, accum2);
445 if (has_alpha) {
446 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
447 __m128i a = _mm_srli_epi32(accum0, 8);
448 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
449 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
450 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
451 a = _mm_srli_epi32(accum0, 16);
452 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
453 b = _mm_max_epu8(a, b); // Max of r and g and b.
454 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
455 b = _mm_slli_epi32(b, 24);
456 accum0 = _mm_max_epu8(b, accum0);
457 } else {
458 __m128i mask = _mm_set1_epi32(0xff000000);
459 accum0 = _mm_or_si128(accum0, mask);
460 }
461
462 for (int out_x = width; out_x < pixel_width; out_x++) {
463 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
464 accum0 = _mm_srli_si128(accum0, 4);
465 out_row += 4;
466 }
467 }
468 }
469
convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed * filter_values,int filter_length,unsigned char * const * source_data_rows,int pixel_width,unsigned char * out_row,bool has_alpha)470 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
471 int filter_length,
472 unsigned char* const* source_data_rows,
473 int pixel_width,
474 unsigned char* out_row,
475 bool has_alpha) {
476 if (has_alpha) {
477 convolveVertically_SSE2<true>(filter_values,
478 filter_length,
479 source_data_rows,
480 pixel_width,
481 out_row);
482 } else {
483 convolveVertically_SSE2<false>(filter_values,
484 filter_length,
485 source_data_rows,
486 pixel_width,
487 out_row);
488 }
489 }
490
applySIMDPadding_SSE2(SkConvolutionFilter1D * filter)491 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
492 // Padding |paddingCount| of more dummy coefficients after the coefficients
493 // of last filter to prevent SIMD instructions which load 8 or 16 bytes
494 // together to access invalid memory areas. We are not trying to align the
495 // coefficients right now due to the opaqueness of <vector> implementation.
496 // This has to be done after all |AddFilter| calls.
497 for (int i = 0; i < 8; ++i) {
498 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
499 }
500 }
501