1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdint.h>
18 #include <x86intrin.h>
19 
20 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)21 static inline __m128i cvtepu8_epi32(__m128i x) {
22 #if defined(__SSE4_1__)
23     return _mm_cvtepu8_epi32(x);
24 #elif defined(__SSSE3__)
25     const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
26     x = _mm_shuffle_epi8(x, M8to32);
27     return x;
28 #else
29 #   error "Require at least SSSE3"
30 #endif
31 }
32 
packus_epi32(__m128i lo,__m128i hi)33 static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
34 #if defined(__SSE4_1__)
35     return _mm_packus_epi32(lo, hi);
36 #elif defined(__SSSE3__)
37     const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
38     const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
39     const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
40     const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
41     lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
42     lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
43     hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
44     hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
45     return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
46                         _mm_shuffle_epi8(hi, M32to16H));
47 #else
48 #   error "Require at least SSSE3"
49 #endif
50 }
51 
mullo_epi32(__m128i x,__m128i y)52 static inline __m128i mullo_epi32(__m128i x, __m128i y) {
53 #if defined(__SSE4_1__)
54     return _mm_mullo_epi32(x, y);
55 #elif defined(__SSSE3__)
56     const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
57     __m128i even = _mm_mul_epu32(x, y);
58     __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
59                                 _mm_srli_si128(y, 4));
60     even = _mm_and_si128(even, Meven);
61     odd = _mm_and_si128(odd, Meven);
62     return _mm_or_si128(even, _mm_slli_si128(odd, 4));
63 #else
64 #   error "Require at least SSSE3"
65 #endif
66 }
67 
68 /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)69 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
70 #if defined(__SSE4_1__)
71     return _mm_blendv_epi8(x, y, mask);
72 #elif defined(__SSSE3__)
73     return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
74 #else
75 #   error "Require at least SSSE3"
76 #endif
77 }
78 
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)79 void rsdIntrinsicConvolve3x3_K(void *dst,
80                                const void *y0, const void *y1, const void *y2,
81                                const short *coef, uint32_t count) {
82     __m128i x;
83     __m128i c0, c2, c4, c6, c8;
84     __m128i r0, r1, r2;
85     __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
86     __m128i o0, o1;
87     uint32_t i;
88 
89     x = _mm_loadl_epi64((const __m128i *)(coef+0));
90     c0 = _mm_shuffle_epi32(x, 0x00);
91     c2 = _mm_shuffle_epi32(x, 0x55);
92     x = _mm_loadl_epi64((const __m128i *)(coef+4));
93     c4 = _mm_shuffle_epi32(x, 0x00);
94     c6 = _mm_shuffle_epi32(x, 0x55);
95     x = _mm_loadl_epi64((const __m128i *)(coef+8));
96     c8 = _mm_shuffle_epi32(x, 0x00);
97 
98     for (i = 0; i < count; ++i) {
99 
100         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
101         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
102         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
103         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
104         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
105         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
106         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
107         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
108         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
109         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
110         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
111         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
112 
113         o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
114         o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
115 
116         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
117         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
118 
119         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
120         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
121 
122         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
123         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
124 
125         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
126         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
127 
128         o0 = _mm_srai_epi32(o0, 8);
129         o1 = _mm_srai_epi32(o1, 8);
130 
131         o0 = packus_epi32(o0, o1);
132         o0 = _mm_packus_epi16(o0, o0);
133         _mm_storel_epi64((__m128i *)dst, o0);
134 
135         y0 = (const char *)y0 + 8;
136         y1 = (const char *)y1 + 8;
137         y2 = (const char *)y2 + 8;
138         dst = (char *)dst + 8;
139     }
140 }
141 
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)142 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
143                                   const short *coef, uint32_t count) {
144     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
145                                       14, 10, 6, 2,
146                                       13,  9, 5, 1,
147                                       12,  8, 4, 0);
148 
149     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
150     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
151     __m128i c0, c1, c2, c3;
152     __m128i i4, o4;
153     __m128i xy, zw;
154     __m128i x2, y2, z2, w2;
155     uint32_t i;
156 
157     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
158     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
159     c0 = _mm_unpacklo_epi16(c0, c1);
160 
161     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
162     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
163     c2 = _mm_unpacklo_epi16(c2, c3);
164 
165     for (i = 0; i < count; ++i) {
166         i4 = _mm_load_si128((const __m128i *)src);
167         xy = _mm_shuffle_epi8(i4, Mxy);
168         zw = _mm_shuffle_epi8(i4, Mzw);
169 
170         x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
171         y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
172         z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
173         w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
174 
175         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
176         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
177         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
178         w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
179 
180         x2 = _mm_srai_epi32(x2, 8);
181         y2 = _mm_srai_epi32(y2, 8);
182         z2 = _mm_srai_epi32(z2, 8);
183         w2 = _mm_srai_epi32(w2, 8);
184 
185         x2 = packus_epi32(x2, y2);
186         z2 = packus_epi32(z2, w2);
187         o4 = _mm_packus_epi16(x2, z2);
188 
189         o4 = _mm_shuffle_epi8(o4, T4x4);
190         _mm_storeu_si128((__m128i *)dst, o4);
191 
192         src = (const char *)src + 16;
193         dst = (char *)dst + 16;
194     }
195 }
196 
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)197 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
198                                   const short *coef, uint32_t count) {
199     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
200                                       14, 10, 6, 2,
201                                       13,  9, 5, 1,
202                                       12,  8, 4, 0);
203 
204     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
205     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
206 
207     __m128i c0, c1, c2, c3;
208     __m128i i4, o4;
209     __m128i xy, zw;
210     __m128i x2, y2, z2, w2;
211     uint32_t i;
212 
213     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
214     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
215     c0 = _mm_unpacklo_epi16(c0, c1);
216 
217     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
218     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
219     c2 = _mm_unpacklo_epi16(c2, c3);
220 
221     for (i = 0; i < count; ++i) {
222         i4 = _mm_loadu_si128((const __m128i *)src);
223         xy = _mm_shuffle_epi8(i4, Mxy);
224         zw = _mm_shuffle_epi8(i4, Mzw);
225 
226         x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
227         y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
228         z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
229 
230         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
231         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
232         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
233 
234         x2 = _mm_srai_epi32(x2, 8);
235         y2 = _mm_srai_epi32(y2, 8);
236         z2 = _mm_srai_epi32(z2, 8);
237         w2 = _mm_srli_epi32(zw, 16);
238 
239         x2 = packus_epi32(x2, y2);
240         z2 = packus_epi32(z2, w2);
241         o4 = _mm_packus_epi16(x2, z2);
242 
243         o4 = _mm_shuffle_epi8(o4, T4x4);
244         _mm_storeu_si128((__m128i *)dst, o4);
245 
246         src = (const char *)src + 16;
247         dst = (char *)dst + 16;
248     }
249 }
250 
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)251 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
252                                   const short *coef, uint32_t count) {
253     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
254                                       14, 10, 6, 2,
255                                       13,  9, 5, 1,
256                                       12,  8, 4, 0);
257     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
258     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
259     __m128i c0, c1, c2, c3;
260     __m128i i4, o4;
261     __m128i xy, zw;
262     __m128i x2, y2, z2, w2;
263     uint32_t i;
264 
265     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
266     c0 = _mm_shufflelo_epi16(c0, 0);
267     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
268     c1 = _mm_shufflelo_epi16(c1, 0);
269     c0 = _mm_unpacklo_epi16(c0, c1);
270 
271     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
272     c2 = _mm_shufflelo_epi16(c2, 0);
273     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
274     c3 = _mm_shufflelo_epi16(c3, 0);
275     c2 = _mm_unpacklo_epi16(c2, c3);
276 
277     for (i = 0; i < count; ++i) {
278         i4 = _mm_loadu_si128((const __m128i *)src);
279 
280         xy = _mm_shuffle_epi8(i4, Mxy);
281         zw = _mm_shuffle_epi8(i4, Mzw);
282 
283         x2 =  _mm_madd_epi16(xy, c0);
284         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
285 
286         x2 = _mm_srai_epi32(x2, 8);
287         y2 = x2;
288         z2 = x2;
289         w2 = _mm_srli_epi32(zw, 16);
290 
291         x2 = packus_epi32(x2, y2);
292         z2 = packus_epi32(z2, w2);
293         o4 = _mm_packus_epi16(x2, z2);
294 
295         o4 = _mm_shuffle_epi8(o4, T4x4);
296         _mm_storeu_si128((__m128i *)dst, o4);
297 
298         src = (const char *)src + 16;
299         dst = (char *)dst + 16;
300     }
301 }
302 
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)303 void rsdIntrinsicBlurVFU4_K(void *dst,
304                           const void *pin, int stride, const void *gptr,
305                           int rct, int x1, int x2) {
306     const char *pi;
307     __m128i pi0, pi1;
308     __m128 pf0, pf1;
309     __m128 bp0, bp1;
310     __m128 x;
311     int r;
312 
313     for (; x1 < x2; x1 += 2) {
314         pi = (const char *)pin + (x1 << 2);
315         bp0 = _mm_setzero_ps();
316         bp1 = _mm_setzero_ps();
317 
318         for (r = 0; r < rct; ++r) {
319             x = _mm_load_ss((const float *)gptr + r);
320             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
321 
322             pi0 = _mm_cvtsi32_si128(*(const int *)pi);
323             pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
324 
325             pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
326             pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
327 
328             bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
329             bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
330 
331             pi += stride;
332         }
333 
334         _mm_storeu_ps((float *)dst, bp0);
335         _mm_storeu_ps((float *)dst + 4, bp1);
336         dst = (char *)dst + 32;
337     }
338 }
339 
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)340 void rsdIntrinsicBlurHFU4_K(void *dst,
341                           const void *pin, const void *gptr,
342                           int rct, int x1, int x2) {
343     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
344     const float *pi;
345     __m128 pf, x, y;
346     __m128i o;
347     int r;
348 
349     for (; x1 < x2; ++x1) {
350         /* rct is define as 2*r+1 by the caller */
351         x = _mm_load_ss((const float *)gptr);
352         x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
353 
354         pi = (const float *)pin + (x1 << 2);
355         pf = _mm_mul_ps(x, _mm_load_ps(pi));
356 
357         for (r = 1; r < rct; r += 2) {
358             x = _mm_load_ss((const float *)gptr + r);
359             y = _mm_load_ss((const float *)gptr + r + 1);
360             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
361             y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
362 
363             pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
364             pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
365         }
366 
367         o = _mm_cvtps_epi32(pf);
368         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
369         dst = (char *)dst + 4;
370     }
371 }
372 
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)373 void rsdIntrinsicBlurHFU1_K(void *dst,
374                           const void *pin, const void *gptr,
375                           int rct, int x1, int x2) {
376     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
377     const float *pi;
378     __m128 pf, g0, g1, g2, g3, gx, p0, p1;
379     __m128i o;
380     int r;
381 
382     for (; x1 < x2; x1+=4) {
383         g0 = _mm_load_ss((const float *)gptr);
384         g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
385 
386         pi = (const float *)pin + x1;
387         pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
388 
389         for (r = 1; r < rct; r += 4) {
390             gx = _mm_loadu_ps((const float *)gptr + r);
391             p0 = _mm_loadu_ps(pi + r);
392             p1 = _mm_loadu_ps(pi + r + 4);
393 
394             g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
395             pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
396             g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
397             pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
398             g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
399             pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
400             g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
401             pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
402         }
403 
404         o = _mm_cvtps_epi32(pf);
405         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
406         dst = (char *)dst + 4;
407     }
408 }
409 
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)410 void rsdIntrinsicYuv_K(void *dst,
411                        const unsigned char *pY, const unsigned char *pUV,
412                        uint32_t count, const short *param) {
413     __m128i biasY, biasUV;
414     __m128i c0, c1, c2, c3, c4;
415 
416     biasY = _mm_set1_epi32(param[8]);   /*  16 */
417     biasUV = _mm_set1_epi32(param[16]); /* 128 */
418 
419     c0 = _mm_set1_epi32(param[0]);  /*  298 */
420     c1 = _mm_set1_epi32(param[1]);  /*  409 */
421     c2 = _mm_set1_epi32(param[2]);  /* -100 */
422     c3 = _mm_set1_epi32(param[3]);  /*  516 */
423     c4 = _mm_set1_epi32(param[4]);  /* -208 */
424 
425     __m128i Y, UV, U, V, R, G, B, A;
426 
427     A = _mm_set1_epi32(255);
428     uint32_t i;
429 
430     for (i = 0; i < (count << 1); ++i) {
431         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
432         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
433 
434         Y = _mm_sub_epi32(Y, biasY);
435         UV = _mm_sub_epi32(UV, biasUV);
436 
437         U = _mm_shuffle_epi32(UV, 0xf5);
438         V = _mm_shuffle_epi32(UV, 0xa0);
439 
440         Y = mullo_epi32(Y, c0);
441 
442         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
443         R = _mm_add_epi32(R, biasUV);
444         R = _mm_srai_epi32(R, 8);
445 
446         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
447         G = _mm_add_epi32(G, mullo_epi32(V, c4));
448         G = _mm_add_epi32(G, biasUV);
449         G = _mm_srai_epi32(G, 8);
450 
451         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
452         B = _mm_add_epi32(B, biasUV);
453         B = _mm_srai_epi32(B, 8);
454 
455         __m128i y1, y2, y3, y4;
456 
457         y1 = packus_epi32(R, G);
458         y2 = packus_epi32(B, A);
459         y3 = _mm_packus_epi16(y1, y2);
460         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
461                                           14, 10, 6, 2,
462                                           13,  9, 5, 1,
463                                           12,  8, 4, 0);
464         y4 = _mm_shuffle_epi8(y3, T4x4);
465         _mm_storeu_si128((__m128i *)dst, y4);
466         pY += 4;
467         pUV += 4;
468         dst = (__m128i *)dst + 1;
469     }
470 }
471 
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)472 void rsdIntrinsicYuvR_K(void *dst,
473                        const unsigned char *pY, const unsigned char *pUV,
474                        uint32_t count, const short *param) {
475     __m128i biasY, biasUV;
476     __m128i c0, c1, c2, c3, c4;
477 
478     biasY = _mm_set1_epi32(param[8]);   /*  16 */
479     biasUV = _mm_set1_epi32(param[16]); /* 128 */
480 
481     c0 = _mm_set1_epi32(param[0]);  /*  298 */
482     c1 = _mm_set1_epi32(param[1]);  /*  409 */
483     c2 = _mm_set1_epi32(param[2]);  /* -100 */
484     c3 = _mm_set1_epi32(param[3]);  /*  516 */
485     c4 = _mm_set1_epi32(param[4]);  /* -208 */
486 
487     __m128i Y, UV, U, V, R, G, B, A;
488 
489     A = _mm_set1_epi32(255);
490     uint32_t i;
491 
492     for (i = 0; i < (count << 1); ++i) {
493         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
494         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
495 
496         Y = _mm_sub_epi32(Y, biasY);
497         UV = _mm_sub_epi32(UV, biasUV);
498 
499         V = _mm_shuffle_epi32(UV, 0xf5);
500         U = _mm_shuffle_epi32(UV, 0xa0);
501 
502         Y = mullo_epi32(Y, c0);
503 
504         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
505         R = _mm_add_epi32(R, biasUV);
506         R = _mm_srai_epi32(R, 8);
507 
508         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
509         G = _mm_add_epi32(G, mullo_epi32(V, c4));
510         G = _mm_add_epi32(G, biasUV);
511         G = _mm_srai_epi32(G, 8);
512 
513         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
514         B = _mm_add_epi32(B, biasUV);
515         B = _mm_srai_epi32(B, 8);
516 
517         __m128i y1, y2, y3, y4;
518 
519         y1 = packus_epi32(R, G);
520         y2 = packus_epi32(B, A);
521         y3 = _mm_packus_epi16(y1, y2);
522         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
523                                           14, 10, 6, 2,
524                                           13,  9, 5, 1,
525                                           12,  8, 4, 0);
526         y4 = _mm_shuffle_epi8(y3, T4x4);
527         _mm_storeu_si128((__m128i *)dst, y4);
528         pY += 4;
529         pUV += 4;
530         dst = (__m128i *)dst + 1;
531     }
532 }
533 
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)534 void rsdIntrinsicYuv2_K(void *dst,
535                        const unsigned char *pY, const unsigned char *pU,
536                        const unsigned char *pV, uint32_t count, const short *param) {
537     __m128i biasY, biasUV;
538     __m128i c0, c1, c2, c3, c4;
539 
540     biasY = _mm_set1_epi32(param[8]);   /*  16 */
541     biasUV = _mm_set1_epi32(param[16]); /* 128 */
542 
543     c0 = _mm_set1_epi32(param[0]);  /*  298 */
544     c1 = _mm_set1_epi32(param[1]);  /*  409 */
545     c2 = _mm_set1_epi32(param[2]);  /* -100 */
546     c3 = _mm_set1_epi32(param[3]);  /*  516 */
547     c4 = _mm_set1_epi32(param[4]);  /* -208 */
548 
549     __m128i Y, U, V, R, G, B, A;
550 
551     A = _mm_set1_epi32(255);
552     uint32_t i;
553 
554     for (i = 0; i < (count << 1); ++i) {
555         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
556         U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
557 		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
558 
559         Y = _mm_sub_epi32(Y, biasY);
560         U = _mm_sub_epi32(U, biasUV);
561 		V = _mm_sub_epi32(V, biasUV);
562 
563         Y = mullo_epi32(Y, c0);
564 
565         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
566         R = _mm_add_epi32(R, biasUV);
567         R = _mm_srai_epi32(R, 8);
568 
569         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
570         G = _mm_add_epi32(G, mullo_epi32(V, c4));
571         G = _mm_add_epi32(G, biasUV);
572         G = _mm_srai_epi32(G, 8);
573 
574         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
575         B = _mm_add_epi32(B, biasUV);
576         B = _mm_srai_epi32(B, 8);
577 
578         __m128i y1, y2, y3, y4;
579 
580         y1 = packus_epi32(R, G);
581         y2 = packus_epi32(B, A);
582         y3 = _mm_packus_epi16(y1, y2);
583         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
584                                           14, 10, 6, 2,
585                                           13,  9, 5, 1,
586                                           12,  8, 4, 0);
587         y4 = _mm_shuffle_epi8(y3, T4x4);
588         _mm_storeu_si128((__m128i *)dst, y4);
589         pY += 4;
590         pU += 4;
591 		pV += 4;
592         dst = (__m128i *)dst + 1;
593     }
594 }
595 
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)596 void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
597                                const void *y2, const void *y3, const void *y4,
598                                const short *coef, uint32_t count) {
599     __m128i x;
600     __m128i c0, c2, c4, c6, c8, c10, c12;
601     __m128i c14, c16, c18, c20, c22, c24;
602     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
603     __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
604     __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
605     __m128i p16, p17, p18, p19, p20, p21, p22, p23;
606     __m128i p24, p25, p26, p27, p28, p29, p30, p31;
607     __m128i p32, p33, p34, p35, p36, p37, p38, p39;
608     __m128i o0, o1, o2, o3;
609     uint32_t i;
610 
611     x = _mm_loadl_epi64((const __m128i *)(coef+0));
612     c0  = _mm_shuffle_epi32(x, 0x00);
613     c2  = _mm_shuffle_epi32(x, 0x55);
614 
615     x = _mm_loadl_epi64((const __m128i *)(coef+4));
616     c4  = _mm_shuffle_epi32(x, 0x00);
617     c6  = _mm_shuffle_epi32(x, 0x55);
618 
619     x = _mm_loadl_epi64((const __m128i *)(coef+8));
620     c8  = _mm_shuffle_epi32(x, 0x00);
621     c10  = _mm_shuffle_epi32(x, 0x55);
622 
623     x = _mm_loadl_epi64((const __m128i *)(coef+12));
624     c12  = _mm_shuffle_epi32(x, 0x00);
625     c14  = _mm_shuffle_epi32(x, 0x55);
626 
627     x = _mm_loadl_epi64((const __m128i *)(coef+16));
628     c16  = _mm_shuffle_epi32(x, 0x00);
629     c18  = _mm_shuffle_epi32(x, 0x55);
630 
631     x = _mm_loadl_epi64((const __m128i *)(coef+20));
632     c20  = _mm_shuffle_epi32(x, 0x00);
633     c22  = _mm_shuffle_epi32(x, 0x55);
634 
635     x = _mm_loadl_epi64((const __m128i *)(coef+24));
636     c24  = _mm_shuffle_epi32(x, 0x00);
637 
638     for (i = 0; i < count; ++i) {
639 
640         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
641         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
642         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
643         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
644         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
645         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
646         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
647         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
648 
649         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
650         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
651         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
652         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
653         p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
654         p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
655         p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
656         p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
657 
658         p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
659         p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
660         p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
661         p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
662         p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
663         p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
664         p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
665         p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
666 
667         p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
668         p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
669         p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
670         p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
671         p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
672         p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
673         p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
674         p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
675 
676         p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
677         p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
678         p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
679         p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
680         p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
681         p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
682         p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
683         p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
684 
685         o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
686         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
687         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
688         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
689         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
690         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
691         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
692         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
693         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
694         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
695         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
696         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
697         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
698         o0 = _mm_srai_epi32(o0, 8);
699 
700         o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
701         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
702         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
703         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
704         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
705         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
706         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
707         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
708         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
709         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
710         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
711         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
712         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
713         o1 = _mm_srai_epi32(o1, 8);
714 
715         o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
716         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
717         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
718         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
719         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
720         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
721         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
722         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
723         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
724         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
725         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
726         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
727         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
728         o2 = _mm_srai_epi32(o2, 8);
729 
730         o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
731         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
732         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
733         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
734         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
735         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
736         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
737         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
738         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
739         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
740         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
741         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
742         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
743         o3 = _mm_srai_epi32(o3, 8);
744 
745         o0 = packus_epi32(o0, o1);
746         o2 = packus_epi32(o2, o3);
747         o0 = _mm_packus_epi16(o0, o2);
748         _mm_storeu_si128((__m128i *)dst, o0);
749 
750         y0 = (const char *)y0 + 16;
751         y1 = (const char *)y1 + 16;
752         y2 = (const char *)y2 + 16;
753         y3 = (const char *)y3 + 16;
754         y4 = (const char *)y4 + 16;
755         dst = (char *)dst + 16;
756     }
757 }
758 
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)759 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
760     __m128i all1s, ina, ins;
761     __m128i in0, in1, out0, out1;
762     __m128i t0, t1, t2, t3;
763     uint32_t i;
764 
765     all1s = _mm_set1_epi16(255);
766 
767     for (i = 0; i < count8; ++i) {
768         in0 = _mm_loadu_si128((const __m128i *)src);
769         in1 = _mm_loadu_si128((const __m128i *)src + 1);
770         out0 = _mm_loadu_si128((const __m128i *)dst);
771         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
772 
773         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
774         ina = _mm_shufflelo_epi16(ins, 0xFF);
775         ina = _mm_shufflehi_epi16(ina, 0xFF);
776         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
777         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
778         t0 = _mm_srai_epi16(t0, 8);
779         t0 = _mm_add_epi16(t0, ins);
780 
781         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
782         ina = _mm_shufflelo_epi16(ins, 0xFF);
783         ina = _mm_shufflehi_epi16(ina, 0xFF);
784         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
785         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
786         t1 = _mm_srai_epi16(t1, 8);
787         t1 = _mm_add_epi16(t1, ins);
788 
789         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
790         ina = _mm_shufflelo_epi16(ins, 0xFF);
791         ina = _mm_shufflehi_epi16(ina, 0xFF);
792         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
793         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
794         t2 = _mm_srai_epi16(t2, 8);
795         t2 = _mm_add_epi16(t2, ins);
796 
797         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
798         ina = _mm_shufflelo_epi16(ins, 0xFF);
799         ina = _mm_shufflehi_epi16(ina, 0xFF);
800         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
801         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
802         t3 = _mm_srai_epi16(t3, 8);
803         t3 = _mm_add_epi16(t3, ins);
804 
805         t0 = _mm_packus_epi16(t0, t1);
806         t2 = _mm_packus_epi16(t2, t3);
807         _mm_storeu_si128((__m128i *)dst, t0);
808         _mm_storeu_si128((__m128i *)dst + 1, t2);
809 
810         src = (const __m128i *)src + 2;
811         dst = (__m128i *)dst + 2;
812     }
813 }
814 
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)815 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
816     __m128i all1s, outa, outs;
817     __m128i in0, in1, out0, out1;
818     __m128i t0, t1, t2, t3;
819     uint32_t i;
820 
821     all1s = _mm_set1_epi16(255);
822 
823     for (i = 0; i < count8; ++i) {
824         in0 = _mm_loadu_si128((const __m128i *)src);
825         in1 = _mm_loadu_si128((const __m128i *)src + 1);
826         out0 = _mm_loadu_si128((const __m128i *)dst);
827         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
828 
829 
830         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
831         outa = _mm_shufflelo_epi16(outs, 0xFF);
832         outa = _mm_shufflehi_epi16(outa, 0xFF);
833         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
834         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
835         t0 = _mm_srai_epi16(t0, 8);
836         t0 = _mm_add_epi16(t0, outs);
837 
838         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
839         outa = _mm_shufflelo_epi16(outs, 0xFF);
840         outa = _mm_shufflehi_epi16(outa, 0xFF);
841         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
842         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
843         t1 = _mm_srai_epi16(t1, 8);
844         t1 = _mm_add_epi16(t1, outs);
845 
846         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
847         outa = _mm_shufflelo_epi16(outs, 0xFF);
848         outa = _mm_shufflehi_epi16(outa, 0xFF);
849         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
850         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
851         t2 = _mm_srai_epi16(t2, 8);
852         t2 = _mm_add_epi16(t2, outs);
853 
854         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
855         outa = _mm_shufflelo_epi16(outs, 0xFF);
856         outa = _mm_shufflehi_epi16(outa, 0xFF);
857         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
858         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
859         t3 = _mm_srai_epi16(t3, 8);
860         t3 = _mm_add_epi16(t3, outs);
861 
862         t0 = _mm_packus_epi16(t0, t1);
863         t2 = _mm_packus_epi16(t2, t3);
864         _mm_storeu_si128((__m128i *)dst, t0);
865         _mm_storeu_si128((__m128i *)dst + 1, t2);
866 
867         src = (const __m128i *)src + 2;
868         dst = (__m128i *)dst + 2;
869     }
870 }
871 
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)872 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
873     __m128i outa;
874     __m128i in0, in1, out0, out1;
875     __m128i t0, t1, t2, t3;
876     uint32_t i;
877 
878     for (i = 0; i < count8; ++i) {
879         in0 = _mm_loadu_si128((const __m128i *)src);
880         in1 = _mm_loadu_si128((const __m128i *)src + 1);
881         out0 = _mm_loadu_si128((const __m128i *)dst);
882         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
883 
884         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
885         outa = _mm_shufflelo_epi16(outa, 0xFF);
886         outa = _mm_shufflehi_epi16(outa, 0xFF);
887         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
888         t0 = _mm_mullo_epi16(t0, outa);
889         t0 = _mm_srai_epi16(t0, 8);
890 
891         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
892         outa = _mm_shufflelo_epi16(outa, 0xFF);
893         outa = _mm_shufflehi_epi16(outa, 0xFF);
894         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
895         t1 = _mm_mullo_epi16(t1, outa);
896         t1 = _mm_srai_epi16(t1, 8);
897 
898         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
899         outa = _mm_shufflelo_epi16(outa, 0xFF);
900         outa = _mm_shufflehi_epi16(outa, 0xFF);
901         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
902         t2 = _mm_mullo_epi16(t2, outa);
903         t2 = _mm_srai_epi16(t2, 8);
904 
905         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
906         outa = _mm_shufflelo_epi16(outa, 0xFF);
907         outa = _mm_shufflehi_epi16(outa, 0xFF);
908         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
909         t3 = _mm_mullo_epi16(t3, outa);
910         t3 = _mm_srai_epi16(t3, 8);
911 
912         t0 = _mm_packus_epi16(t0, t1);
913         t2 = _mm_packus_epi16(t2, t3);
914         _mm_storeu_si128((__m128i *)dst, t0);
915         _mm_storeu_si128((__m128i *)dst + 1, t2);
916 
917         src = (const __m128i *)src + 2;
918         dst = (__m128i *)dst + 2;
919     }
920 }
921 
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)922 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
923     __m128i ina;
924     __m128i in0, in1, out0, out1;
925     __m128i t0, t1, t2, t3;
926     uint32_t i;
927 
928     for (i = 0; i < count8; ++i) {
929         in0 = _mm_loadu_si128((const __m128i *)src);
930         in1 = _mm_loadu_si128((const __m128i *)src + 1);
931         out0 = _mm_loadu_si128((const __m128i *)dst);
932         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
933 
934         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
935         ina = _mm_shufflelo_epi16(ina, 0xFF);
936         ina = _mm_shufflehi_epi16(ina, 0xFF);
937         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
938         t0 = _mm_mullo_epi16(t0, ina);
939         t0 = _mm_srai_epi16(t0, 8);
940 
941         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
942         ina = _mm_shufflelo_epi16(ina, 0xFF);
943         ina = _mm_shufflehi_epi16(ina, 0xFF);
944         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
945         t1 = _mm_mullo_epi16(t1, ina);
946         t1 = _mm_srai_epi16(t1, 8);
947 
948         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
949         ina = _mm_shufflelo_epi16(ina, 0xFF);
950         ina = _mm_shufflehi_epi16(ina, 0xFF);
951         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
952         t2 = _mm_mullo_epi16(t2, ina);
953         t2 = _mm_srai_epi16(t2, 8);
954 
955         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
956         ina = _mm_shufflelo_epi16(ina, 0xFF);
957         ina = _mm_shufflehi_epi16(ina, 0xFF);
958         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
959         t3 = _mm_mullo_epi16(t3, ina);
960         t3 = _mm_srai_epi16(t3, 8);
961 
962         t0 = _mm_packus_epi16(t0, t1);
963         t2 = _mm_packus_epi16(t2, t3);
964         _mm_storeu_si128((__m128i *)dst, t0);
965         _mm_storeu_si128((__m128i *)dst + 1, t2);
966 
967         src = (const __m128i *)src + 2;
968         dst = (__m128i *)dst + 2;
969     }
970 }
971 
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)972 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
973     __m128i all1s, outa;
974     __m128i in0, in1, out0, out1;
975     __m128i t0, t1, t2, t3;
976     uint32_t i;
977 
978     all1s = _mm_set1_epi16(255);
979 
980     for (i = 0; i < count8; ++i) {
981         in0 = _mm_loadu_si128((const __m128i *)src);
982         in1 = _mm_loadu_si128((const __m128i *)src + 1);
983         out0 = _mm_loadu_si128((const __m128i *)dst);
984         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
985 
986         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
987         outa = _mm_shufflelo_epi16(outa, 0xFF);
988         outa = _mm_shufflehi_epi16(outa, 0xFF);
989         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
990         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
991         t0 = _mm_srai_epi16(t0, 8);
992 
993         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
994         outa = _mm_shufflelo_epi16(outa, 0xFF);
995         outa = _mm_shufflehi_epi16(outa, 0xFF);
996         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
997         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
998         t1 = _mm_srai_epi16(t1, 8);
999 
1000         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1001         outa = _mm_shufflelo_epi16(outa, 0xFF);
1002         outa = _mm_shufflehi_epi16(outa, 0xFF);
1003         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1004         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1005         t2 = _mm_srai_epi16(t2, 8);
1006 
1007         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1008         outa = _mm_shufflelo_epi16(outa, 0xFF);
1009         outa = _mm_shufflehi_epi16(outa, 0xFF);
1010         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1011         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1012         t3 = _mm_srai_epi16(t3, 8);
1013 
1014         t0 = _mm_packus_epi16(t0, t1);
1015         t2 = _mm_packus_epi16(t2, t3);
1016         _mm_storeu_si128((__m128i *)dst, t0);
1017         _mm_storeu_si128((__m128i *)dst + 1, t2);
1018 
1019         src = (const __m128i *)src + 2;
1020         dst = (__m128i *)dst + 2;
1021     }
1022 }
1023 
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1024 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1025     __m128i all1s, ina;
1026     __m128i in0, in1, out0, out1;
1027     __m128i t0, t1, t2, t3;
1028     uint32_t i;
1029 
1030     all1s = _mm_set1_epi16(255);
1031 
1032     for (i = 0; i < count8; ++i) {
1033         in0 = _mm_loadu_si128((const __m128i *)src);
1034         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1035         out0 = _mm_loadu_si128((const __m128i *)dst);
1036         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1037 
1038         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1039         ina = _mm_shufflelo_epi16(ina, 0xFF);
1040         ina = _mm_shufflehi_epi16(ina, 0xFF);
1041         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1042         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1043         t0 = _mm_srai_epi16(t0, 8);
1044 
1045         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1046         ina = _mm_shufflelo_epi16(ina, 0xFF);
1047         ina = _mm_shufflehi_epi16(ina, 0xFF);
1048         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1049         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1050         t1 = _mm_srai_epi16(t1, 8);
1051 
1052         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1053         ina = _mm_shufflelo_epi16(ina, 0xFF);
1054         ina = _mm_shufflehi_epi16(ina, 0xFF);
1055         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1056         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1057         t2 = _mm_srai_epi16(t2, 8);
1058 
1059         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1060         ina = _mm_shufflelo_epi16(ina, 0xFF);
1061         ina = _mm_shufflehi_epi16(ina, 0xFF);
1062         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1063         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1064         t3 = _mm_srai_epi16(t3, 8);
1065 
1066         t0 = _mm_packus_epi16(t0, t1);
1067         t2 = _mm_packus_epi16(t2, t3);
1068         _mm_storeu_si128((__m128i *)dst, t0);
1069         _mm_storeu_si128((__m128i *)dst + 1, t2);
1070 
1071         src = (const __m128i *)src + 2;
1072         dst = (__m128i *)dst + 2;
1073     }
1074 }
1075 
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1076 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1077     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1078     __m128i all1s, ina, outa, ins, outs;
1079     __m128i in0, in1, out0, out1;
1080     __m128i t0, t1, t2, t3;
1081     uint32_t i;
1082 
1083     all1s = _mm_set1_epi16(255);
1084 
1085     for (i = 0; i < count8; ++i) {
1086         in0 = _mm_loadu_si128((const __m128i *)src);
1087         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1088         out0 = _mm_loadu_si128((const __m128i *)dst);
1089         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1090 
1091         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1092         ina = _mm_shufflelo_epi16(ins, 0xFF);
1093         ina = _mm_shufflehi_epi16(ina, 0xFF);
1094         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1095         outa = _mm_shufflelo_epi16(outs, 0xFF);
1096         outa = _mm_shufflehi_epi16(outa, 0xFF);
1097         t0 = _mm_sub_epi16(all1s, ina);
1098         t0 = _mm_mullo_epi16(t0, outs);
1099         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1100         t0 = _mm_srli_epi16(t0, 8);
1101 
1102         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1103         ina = _mm_shufflelo_epi16(ins, 0xFF);
1104         ina = _mm_shufflehi_epi16(ina, 0xFF);
1105         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1106         outa = _mm_shufflelo_epi16(outs, 0xFF);
1107         outa = _mm_shufflehi_epi16(outa, 0xFF);
1108         t1 = _mm_sub_epi16(all1s, ina);
1109         t1 = _mm_mullo_epi16(t1, outs);
1110         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1111         t1 = _mm_srli_epi16(t1, 8);
1112 
1113         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1114         ina = _mm_shufflelo_epi16(ins, 0xFF);
1115         ina = _mm_shufflehi_epi16(ina, 0xFF);
1116         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1117         outa = _mm_shufflelo_epi16(outs, 0xFF);
1118         outa = _mm_shufflehi_epi16(outa, 0xFF);
1119         t2 = _mm_sub_epi16(all1s, ina);
1120         t2 = _mm_mullo_epi16(t2, outs);
1121         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1122         t2 = _mm_srli_epi16(t2, 8);
1123 
1124         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1125         ina = _mm_shufflelo_epi16(ins, 0xFF);
1126         ina = _mm_shufflehi_epi16(ina, 0xFF);
1127         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1128         outa = _mm_shufflelo_epi16(outs, 0xFF);
1129         outa = _mm_shufflehi_epi16(outa, 0xFF);
1130         t3 = _mm_sub_epi16(all1s, ina);
1131         t3 = _mm_mullo_epi16(t3, outs);
1132         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1133         t3 = _mm_srli_epi16(t3, 8);
1134 
1135         t0 = _mm_packus_epi16(t0, t1);
1136         t0 = blendv_epi8(t0, out0, M0001);
1137         t2 = _mm_packus_epi16(t2, t3);
1138         t2 = blendv_epi8(t2, out1, M0001);
1139         _mm_storeu_si128((__m128i *)dst, t0);
1140         _mm_storeu_si128((__m128i *)dst + 1, t2);
1141 
1142         src = (const __m128i *)src + 2;
1143         dst = (__m128i *)dst + 2;
1144     }
1145 }
1146 
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1147 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1148     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1149     __m128i all1s, ina, ins, outa, outs;
1150     __m128i in0, in1, out0, out1;
1151     __m128i t0, t1, t2, t3;
1152     uint32_t i;
1153 
1154     all1s = _mm_set1_epi16(255);
1155 
1156     for (i = 0; i < count8; ++i) {
1157         in0 = _mm_loadu_si128((const __m128i *)src);
1158         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1159         out0 = _mm_loadu_si128((const __m128i *)dst);
1160         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1161 
1162         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1163         ina = _mm_shufflelo_epi16(ins, 0xFF);
1164         ina = _mm_shufflehi_epi16(ina, 0xFF);
1165         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1166         outa = _mm_shufflelo_epi16(outs, 0xFF);
1167         outa = _mm_shufflehi_epi16(outa, 0xFF);
1168         t0 = _mm_sub_epi16(all1s, outa);
1169         t0 = _mm_mullo_epi16(t0, ins);
1170         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1171         t0 = _mm_srli_epi16(t0, 8);
1172 
1173         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1174         ina = _mm_shufflelo_epi16(ins, 0xFF);
1175         ina = _mm_shufflehi_epi16(ina, 0xFF);
1176         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1177         outa = _mm_shufflelo_epi16(outs, 0xFF);
1178         outa = _mm_shufflehi_epi16(outa, 0xFF);
1179         t1 = _mm_sub_epi16(all1s, outa);
1180         t1 = _mm_mullo_epi16(t1, ins);
1181         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1182         t1 = _mm_srli_epi16(t1, 8);
1183 
1184         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1185         ina = _mm_shufflelo_epi16(ins, 0xFF);
1186         ina = _mm_shufflehi_epi16(ina, 0xFF);
1187         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1188         outa = _mm_shufflelo_epi16(outs, 0xFF);
1189         outa = _mm_shufflehi_epi16(outa, 0xFF);
1190         t2 = _mm_sub_epi16(all1s, outa);
1191         t2 = _mm_mullo_epi16(t2, ins);
1192         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1193         t2 = _mm_srli_epi16(t2, 8);
1194 
1195         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1196         ina = _mm_shufflelo_epi16(ins, 0xFF);
1197         ina = _mm_shufflehi_epi16(ina, 0xFF);
1198         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1199         outa = _mm_shufflelo_epi16(outs, 0xFF);
1200         outa = _mm_shufflehi_epi16(outa, 0xFF);
1201         t3 = _mm_sub_epi16(all1s, outa);
1202         t3 = _mm_mullo_epi16(t3, ins);
1203         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1204         t3 = _mm_srli_epi16(t3, 8);
1205 
1206         t0 = _mm_packus_epi16(t0, t1);
1207         t0 = blendv_epi8(t0, out0, M0001);
1208         t2 = _mm_packus_epi16(t2, t3);
1209         t2 = blendv_epi8(t2, out1, M0001);
1210         _mm_storeu_si128((__m128i *)dst, t0);
1211         _mm_storeu_si128((__m128i *)dst + 1, t2);
1212 
1213         src = (const __m128i *)src + 2;
1214         dst = (__m128i *)dst + 2;
1215     }
1216 }
1217 
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1218 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1219     __m128i in0, in1, out0, out1;
1220     uint32_t i;
1221 
1222     for (i = 0; i < count8; ++i) {
1223         in0 = _mm_loadu_si128((const __m128i *)src);
1224         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1225         out0 = _mm_loadu_si128((const __m128i *)dst);
1226         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1227 
1228         out0 = _mm_xor_si128(out0, in0);
1229         out1 = _mm_xor_si128(out1, in1);
1230 
1231         _mm_storeu_si128((__m128i *)dst, out0);
1232         _mm_storeu_si128((__m128i *)dst + 1, out1);
1233 
1234         src = (const __m128i *)src + 2;
1235         dst = (__m128i *)dst + 2;
1236     }
1237 }
1238 
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1239 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1240     __m128i in0, in1, out0, out1;
1241     __m128i t0, t1, t2, t3;
1242     uint32_t i;
1243 
1244     for (i = 0; i < count8; ++i) {
1245         in0 = _mm_loadu_si128((const __m128i *)src);
1246         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1247         out0 = _mm_loadu_si128((const __m128i *)dst);
1248         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1249 
1250         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1251         t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1252         t0 = _mm_srli_epi16(t0, 8);
1253 
1254         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1255         t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1256         t1 = _mm_srli_epi16(t1, 8);
1257 
1258         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1259         t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1260         t2 = _mm_srli_epi16(t2, 8);
1261 
1262         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1263         t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1264         t3 = _mm_srli_epi16(t3, 8);
1265 
1266         t0 = _mm_packus_epi16(t0, t1);
1267         t2 = _mm_packus_epi16(t2, t3);
1268         _mm_storeu_si128((__m128i *)dst, t0);
1269         _mm_storeu_si128((__m128i *)dst + 1, t2);
1270 
1271         src = (const __m128i *)src + 2;
1272         dst = (__m128i *)dst + 2;
1273     }
1274 }
1275 
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1276 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1277     __m128i in0, in1, out0, out1;
1278     uint32_t i;
1279 
1280     for (i = 0; i < count8; ++i) {
1281         in0 = _mm_loadu_si128((const __m128i *)src);
1282         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1283         out0 = _mm_loadu_si128((const __m128i *)dst);
1284         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1285 
1286         out0 = _mm_adds_epu8(out0, in0);
1287         out1 = _mm_adds_epu8(out1, in1);
1288 
1289         _mm_storeu_si128((__m128i *)dst, out0);
1290         _mm_storeu_si128((__m128i *)dst + 1, out1);
1291 
1292         src = (const __m128i *)src + 2;
1293         dst = (__m128i *)dst + 2;
1294     }
1295 }
1296 
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1297 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1298     __m128i in0, in1, out0, out1;
1299     uint32_t i;
1300 
1301     for (i = 0; i < count8; ++i) {
1302         in0 = _mm_loadu_si128((const __m128i *)src);
1303         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1304         out0 = _mm_loadu_si128((const __m128i *)dst);
1305         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1306 
1307         out0 = _mm_subs_epu8(out0, in0);
1308         out1 = _mm_subs_epu8(out1, in1);
1309 
1310         _mm_storeu_si128((__m128i *)dst, out0);
1311         _mm_storeu_si128((__m128i *)dst + 1, out1);
1312 
1313         src = (const __m128i *)src + 2;
1314         dst = (__m128i *)dst + 2;
1315     }
1316 }
1317