1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <stdint.h>
18 #include <x86intrin.h>
19
20 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)21 static inline __m128i cvtepu8_epi32(__m128i x) {
22 #if defined(__SSE4_1__)
23 return _mm_cvtepu8_epi32(x);
24 #elif defined(__SSSE3__)
25 const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
26 x = _mm_shuffle_epi8(x, M8to32);
27 return x;
28 #else
29 # error "Require at least SSSE3"
30 #endif
31 }
32
packus_epi32(__m128i lo,__m128i hi)33 static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
34 #if defined(__SSE4_1__)
35 return _mm_packus_epi32(lo, hi);
36 #elif defined(__SSSE3__)
37 const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
38 const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
39 const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
40 const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
41 lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
42 lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
43 hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
44 hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
45 return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
46 _mm_shuffle_epi8(hi, M32to16H));
47 #else
48 # error "Require at least SSSE3"
49 #endif
50 }
51
mullo_epi32(__m128i x,__m128i y)52 static inline __m128i mullo_epi32(__m128i x, __m128i y) {
53 #if defined(__SSE4_1__)
54 return _mm_mullo_epi32(x, y);
55 #elif defined(__SSSE3__)
56 const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
57 __m128i even = _mm_mul_epu32(x, y);
58 __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
59 _mm_srli_si128(y, 4));
60 even = _mm_and_si128(even, Meven);
61 odd = _mm_and_si128(odd, Meven);
62 return _mm_or_si128(even, _mm_slli_si128(odd, 4));
63 #else
64 # error "Require at least SSSE3"
65 #endif
66 }
67
68 /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)69 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
70 #if defined(__SSE4_1__)
71 return _mm_blendv_epi8(x, y, mask);
72 #elif defined(__SSSE3__)
73 return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
74 #else
75 # error "Require at least SSSE3"
76 #endif
77 }
78
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)79 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
80 const void *y1, const void *y2,
81 const short *coef, uint32_t count) {
82 __m128i x;
83 __m128i c0, c2, c4, c6, c8;
84 __m128i r0, r1, r2;
85 __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
86 __m128i o0, o1;
87 uint32_t i;
88
89 x = _mm_loadl_epi64((const __m128i *)(coef+0));
90 c0 = _mm_shuffle_epi32(x, 0x00);
91 c2 = _mm_shuffle_epi32(x, 0x55);
92 x = _mm_loadl_epi64((const __m128i *)(coef+4));
93 c4 = _mm_shuffle_epi32(x, 0x00);
94 c6 = _mm_shuffle_epi32(x, 0x55);
95 x = _mm_loadl_epi64((const __m128i *)(coef+8));
96 c8 = _mm_shuffle_epi32(x, 0x00);
97
98 for (i = 0; i < count; ++i) {
99
100 p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
101 p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
102 p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
103 p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
104 p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
105 p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
106 p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
107 p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
108 p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
109 p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
110 p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
111 p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
112
113 o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
114 o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
115
116 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
117 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
118
119 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
120 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
121
122 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
123 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
124
125 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
126 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
127
128 o0 = _mm_srai_epi32(o0, 8);
129 o1 = _mm_srai_epi32(o1, 8);
130
131 o0 = packus_epi32(o0, o1);
132 o0 = _mm_packus_epi16(o0, o0);
133 _mm_storel_epi64((__m128i *)dst, o0);
134
135 y0 = (const char *)y0 + 8;
136 y1 = (const char *)y1 + 8;
137 y2 = (const char *)y2 + 8;
138 dst = (char *)dst + 8;
139 }
140 }
141
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)142 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
143 const short *coef, uint32_t count) {
144 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
145 14, 10, 6, 2,
146 13, 9, 5, 1,
147 12, 8, 4, 0);
148
149 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
150 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
151 __m128i c0, c1, c2, c3;
152 __m128i i4, o4;
153 __m128i xy, zw;
154 __m128i x2, y2, z2, w2;
155 uint32_t i;
156
157 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
158 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
159 c0 = _mm_unpacklo_epi16(c0, c1);
160
161 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
162 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
163 c2 = _mm_unpacklo_epi16(c2, c3);
164
165 for (i = 0; i < count; ++i) {
166 i4 = _mm_load_si128((const __m128i *)src);
167 xy = _mm_shuffle_epi8(i4, Mxy);
168 zw = _mm_shuffle_epi8(i4, Mzw);
169
170 x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
171 y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
172 z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
173 w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
174
175 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
176 y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
177 z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
178 w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
179
180 x2 = _mm_srai_epi32(x2, 8);
181 y2 = _mm_srai_epi32(y2, 8);
182 z2 = _mm_srai_epi32(z2, 8);
183 w2 = _mm_srai_epi32(w2, 8);
184
185 x2 = packus_epi32(x2, y2);
186 z2 = packus_epi32(z2, w2);
187 o4 = _mm_packus_epi16(x2, z2);
188
189 o4 = _mm_shuffle_epi8(o4, T4x4);
190 _mm_storeu_si128((__m128i *)dst, o4);
191
192 src = (const char *)src + 16;
193 dst = (char *)dst + 16;
194 }
195 }
196
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)197 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
198 const short *coef, uint32_t count) {
199 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
200 14, 10, 6, 2,
201 13, 9, 5, 1,
202 12, 8, 4, 0);
203
204 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
205 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
206
207 __m128i c0, c1, c2, c3;
208 __m128i i4, o4;
209 __m128i xy, zw;
210 __m128i x2, y2, z2, w2;
211 uint32_t i;
212
213 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
214 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
215 c0 = _mm_unpacklo_epi16(c0, c1);
216
217 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
218 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
219 c2 = _mm_unpacklo_epi16(c2, c3);
220
221 for (i = 0; i < count; ++i) {
222 i4 = _mm_loadu_si128((const __m128i *)src);
223 xy = _mm_shuffle_epi8(i4, Mxy);
224 zw = _mm_shuffle_epi8(i4, Mzw);
225
226 x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
227 y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
228 z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
229
230 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
231 y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
232 z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
233
234 x2 = _mm_srai_epi32(x2, 8);
235 y2 = _mm_srai_epi32(y2, 8);
236 z2 = _mm_srai_epi32(z2, 8);
237 w2 = _mm_srli_epi32(zw, 16);
238
239 x2 = packus_epi32(x2, y2);
240 z2 = packus_epi32(z2, w2);
241 o4 = _mm_packus_epi16(x2, z2);
242
243 o4 = _mm_shuffle_epi8(o4, T4x4);
244 _mm_storeu_si128((__m128i *)dst, o4);
245
246 src = (const char *)src + 16;
247 dst = (char *)dst + 16;
248 }
249 }
250
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)251 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
252 const short *coef, uint32_t count) {
253 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
254 14, 10, 6, 2,
255 13, 9, 5, 1,
256 12, 8, 4, 0);
257 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
258 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
259 __m128i c0, c1, c2, c3;
260 __m128i i4, o4;
261 __m128i xy, zw;
262 __m128i x2, y2, z2, w2;
263 uint32_t i;
264
265 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
266 c0 = _mm_shufflelo_epi16(c0, 0);
267 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
268 c1 = _mm_shufflelo_epi16(c1, 0);
269 c0 = _mm_unpacklo_epi16(c0, c1);
270
271 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
272 c2 = _mm_shufflelo_epi16(c2, 0);
273 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
274 c3 = _mm_shufflelo_epi16(c3, 0);
275 c2 = _mm_unpacklo_epi16(c2, c3);
276
277 for (i = 0; i < count; ++i) {
278 i4 = _mm_loadu_si128((const __m128i *)src);
279
280 xy = _mm_shuffle_epi8(i4, Mxy);
281 zw = _mm_shuffle_epi8(i4, Mzw);
282
283 x2 = _mm_madd_epi16(xy, c0);
284 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
285
286 x2 = _mm_srai_epi32(x2, 8);
287 y2 = x2;
288 z2 = x2;
289 w2 = _mm_srli_epi32(zw, 16);
290
291 x2 = packus_epi32(x2, y2);
292 z2 = packus_epi32(z2, w2);
293 o4 = _mm_packus_epi16(x2, z2);
294
295 o4 = _mm_shuffle_epi8(o4, T4x4);
296 _mm_storeu_si128((__m128i *)dst, o4);
297
298 src = (const char *)src + 16;
299 dst = (char *)dst + 16;
300 }
301 }
302
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)303 void rsdIntrinsicBlurVFU4_K(void *dst,
304 const void *pin, int stride, const void *gptr,
305 int rct, int x1, int x2) {
306 const char *pi;
307 __m128i pi0, pi1;
308 __m128 pf0, pf1;
309 __m128 bp0, bp1;
310 __m128 x;
311 int r;
312
313 for (; x1 < x2; x1 += 2) {
314 pi = (const char *)pin + (x1 << 2);
315 bp0 = _mm_setzero_ps();
316 bp1 = _mm_setzero_ps();
317
318 for (r = 0; r < rct; ++r) {
319 x = _mm_load_ss((const float *)gptr + r);
320 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
321
322 pi0 = _mm_cvtsi32_si128(*(const int *)pi);
323 pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
324
325 pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
326 pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
327
328 bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
329 bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
330
331 pi += stride;
332 }
333
334 _mm_storeu_ps((float *)dst, bp0);
335 _mm_storeu_ps((float *)dst + 4, bp1);
336 dst = (char *)dst + 32;
337 }
338 }
339
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)340 void rsdIntrinsicBlurHFU4_K(void *dst,
341 const void *pin, const void *gptr,
342 int rct, int x1, int x2) {
343 const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
344 const float *pi;
345 __m128 pf, x, y;
346 __m128i o;
347 int r;
348
349 for (; x1 < x2; ++x1) {
350 /* rct is define as 2*r+1 by the caller */
351 x = _mm_load_ss((const float *)gptr);
352 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
353
354 pi = (const float *)pin + (x1 << 2);
355 pf = _mm_mul_ps(x, _mm_load_ps(pi));
356
357 for (r = 1; r < rct; r += 2) {
358 x = _mm_load_ss((const float *)gptr + r);
359 y = _mm_load_ss((const float *)gptr + r + 1);
360 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
361 y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
362
363 pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
364 pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
365 }
366
367 o = _mm_cvtps_epi32(pf);
368 *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
369 dst = (char *)dst + 4;
370 }
371 }
372
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)373 void rsdIntrinsicBlurHFU1_K(void *dst,
374 const void *pin, const void *gptr,
375 int rct, int x1, int x2) {
376 const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
377 const float *pi;
378 __m128 pf, g0, g1, g2, g3, gx, p0, p1;
379 __m128i o;
380 int r;
381
382 for (; x1 < x2; x1+=4) {
383 g0 = _mm_load_ss((const float *)gptr);
384 g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
385
386 pi = (const float *)pin + x1;
387 pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
388
389 for (r = 1; r < rct; r += 4) {
390 gx = _mm_loadu_ps((const float *)gptr + r);
391 p0 = _mm_loadu_ps(pi + r);
392 p1 = _mm_loadu_ps(pi + r + 4);
393
394 g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
395 pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
396 g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
397 pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
398 g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
399 pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
400 g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
401 pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
402 }
403
404 o = _mm_cvtps_epi32(pf);
405 *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
406 dst = (char *)dst + 4;
407 }
408 }
409
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)410 void rsdIntrinsicYuv_K(void *dst,
411 const unsigned char *pY, const unsigned char *pUV,
412 uint32_t count, const short *param) {
413 __m128i biasY, biasUV;
414 __m128i c0, c1, c2, c3, c4;
415
416 biasY = _mm_set1_epi32(param[8]); /* 16 */
417 biasUV = _mm_set1_epi32(param[16]); /* 128 */
418
419 c0 = _mm_set1_epi32(param[0]); /* 298 */
420 c1 = _mm_set1_epi32(param[1]); /* 409 */
421 c2 = _mm_set1_epi32(param[2]); /* -100 */
422 c3 = _mm_set1_epi32(param[3]); /* 516 */
423 c4 = _mm_set1_epi32(param[4]); /* -208 */
424
425 __m128i Y, UV, U, V, R, G, B, A;
426
427 A = _mm_set1_epi32(255);
428 uint32_t i;
429
430 for (i = 0; i < (count << 1); ++i) {
431 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
432 UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
433
434 Y = _mm_sub_epi32(Y, biasY);
435 UV = _mm_sub_epi32(UV, biasUV);
436
437 U = _mm_shuffle_epi32(UV, 0xf5);
438 V = _mm_shuffle_epi32(UV, 0xa0);
439
440 Y = mullo_epi32(Y, c0);
441
442 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
443 R = _mm_add_epi32(R, biasUV);
444 R = _mm_srai_epi32(R, 8);
445
446 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
447 G = _mm_add_epi32(G, mullo_epi32(V, c4));
448 G = _mm_add_epi32(G, biasUV);
449 G = _mm_srai_epi32(G, 8);
450
451 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
452 B = _mm_add_epi32(B, biasUV);
453 B = _mm_srai_epi32(B, 8);
454
455 __m128i y1, y2, y3, y4;
456
457 y1 = packus_epi32(R, G);
458 y2 = packus_epi32(B, A);
459 y3 = _mm_packus_epi16(y1, y2);
460 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
461 14, 10, 6, 2,
462 13, 9, 5, 1,
463 12, 8, 4, 0);
464 y4 = _mm_shuffle_epi8(y3, T4x4);
465 _mm_storeu_si128((__m128i *)dst, y4);
466 pY += 4;
467 pUV += 4;
468 dst = (__m128i *)dst + 1;
469 }
470 }
471
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)472 void rsdIntrinsicYuvR_K(void *dst,
473 const unsigned char *pY, const unsigned char *pUV,
474 uint32_t count, const short *param) {
475 __m128i biasY, biasUV;
476 __m128i c0, c1, c2, c3, c4;
477
478 biasY = _mm_set1_epi32(param[8]); /* 16 */
479 biasUV = _mm_set1_epi32(param[16]); /* 128 */
480
481 c0 = _mm_set1_epi32(param[0]); /* 298 */
482 c1 = _mm_set1_epi32(param[1]); /* 409 */
483 c2 = _mm_set1_epi32(param[2]); /* -100 */
484 c3 = _mm_set1_epi32(param[3]); /* 516 */
485 c4 = _mm_set1_epi32(param[4]); /* -208 */
486
487 __m128i Y, UV, U, V, R, G, B, A;
488
489 A = _mm_set1_epi32(255);
490 uint32_t i;
491
492 for (i = 0; i < (count << 1); ++i) {
493 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
494 UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
495
496 Y = _mm_sub_epi32(Y, biasY);
497 UV = _mm_sub_epi32(UV, biasUV);
498
499 V = _mm_shuffle_epi32(UV, 0xf5);
500 U = _mm_shuffle_epi32(UV, 0xa0);
501
502 Y = mullo_epi32(Y, c0);
503
504 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
505 R = _mm_add_epi32(R, biasUV);
506 R = _mm_srai_epi32(R, 8);
507
508 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
509 G = _mm_add_epi32(G, mullo_epi32(V, c4));
510 G = _mm_add_epi32(G, biasUV);
511 G = _mm_srai_epi32(G, 8);
512
513 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
514 B = _mm_add_epi32(B, biasUV);
515 B = _mm_srai_epi32(B, 8);
516
517 __m128i y1, y2, y3, y4;
518
519 y1 = packus_epi32(R, G);
520 y2 = packus_epi32(B, A);
521 y3 = _mm_packus_epi16(y1, y2);
522 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
523 14, 10, 6, 2,
524 13, 9, 5, 1,
525 12, 8, 4, 0);
526 y4 = _mm_shuffle_epi8(y3, T4x4);
527 _mm_storeu_si128((__m128i *)dst, y4);
528 pY += 4;
529 pUV += 4;
530 dst = (__m128i *)dst + 1;
531 }
532 }
533
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)534 void rsdIntrinsicYuv2_K(void *dst,
535 const unsigned char *pY, const unsigned char *pU,
536 const unsigned char *pV, uint32_t count, const short *param) {
537 __m128i biasY, biasUV;
538 __m128i c0, c1, c2, c3, c4;
539
540 biasY = _mm_set1_epi32(param[8]); /* 16 */
541 biasUV = _mm_set1_epi32(param[16]); /* 128 */
542
543 c0 = _mm_set1_epi32(param[0]); /* 298 */
544 c1 = _mm_set1_epi32(param[1]); /* 409 */
545 c2 = _mm_set1_epi32(param[2]); /* -100 */
546 c3 = _mm_set1_epi32(param[3]); /* 516 */
547 c4 = _mm_set1_epi32(param[4]); /* -208 */
548
549 __m128i Y, U, V, R, G, B, A;
550
551 A = _mm_set1_epi32(255);
552 uint32_t i;
553
554 for (i = 0; i < (count << 1); ++i) {
555 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
556 U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
557 V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
558
559 Y = _mm_sub_epi32(Y, biasY);
560 U = _mm_sub_epi32(U, biasUV);
561 V = _mm_sub_epi32(V, biasUV);
562
563 Y = mullo_epi32(Y, c0);
564
565 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
566 R = _mm_add_epi32(R, biasUV);
567 R = _mm_srai_epi32(R, 8);
568
569 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
570 G = _mm_add_epi32(G, mullo_epi32(V, c4));
571 G = _mm_add_epi32(G, biasUV);
572 G = _mm_srai_epi32(G, 8);
573
574 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
575 B = _mm_add_epi32(B, biasUV);
576 B = _mm_srai_epi32(B, 8);
577
578 __m128i y1, y2, y3, y4;
579
580 y1 = packus_epi32(R, G);
581 y2 = packus_epi32(B, A);
582 y3 = _mm_packus_epi16(y1, y2);
583 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
584 14, 10, 6, 2,
585 13, 9, 5, 1,
586 12, 8, 4, 0);
587 y4 = _mm_shuffle_epi8(y3, T4x4);
588 _mm_storeu_si128((__m128i *)dst, y4);
589 pY += 4;
590 pU += 4;
591 pV += 4;
592 dst = (__m128i *)dst + 1;
593 }
594 }
595
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)596 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
597 const void *y1, const void *y2,
598 const void *y3, const void *y4,
599 const short *coef, uint32_t count) {
600 __m128i x;
601 __m128i c0, c2, c4, c6, c8, c10, c12;
602 __m128i c14, c16, c18, c20, c22, c24;
603 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
604 __m128i p0, p1, p2, p3, p4, p5, p6, p7;
605 __m128i p8, p9, p10, p11, p12, p13, p14, p15;
606 __m128i p16, p17, p18, p19, p20, p21, p22, p23;
607 __m128i p24, p25, p26, p27, p28, p29, p30, p31;
608 __m128i p32, p33, p34, p35, p36, p37, p38, p39;
609 __m128i o0, o1, o2, o3;
610 uint32_t i;
611
612 x = _mm_loadl_epi64((const __m128i *)(coef+0));
613 c0 = _mm_shuffle_epi32(x, 0x00);
614 c2 = _mm_shuffle_epi32(x, 0x55);
615
616 x = _mm_loadl_epi64((const __m128i *)(coef+4));
617 c4 = _mm_shuffle_epi32(x, 0x00);
618 c6 = _mm_shuffle_epi32(x, 0x55);
619
620 x = _mm_loadl_epi64((const __m128i *)(coef+8));
621 c8 = _mm_shuffle_epi32(x, 0x00);
622 c10 = _mm_shuffle_epi32(x, 0x55);
623
624 x = _mm_loadl_epi64((const __m128i *)(coef+12));
625 c12 = _mm_shuffle_epi32(x, 0x00);
626 c14 = _mm_shuffle_epi32(x, 0x55);
627
628 x = _mm_loadl_epi64((const __m128i *)(coef+16));
629 c16 = _mm_shuffle_epi32(x, 0x00);
630 c18 = _mm_shuffle_epi32(x, 0x55);
631
632 x = _mm_loadl_epi64((const __m128i *)(coef+20));
633 c20 = _mm_shuffle_epi32(x, 0x00);
634 c22 = _mm_shuffle_epi32(x, 0x55);
635
636 x = _mm_loadl_epi64((const __m128i *)(coef+24));
637 c24 = _mm_shuffle_epi32(x, 0x00);
638
639 for (i = 0; i < count; ++i) {
640
641 p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
642 p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
643 p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
644 p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
645 p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
646 p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
647 p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
648 p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
649
650 p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
651 p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
652 p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
653 p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
654 p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
655 p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
656 p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
657 p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
658
659 p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
660 p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
661 p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
662 p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
663 p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
664 p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
665 p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
666 p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
667
668 p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
669 p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
670 p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
671 p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
672 p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
673 p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
674 p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
675 p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
676
677 p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
678 p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
679 p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
680 p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
681 p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
682 p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
683 p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
684 p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
685
686 o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0);
687 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2));
688 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4));
689 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6));
690 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8));
691 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
692 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
693 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
694 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
695 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
696 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
697 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
698 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
699 o0 = _mm_srai_epi32(o0, 8);
700
701 o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0);
702 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2));
703 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4));
704 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6));
705 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8));
706 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
707 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
708 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
709 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
710 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
711 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
712 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
713 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
714 o1 = _mm_srai_epi32(o1, 8);
715
716 o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0);
717 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2));
718 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4));
719 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6));
720 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8));
721 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
722 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
723 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
724 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
725 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
726 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
727 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
728 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
729 o2 = _mm_srai_epi32(o2, 8);
730
731 o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0);
732 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2));
733 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4));
734 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6));
735 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8));
736 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
737 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
738 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
739 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
740 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
741 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
742 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
743 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
744 o3 = _mm_srai_epi32(o3, 8);
745
746 o0 = packus_epi32(o0, o1);
747 o2 = packus_epi32(o2, o3);
748 o0 = _mm_packus_epi16(o0, o2);
749 _mm_storeu_si128((__m128i *)dst, o0);
750
751 y0 = (const char *)y0 + 16;
752 y1 = (const char *)y1 + 16;
753 y2 = (const char *)y2 + 16;
754 y3 = (const char *)y3 + 16;
755 y4 = (const char *)y4 + 16;
756 dst = (char *)dst + 16;
757 }
758 }
759
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)760 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
761 __m128i all1s, ina, ins;
762 __m128i in0, in1, out0, out1;
763 __m128i t0, t1, t2, t3;
764 uint32_t i;
765
766 all1s = _mm_set1_epi16(255);
767
768 for (i = 0; i < count8; ++i) {
769 in0 = _mm_loadu_si128((const __m128i *)src);
770 in1 = _mm_loadu_si128((const __m128i *)src + 1);
771 out0 = _mm_loadu_si128((const __m128i *)dst);
772 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
773
774 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
775 ina = _mm_shufflelo_epi16(ins, 0xFF);
776 ina = _mm_shufflehi_epi16(ina, 0xFF);
777 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
778 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
779 t0 = _mm_srai_epi16(t0, 8);
780 t0 = _mm_add_epi16(t0, ins);
781
782 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
783 ina = _mm_shufflelo_epi16(ins, 0xFF);
784 ina = _mm_shufflehi_epi16(ina, 0xFF);
785 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
786 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
787 t1 = _mm_srai_epi16(t1, 8);
788 t1 = _mm_add_epi16(t1, ins);
789
790 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
791 ina = _mm_shufflelo_epi16(ins, 0xFF);
792 ina = _mm_shufflehi_epi16(ina, 0xFF);
793 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
794 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
795 t2 = _mm_srai_epi16(t2, 8);
796 t2 = _mm_add_epi16(t2, ins);
797
798 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
799 ina = _mm_shufflelo_epi16(ins, 0xFF);
800 ina = _mm_shufflehi_epi16(ina, 0xFF);
801 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
802 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
803 t3 = _mm_srai_epi16(t3, 8);
804 t3 = _mm_add_epi16(t3, ins);
805
806 t0 = _mm_packus_epi16(t0, t1);
807 t2 = _mm_packus_epi16(t2, t3);
808 _mm_storeu_si128((__m128i *)dst, t0);
809 _mm_storeu_si128((__m128i *)dst + 1, t2);
810
811 src = (const __m128i *)src + 2;
812 dst = (__m128i *)dst + 2;
813 }
814 }
815
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)816 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
817 __m128i all1s, outa, outs;
818 __m128i in0, in1, out0, out1;
819 __m128i t0, t1, t2, t3;
820 uint32_t i;
821
822 all1s = _mm_set1_epi16(255);
823
824 for (i = 0; i < count8; ++i) {
825 in0 = _mm_loadu_si128((const __m128i *)src);
826 in1 = _mm_loadu_si128((const __m128i *)src + 1);
827 out0 = _mm_loadu_si128((const __m128i *)dst);
828 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
829
830
831 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
832 outa = _mm_shufflelo_epi16(outs, 0xFF);
833 outa = _mm_shufflehi_epi16(outa, 0xFF);
834 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
835 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
836 t0 = _mm_srai_epi16(t0, 8);
837 t0 = _mm_add_epi16(t0, outs);
838
839 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
840 outa = _mm_shufflelo_epi16(outs, 0xFF);
841 outa = _mm_shufflehi_epi16(outa, 0xFF);
842 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
843 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
844 t1 = _mm_srai_epi16(t1, 8);
845 t1 = _mm_add_epi16(t1, outs);
846
847 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
848 outa = _mm_shufflelo_epi16(outs, 0xFF);
849 outa = _mm_shufflehi_epi16(outa, 0xFF);
850 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
851 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
852 t2 = _mm_srai_epi16(t2, 8);
853 t2 = _mm_add_epi16(t2, outs);
854
855 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
856 outa = _mm_shufflelo_epi16(outs, 0xFF);
857 outa = _mm_shufflehi_epi16(outa, 0xFF);
858 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
859 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
860 t3 = _mm_srai_epi16(t3, 8);
861 t3 = _mm_add_epi16(t3, outs);
862
863 t0 = _mm_packus_epi16(t0, t1);
864 t2 = _mm_packus_epi16(t2, t3);
865 _mm_storeu_si128((__m128i *)dst, t0);
866 _mm_storeu_si128((__m128i *)dst + 1, t2);
867
868 src = (const __m128i *)src + 2;
869 dst = (__m128i *)dst + 2;
870 }
871 }
872
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)873 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
874 __m128i outa;
875 __m128i in0, in1, out0, out1;
876 __m128i t0, t1, t2, t3;
877 uint32_t i;
878
879 for (i = 0; i < count8; ++i) {
880 in0 = _mm_loadu_si128((const __m128i *)src);
881 in1 = _mm_loadu_si128((const __m128i *)src + 1);
882 out0 = _mm_loadu_si128((const __m128i *)dst);
883 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
884
885 outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
886 outa = _mm_shufflelo_epi16(outa, 0xFF);
887 outa = _mm_shufflehi_epi16(outa, 0xFF);
888 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
889 t0 = _mm_mullo_epi16(t0, outa);
890 t0 = _mm_srai_epi16(t0, 8);
891
892 outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
893 outa = _mm_shufflelo_epi16(outa, 0xFF);
894 outa = _mm_shufflehi_epi16(outa, 0xFF);
895 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
896 t1 = _mm_mullo_epi16(t1, outa);
897 t1 = _mm_srai_epi16(t1, 8);
898
899 outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
900 outa = _mm_shufflelo_epi16(outa, 0xFF);
901 outa = _mm_shufflehi_epi16(outa, 0xFF);
902 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
903 t2 = _mm_mullo_epi16(t2, outa);
904 t2 = _mm_srai_epi16(t2, 8);
905
906 outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
907 outa = _mm_shufflelo_epi16(outa, 0xFF);
908 outa = _mm_shufflehi_epi16(outa, 0xFF);
909 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
910 t3 = _mm_mullo_epi16(t3, outa);
911 t3 = _mm_srai_epi16(t3, 8);
912
913 t0 = _mm_packus_epi16(t0, t1);
914 t2 = _mm_packus_epi16(t2, t3);
915 _mm_storeu_si128((__m128i *)dst, t0);
916 _mm_storeu_si128((__m128i *)dst + 1, t2);
917
918 src = (const __m128i *)src + 2;
919 dst = (__m128i *)dst + 2;
920 }
921 }
922
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)923 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
924 __m128i ina;
925 __m128i in0, in1, out0, out1;
926 __m128i t0, t1, t2, t3;
927 uint32_t i;
928
929 for (i = 0; i < count8; ++i) {
930 in0 = _mm_loadu_si128((const __m128i *)src);
931 in1 = _mm_loadu_si128((const __m128i *)src + 1);
932 out0 = _mm_loadu_si128((const __m128i *)dst);
933 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
934
935 ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
936 ina = _mm_shufflelo_epi16(ina, 0xFF);
937 ina = _mm_shufflehi_epi16(ina, 0xFF);
938 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
939 t0 = _mm_mullo_epi16(t0, ina);
940 t0 = _mm_srai_epi16(t0, 8);
941
942 ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
943 ina = _mm_shufflelo_epi16(ina, 0xFF);
944 ina = _mm_shufflehi_epi16(ina, 0xFF);
945 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
946 t1 = _mm_mullo_epi16(t1, ina);
947 t1 = _mm_srai_epi16(t1, 8);
948
949 ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
950 ina = _mm_shufflelo_epi16(ina, 0xFF);
951 ina = _mm_shufflehi_epi16(ina, 0xFF);
952 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
953 t2 = _mm_mullo_epi16(t2, ina);
954 t2 = _mm_srai_epi16(t2, 8);
955
956 ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
957 ina = _mm_shufflelo_epi16(ina, 0xFF);
958 ina = _mm_shufflehi_epi16(ina, 0xFF);
959 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
960 t3 = _mm_mullo_epi16(t3, ina);
961 t3 = _mm_srai_epi16(t3, 8);
962
963 t0 = _mm_packus_epi16(t0, t1);
964 t2 = _mm_packus_epi16(t2, t3);
965 _mm_storeu_si128((__m128i *)dst, t0);
966 _mm_storeu_si128((__m128i *)dst + 1, t2);
967
968 src = (const __m128i *)src + 2;
969 dst = (__m128i *)dst + 2;
970 }
971 }
972
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)973 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
974 __m128i all1s, outa;
975 __m128i in0, in1, out0, out1;
976 __m128i t0, t1, t2, t3;
977 uint32_t i;
978
979 all1s = _mm_set1_epi16(255);
980
981 for (i = 0; i < count8; ++i) {
982 in0 = _mm_loadu_si128((const __m128i *)src);
983 in1 = _mm_loadu_si128((const __m128i *)src + 1);
984 out0 = _mm_loadu_si128((const __m128i *)dst);
985 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
986
987 outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
988 outa = _mm_shufflelo_epi16(outa, 0xFF);
989 outa = _mm_shufflehi_epi16(outa, 0xFF);
990 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
991 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
992 t0 = _mm_srai_epi16(t0, 8);
993
994 outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
995 outa = _mm_shufflelo_epi16(outa, 0xFF);
996 outa = _mm_shufflehi_epi16(outa, 0xFF);
997 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
998 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
999 t1 = _mm_srai_epi16(t1, 8);
1000
1001 outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1002 outa = _mm_shufflelo_epi16(outa, 0xFF);
1003 outa = _mm_shufflehi_epi16(outa, 0xFF);
1004 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1005 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1006 t2 = _mm_srai_epi16(t2, 8);
1007
1008 outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1009 outa = _mm_shufflelo_epi16(outa, 0xFF);
1010 outa = _mm_shufflehi_epi16(outa, 0xFF);
1011 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1012 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1013 t3 = _mm_srai_epi16(t3, 8);
1014
1015 t0 = _mm_packus_epi16(t0, t1);
1016 t2 = _mm_packus_epi16(t2, t3);
1017 _mm_storeu_si128((__m128i *)dst, t0);
1018 _mm_storeu_si128((__m128i *)dst + 1, t2);
1019
1020 src = (const __m128i *)src + 2;
1021 dst = (__m128i *)dst + 2;
1022 }
1023 }
1024
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1025 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1026 __m128i all1s, ina;
1027 __m128i in0, in1, out0, out1;
1028 __m128i t0, t1, t2, t3;
1029 uint32_t i;
1030
1031 all1s = _mm_set1_epi16(255);
1032
1033 for (i = 0; i < count8; ++i) {
1034 in0 = _mm_loadu_si128((const __m128i *)src);
1035 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1036 out0 = _mm_loadu_si128((const __m128i *)dst);
1037 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1038
1039 ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1040 ina = _mm_shufflelo_epi16(ina, 0xFF);
1041 ina = _mm_shufflehi_epi16(ina, 0xFF);
1042 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1043 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1044 t0 = _mm_srai_epi16(t0, 8);
1045
1046 ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1047 ina = _mm_shufflelo_epi16(ina, 0xFF);
1048 ina = _mm_shufflehi_epi16(ina, 0xFF);
1049 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1050 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1051 t1 = _mm_srai_epi16(t1, 8);
1052
1053 ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1054 ina = _mm_shufflelo_epi16(ina, 0xFF);
1055 ina = _mm_shufflehi_epi16(ina, 0xFF);
1056 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1057 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1058 t2 = _mm_srai_epi16(t2, 8);
1059
1060 ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1061 ina = _mm_shufflelo_epi16(ina, 0xFF);
1062 ina = _mm_shufflehi_epi16(ina, 0xFF);
1063 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1064 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1065 t3 = _mm_srai_epi16(t3, 8);
1066
1067 t0 = _mm_packus_epi16(t0, t1);
1068 t2 = _mm_packus_epi16(t2, t3);
1069 _mm_storeu_si128((__m128i *)dst, t0);
1070 _mm_storeu_si128((__m128i *)dst + 1, t2);
1071
1072 src = (const __m128i *)src + 2;
1073 dst = (__m128i *)dst + 2;
1074 }
1075 }
1076
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1077 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1078 const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1079 __m128i all1s, ina, outa, ins, outs;
1080 __m128i in0, in1, out0, out1;
1081 __m128i t0, t1, t2, t3;
1082 uint32_t i;
1083
1084 all1s = _mm_set1_epi16(255);
1085
1086 for (i = 0; i < count8; ++i) {
1087 in0 = _mm_loadu_si128((const __m128i *)src);
1088 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1089 out0 = _mm_loadu_si128((const __m128i *)dst);
1090 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1091
1092 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1093 ina = _mm_shufflelo_epi16(ins, 0xFF);
1094 ina = _mm_shufflehi_epi16(ina, 0xFF);
1095 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1096 outa = _mm_shufflelo_epi16(outs, 0xFF);
1097 outa = _mm_shufflehi_epi16(outa, 0xFF);
1098 t0 = _mm_sub_epi16(all1s, ina);
1099 t0 = _mm_mullo_epi16(t0, outs);
1100 t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1101 t0 = _mm_srli_epi16(t0, 8);
1102
1103 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1104 ina = _mm_shufflelo_epi16(ins, 0xFF);
1105 ina = _mm_shufflehi_epi16(ina, 0xFF);
1106 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1107 outa = _mm_shufflelo_epi16(outs, 0xFF);
1108 outa = _mm_shufflehi_epi16(outa, 0xFF);
1109 t1 = _mm_sub_epi16(all1s, ina);
1110 t1 = _mm_mullo_epi16(t1, outs);
1111 t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1112 t1 = _mm_srli_epi16(t1, 8);
1113
1114 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1115 ina = _mm_shufflelo_epi16(ins, 0xFF);
1116 ina = _mm_shufflehi_epi16(ina, 0xFF);
1117 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1118 outa = _mm_shufflelo_epi16(outs, 0xFF);
1119 outa = _mm_shufflehi_epi16(outa, 0xFF);
1120 t2 = _mm_sub_epi16(all1s, ina);
1121 t2 = _mm_mullo_epi16(t2, outs);
1122 t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1123 t2 = _mm_srli_epi16(t2, 8);
1124
1125 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1126 ina = _mm_shufflelo_epi16(ins, 0xFF);
1127 ina = _mm_shufflehi_epi16(ina, 0xFF);
1128 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1129 outa = _mm_shufflelo_epi16(outs, 0xFF);
1130 outa = _mm_shufflehi_epi16(outa, 0xFF);
1131 t3 = _mm_sub_epi16(all1s, ina);
1132 t3 = _mm_mullo_epi16(t3, outs);
1133 t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1134 t3 = _mm_srli_epi16(t3, 8);
1135
1136 t0 = _mm_packus_epi16(t0, t1);
1137 t0 = blendv_epi8(t0, out0, M0001);
1138 t2 = _mm_packus_epi16(t2, t3);
1139 t2 = blendv_epi8(t2, out1, M0001);
1140 _mm_storeu_si128((__m128i *)dst, t0);
1141 _mm_storeu_si128((__m128i *)dst + 1, t2);
1142
1143 src = (const __m128i *)src + 2;
1144 dst = (__m128i *)dst + 2;
1145 }
1146 }
1147
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1148 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1149 const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1150 __m128i all1s, ina, ins, outa, outs;
1151 __m128i in0, in1, out0, out1;
1152 __m128i t0, t1, t2, t3;
1153 uint32_t i;
1154
1155 all1s = _mm_set1_epi16(255);
1156
1157 for (i = 0; i < count8; ++i) {
1158 in0 = _mm_loadu_si128((const __m128i *)src);
1159 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1160 out0 = _mm_loadu_si128((const __m128i *)dst);
1161 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1162
1163 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1164 ina = _mm_shufflelo_epi16(ins, 0xFF);
1165 ina = _mm_shufflehi_epi16(ina, 0xFF);
1166 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1167 outa = _mm_shufflelo_epi16(outs, 0xFF);
1168 outa = _mm_shufflehi_epi16(outa, 0xFF);
1169 t0 = _mm_sub_epi16(all1s, outa);
1170 t0 = _mm_mullo_epi16(t0, ins);
1171 t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1172 t0 = _mm_srli_epi16(t0, 8);
1173
1174 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1175 ina = _mm_shufflelo_epi16(ins, 0xFF);
1176 ina = _mm_shufflehi_epi16(ina, 0xFF);
1177 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1178 outa = _mm_shufflelo_epi16(outs, 0xFF);
1179 outa = _mm_shufflehi_epi16(outa, 0xFF);
1180 t1 = _mm_sub_epi16(all1s, outa);
1181 t1 = _mm_mullo_epi16(t1, ins);
1182 t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1183 t1 = _mm_srli_epi16(t1, 8);
1184
1185 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1186 ina = _mm_shufflelo_epi16(ins, 0xFF);
1187 ina = _mm_shufflehi_epi16(ina, 0xFF);
1188 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1189 outa = _mm_shufflelo_epi16(outs, 0xFF);
1190 outa = _mm_shufflehi_epi16(outa, 0xFF);
1191 t2 = _mm_sub_epi16(all1s, outa);
1192 t2 = _mm_mullo_epi16(t2, ins);
1193 t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1194 t2 = _mm_srli_epi16(t2, 8);
1195
1196 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1197 ina = _mm_shufflelo_epi16(ins, 0xFF);
1198 ina = _mm_shufflehi_epi16(ina, 0xFF);
1199 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1200 outa = _mm_shufflelo_epi16(outs, 0xFF);
1201 outa = _mm_shufflehi_epi16(outa, 0xFF);
1202 t3 = _mm_sub_epi16(all1s, outa);
1203 t3 = _mm_mullo_epi16(t3, ins);
1204 t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1205 t3 = _mm_srli_epi16(t3, 8);
1206
1207 t0 = _mm_packus_epi16(t0, t1);
1208 t0 = blendv_epi8(t0, out0, M0001);
1209 t2 = _mm_packus_epi16(t2, t3);
1210 t2 = blendv_epi8(t2, out1, M0001);
1211 _mm_storeu_si128((__m128i *)dst, t0);
1212 _mm_storeu_si128((__m128i *)dst + 1, t2);
1213
1214 src = (const __m128i *)src + 2;
1215 dst = (__m128i *)dst + 2;
1216 }
1217 }
1218
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1219 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1220 __m128i in0, in1, out0, out1;
1221 uint32_t i;
1222
1223 for (i = 0; i < count8; ++i) {
1224 in0 = _mm_loadu_si128((const __m128i *)src);
1225 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1226 out0 = _mm_loadu_si128((const __m128i *)dst);
1227 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1228
1229 out0 = _mm_xor_si128(out0, in0);
1230 out1 = _mm_xor_si128(out1, in1);
1231
1232 _mm_storeu_si128((__m128i *)dst, out0);
1233 _mm_storeu_si128((__m128i *)dst + 1, out1);
1234
1235 src = (const __m128i *)src + 2;
1236 dst = (__m128i *)dst + 2;
1237 }
1238 }
1239
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1240 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1241 __m128i in0, in1, out0, out1;
1242 __m128i t0, t1, t2, t3;
1243 uint32_t i;
1244
1245 for (i = 0; i < count8; ++i) {
1246 in0 = _mm_loadu_si128((const __m128i *)src);
1247 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1248 out0 = _mm_loadu_si128((const __m128i *)dst);
1249 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1250
1251 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1252 t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1253 t0 = _mm_srli_epi16(t0, 8);
1254
1255 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1256 t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1257 t1 = _mm_srli_epi16(t1, 8);
1258
1259 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1260 t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1261 t2 = _mm_srli_epi16(t2, 8);
1262
1263 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1264 t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1265 t3 = _mm_srli_epi16(t3, 8);
1266
1267 t0 = _mm_packus_epi16(t0, t1);
1268 t2 = _mm_packus_epi16(t2, t3);
1269 _mm_storeu_si128((__m128i *)dst, t0);
1270 _mm_storeu_si128((__m128i *)dst + 1, t2);
1271
1272 src = (const __m128i *)src + 2;
1273 dst = (__m128i *)dst + 2;
1274 }
1275 }
1276
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1277 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1278 __m128i in0, in1, out0, out1;
1279 uint32_t i;
1280
1281 for (i = 0; i < count8; ++i) {
1282 in0 = _mm_loadu_si128((const __m128i *)src);
1283 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1284 out0 = _mm_loadu_si128((const __m128i *)dst);
1285 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1286
1287 out0 = _mm_adds_epu8(out0, in0);
1288 out1 = _mm_adds_epu8(out1, in1);
1289
1290 _mm_storeu_si128((__m128i *)dst, out0);
1291 _mm_storeu_si128((__m128i *)dst + 1, out1);
1292
1293 src = (const __m128i *)src + 2;
1294 dst = (__m128i *)dst + 2;
1295 }
1296 }
1297
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1298 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1299 __m128i in0, in1, out0, out1;
1300 uint32_t i;
1301
1302 for (i = 0; i < count8; ++i) {
1303 in0 = _mm_loadu_si128((const __m128i *)src);
1304 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1305 out0 = _mm_loadu_si128((const __m128i *)dst);
1306 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1307
1308 out0 = _mm_subs_epu8(out0, in0);
1309 out1 = _mm_subs_epu8(out1, in1);
1310
1311 _mm_storeu_si128((__m128i *)dst, out0);
1312 _mm_storeu_si128((__m128i *)dst + 1, out1);
1313
1314 src = (const __m128i *)src + 2;
1315 dst = (__m128i *)dst + 2;
1316 }
1317 }
1318