1 /*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkJumper.h"
9 #include <string.h>
10
11 #define SI static inline
12
13 template <typename T, typename P>
unaligned_load(const P * p)14 SI T unaligned_load(const P* p) {
15 T v;
16 memcpy(&v, p, sizeof(v));
17 return v;
18 }
19
20 template <typename Dst, typename Src>
bit_cast(const Src & src)21 SI Dst bit_cast(const Src& src) {
22 static_assert(sizeof(Dst) == sizeof(Src), "");
23 return unaligned_load<Dst>(&src);
24 }
25
26 // A couple functions for embedding constants directly into code,
27 // so that no .const or .literal4 section is created.
C(int x)28 SI int C(int x) {
29 #if defined(JUMPER) && defined(__x86_64__)
30 // Move x-the-compile-time-constant as a literal into x-the-register.
31 asm("mov %1, %0" : "=r"(x) : "i"(x));
32 #endif
33 return x;
34 }
C(float f)35 SI float C(float f) {
36 int x = C(unaligned_load<int>(&f));
37 return unaligned_load<float>(&x);
38 }
operator ""_i(unsigned long long int i)39 SI int operator "" _i(unsigned long long int i) { return C( (int)i); }
operator ""_f(long double f)40 SI float operator "" _f( long double f) { return C((float)f); }
41
42 // Not all constants can be generated using C() or _i/_f. We read the rest from this struct.
43 using K = const SkJumper_constants;
44
45 #if !defined(JUMPER)
46 // This path should lead to portable code that can be compiled directly into Skia.
47 // (All other paths are compiled offline by Clang into SkJumper_generated.h.)
48 #include <math.h>
49
50 using F = float;
51 using I32 = int32_t;
52 using U32 = uint32_t;
53 using U16 = uint16_t;
54 using U8 = uint8_t;
55
mad(F f,F m,F a)56 SI F mad(F f, F m, F a) { return f*m+a; }
min(F a,F b)57 SI F min(F a, F b) { return fminf(a,b); }
max(F a,F b)58 SI F max(F a, F b) { return fmaxf(a,b); }
abs_(F v)59 SI F abs_ (F v) { return fabsf(v); }
floor_(F v)60 SI F floor_(F v) { return floorf(v); }
rcp(F v)61 SI F rcp (F v) { return 1.0f / v; }
rsqrt(F v)62 SI F rsqrt (F v) { return 1.0f / sqrtf(v); }
round(F v,F scale)63 SI U32 round (F v, F scale) { return (uint32_t)lrintf(v*scale); }
pack(U32 v)64 SI U16 pack(U32 v) { return (U16)v; }
pack(U16 v)65 SI U8 pack(U16 v) { return (U8)v; }
66
if_then_else(I32 c,F t,F e)67 SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
68
gather(const float * p,U32 ix)69 SI F gather(const float* p, U32 ix) { return p[ix]; }
70
71 #define WRAP(name) sk_##name
72
73 #elif defined(__aarch64__)
74 #include <arm_neon.h>
75
76 // Since we know we're using Clang, we can use its vector extensions.
77 using F = float __attribute__((ext_vector_type(4)));
78 using I32 = int32_t __attribute__((ext_vector_type(4)));
79 using U32 = uint32_t __attribute__((ext_vector_type(4)));
80 using U16 = uint16_t __attribute__((ext_vector_type(4)));
81 using U8 = uint8_t __attribute__((ext_vector_type(4)));
82
83 // We polyfill a few routines that Clang doesn't build into ext_vector_types.
mad(F f,F m,F a)84 SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
min(F a,F b)85 SI F min(F a, F b) { return vminq_f32(a,b); }
max(F a,F b)86 SI F max(F a, F b) { return vmaxq_f32(a,b); }
abs_(F v)87 SI F abs_ (F v) { return vabsq_f32(v); }
floor_(F v)88 SI F floor_(F v) { return vrndmq_f32(v); }
rcp(F v)89 SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
rsqrt(F v)90 SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
round(F v,F scale)91 SI U32 round (F v, F scale) { return vcvtnq_u32_f32(v*scale); }
pack(U32 v)92 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
pack(U16 v)93 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
94
if_then_else(I32 c,F t,F e)95 SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
96
gather(const float * p,U32 ix)97 SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
98
99 #define WRAP(name) sk_##name##_aarch64
100
101 #elif defined(__arm__)
102 #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
103 #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
104 #endif
105 #include <arm_neon.h>
106
107 // We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers.
108 using F = float __attribute__((ext_vector_type(2)));
109 using I32 = int32_t __attribute__((ext_vector_type(2)));
110 using U32 = uint32_t __attribute__((ext_vector_type(2)));
111 using U16 = uint16_t __attribute__((ext_vector_type(2)));
112 using U8 = uint8_t __attribute__((ext_vector_type(2)));
113
mad(F f,F m,F a)114 SI F mad(F f, F m, F a) { return vfma_f32(a,f,m); }
min(F a,F b)115 SI F min(F a, F b) { return vmin_f32(a,b); }
max(F a,F b)116 SI F max(F a, F b) { return vmax_f32(a,b); }
abs_(F v)117 SI F abs_ (F v) { return vabs_f32(v); }
rcp(F v)118 SI F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
rsqrt(F v)119 SI F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
round(F v,F scale)120 SI U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); }
pack(U32 v)121 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
pack(U16 v)122 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
123
if_then_else(I32 c,F t,F e)124 SI F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
125
floor_(F v)126 SI F floor_(F v) {
127 F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
128 return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
129 }
130
gather(const float * p,U32 ix)131 SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
132
133 #define WRAP(name) sk_##name##_vfp4
134
135 #elif defined(__AVX__)
136 #include <immintrin.h>
137
138 // These are __m256 and __m256i, but friendlier and strongly-typed.
139 using F = float __attribute__((ext_vector_type(8)));
140 using I32 = int32_t __attribute__((ext_vector_type(8)));
141 using U32 = uint32_t __attribute__((ext_vector_type(8)));
142 using U16 = uint16_t __attribute__((ext_vector_type(8)));
143 using U8 = uint8_t __attribute__((ext_vector_type(8)));
144
mad(F f,F m,F a)145 SI F mad(F f, F m, F a) {
146 #if defined(__FMA__)
147 return _mm256_fmadd_ps(f,m,a);
148 #else
149 return f*m+a;
150 #endif
151 }
152
min(F a,F b)153 SI F min(F a, F b) { return _mm256_min_ps(a,b); }
max(F a,F b)154 SI F max(F a, F b) { return _mm256_max_ps(a,b); }
abs_(F v)155 SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
floor_(F v)156 SI F floor_(F v) { return _mm256_floor_ps(v); }
rcp(F v)157 SI F rcp (F v) { return _mm256_rcp_ps (v); }
rsqrt(F v)158 SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
round(F v,F scale)159 SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
160
pack(U32 v)161 SI U16 pack(U32 v) {
162 return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
163 _mm256_extractf128_si256(v, 1));
164 }
pack(U16 v)165 SI U8 pack(U16 v) {
166 auto r = _mm_packus_epi16(v,v);
167 return unaligned_load<U8>(&r);
168 }
169
if_then_else(I32 c,F t,F e)170 SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
171
gather(const float * p,U32 ix)172 SI F gather(const float* p, U32 ix) {
173 #if defined(__AVX2__)
174 return _mm256_i32gather_ps(p, ix, 4);
175 #else
176 return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
177 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
178 #endif
179 }
180
181 #if defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
182 #define WRAP(name) sk_##name##_hsw
183 #else
184 #define WRAP(name) sk_##name##_avx
185 #endif
186
187 #elif defined(__SSE2__)
188 #include <immintrin.h>
189
190 using F = float __attribute__((ext_vector_type(4)));
191 using I32 = int32_t __attribute__((ext_vector_type(4)));
192 using U32 = uint32_t __attribute__((ext_vector_type(4)));
193 using U16 = uint16_t __attribute__((ext_vector_type(4)));
194 using U8 = uint8_t __attribute__((ext_vector_type(4)));
195
mad(F f,F m,F a)196 SI F mad(F f, F m, F a) { return f*m+a; }
min(F a,F b)197 SI F min(F a, F b) { return _mm_min_ps(a,b); }
max(F a,F b)198 SI F max(F a, F b) { return _mm_max_ps(a,b); }
abs_(F v)199 SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
rcp(F v)200 SI F rcp (F v) { return _mm_rcp_ps (v); }
rsqrt(F v)201 SI F rsqrt(F v) { return _mm_rsqrt_ps(v); }
round(F v,F scale)202 SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
203
pack(U32 v)204 SI U16 pack(U32 v) {
205 #if defined(__SSE4_1__)
206 auto p = _mm_packus_epi32(v,v);
207 #else
208 // Sign extend so that _mm_packs_epi32() does the pack we want.
209 auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
210 p = _mm_packs_epi32(p,p);
211 #endif
212 return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
213 }
pack(U16 v)214 SI U8 pack(U16 v) {
215 __m128i r;
216 memcpy(&r, &v, sizeof(v));
217 r = _mm_packus_epi16(r,r);
218 return unaligned_load<U8>(&r);
219 }
220
if_then_else(I32 c,F t,F e)221 SI F if_then_else(I32 c, F t, F e) {
222 return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
223 }
224
floor_(F v)225 SI F floor_(F v) {
226 #if defined(__SSE4_1__)
227 return _mm_floor_ps(v);
228 #else
229 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
230 return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
231 #endif
232 }
233
gather(const float * p,U32 ix)234 SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
235
236 #if defined(__SSE4_1__)
237 #define WRAP(name) sk_##name##_sse41
238 #else
239 #define WRAP(name) sk_##name##_sse2
240 #endif
241 #endif
242
243 static const size_t kStride = sizeof(F) / sizeof(float);
244
245 // We need to be a careful with casts.
246 // (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
247 // These named casts and bit_cast() are always what they seem to be.
248 #if defined(JUMPER)
cast(U32 v)249 SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
expand(U16 v)250 SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
expand(U8 v)251 SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
252 #else
cast(U32 v)253 SI F cast (U32 v) { return (F)v; }
expand(U16 v)254 SI U32 expand(U16 v) { return (U32)v; }
expand(U8 v)255 SI U32 expand(U8 v) { return (U32)v; }
256 #endif
257
258 template <typename V, typename T>
load(const T * src,size_t tail)259 SI V load(const T* src, size_t tail) {
260 #if defined(JUMPER)
261 __builtin_assume(tail < kStride);
262 if (__builtin_expect(tail, 0)) {
263 V v{}; // Any inactive lanes are zeroed.
264 switch (tail-1) {
265 case 6: v[6] = src[6];
266 case 5: v[5] = src[5];
267 case 4: v[4] = src[4];
268 case 3: v[3] = src[3];
269 case 2: v[2] = src[2];
270 case 1: v[1] = src[1];
271 case 0: v[0] = src[0];
272 }
273 return v;
274 }
275 #endif
276 return unaligned_load<V>(src);
277 }
278
279 template <typename V, typename T>
store(T * dst,V v,size_t tail)280 SI void store(T* dst, V v, size_t tail) {
281 #if defined(JUMPER)
282 __builtin_assume(tail < kStride);
283 if (__builtin_expect(tail, 0)) {
284 switch (tail-1) {
285 case 6: dst[6] = v[6];
286 case 5: dst[5] = v[5];
287 case 4: dst[4] = v[4];
288 case 3: dst[3] = v[3];
289 case 2: dst[2] = v[2];
290 case 1: dst[1] = v[1];
291 case 0: dst[0] = v[0];
292 }
293 return;
294 }
295 #endif
296 memcpy(dst, &v, sizeof(v));
297 }
298
299 #if 1 && defined(JUMPER) && defined(__AVX__)
300 template <>
load(const uint8_t * src,size_t tail)301 inline U8 load(const uint8_t* src, size_t tail) {
302 if (__builtin_expect(tail, 0)) {
303 uint64_t v = 0;
304 size_t shift = 0;
305 #pragma nounroll
306 while (tail --> 0) {
307 v |= (uint64_t)*src++ << shift;
308 shift += 8;
309 }
310 return unaligned_load<U8>(&v);
311 }
312 return unaligned_load<U8>(src);
313 }
314 #endif
315
316 #if 1 && defined(JUMPER) && defined(__AVX2__)
mask(size_t tail)317 SI U32 mask(size_t tail) {
318 // It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff.
319 // Start fully on, then shift away lanes from the top until we've got our mask.
320 uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
321
322 // Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff.
323 return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask));
324 }
325
326 template <>
load(const uint32_t * src,size_t tail)327 inline U32 load(const uint32_t* src, size_t tail) {
328 __builtin_assume(tail < kStride);
329 if (__builtin_expect(tail, 0)) {
330 return _mm256_maskload_epi32((const int*)src, mask(tail));
331 }
332 return unaligned_load<U32>(src);
333 }
334
335 template <>
store(uint32_t * dst,U32 v,size_t tail)336 inline void store(uint32_t* dst, U32 v, size_t tail) {
337 __builtin_assume(tail < kStride);
338 if (__builtin_expect(tail, 0)) {
339 return _mm256_maskstore_epi32((int*)dst, mask(tail), v);
340 }
341 memcpy(dst, &v, sizeof(v));
342 }
343 #endif
344
345
lerp(F from,F to,F t)346 SI F lerp(F from, F to, F t) {
347 return mad(to-from, t, from);
348 }
349
from_565(U16 _565,F * r,F * g,F * b)350 SI void from_565(U16 _565, F* r, F* g, F* b) {
351 U32 wide = expand(_565);
352 *r = cast(wide & C(31<<11)) * C(1.0f / (31<<11));
353 *g = cast(wide & C(63<< 5)) * C(1.0f / (63<< 5));
354 *b = cast(wide & C(31<< 0)) * C(1.0f / (31<< 0));
355 }
356
357 // Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector.
358 #if defined(JUMPER)
359 using F4 = float __attribute__((ext_vector_type(4)));
360 #else
361 struct F4 {
362 float vals[4];
operator []F4363 float operator[](int i) const { return vals[i]; }
364 };
365 #endif
366
load_and_inc(void ** & program)367 SI void* load_and_inc(void**& program) {
368 #if defined(__GNUC__) && defined(__x86_64__)
369 // Passing program as the second Stage argument makes it likely that it's in %rsi,
370 // so this is usually a single instruction *program++.
371 void* rax;
372 asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
373 return rax;
374 // When a Stage uses its ctx pointer, this optimization typically cuts an instruction:
375 // mov (%rsi), %rcx // ctx = program[0]
376 // ...
377 // mov 0x8(%rsi), %rax // next = program[1]
378 // add $0x10, %rsi // program += 2
379 // jmpq *%rax // JUMP!
380 // becomes
381 // lods %ds:(%rsi),%rax // ctx = *program++;
382 // ...
383 // lods %ds:(%rsi),%rax // next = *program++;
384 // jmpq *%rax // JUMP!
385 //
386 // When a Stage doesn't use its ctx pointer, it's 3 instructions either way,
387 // but using lodsq (a 2-byte instruction) tends to trim a few bytes.
388 #else
389 // On ARM *program++ compiles into a single instruction without any handholding.
390 return *program++;
391 #endif
392 }
393
394 // Doesn't do anything unless you resolve it, either by casting to a pointer or calling load().
395 // This makes it free in stages that have no context pointer to load (i.e. built with nullptr).
396 struct LazyCtx {
397 void* ptr;
398 void**& program;
399
LazyCtxLazyCtx400 explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {}
401
402 template <typename T>
operator T*LazyCtx403 operator T*() {
404 if (!ptr) { ptr = load_and_inc(program); }
405 return (T*)ptr;
406 }
407
408 template <typename T>
loadLazyCtx409 T load() {
410 if (!ptr) { ptr = load_and_inc(program); }
411 return unaligned_load<T>(ptr);
412 }
413 };
414
415 #if defined(JUMPER) && defined(__AVX__)
416 // There's a big cost to switch between SSE and AVX+, so we do a little
417 // extra work to handle even the jagged <kStride tail in AVX+ mode.
418 using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
419
420 #if defined(JUMPER) && defined(WIN)
421 __attribute__((ms_abi))
422 #endif
WRAP(start_pipeline)423 extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
424 F v{};
425 auto start = (Stage*)load_and_inc(program);
426 while (x + kStride <= limit) {
427 start(x,program,k,0, v,v,v,v, v,v,v,v);
428 x += kStride;
429 }
430 if (size_t tail = limit - x) {
431 start(x,program,k,tail, v,v,v,v, v,v,v,v);
432 }
433 return limit;
434 }
435
436 #define STAGE(name) \
437 SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
438 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
439 extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
440 F r, F g, F b, F a, F dr, F dg, F db, F da) { \
441 LazyCtx ctx(program); \
442 name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
443 auto next = (Stage*)load_and_inc(program); \
444 next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
445 } \
446 SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
447 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
448
449 #else
450 // Other instruction sets (SSE, NEON, portable) can fall back on narrower
451 // pipelines cheaply, which frees us to always assume tail==0.
452
453 // Stages tail call between each other by following program,
454 // an interlaced sequence of Stage pointers and context pointers.
455 using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
456
457 #if defined(JUMPER) && defined(WIN)
458 __attribute__((ms_abi))
459 #endif
WRAP(start_pipeline)460 extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
461 F v{};
462 auto start = (Stage*)load_and_inc(program);
463 while (x + kStride <= limit) {
464 start(x,program,k, v,v,v,v, v,v,v,v);
465 x += kStride;
466 }
467 return x;
468 }
469
470 #define STAGE(name) \
471 SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
472 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
473 extern "C" void WRAP(name)(size_t x, void** program, K* k, \
474 F r, F g, F b, F a, F dr, F dg, F db, F da) { \
475 LazyCtx ctx(program); \
476 name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
477 auto next = (Stage*)load_and_inc(program); \
478 next(x,program,k, r,g,b,a, dr,dg,db,da); \
479 } \
480 SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
481 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
482 #endif
483
484 // Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
WRAP(just_return)485 extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
486
487 // We can now define Stages!
488
489 // Some things to keep in mind while writing Stages:
490 // - do not branch; (i.e. avoid jmp)
491 // - do not call functions that don't inline; (i.e. avoid call, ret)
492 // - do not use constant literals other than 0, ~0 and 0.0f. (i.e. avoid rip relative addressing)
493 //
494 // Some things that should work fine:
495 // - 0, ~0, and 0.0f;
496 // - arithmetic;
497 // - functions of F and U32 that we've defined above;
498 // - temporary values;
499 // - lambdas;
500 // - memcpy() with a compile-time constant size argument.
501
STAGE(seed_shader)502 STAGE(seed_shader) {
503 auto y = *(const int*)ctx;
504
505 // It's important for speed to explicitly cast(x) and cast(y),
506 // which has the effect of splatting them to vectors before converting to floats.
507 // On Intel this breaks a data dependency on previous loop iterations' registers.
508 r = cast(x) + 0.5_f + unaligned_load<F>(k->iota);
509 g = cast(y) + 0.5_f;
510 b = 1.0_f;
511 a = 0;
512 dr = dg = db = da = 0;
513 }
514
STAGE(constant_color)515 STAGE(constant_color) {
516 auto rgba = ctx.load<F4>();
517 r = rgba[0];
518 g = rgba[1];
519 b = rgba[2];
520 a = rgba[3];
521 }
522
STAGE(clear)523 STAGE(clear) {
524 r = g = b = a = 0;
525 }
526
STAGE(plus_)527 STAGE(plus_) {
528 r = r + dr;
529 g = g + dg;
530 b = b + db;
531 a = a + da;
532 }
533
STAGE(srcover)534 STAGE(srcover) {
535 auto A = C(1.0f) - a;
536 r = mad(dr, A, r);
537 g = mad(dg, A, g);
538 b = mad(db, A, b);
539 a = mad(da, A, a);
540 }
STAGE(dstover)541 STAGE(dstover) {
542 auto DA = 1.0_f - da;
543 r = mad(r, DA, dr);
544 g = mad(g, DA, dg);
545 b = mad(b, DA, db);
546 a = mad(a, DA, da);
547 }
548
STAGE(clamp_0)549 STAGE(clamp_0) {
550 r = max(r, 0);
551 g = max(g, 0);
552 b = max(b, 0);
553 a = max(a, 0);
554 }
555
STAGE(clamp_1)556 STAGE(clamp_1) {
557 r = min(r, 1.0_f);
558 g = min(g, 1.0_f);
559 b = min(b, 1.0_f);
560 a = min(a, 1.0_f);
561 }
562
STAGE(clamp_a)563 STAGE(clamp_a) {
564 a = min(a, 1.0_f);
565 r = min(r, a);
566 g = min(g, a);
567 b = min(b, a);
568 }
569
STAGE(set_rgb)570 STAGE(set_rgb) {
571 auto rgb = (const float*)ctx;
572 r = rgb[0];
573 g = rgb[1];
574 b = rgb[2];
575 }
STAGE(swap_rb)576 STAGE(swap_rb) {
577 auto tmp = r;
578 r = b;
579 b = tmp;
580 }
581
STAGE(swap)582 STAGE(swap) {
583 auto swap = [](F& v, F& dv) {
584 auto tmp = v;
585 v = dv;
586 dv = tmp;
587 };
588 swap(r, dr);
589 swap(g, dg);
590 swap(b, db);
591 swap(a, da);
592 }
STAGE(move_src_dst)593 STAGE(move_src_dst) {
594 dr = r;
595 dg = g;
596 db = b;
597 da = a;
598 }
STAGE(move_dst_src)599 STAGE(move_dst_src) {
600 r = dr;
601 g = dg;
602 b = db;
603 a = da;
604 }
605
STAGE(premul)606 STAGE(premul) {
607 r = r * a;
608 g = g * a;
609 b = b * a;
610 }
STAGE(unpremul)611 STAGE(unpremul) {
612 auto scale = if_then_else(a == 0, 0, 1.0_f / a);
613 r = r * scale;
614 g = g * scale;
615 b = b * scale;
616 }
617
STAGE(from_srgb)618 STAGE(from_srgb) {
619 auto fn = [&](F s) {
620 auto lo = s * C(1/12.92f);
621 auto hi = mad(s*s, mad(s, 0.3000_f, 0.6975_f), 0.0025_f);
622 return if_then_else(s < 0.055_f, lo, hi);
623 };
624 r = fn(r);
625 g = fn(g);
626 b = fn(b);
627 }
STAGE(to_srgb)628 STAGE(to_srgb) {
629 auto fn = [&](F l) {
630 F sqrt = rcp (rsqrt(l)),
631 ftrt = rsqrt(rsqrt(l));
632 auto lo = l * 12.46_f;
633 auto hi = min(1.0_f, mad(0.411192_f, ftrt,
634 mad(0.689206_f, sqrt, -0.0988_f)));
635 return if_then_else(l < 0.0043_f, lo, hi);
636 };
637 r = fn(r);
638 g = fn(g);
639 b = fn(b);
640 }
641
STAGE(scale_1_float)642 STAGE(scale_1_float) {
643 auto c = *(const float*)ctx;
644
645 r = r * c;
646 g = g * c;
647 b = b * c;
648 a = a * c;
649 }
STAGE(scale_u8)650 STAGE(scale_u8) {
651 auto ptr = *(const uint8_t**)ctx + x;
652
653 auto scales = load<U8>(ptr, tail);
654 auto c = cast(expand(scales)) * C(1/255.0f);
655
656 r = r * c;
657 g = g * c;
658 b = b * c;
659 a = a * c;
660 }
661
STAGE(lerp_1_float)662 STAGE(lerp_1_float) {
663 auto c = *(const float*)ctx;
664
665 r = lerp(dr, r, c);
666 g = lerp(dg, g, c);
667 b = lerp(db, b, c);
668 a = lerp(da, a, c);
669 }
STAGE(lerp_u8)670 STAGE(lerp_u8) {
671 auto ptr = *(const uint8_t**)ctx + x;
672
673 auto scales = load<U8>(ptr, tail);
674 auto c = cast(expand(scales)) * C(1/255.0f);
675
676 r = lerp(dr, r, c);
677 g = lerp(dg, g, c);
678 b = lerp(db, b, c);
679 a = lerp(da, a, c);
680 }
STAGE(lerp_565)681 STAGE(lerp_565) {
682 auto ptr = *(const uint16_t**)ctx + x;
683
684 F cr,cg,cb;
685 from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
686
687 r = lerp(dr, r, cr);
688 g = lerp(dg, g, cg);
689 b = lerp(db, b, cb);
690 a = 1.0_f;
691 }
692
STAGE(load_tables)693 STAGE(load_tables) {
694 struct Ctx {
695 const uint32_t* src;
696 const float *r, *g, *b;
697 };
698 auto c = (const Ctx*)ctx;
699
700 auto px = load<U32>(c->src + x, tail);
701 r = gather(c->r, (px ) & 0xff_i);
702 g = gather(c->g, (px >> 8) & 0xff_i);
703 b = gather(c->b, (px >> 16) & 0xff_i);
704 a = cast( (px >> 24)) * C(1/255.0f);
705 }
706
STAGE(load_a8)707 STAGE(load_a8) {
708 auto ptr = *(const uint8_t**)ctx + x;
709
710 r = g = b = 0.0f;
711 a = cast(expand(load<U8>(ptr, tail))) * C(1/255.0f);
712 }
STAGE(store_a8)713 STAGE(store_a8) {
714 auto ptr = *(uint8_t**)ctx + x;
715
716 U8 packed = pack(pack(round(a, 255.0_f)));
717 store(ptr, packed, tail);
718 }
719
STAGE(load_565)720 STAGE(load_565) {
721 auto ptr = *(const uint16_t**)ctx + x;
722
723 from_565(load<U16>(ptr, tail), &r,&g,&b);
724 a = 1.0_f;
725 }
STAGE(store_565)726 STAGE(store_565) {
727 auto ptr = *(uint16_t**)ctx + x;
728
729 U16 px = pack( round(r, 31.0_f) << 11
730 | round(g, 63.0_f) << 5
731 | round(b, 31.0_f) );
732 store(ptr, px, tail);
733 }
734
STAGE(load_8888)735 STAGE(load_8888) {
736 auto ptr = *(const uint32_t**)ctx + x;
737
738 auto px = load<U32>(ptr, tail);
739 r = cast((px ) & 0xff_i) * C(1/255.0f);
740 g = cast((px >> 8) & 0xff_i) * C(1/255.0f);
741 b = cast((px >> 16) & 0xff_i) * C(1/255.0f);
742 a = cast((px >> 24) ) * C(1/255.0f);
743 }
744
STAGE(store_8888)745 STAGE(store_8888) {
746 auto ptr = *(uint32_t**)ctx + x;
747
748 U32 px = round(r, 255.0_f)
749 | round(g, 255.0_f) << 8
750 | round(b, 255.0_f) << 16
751 | round(a, 255.0_f) << 24;
752 store(ptr, px, tail);
753 }
754
STAGE(load_f16)755 STAGE(load_f16) {
756 auto ptr = *(const uint64_t**)ctx + x;
757
758 #if !defined(JUMPER)
759 auto half_to_float = [&](int16_t h) {
760 if (h < 0x0400) { h = 0; } // Flush denorm and negative to zero.
761 return bit_cast<F>(h << 13) // Line up the mantissa,
762 * bit_cast<F>(U32(0x77800000)); // then fix up the exponent.
763 };
764 auto rgba = (const int16_t*)ptr;
765 r = half_to_float(rgba[0]);
766 g = half_to_float(rgba[1]);
767 b = half_to_float(rgba[2]);
768 a = half_to_float(rgba[3]);
769 #elif defined(__aarch64__)
770 auto halfs = vld4_f16((const float16_t*)ptr);
771 r = vcvt_f32_f16(halfs.val[0]);
772 g = vcvt_f32_f16(halfs.val[1]);
773 b = vcvt_f32_f16(halfs.val[2]);
774 a = vcvt_f32_f16(halfs.val[3]);
775 #elif defined(__arm__)
776 auto rb_ga = vld2_f16((const float16_t*)ptr);
777 auto rb = vcvt_f32_f16(rb_ga.val[0]),
778 ga = vcvt_f32_f16(rb_ga.val[1]);
779 r = {rb[0], rb[2]};
780 g = {ga[0], ga[2]};
781 b = {rb[1], rb[3]};
782 a = {ga[1], ga[3]};
783 #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
784 __m128i _01, _23, _45, _67;
785 if (__builtin_expect(tail,0)) {
786 auto src = (const double*)ptr;
787 _01 = _23 = _45 = _67 = _mm_setzero_si128();
788 if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
789 if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
790 if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
791 if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
792 if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
793 if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
794 if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
795 } else {
796 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
797 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
798 _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
799 _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
800 }
801
802 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
803 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
804 _46 = _mm_unpacklo_epi16(_45, _67),
805 _57 = _mm_unpackhi_epi16(_45, _67);
806
807 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
808 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
809 rg4567 = _mm_unpacklo_epi16(_46, _57),
810 ba4567 = _mm_unpackhi_epi16(_46, _57);
811
812 r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567));
813 g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
814 b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
815 a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
816 #elif defined(__AVX__)
817 __m128i _01, _23, _45, _67;
818 if (__builtin_expect(tail,0)) {
819 auto src = (const double*)ptr;
820 _01 = _23 = _45 = _67 = _mm_setzero_si128();
821 if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
822 if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
823 if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
824 if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
825 if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
826 if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
827 if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
828 } else {
829 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
830 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
831 _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
832 _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
833 }
834
835 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
836 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
837 _46 = _mm_unpacklo_epi16(_45, _67),
838 _57 = _mm_unpackhi_epi16(_45, _67);
839
840 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
841 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
842 rg4567 = _mm_unpacklo_epi16(_46, _57),
843 ba4567 = _mm_unpackhi_epi16(_46, _57);
844
845 // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero.
846 // With a signed comparison this conveniently also flushes negative half floats to zero.
847 auto ftz = [](__m128i v) {
848 return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
849 };
850 rg0123 = ftz(rg0123);
851 ba0123 = ftz(ba0123);
852 rg4567 = ftz(rg4567);
853 ba4567 = ftz(ba4567);
854
855 U32 R = _mm256_setr_m128i(_mm_unpacklo_epi16(rg0123, _mm_setzero_si128()),
856 _mm_unpacklo_epi16(rg4567, _mm_setzero_si128())),
857 G = _mm256_setr_m128i(_mm_unpackhi_epi16(rg0123, _mm_setzero_si128()),
858 _mm_unpackhi_epi16(rg4567, _mm_setzero_si128())),
859 B = _mm256_setr_m128i(_mm_unpacklo_epi16(ba0123, _mm_setzero_si128()),
860 _mm_unpacklo_epi16(ba4567, _mm_setzero_si128())),
861 A = _mm256_setr_m128i(_mm_unpackhi_epi16(ba0123, _mm_setzero_si128()),
862 _mm_unpackhi_epi16(ba4567, _mm_setzero_si128()));
863
864 auto half_to_float = [&](U32 h) {
865 return bit_cast<F>(h << 13) // Line up the mantissa,
866 * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent.
867 };
868
869 r = half_to_float(R);
870 g = half_to_float(G);
871 b = half_to_float(B);
872 a = half_to_float(A);
873
874 #elif defined(__SSE2__)
875 auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
876 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
877
878 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
879 _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
880
881 auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
882 ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
883
884 // Same deal as AVX, flush denorms and negatives to zero.
885 auto ftz = [](__m128i v) {
886 return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
887 };
888 rg = ftz(rg);
889 ba = ftz(ba);
890
891 auto half_to_float = [&](U32 h) {
892 return bit_cast<F>(h << 13) // Line up the mantissa,
893 * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent.
894 };
895
896 r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128()));
897 g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128()));
898 b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128()));
899 a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128()));
900 #endif
901 }
902
STAGE(store_f16)903 STAGE(store_f16) {
904 auto ptr = *(uint64_t**)ctx + x;
905
906 #if !defined(JUMPER)
907 auto float_to_half = [&](F f) {
908 return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
909 >> 13; // then line up the mantissa.
910 };
911 auto rgba = (int16_t*)ptr;
912 rgba[0] = float_to_half(r);
913 rgba[1] = float_to_half(g);
914 rgba[2] = float_to_half(b);
915 rgba[3] = float_to_half(a);
916 #elif defined(__aarch64__)
917 float16x4x4_t halfs = {{
918 vcvt_f16_f32(r),
919 vcvt_f16_f32(g),
920 vcvt_f16_f32(b),
921 vcvt_f16_f32(a),
922 }};
923 vst4_f16((float16_t*)ptr, halfs);
924 #elif defined(__arm__)
925 float16x4x2_t rb_ga = {{
926 vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}),
927 vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
928 }};
929 vst2_f16((float16_t*)ptr, rb_ga);
930 #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
931 auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
932 G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
933 B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
934 A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
935
936 auto rg0123 = _mm_unpacklo_epi16(R, G), // r0 g0 r1 g1 r2 g2 r3 g3
937 rg4567 = _mm_unpackhi_epi16(R, G), // r4 g4 r5 g5 r6 g6 r7 g7
938 ba0123 = _mm_unpacklo_epi16(B, A),
939 ba4567 = _mm_unpackhi_epi16(B, A);
940
941 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
942 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
943 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
944 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
945
946 if (__builtin_expect(tail,0)) {
947 auto dst = (double*)ptr;
948 if (tail > 0) { _mm_storel_pd(dst+0, _01); }
949 if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
950 if (tail > 2) { _mm_storel_pd(dst+2, _23); }
951 if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
952 if (tail > 4) { _mm_storel_pd(dst+4, _45); }
953 if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
954 if (tail > 6) { _mm_storel_pd(dst+6, _67); }
955 } else {
956 _mm_storeu_si128((__m128i*)ptr + 0, _01);
957 _mm_storeu_si128((__m128i*)ptr + 1, _23);
958 _mm_storeu_si128((__m128i*)ptr + 2, _45);
959 _mm_storeu_si128((__m128i*)ptr + 3, _67);
960 }
961 #elif defined(__AVX__)
962 auto float_to_half = [&](F f) {
963 return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
964 >> 13; // then line up the mantissa.
965 };
966 U32 R = float_to_half(r),
967 G = float_to_half(g),
968 B = float_to_half(b),
969 A = float_to_half(a);
970 auto r0123 = _mm256_extractf128_si256(R, 0),
971 r4567 = _mm256_extractf128_si256(R, 1),
972 g0123 = _mm256_extractf128_si256(G, 0),
973 g4567 = _mm256_extractf128_si256(G, 1),
974 b0123 = _mm256_extractf128_si256(B, 0),
975 b4567 = _mm256_extractf128_si256(B, 1),
976 a0123 = _mm256_extractf128_si256(A, 0),
977 a4567 = _mm256_extractf128_si256(A, 1);
978 auto rg0123 = r0123 | _mm_slli_si128(g0123,2),
979 rg4567 = r4567 | _mm_slli_si128(g4567,2),
980 ba0123 = b0123 | _mm_slli_si128(a0123,2),
981 ba4567 = b4567 | _mm_slli_si128(a4567,2);
982
983 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
984 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
985 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
986 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
987
988 if (__builtin_expect(tail,0)) {
989 auto dst = (double*)ptr;
990 if (tail > 0) { _mm_storel_pd(dst+0, _01); }
991 if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
992 if (tail > 2) { _mm_storel_pd(dst+2, _23); }
993 if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
994 if (tail > 4) { _mm_storel_pd(dst+4, _45); }
995 if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
996 if (tail > 6) { _mm_storel_pd(dst+6, _67); }
997 } else {
998 _mm_storeu_si128((__m128i*)ptr + 0, _01);
999 _mm_storeu_si128((__m128i*)ptr + 1, _23);
1000 _mm_storeu_si128((__m128i*)ptr + 2, _45);
1001 _mm_storeu_si128((__m128i*)ptr + 3, _67);
1002 }
1003 #elif defined(__SSE2__)
1004 auto float_to_half = [&](F f) {
1005 return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
1006 >> 13; // then line up the mantissa.
1007 };
1008 U32 R = float_to_half(r),
1009 G = float_to_half(g),
1010 B = float_to_half(b),
1011 A = float_to_half(a);
1012 U32 rg = R | _mm_slli_si128(G,2),
1013 ba = B | _mm_slli_si128(A,2);
1014 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
1015 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
1016 #endif
1017 }
1018
STAGE(store_f32)1019 STAGE(store_f32) {
1020 auto ptr = *(float**)ctx + 4*x;
1021
1022 #if !defined(JUMPER)
1023 ptr[0] = r;
1024 ptr[1] = g;
1025 ptr[2] = b;
1026 ptr[3] = a;
1027 #elif defined(__aarch64__)
1028 vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
1029 #elif defined(__arm__)
1030 vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
1031 #elif defined(__AVX__)
1032 F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
1033 rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
1034 ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
1035 ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
1036
1037 F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
1038 _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
1039 _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
1040 _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
1041
1042 if (__builtin_expect(tail, 0)) {
1043 if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
1044 if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
1045 if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
1046 if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
1047 if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
1048 if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
1049 if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
1050 } else {
1051 F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
1052 _23 = _mm256_permute2f128_ps(_26, _37, 32),
1053 _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
1054 _67 = _mm256_permute2f128_ps(_26, _37, 49);
1055 _mm256_storeu_ps(ptr+ 0, _01);
1056 _mm256_storeu_ps(ptr+ 8, _23);
1057 _mm256_storeu_ps(ptr+16, _45);
1058 _mm256_storeu_ps(ptr+24, _67);
1059 }
1060 #elif defined(__SSE2__)
1061 auto v0 = r, v1 = g, v2 = b, v3 = a;
1062 _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
1063 memcpy(ptr+ 0, &v0, sizeof(v0));
1064 memcpy(ptr+ 4, &v1, sizeof(v1));
1065 memcpy(ptr+ 8, &v2, sizeof(v2));
1066 memcpy(ptr+12, &v3, sizeof(v3));
1067 #endif
1068 }
1069
ulp_before(F v)1070 SI F ulp_before(F v) {
1071 return bit_cast<F>(bit_cast<U32>(v) + U32(0xffffffff));
1072 }
clamp(F v,float limit)1073 SI F clamp(F v, float limit) {
1074 v = max(0, v);
1075 return min(v, ulp_before(limit));
1076 }
repeat(F v,float limit)1077 SI F repeat(F v, float limit) {
1078 v = v - floor_(v/limit)*limit;
1079 return min(v, ulp_before(limit));
1080 }
mirror(F v,float limit)1081 SI F mirror(F v, float limit) {
1082 v = abs_( (v-limit) - (limit+limit)*floor_((v-limit)/(limit+limit)) - limit );
1083 return min(v, ulp_before(limit));
1084 }
STAGE(clamp_x)1085 STAGE(clamp_x) { r = clamp (r, *(const float*)ctx); }
STAGE(clamp_y)1086 STAGE(clamp_y) { g = clamp (g, *(const float*)ctx); }
STAGE(repeat_x)1087 STAGE(repeat_x) { r = repeat(r, *(const float*)ctx); }
STAGE(repeat_y)1088 STAGE(repeat_y) { g = repeat(g, *(const float*)ctx); }
STAGE(mirror_x)1089 STAGE(mirror_x) { r = mirror(r, *(const float*)ctx); }
STAGE(mirror_y)1090 STAGE(mirror_y) { g = mirror(g, *(const float*)ctx); }
1091
STAGE(luminance_to_alpha)1092 STAGE(luminance_to_alpha) {
1093 a = r*0.2126_f + g*0.7152_f + b*0.0722_f;
1094 r = g = b = 0;
1095 }
1096
STAGE(matrix_2x3)1097 STAGE(matrix_2x3) {
1098 auto m = (const float*)ctx;
1099
1100 auto R = mad(r,m[0], mad(g,m[2], m[4])),
1101 G = mad(r,m[1], mad(g,m[3], m[5]));
1102 r = R;
1103 g = G;
1104 }
STAGE(matrix_3x4)1105 STAGE(matrix_3x4) {
1106 auto m = (const float*)ctx;
1107
1108 auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))),
1109 G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))),
1110 B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11])));
1111 r = R;
1112 g = G;
1113 b = B;
1114 }
STAGE(matrix_4x5)1115 STAGE(matrix_4x5) {
1116 auto m = (const float*)ctx;
1117
1118 auto R = mad(r,m[0], mad(g,m[4], mad(b,m[ 8], mad(a,m[12], m[16])))),
1119 G = mad(r,m[1], mad(g,m[5], mad(b,m[ 9], mad(a,m[13], m[17])))),
1120 B = mad(r,m[2], mad(g,m[6], mad(b,m[10], mad(a,m[14], m[18])))),
1121 A = mad(r,m[3], mad(g,m[7], mad(b,m[11], mad(a,m[15], m[19]))));
1122 r = R;
1123 g = G;
1124 b = B;
1125 a = A;
1126 }
STAGE(matrix_perspective)1127 STAGE(matrix_perspective) {
1128 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
1129 auto m = (const float*)ctx;
1130
1131 auto R = mad(r,m[0], mad(g,m[1], m[2])),
1132 G = mad(r,m[3], mad(g,m[4], m[5])),
1133 Z = mad(r,m[6], mad(g,m[7], m[8]));
1134 r = R * rcp(Z);
1135 g = G * rcp(Z);
1136 }
1137
STAGE(linear_gradient_2stops)1138 STAGE(linear_gradient_2stops) {
1139 struct Ctx { F4 c0, dc; };
1140 auto c = ctx.load<Ctx>();
1141
1142 auto t = r;
1143 r = mad(t, c.dc[0], c.c0[0]);
1144 g = mad(t, c.dc[1], c.c0[1]);
1145 b = mad(t, c.dc[2], c.c0[2]);
1146 a = mad(t, c.dc[3], c.c0[3]);
1147 }
1148