1 /*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkNx_neon_DEFINED
9 #define SkNx_neon_DEFINED
10
11 #include <arm_neon.h>
12
13 namespace { // NOLINT(google-build-namespaces)
14
15 // ARMv8 has vrndm(q)_f32 to floor floats. Here we emulate it:
16 // - roundtrip through integers via truncation
17 // - subtract 1 if that's too big (possible for negative values).
18 // This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big.
emulate_vrndmq_f32(float32x4_t v)19 AI static float32x4_t emulate_vrndmq_f32(float32x4_t v) {
20 auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
21 auto too_big = vcgtq_f32(roundtrip, v);
22 return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1)));
23 }
emulate_vrndm_f32(float32x2_t v)24 AI static float32x2_t emulate_vrndm_f32(float32x2_t v) {
25 auto roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
26 auto too_big = vcgt_f32(roundtrip, v);
27 return vsub_f32(roundtrip, (float32x2_t)vand_u32(too_big, (uint32x2_t)vdup_n_f32(1)));
28 }
29
30 template <>
31 class SkNx<2, float> {
32 public:
SkNx(float32x2_t vec)33 AI SkNx(float32x2_t vec) : fVec(vec) {}
34
SkNx()35 AI SkNx() {}
SkNx(float val)36 AI SkNx(float val) : fVec(vdup_n_f32(val)) {}
SkNx(float a,float b)37 AI SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
38
Load(const void * ptr)39 AI static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); }
store(void * ptr)40 AI void store(void* ptr) const { vst1_f32((float*)ptr, fVec); }
41
Load2(const void * ptr,SkNx * x,SkNx * y)42 AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
43 float32x2x2_t xy = vld2_f32((const float*) ptr);
44 *x = xy.val[0];
45 *y = xy.val[1];
46 }
47
Store2(void * dst,const SkNx & a,const SkNx & b)48 AI static void Store2(void* dst, const SkNx& a, const SkNx& b) {
49 float32x2x2_t ab = {{
50 a.fVec,
51 b.fVec,
52 }};
53 vst2_f32((float*) dst, ab);
54 }
55
Store3(void * dst,const SkNx & a,const SkNx & b,const SkNx & c)56 AI static void Store3(void* dst, const SkNx& a, const SkNx& b, const SkNx& c) {
57 float32x2x3_t abc = {{
58 a.fVec,
59 b.fVec,
60 c.fVec,
61 }};
62 vst3_f32((float*) dst, abc);
63 }
64
Store4(void * dst,const SkNx & a,const SkNx & b,const SkNx & c,const SkNx & d)65 AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
66 float32x2x4_t abcd = {{
67 a.fVec,
68 b.fVec,
69 c.fVec,
70 d.fVec,
71 }};
72 vst4_f32((float*) dst, abcd);
73 }
74
invert()75 AI SkNx invert() const {
76 float32x2_t est0 = vrecpe_f32(fVec),
77 est1 = vmul_f32(vrecps_f32(est0, fVec), est0);
78 return est1;
79 }
80
81 AI SkNx operator - () const { return vneg_f32(fVec); }
82
83 AI SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
84 AI SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
85 AI SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
86 AI SkNx operator / (const SkNx& o) const {
87 #if defined(SK_CPU_ARM64)
88 return vdiv_f32(fVec, o.fVec);
89 #else
90 float32x2_t est0 = vrecpe_f32(o.fVec),
91 est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
92 est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
93 return vmul_f32(fVec, est2);
94 #endif
95 }
96
97 AI SkNx operator==(const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
98 AI SkNx operator <(const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
99 AI SkNx operator >(const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
100 AI SkNx operator<=(const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
101 AI SkNx operator>=(const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
102 AI SkNx operator!=(const SkNx& o) const {
103 return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
104 }
105
Min(const SkNx & l,const SkNx & r)106 AI static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); }
Max(const SkNx & l,const SkNx & r)107 AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
108
abs()109 AI SkNx abs() const { return vabs_f32(fVec); }
floor()110 AI SkNx floor() const {
111 #if defined(SK_CPU_ARM64)
112 return vrndm_f32(fVec);
113 #else
114 return emulate_vrndm_f32(fVec);
115 #endif
116 }
117
rsqrt()118 AI SkNx rsqrt() const {
119 float32x2_t est0 = vrsqrte_f32(fVec);
120 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
121 }
122
sqrt()123 AI SkNx sqrt() const {
124 #if defined(SK_CPU_ARM64)
125 return vsqrt_f32(fVec);
126 #else
127 float32x2_t est0 = vrsqrte_f32(fVec),
128 est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0),
129 est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
130 return vmul_f32(fVec, est2);
131 #endif
132 }
133
134 AI float operator[](int k) const {
135 SkASSERT(0 <= k && k < 2);
136 union { float32x2_t v; float fs[2]; } pun = {fVec};
137 return pun.fs[k&1];
138 }
139
allTrue()140 AI bool allTrue() const {
141 #if defined(SK_CPU_ARM64)
142 return 0 != vminv_u32(vreinterpret_u32_f32(fVec));
143 #else
144 auto v = vreinterpret_u32_f32(fVec);
145 return vget_lane_u32(v,0) && vget_lane_u32(v,1);
146 #endif
147 }
anyTrue()148 AI bool anyTrue() const {
149 #if defined(SK_CPU_ARM64)
150 return 0 != vmaxv_u32(vreinterpret_u32_f32(fVec));
151 #else
152 auto v = vreinterpret_u32_f32(fVec);
153 return vget_lane_u32(v,0) || vget_lane_u32(v,1);
154 #endif
155 }
156
thenElse(const SkNx & t,const SkNx & e)157 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
158 return vbsl_f32(vreinterpret_u32_f32(fVec), t.fVec, e.fVec);
159 }
160
161 float32x2_t fVec;
162 };
163
164 template <>
165 class SkNx<4, float> {
166 public:
SkNx(float32x4_t vec)167 AI SkNx(float32x4_t vec) : fVec(vec) {}
168
SkNx()169 AI SkNx() {}
SkNx(float val)170 AI SkNx(float val) : fVec(vdupq_n_f32(val)) {}
SkNx(float a,float b,float c,float d)171 AI SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
172
Load(const void * ptr)173 AI static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
store(void * ptr)174 AI void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
175
Load2(const void * ptr,SkNx * x,SkNx * y)176 AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
177 float32x4x2_t xy = vld2q_f32((const float*) ptr);
178 *x = xy.val[0];
179 *y = xy.val[1];
180 }
181
Load4(const void * ptr,SkNx * r,SkNx * g,SkNx * b,SkNx * a)182 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
183 float32x4x4_t rgba = vld4q_f32((const float*) ptr);
184 *r = rgba.val[0];
185 *g = rgba.val[1];
186 *b = rgba.val[2];
187 *a = rgba.val[3];
188 }
Store4(void * dst,const SkNx & r,const SkNx & g,const SkNx & b,const SkNx & a)189 AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
190 float32x4x4_t rgba = {{
191 r.fVec,
192 g.fVec,
193 b.fVec,
194 a.fVec,
195 }};
196 vst4q_f32((float*) dst, rgba);
197 }
198
invert()199 AI SkNx invert() const {
200 float32x4_t est0 = vrecpeq_f32(fVec),
201 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
202 return est1;
203 }
204
205 AI SkNx operator - () const { return vnegq_f32(fVec); }
206
207 AI SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
208 AI SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
209 AI SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
210 AI SkNx operator / (const SkNx& o) const {
211 #if defined(SK_CPU_ARM64)
212 return vdivq_f32(fVec, o.fVec);
213 #else
214 float32x4_t est0 = vrecpeq_f32(o.fVec),
215 est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
216 est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
217 return vmulq_f32(fVec, est2);
218 #endif
219 }
220
221 AI SkNx operator==(const SkNx& o) const {return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec));}
222 AI SkNx operator <(const SkNx& o) const {return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec));}
223 AI SkNx operator >(const SkNx& o) const {return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec));}
224 AI SkNx operator<=(const SkNx& o) const {return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec));}
225 AI SkNx operator>=(const SkNx& o) const {return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec));}
226 AI SkNx operator!=(const SkNx& o) const {
227 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
228 }
229
Min(const SkNx & l,const SkNx & r)230 AI static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); }
Max(const SkNx & l,const SkNx & r)231 AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); }
232
abs()233 AI SkNx abs() const { return vabsq_f32(fVec); }
floor()234 AI SkNx floor() const {
235 #if defined(SK_CPU_ARM64)
236 return vrndmq_f32(fVec);
237 #else
238 return emulate_vrndmq_f32(fVec);
239 #endif
240 }
241
242
rsqrt()243 AI SkNx rsqrt() const {
244 float32x4_t est0 = vrsqrteq_f32(fVec);
245 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
246 }
247
sqrt()248 AI SkNx sqrt() const {
249 #if defined(SK_CPU_ARM64)
250 return vsqrtq_f32(fVec);
251 #else
252 float32x4_t est0 = vrsqrteq_f32(fVec),
253 est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0),
254 est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
255 return vmulq_f32(fVec, est2);
256 #endif
257 }
258
259 AI float operator[](int k) const {
260 SkASSERT(0 <= k && k < 4);
261 union { float32x4_t v; float fs[4]; } pun = {fVec};
262 return pun.fs[k&3];
263 }
264
min()265 AI float min() const {
266 #if defined(SK_CPU_ARM64)
267 return vminvq_f32(fVec);
268 #else
269 SkNx min = Min(*this, vrev64q_f32(fVec));
270 return SkTMin(min[0], min[2]);
271 #endif
272 }
273
max()274 AI float max() const {
275 #if defined(SK_CPU_ARM64)
276 return vmaxvq_f32(fVec);
277 #else
278 SkNx max = Max(*this, vrev64q_f32(fVec));
279 return SkTMax(max[0], max[2]);
280 #endif
281 }
282
allTrue()283 AI bool allTrue() const {
284 #if defined(SK_CPU_ARM64)
285 return 0 != vminvq_u32(vreinterpretq_u32_f32(fVec));
286 #else
287 auto v = vreinterpretq_u32_f32(fVec);
288 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1)
289 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3);
290 #endif
291 }
anyTrue()292 AI bool anyTrue() const {
293 #if defined(SK_CPU_ARM64)
294 return 0 != vmaxvq_u32(vreinterpretq_u32_f32(fVec));
295 #else
296 auto v = vreinterpretq_u32_f32(fVec);
297 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1)
298 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
299 #endif
300 }
301
thenElse(const SkNx & t,const SkNx & e)302 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
303 return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
304 }
305
306 float32x4_t fVec;
307 };
308
309 #if defined(SK_CPU_ARM64)
SkNx_fma(const Sk4f & f,const Sk4f & m,const Sk4f & a)310 AI static Sk4f SkNx_fma(const Sk4f& f, const Sk4f& m, const Sk4f& a) {
311 return vfmaq_f32(a.fVec, f.fVec, m.fVec);
312 }
313 #endif
314
315 // It's possible that for our current use cases, representing this as
316 // half a uint16x8_t might be better than representing it as a uint16x4_t.
317 // It'd make conversion to Sk4b one step simpler.
318 template <>
319 class SkNx<4, uint16_t> {
320 public:
SkNx(const uint16x4_t & vec)321 AI SkNx(const uint16x4_t& vec) : fVec(vec) {}
322
SkNx()323 AI SkNx() {}
SkNx(uint16_t val)324 AI SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
SkNx(uint16_t a,uint16_t b,uint16_t c,uint16_t d)325 AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
326 fVec = (uint16x4_t) { a,b,c,d };
327 }
328
Load(const void * ptr)329 AI static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
store(void * ptr)330 AI void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
331
Load4(const void * ptr,SkNx * r,SkNx * g,SkNx * b,SkNx * a)332 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
333 uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
334 *r = rgba.val[0];
335 *g = rgba.val[1];
336 *b = rgba.val[2];
337 *a = rgba.val[3];
338 }
Load3(const void * ptr,SkNx * r,SkNx * g,SkNx * b)339 AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
340 uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr);
341 *r = rgba.val[0];
342 *g = rgba.val[1];
343 *b = rgba.val[2];
344 }
Store4(void * dst,const SkNx & r,const SkNx & g,const SkNx & b,const SkNx & a)345 AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
346 uint16x4x4_t rgba = {{
347 r.fVec,
348 g.fVec,
349 b.fVec,
350 a.fVec,
351 }};
352 vst4_u16((uint16_t*) dst, rgba);
353 }
354
355 AI SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
356 AI SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
357 AI SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
358 AI SkNx operator & (const SkNx& o) const { return vand_u16(fVec, o.fVec); }
359 AI SkNx operator | (const SkNx& o) const { return vorr_u16(fVec, o.fVec); }
360
361 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
362 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
363
Min(const SkNx & a,const SkNx & b)364 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); }
365
366 AI uint16_t operator[](int k) const {
367 SkASSERT(0 <= k && k < 4);
368 union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
369 return pun.us[k&3];
370 }
371
thenElse(const SkNx & t,const SkNx & e)372 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
373 return vbsl_u16(fVec, t.fVec, e.fVec);
374 }
375
376 uint16x4_t fVec;
377 };
378
379 template <>
380 class SkNx<8, uint16_t> {
381 public:
SkNx(const uint16x8_t & vec)382 AI SkNx(const uint16x8_t& vec) : fVec(vec) {}
383
SkNx()384 AI SkNx() {}
SkNx(uint16_t val)385 AI SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
Load(const void * ptr)386 AI static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); }
387
SkNx(uint16_t a,uint16_t b,uint16_t c,uint16_t d,uint16_t e,uint16_t f,uint16_t g,uint16_t h)388 AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
389 uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
390 fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
391 }
392
store(void * ptr)393 AI void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); }
394
395 AI SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
396 AI SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
397 AI SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
398 AI SkNx operator & (const SkNx& o) const { return vandq_u16(fVec, o.fVec); }
399 AI SkNx operator | (const SkNx& o) const { return vorrq_u16(fVec, o.fVec); }
400
401 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
402 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
403
Min(const SkNx & a,const SkNx & b)404 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
405
406 AI uint16_t operator[](int k) const {
407 SkASSERT(0 <= k && k < 8);
408 union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
409 return pun.us[k&7];
410 }
411
mulHi(const SkNx & m)412 AI SkNx mulHi(const SkNx& m) const {
413 uint32x4_t hi = vmull_u16(vget_high_u16(fVec), vget_high_u16(m.fVec));
414 uint32x4_t lo = vmull_u16( vget_low_u16(fVec), vget_low_u16(m.fVec));
415
416 return { vcombine_u16(vshrn_n_u32(lo,16), vshrn_n_u32(hi,16)) };
417 }
418
thenElse(const SkNx & t,const SkNx & e)419 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
420 return vbslq_u16(fVec, t.fVec, e.fVec);
421 }
422
423 uint16x8_t fVec;
424 };
425
426 template <>
427 class SkNx<4, uint8_t> {
428 public:
429 typedef uint32_t __attribute__((aligned(1))) unaligned_uint32_t;
430
SkNx(const uint8x8_t & vec)431 AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
432
SkNx()433 AI SkNx() {}
SkNx(uint8_t a,uint8_t b,uint8_t c,uint8_t d)434 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
435 fVec = (uint8x8_t){a,b,c,d, 0,0,0,0};
436 }
Load(const void * ptr)437 AI static SkNx Load(const void* ptr) {
438 return (uint8x8_t)vld1_dup_u32((const unaligned_uint32_t*)ptr);
439 }
store(void * ptr)440 AI void store(void* ptr) const {
441 return vst1_lane_u32((unaligned_uint32_t*)ptr, (uint32x2_t)fVec, 0);
442 }
443 AI uint8_t operator[](int k) const {
444 SkASSERT(0 <= k && k < 4);
445 union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
446 return pun.us[k&3];
447 }
448
449 // TODO as needed
450
451 uint8x8_t fVec;
452 };
453
454 template <>
455 class SkNx<8, uint8_t> {
456 public:
SkNx(const uint8x8_t & vec)457 AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
458
SkNx()459 AI SkNx() {}
SkNx(uint8_t val)460 AI SkNx(uint8_t val) : fVec(vdup_n_u8(val)) {}
SkNx(uint8_t a,uint8_t b,uint8_t c,uint8_t d,uint8_t e,uint8_t f,uint8_t g,uint8_t h)461 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
462 uint8_t e, uint8_t f, uint8_t g, uint8_t h) {
463 fVec = (uint8x8_t) { a,b,c,d, e,f,g,h };
464 }
465
Load(const void * ptr)466 AI static SkNx Load(const void* ptr) { return vld1_u8((const uint8_t*)ptr); }
store(void * ptr)467 AI void store(void* ptr) const { vst1_u8((uint8_t*)ptr, fVec); }
468
469 AI uint8_t operator[](int k) const {
470 SkASSERT(0 <= k && k < 8);
471 union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
472 return pun.us[k&7];
473 }
474
475 uint8x8_t fVec;
476 };
477
478 template <>
479 class SkNx<16, uint8_t> {
480 public:
SkNx(const uint8x16_t & vec)481 AI SkNx(const uint8x16_t& vec) : fVec(vec) {}
482
SkNx()483 AI SkNx() {}
SkNx(uint8_t val)484 AI SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
SkNx(uint8_t a,uint8_t b,uint8_t c,uint8_t d,uint8_t e,uint8_t f,uint8_t g,uint8_t h,uint8_t i,uint8_t j,uint8_t k,uint8_t l,uint8_t m,uint8_t n,uint8_t o,uint8_t p)485 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
486 uint8_t e, uint8_t f, uint8_t g, uint8_t h,
487 uint8_t i, uint8_t j, uint8_t k, uint8_t l,
488 uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
489 fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
490 }
491
Load(const void * ptr)492 AI static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); }
store(void * ptr)493 AI void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); }
494
saturatedAdd(const SkNx & o)495 AI SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
496
497 AI SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
498 AI SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
499 AI SkNx operator & (const SkNx& o) const { return vandq_u8(fVec, o.fVec); }
500
Min(const SkNx & a,const SkNx & b)501 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
502 AI SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
503
504 AI uint8_t operator[](int k) const {
505 SkASSERT(0 <= k && k < 16);
506 union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
507 return pun.us[k&15];
508 }
509
thenElse(const SkNx & t,const SkNx & e)510 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
511 return vbslq_u8(fVec, t.fVec, e.fVec);
512 }
513
514 uint8x16_t fVec;
515 };
516
517 template <>
518 class SkNx<4, int32_t> {
519 public:
SkNx(const int32x4_t & vec)520 AI SkNx(const int32x4_t& vec) : fVec(vec) {}
521
SkNx()522 AI SkNx() {}
SkNx(int32_t v)523 AI SkNx(int32_t v) {
524 fVec = vdupq_n_s32(v);
525 }
SkNx(int32_t a,int32_t b,int32_t c,int32_t d)526 AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d) {
527 fVec = (int32x4_t){a,b,c,d};
528 }
Load(const void * ptr)529 AI static SkNx Load(const void* ptr) {
530 return vld1q_s32((const int32_t*)ptr);
531 }
store(void * ptr)532 AI void store(void* ptr) const {
533 return vst1q_s32((int32_t*)ptr, fVec);
534 }
535 AI int32_t operator[](int k) const {
536 SkASSERT(0 <= k && k < 4);
537 union { int32x4_t v; int32_t is[4]; } pun = {fVec};
538 return pun.is[k&3];
539 }
540
541 AI SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
542 AI SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
543 AI SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
544
545 AI SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
546 AI SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
547 AI SkNx operator ^ (const SkNx& o) const { return veorq_s32(fVec, o.fVec); }
548
549 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
550 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
551
552 AI SkNx operator == (const SkNx& o) const {
553 return vreinterpretq_s32_u32(vceqq_s32(fVec, o.fVec));
554 }
555 AI SkNx operator < (const SkNx& o) const {
556 return vreinterpretq_s32_u32(vcltq_s32(fVec, o.fVec));
557 }
558 AI SkNx operator > (const SkNx& o) const {
559 return vreinterpretq_s32_u32(vcgtq_s32(fVec, o.fVec));
560 }
561
Min(const SkNx & a,const SkNx & b)562 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); }
Max(const SkNx & a,const SkNx & b)563 AI static SkNx Max(const SkNx& a, const SkNx& b) { return vmaxq_s32(a.fVec, b.fVec); }
564 // TODO as needed
565
thenElse(const SkNx & t,const SkNx & e)566 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
567 return vbslq_s32(vreinterpretq_u32_s32(fVec), t.fVec, e.fVec);
568 }
569
abs()570 AI SkNx abs() const { return vabsq_s32(fVec); }
571
572 int32x4_t fVec;
573 };
574
575 template <>
576 class SkNx<4, uint32_t> {
577 public:
SkNx(const uint32x4_t & vec)578 AI SkNx(const uint32x4_t& vec) : fVec(vec) {}
579
SkNx()580 AI SkNx() {}
SkNx(uint32_t v)581 AI SkNx(uint32_t v) {
582 fVec = vdupq_n_u32(v);
583 }
SkNx(uint32_t a,uint32_t b,uint32_t c,uint32_t d)584 AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
585 fVec = (uint32x4_t){a,b,c,d};
586 }
Load(const void * ptr)587 AI static SkNx Load(const void* ptr) {
588 return vld1q_u32((const uint32_t*)ptr);
589 }
store(void * ptr)590 AI void store(void* ptr) const {
591 return vst1q_u32((uint32_t*)ptr, fVec);
592 }
593 AI uint32_t operator[](int k) const {
594 SkASSERT(0 <= k && k < 4);
595 union { uint32x4_t v; uint32_t us[4]; } pun = {fVec};
596 return pun.us[k&3];
597 }
598
599 AI SkNx operator + (const SkNx& o) const { return vaddq_u32(fVec, o.fVec); }
600 AI SkNx operator - (const SkNx& o) const { return vsubq_u32(fVec, o.fVec); }
601 AI SkNx operator * (const SkNx& o) const { return vmulq_u32(fVec, o.fVec); }
602
603 AI SkNx operator & (const SkNx& o) const { return vandq_u32(fVec, o.fVec); }
604 AI SkNx operator | (const SkNx& o) const { return vorrq_u32(fVec, o.fVec); }
605 AI SkNx operator ^ (const SkNx& o) const { return veorq_u32(fVec, o.fVec); }
606
607 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
608 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
609
610 AI SkNx operator == (const SkNx& o) const { return vceqq_u32(fVec, o.fVec); }
611 AI SkNx operator < (const SkNx& o) const { return vcltq_u32(fVec, o.fVec); }
612 AI SkNx operator > (const SkNx& o) const { return vcgtq_u32(fVec, o.fVec); }
613
Min(const SkNx & a,const SkNx & b)614 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); }
615 // TODO as needed
616
mulHi(const SkNx & m)617 AI SkNx mulHi(const SkNx& m) const {
618 uint64x2_t hi = vmull_u32(vget_high_u32(fVec), vget_high_u32(m.fVec));
619 uint64x2_t lo = vmull_u32( vget_low_u32(fVec), vget_low_u32(m.fVec));
620
621 return { vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)) };
622 }
623
thenElse(const SkNx & t,const SkNx & e)624 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
625 return vbslq_u32(fVec, t.fVec, e.fVec);
626 }
627
628 uint32x4_t fVec;
629 };
630
631 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
632 return vcvtq_s32_f32(src.fVec);
633
634 }
635 template<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
636 return vcvtq_f32_s32(src.fVec);
637 }
638 template<> AI /*static*/ Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
639 return SkNx_cast<float>(Sk4i::Load(&src));
640 }
641
642 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
643 return vqmovn_u32(vcvtq_u32_f32(src.fVec));
644 }
645
646 template<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
647 return vcvtq_f32_u32(vmovl_u16(src.fVec));
648 }
649
650 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
651 uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
652 uint16x4_t _16 = vqmovn_u32(_32);
653 return vqmovn_u16(vcombine_u16(_16, _16));
654 }
655
656 template<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) {
657 uint16x8_t _16 = vmovl_u8(src.fVec);
658 return vmovl_u16(vget_low_u16(_16));
659 }
660
661 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
662 return vreinterpretq_s32_u32(SkNx_cast<uint32_t>(src).fVec);
663 }
664
665 template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
666 return vcvtq_f32_s32(SkNx_cast<int32_t>(src).fVec);
667 }
668
669 template<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
670 Sk8f ab, cd;
671 SkNx_split(src, &ab, &cd);
672
673 Sk4f a,b,c,d;
674 SkNx_split(ab, &a, &b);
675 SkNx_split(cd, &c, &d);
676 return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
677 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
678 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
679 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0];
680 }
681
682 template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, int32_t>(const Sk8i& src) {
683 Sk4i a, b;
684 SkNx_split(src, &a, &b);
685 uint16x4_t a16 = vqmovun_s32(a.fVec);
686 uint16x4_t b16 = vqmovun_s32(b.fVec);
687
688 return vqmovn_u16(vcombine_u16(a16, b16));
689 }
690
691 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
692 return vget_low_u16(vmovl_u8(src.fVec));
693 }
694
695 template<> AI /*static*/ Sk8h SkNx_cast<uint16_t, uint8_t>(const Sk8b& src) {
696 return vmovl_u8(src.fVec);
697 }
698
699 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
700 return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
701 }
702
703 template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, uint16_t>(const Sk8h& src) {
704 return vqmovn_u16(src.fVec);
705 }
706
707 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
708 uint16x4_t _16 = vqmovun_s32(src.fVec);
709 return vqmovn_u16(vcombine_u16(_16, _16));
710 }
711
712 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
713 uint16x4_t _16 = vqmovn_u32(src.fVec);
714 return vqmovn_u16(vcombine_u16(_16, _16));
715 }
716
717 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
718 return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
719 }
720
721 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
722 return vmovn_u32(vreinterpretq_u32_s32(src.fVec));
723 }
724
725 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) {
726 return vreinterpretq_s32_u32(src.fVec);
727 }
728
Sk4f_round(const Sk4f & x)729 AI static Sk4i Sk4f_round(const Sk4f& x) {
730 return vcvtq_s32_f32((x + 0.5f).fVec);
731 }
732
733 } // namespace
734
735 #endif//SkNx_neon_DEFINED
736