1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkVM_DEFINED
9 #define SkVM_DEFINED
10 
11 #include "include/core/SkBlendMode.h"
12 #include "include/core/SkColor.h"
13 #include "include/core/SkSpan.h"
14 #include "include/private/SkMacros.h"
15 #include "include/private/SkTArray.h"
16 #include "include/private/SkTHash.h"
17 #include "src/core/SkVM_fwd.h"
18 #include <vector>      // std::vector
19 
20 class SkWStream;
21 
22 #if defined(SKVM_JIT_WHEN_POSSIBLE) && !defined(SK_BUILD_FOR_IOS)
23     #if defined(__x86_64__) || defined(_M_X64)
24         #if defined(_WIN32) || defined(__linux) || defined(__APPLE__)
25             #define SKVM_JIT
26         #endif
27     #endif
28     #if defined(__aarch64__)
29         #if defined(__ANDROID__) || defined(__APPLE__)
30             #define SKVM_JIT
31         #endif
32     #endif
33 #endif
34 
35 #if 0
36     #define SKVM_LLVM
37 #endif
38 
39 #if 0
40     #undef SKVM_JIT
41 #endif
42 
43 namespace skvm {
44 
45     class Assembler {
46     public:
47         explicit Assembler(void* buf);
48 
49         size_t size() const;
50 
51         // Order matters... GP64, Xmm, Ymm values match 4-bit register encoding for each.
52         enum GP64 {
53             rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi,
54             r8 , r9 , r10, r11, r12, r13, r14, r15,
55         };
56         enum Xmm {
57             xmm0, xmm1, xmm2 , xmm3 , xmm4 , xmm5 , xmm6 , xmm7 ,
58             xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
59         };
60         enum Ymm {
61             ymm0, ymm1, ymm2 , ymm3 , ymm4 , ymm5 , ymm6 , ymm7 ,
62             ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15,
63         };
64 
65         // X and V values match 5-bit encoding for each (nothing tricky).
66         enum X {
67             x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ,
68             x8 , x9 , x10, x11, x12, x13, x14, x15,
69             x16, x17, x18, x19, x20, x21, x22, x23,
70             x24, x25, x26, x27, x28, x29, x30, xzr, sp=xzr,
71         };
72         enum V {
73             v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ,
74             v8 , v9 , v10, v11, v12, v13, v14, v15,
75             v16, v17, v18, v19, v20, v21, v22, v23,
76             v24, v25, v26, v27, v28, v29, v30, v31,
77         };
78 
79         void bytes(const void*, int);
80         void byte(uint8_t);
81         void word(uint32_t);
82 
83         struct Label {
84             int                                      offset = 0;
85             enum { NotYetSet, ARMDisp19, X86Disp32 } kind = NotYetSet;
86             SkSTArray<2, int>                        references;
87         };
88 
89         // x86-64
90 
91         void align(int mod);
92 
93         void int3();
94         void vzeroupper();
95         void ret();
96 
97         // Mem represents a value at base + disp + scale*index,
98         // or simply at base + disp if index=rsp.
99         enum Scale { ONE, TWO, FOUR, EIGHT };
100         struct Mem {
101             GP64  base;
102             int   disp  = 0;
103             GP64  index = rsp;
104             Scale scale = ONE;
105         };
106 
107         struct Operand {
108             union {
109                 int    reg;
110                 Mem    mem;
111                 Label* label;
112             };
113             enum { REG, MEM, LABEL } kind;
114 
OperandOperand115             Operand(GP64   r) : reg  (r), kind(REG  ) {}
OperandOperand116             Operand(Xmm    r) : reg  (r), kind(REG  ) {}
OperandOperand117             Operand(Ymm    r) : reg  (r), kind(REG  ) {}
OperandOperand118             Operand(Mem    m) : mem  (m), kind(MEM  ) {}
OperandOperand119             Operand(Label* l) : label(l), kind(LABEL) {}
120         };
121 
122         void vpand (Ymm dst, Ymm x, Operand y);
123         void vpandn(Ymm dst, Ymm x, Operand y);
124         void vpor  (Ymm dst, Ymm x, Operand y);
125         void vpxor (Ymm dst, Ymm x, Operand y);
126 
127         void vpaddd (Ymm dst, Ymm x, Operand y);
128         void vpsubd (Ymm dst, Ymm x, Operand y);
129         void vpmulld(Ymm dst, Ymm x, Operand y);
130 
131         void vpaddw   (Ymm dst, Ymm x, Operand y);
132         void vpsubw   (Ymm dst, Ymm x, Operand y);
133         void vpmullw  (Ymm dst, Ymm x, Operand y);
134 
135         void vpabsw   (Ymm dst, Operand x);
136         void vpavgw   (Ymm dst, Ymm x, Operand y);  // dst = (x+y+1)>>1, unsigned.
137         void vpmulhrsw(Ymm dst, Ymm x, Operand y);  // dst = (x*y + (1<<14)) >> 15, signed.
138         void vpminsw  (Ymm dst, Ymm x, Operand y);
139         void vpminuw  (Ymm dst, Ymm x, Operand y);
140         void vpmaxsw  (Ymm dst, Ymm x, Operand y);
141         void vpmaxuw  (Ymm dst, Ymm x, Operand y);
142 
143         void vaddps(Ymm dst, Ymm x, Operand y);
144         void vsubps(Ymm dst, Ymm x, Operand y);
145         void vmulps(Ymm dst, Ymm x, Operand y);
146         void vdivps(Ymm dst, Ymm x, Operand y);
147         void vminps(Ymm dst, Ymm x, Operand y);
148         void vmaxps(Ymm dst, Ymm x, Operand y);
149 
150         void vsqrtps(Ymm dst, Operand x);
151 
152         void vfmadd132ps(Ymm dst, Ymm x, Operand y);
153         void vfmadd213ps(Ymm dst, Ymm x, Operand y);
154         void vfmadd231ps(Ymm dst, Ymm x, Operand y);
155 
156         void vfmsub132ps(Ymm dst, Ymm x, Operand y);
157         void vfmsub213ps(Ymm dst, Ymm x, Operand y);
158         void vfmsub231ps(Ymm dst, Ymm x, Operand y);
159 
160         void vfnmadd132ps(Ymm dst, Ymm x, Operand y);
161         void vfnmadd213ps(Ymm dst, Ymm x, Operand y);
162         void vfnmadd231ps(Ymm dst, Ymm x, Operand y);
163 
164         void vpackusdw(Ymm dst, Ymm x, Operand y);
165         void vpackuswb(Ymm dst, Ymm x, Operand y);
166 
167         void vpunpckldq(Ymm dst, Ymm x, Operand y);
168         void vpunpckhdq(Ymm dst, Ymm x, Operand y);
169 
170         void vpcmpeqd(Ymm dst, Ymm x, Operand y);
171         void vpcmpgtd(Ymm dst, Ymm x, Operand y);
172         void vpcmpeqw(Ymm dst, Ymm x, Operand y);
173         void vpcmpgtw(Ymm dst, Ymm x, Operand y);
174 
175         void vcmpps   (Ymm dst, Ymm x, Operand y, int imm);
vcmpeqps(Ymm dst,Ymm x,Operand y)176         void vcmpeqps (Ymm dst, Ymm x, Operand y) { this->vcmpps(dst,x,y,0); }
vcmpltps(Ymm dst,Ymm x,Operand y)177         void vcmpltps (Ymm dst, Ymm x, Operand y) { this->vcmpps(dst,x,y,1); }
vcmpleps(Ymm dst,Ymm x,Operand y)178         void vcmpleps (Ymm dst, Ymm x, Operand y) { this->vcmpps(dst,x,y,2); }
vcmpneqps(Ymm dst,Ymm x,Operand y)179         void vcmpneqps(Ymm dst, Ymm x, Operand y) { this->vcmpps(dst,x,y,4); }
180 
181         // Sadly, the x parameter cannot be a general Operand for these shifts.
182         void vpslld(Ymm dst, Ymm x, int imm);
183         void vpsrld(Ymm dst, Ymm x, int imm);
184         void vpsrad(Ymm dst, Ymm x, int imm);
185 
186         void vpsllw(Ymm dst, Ymm x, int imm);
187         void vpsrlw(Ymm dst, Ymm x, int imm);
188         void vpsraw(Ymm dst, Ymm x, int imm);
189 
190         void vpermq    (Ymm dst, Operand x, int imm);
191         void vperm2f128(Ymm dst, Ymm x, Operand y, int imm);
192         void vpermps   (Ymm dst, Ymm ix, Operand src);        // dst[i] = src[ix[i]]
193 
194         enum Rounding { NEAREST, FLOOR, CEIL, TRUNC, CURRENT };
195         void vroundps(Ymm dst, Operand x, Rounding);
196 
197         void vmovdqa(Ymm dst, Operand x);
198         void vmovups(Ymm dst, Operand x);
199         void vmovups(Xmm dst, Operand x);
200         void vmovups(Operand dst, Ymm x);
201         void vmovups(Operand dst, Xmm x);
202 
203         void vcvtdq2ps (Ymm dst, Operand x);
204         void vcvttps2dq(Ymm dst, Operand x);
205         void vcvtps2dq (Ymm dst, Operand x);
206 
207         void vcvtps2ph(Operand dst, Ymm x, Rounding);
208         void vcvtph2ps(Ymm dst, Operand x);
209 
210         void vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z);
211 
212         void vpshufb(Ymm dst, Ymm x, Operand y);
213 
214         void vptest(Ymm x, Operand y);
215 
216         void vbroadcastss(Ymm dst, Operand y);
217 
218         void vpmovzxwd(Ymm dst, Operand src);   // dst = src, 128-bit, uint16_t -> int
219         void vpmovzxbd(Ymm dst, Operand src);   // dst = src,  64-bit, uint8_t  -> int
220 
221         void vmovq(Operand dst, Xmm src);  // dst = src,  64-bit
222         void vmovd(Operand dst, Xmm src);  // dst = src,  32-bit
223         void vmovd(Xmm dst, Operand src);  // dst = src,  32-bit
224 
225         void vpinsrd(Xmm dst, Xmm src, Operand y, int imm);  // dst = src; dst[imm] = y, 32-bit
226         void vpinsrw(Xmm dst, Xmm src, Operand y, int imm);  // dst = src; dst[imm] = y, 16-bit
227         void vpinsrb(Xmm dst, Xmm src, Operand y, int imm);  // dst = src; dst[imm] = y,  8-bit
228 
229         void vextracti128(Operand dst, Ymm src, int imm);    // dst = src[imm], 128-bit
230         void vpextrd     (Operand dst, Xmm src, int imm);    // dst = src[imm],  32-bit
231         void vpextrw     (Operand dst, Xmm src, int imm);    // dst = src[imm],  16-bit
232         void vpextrb     (Operand dst, Xmm src, int imm);    // dst = src[imm],   8-bit
233 
234         // if (mask & 0x8000'0000) {
235         //     dst = base[scale*ix];
236         // }
237         // mask = 0;
238         void vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask);
239 
240 
241         void label(Label*);
242 
243         void jmp(Label*);
244         void je (Label*);
245         void jne(Label*);
246         void jl (Label*);
247         void jc (Label*);
248 
249         void add (Operand dst, int imm);
250         void sub (Operand dst, int imm);
251         void cmp (Operand dst, int imm);
252         void mov (Operand dst, int imm);
253         void movb(Operand dst, int imm);
254 
255         void add (Operand dst, GP64 x);
256         void sub (Operand dst, GP64 x);
257         void cmp (Operand dst, GP64 x);
258         void mov (Operand dst, GP64 x);
259         void movb(Operand dst, GP64 x);
260 
261         void add (GP64 dst, Operand x);
262         void sub (GP64 dst, Operand x);
263         void cmp (GP64 dst, Operand x);
264         void mov (GP64 dst, Operand x);
265         void movb(GP64 dst, Operand x);
266 
267         // Disambiguators... choice is arbitrary (but generates different code!).
add(GP64 dst,GP64 x)268         void add (GP64 dst, GP64 x) { this->add (Operand(dst), x); }
sub(GP64 dst,GP64 x)269         void sub (GP64 dst, GP64 x) { this->sub (Operand(dst), x); }
cmp(GP64 dst,GP64 x)270         void cmp (GP64 dst, GP64 x) { this->cmp (Operand(dst), x); }
mov(GP64 dst,GP64 x)271         void mov (GP64 dst, GP64 x) { this->mov (Operand(dst), x); }
movb(GP64 dst,GP64 x)272         void movb(GP64 dst, GP64 x) { this->movb(Operand(dst), x); }
273 
274         void movzbq(GP64 dst, Operand x);  // dst = x, uint8_t  -> int
275         void movzwq(GP64 dst, Operand x);  // dst = x, uint16_t -> int
276 
277         // aarch64
278 
279         // d = op(n,m)
280         using DOpNM = void(V d, V n, V m);
281         DOpNM  and16b, orr16b, eor16b, bic16b, bsl16b,
282                add4s,  sub4s,  mul4s,
283               cmeq4s, cmgt4s,
284                        sub8h,  mul8h,
285               fadd4s, fsub4s, fmul4s, fdiv4s, fmin4s, fmax4s,
286               fcmeq4s, fcmgt4s, fcmge4s,
287               tbl,
288               uzp14s, uzp24s,
289               zip14s, zip24s;
290 
291         // TODO: there are also float ==,<,<=,>,>= instructions with an immediate 0.0f,
292         // and the register comparison > and >= can also compare absolute values.  Interesting.
293 
294         // d += n*m
295         void fmla4s(V d, V n, V m);
296 
297         // d -= n*m
298         void fmls4s(V d, V n, V m);
299 
300         // d = op(n,imm)
301         using DOpNImm = void(V d, V n, int imm);
302         DOpNImm sli4s,
303                 shl4s, sshr4s, ushr4s,
304                                ushr8h;
305 
306         // d = op(n)
307         using DOpN = void(V d, V n);
308         DOpN not16b,    // d = ~n
309              fneg4s,    // d = -n
310              fsqrt4s,   // d = sqrtf(n)
311              scvtf4s,   // int -> float
312              fcvtzs4s,  // truncate float -> int
313              fcvtns4s,  // round float -> int  (nearest even)
314              frintp4s,  // round float -> int as float, toward plus infinity  (ceil)
315              frintm4s,  // round float -> int as float, toward minus infinity (floor)
316              fcvtn,     // f32 -> f16 in low half
317              fcvtl,     // f16 in low half -> f32
318              xtns2h,    // u32 -> u16
319              xtnh2b,    // u16 -> u8
320              uxtlb2h,   // u8 -> u16    (TODO: this is a special case of ushll.8h)
321              uxtlh2s,   // u16 -> u32   (TODO: this is a special case of ushll.4s)
322              uminv4s;   // dst[0] = min(n[0],n[1],n[2],n[3]), n as unsigned
323 
324         void brk (int imm16);
325         void ret (X);
326         void add (X d, X n, int imm12);
327         void sub (X d, X n, int imm12);
328         void subs(X d, X n, int imm12);  // subtract setting condition flags
329 
330         enum Shift { LSL,LSR,ASR,ROR };
331         void add (X d, X n, X m, Shift=LSL, int imm6=0);  // d=n+Shift(m,imm6), for Shift != ROR.
332 
333         // There's another encoding for unconditional branches that can jump further,
334         // but this one encoded as b.al is simple to implement and should be fine.
b(Label * l)335         void b  (Label* l) { this->b(Condition::al, l); }
bne(Label * l)336         void bne(Label* l) { this->b(Condition::ne, l); }
blt(Label * l)337         void blt(Label* l) { this->b(Condition::lt, l); }
338 
339         // "cmp ..." is just an assembler mnemonic for "subs xzr, ..."!
cmp(X n,int imm12)340         void cmp(X n, int imm12) { this->subs(xzr, n, imm12); }
341 
342         // Compare and branch if zero/non-zero, as if
343         //      cmp(t,0)
344         //      beq/bne(l)
345         // but without setting condition flags.
346         void cbz (X t, Label* l);
347         void cbnz(X t, Label* l);
348 
349         // TODO: there are ldur variants with unscaled imm, useful?
350         void ldrd(X dst, X src, int imm12=0);  // 64-bit dst = *(src+imm12*8)
351         void ldrs(X dst, X src, int imm12=0);  // 32-bit dst = *(src+imm12*4)
352         void ldrh(X dst, X src, int imm12=0);  // 16-bit dst = *(src+imm12*2)
353         void ldrb(X dst, X src, int imm12=0);  //  8-bit dst = *(src+imm12)
354 
355         void ldrq(V dst, Label*);  // 128-bit PC-relative load
356 
357         void ldrq(V dst, X src, int imm12=0);  // 128-bit dst = *(src+imm12*16)
358         void ldrd(V dst, X src, int imm12=0);  //  64-bit dst = *(src+imm12*8)
359         void ldrs(V dst, X src, int imm12=0);  //  32-bit dst = *(src+imm12*4)
360         void ldrh(V dst, X src, int imm12=0);  //  16-bit dst = *(src+imm12*2)
361         void ldrb(V dst, X src, int imm12=0);  //   8-bit dst = *(src+imm12)
362 
363         void strs(X src, X dst, int imm12=0);  // 32-bit *(dst+imm12*4) = src
364 
365         void strq(V src, X dst, int imm12=0);  // 128-bit *(dst+imm12*16) = src
366         void strd(V src, X dst, int imm12=0);  //  64-bit *(dst+imm12*8)  = src
367         void strs(V src, X dst, int imm12=0);  //  32-bit *(dst+imm12*4)  = src
368         void strh(V src, X dst, int imm12=0);  //  16-bit *(dst+imm12*2)  = src
369         void strb(V src, X dst, int imm12=0);  //   8-bit *(dst+imm12)    = src
370 
371         void movs(X dst, V src, int lane);  // dst = 32-bit src[lane]
372         void inss(V dst, X src, int lane);  // dst[lane] = 32-bit src
373 
374         void dup4s  (V dst, X src);  // Each 32-bit lane = src
375 
376         void ld1r4s (V dst, X src);  // Each 32-bit lane = *src
377         void ld1r8h (V dst, X src);  // Each 16-bit lane = *src
378         void ld1r16b(V dst, X src);  // Each  8-bit lane = *src
379 
380         void ld24s(V dst, X src);  // deinterleave(dst,dst+1)             = 256-bit *src
381         void ld44s(V dst, X src);  // deinterleave(dst,dst+1,dst+2,dst+3) = 512-bit *src
382         void st24s(V src, X dst);  // 256-bit *dst = interleave_32bit_lanes(src,src+1)
383         void st44s(V src, X dst);  // 512-bit *dst = interleave_32bit_lanes(src,src+1,src+2,src+3)
384 
385         void ld24s(V dst, X src, int lane);  // Load 2 32-bit values into given lane of dst..dst+1
386         void ld44s(V dst, X src, int lane);  // Load 4 32-bit values into given lane of dst..dst+3
387 
388     private:
389         uint8_t* fCode;
390         size_t   fSize;
391 
392         // x86-64
393         enum W { W0, W1 };      // Are the lanes 64-bit (W1) or default (W0)?  Intel Vol 2A 2.3.5.5
394         enum L { L128, L256 };  // Is this a 128- or 256-bit operation?        Intel Vol 2A 2.3.6.2
395 
396         // Helpers for vector instructions.
397         void op(int prefix, int map, int opcode, int dst, int x, Operand y, W,L);
398         void op(int p, int m, int o, Ymm d, Ymm x, Operand y, W w=W0) { op(p,m,o, d,x,y,w,L256); }
399         void op(int p, int m, int o, Ymm d,        Operand y, W w=W0) { op(p,m,o, d,0,y,w,L256); }
400         void op(int p, int m, int o, Xmm d, Xmm x, Operand y, W w=W0) { op(p,m,o, d,x,y,w,L128); }
401         void op(int p, int m, int o, Xmm d,        Operand y, W w=W0) { op(p,m,o, d,0,y,w,L128); }
402 
403         // Helpers for GP64 instructions.
404         void op(int opcode, Operand dst, GP64 x);
405         void op(int opcode, int opcode_ext, Operand dst, int imm);
406 
407         void jump(uint8_t condition, Label*);
408         int disp32(Label*);
409         void imm_byte_after_operand(const Operand&, int byte);
410 
411         // aarch64
412 
413         // Opcode for 3-arguments ops is split between hi and lo:
414         //    [11 bits hi] [5 bits m] [6 bits lo] [5 bits n] [5 bits d]
415         void op(uint32_t hi, V m, uint32_t lo, V n, V d);
416 
417         // 0,1,2-argument ops, with or without an immediate:
418         //    [ 22 bits op ] [5 bits n] [5 bits d]
419         // Any immediate falls in the middle somewhere overlapping with either op, n, or both.
420         void op(uint32_t op22, V n, V d, int imm=0);
421         void op(uint32_t op22, X n, V d, int imm=0) { this->op(op22,(V)n,   d,imm); }
422         void op(uint32_t op22, V n, X d, int imm=0) { this->op(op22,   n,(V)d,imm); }
423         void op(uint32_t op22, X n, X d, int imm=0) { this->op(op22,(V)n,(V)d,imm); }
424         void op(uint32_t op22,           int imm=0) { this->op(op22,(V)0,(V)0,imm); }
425         // (1-argument ops don't seem to have a consistent convention of passing as n or d.)
426 
427 
428         // Order matters... value is 4-bit encoding for condition code.
429         enum class Condition { eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,al };
430         void b(Condition, Label*);
431         int disp19(Label*);
432     };
433 
434     // Order matters a little: Ops <=store128 are treated as having side effects.
435     #define SKVM_OPS(M)                                              \
436         M(assert_true)                                               \
437         M(store8)   M(store16)   M(store32) M(store64) M(store128)   \
438         M(load8)    M(load16)    M(load32)  M(load64) M(load128)     \
439         M(index)                                                     \
440         M(gather8)  M(gather16)  M(gather32)                         \
441                                  M(uniform32)                        \
442         M(splat)                                                     \
443         M(add_f32) M(add_i32)                                        \
444         M(sub_f32) M(sub_i32)                                        \
445         M(mul_f32) M(mul_i32)                                        \
446         M(div_f32)                                                   \
447         M(min_f32) M(max_f32)                                        \
448         M(fma_f32) M(fms_f32) M(fnma_f32)                            \
449         M(sqrt_f32)                                                  \
450         M(shl_i32) M(shr_i32) M(sra_i32)                             \
451         M(ceil) M(floor) M(trunc) M(round) M(to_fp16) M(from_fp16)   \
452         M(to_f32)                                                    \
453         M(neq_f32) M(eq_f32) M(eq_i32)                               \
454         M(gte_f32) M(gt_f32) M(gt_i32)                               \
455         M(bit_and)     M(bit_or)     M(bit_xor)     M(bit_clear)     \
456         M(select)
457     // End of SKVM_OPS
458 
459     enum class Op : int {
460     #define M(op) op,
461         SKVM_OPS(M)
462     #undef M
463     };
464 
has_side_effect(Op op)465     static inline bool has_side_effect(Op op) {
466         return op <= Op::store128;
467     }
touches_varying_memory(Op op)468     static inline bool touches_varying_memory(Op op) {
469         return Op::store8 <= op && op <= Op::load128;
470     }
is_always_varying(Op op)471     static inline bool is_always_varying(Op op) {
472         return Op::store8 <= op && op <= Op::index;
473     }
474 
475     using Val = int;
476     // We reserve an impossibe Val ID as a sentinel
477     // NA meaning none, n/a, null, nil, etc.
478     static const Val NA = -1;
479 
480     struct Ptr { int ix; };
481 
482     struct I32 {
483         Builder* builder = nullptr;
484         Val      id      = NA;
485         explicit operator bool() const { return id != NA; }
486         Builder* operator->()    const { return builder; }
487     };
488 
489     struct F32 {
490         Builder* builder = nullptr;
491         Val      id      = NA;
492         explicit operator bool() const { return id != NA; }
493         Builder* operator->()    const { return builder; }
494     };
495 
496     struct Color {
497         F32 r,g,b,a;
498         explicit operator bool() const { return r && g && b && a; }
499         Builder* operator->()    const { return a.operator->(); }
500     };
501 
502     struct HSLA {
503         F32 h,s,l,a;
504         explicit operator bool() const { return h && s && l && a; }
505         Builder* operator->()    const { return a.operator->(); }
506     };
507 
508     struct Coord {
509         F32 x,y;
510         explicit operator bool() const { return x && y; }
511         Builder* operator->()    const { return x.operator->(); }
512     };
513 
514     struct Uniform {
515         Ptr ptr;
516         int offset;
517     };
518     struct Uniforms {
519         Ptr              base;
520         std::vector<int> buf;
521 
UniformsUniforms522         Uniforms(Ptr ptr, int init) : base(ptr), buf(init) {}
523 
pushUniforms524         Uniform push(int val) {
525             buf.push_back(val);
526             return {base, (int)( sizeof(int)*(buf.size() - 1) )};
527         }
528 
pushFUniforms529         Uniform pushF(float val) {
530             int bits;
531             memcpy(&bits, &val, sizeof(int));
532             return this->push(bits);
533         }
534 
pushPtrUniforms535         Uniform pushPtr(const void* ptr) {
536             // Jam the pointer into 1 or 2 ints.
537             int ints[sizeof(ptr) / sizeof(int)];
538             memcpy(ints, &ptr, sizeof(ptr));
539             for (int bits : ints) {
540                 buf.push_back(bits);
541             }
542             return {base, (int)( sizeof(int)*(buf.size() - SK_ARRAY_COUNT(ints)) )};
543         }
544     };
545 
546     struct PixelFormat {
547         enum { UNORM, FLOAT} encoding;
548         int r_bits,  g_bits,  b_bits,  a_bits,
549             r_shift, g_shift, b_shift, a_shift;
550     };
551     PixelFormat SkColorType_to_PixelFormat(SkColorType);
552 
553     SK_BEGIN_REQUIRE_DENSE
554     struct Instruction {
555         Op  op;         // v* = op(x,y,z,w,immA,immB), where * == index of this Instruction.
556         Val x,y,z,w;    // Enough arguments for Op::store128.
557         int immA,immB;  // Immediate bit pattern, shift count, pointer index, byte offset, etc.
558     };
559     SK_END_REQUIRE_DENSE
560 
561     bool operator==(const Instruction&, const Instruction&);
562     struct InstructionHash {
563         uint32_t operator()(const Instruction&, uint32_t seed=0) const;
564     };
565 
566     struct OptimizedInstruction {
567         Op op;
568         Val x,y,z,w;
569         int immA,immB;
570 
571         Val  death;
572         bool can_hoist;
573     };
574 
575     struct Features {
576         bool fma   = false;
577         bool fp16  = false;
578     };
579 
580     class Builder {
581     public:
582 
583         Builder();
584         explicit Builder(Features);
585 
586         Program done(const char* debug_name = nullptr, bool allow_jit=true) const;
587 
588         // Mostly for debugging, tests, etc.
program()589         std::vector<Instruction> program() const { return fProgram; }
590         std::vector<OptimizedInstruction> optimize() const;
591 
592         // Declare an argument with given stride (use stride=0 for uniforms).
593         // TODO: different types for varying and uniforms?
594         Ptr arg(int stride);
595 
596         // Convenience arg() wrappers for most common strides, sizeof(T) and 0.
597         template <typename T>
varying()598         Ptr varying() { return this->arg(sizeof(T)); }
uniform()599         Ptr uniform() { return this->arg(0); }
600 
601         // TODO: allow uniform (i.e. Ptr) offsets to store* and load*?
602         // TODO: sign extension (signed types) for <32-bit loads?
603         // TODO: unsigned integer operations where relevant (just comparisons?)?
604 
605         // Assert cond is true, printing debug when not.
606         void assert_true(I32 cond, I32 debug);
assert_true(I32 cond,F32 debug)607         void assert_true(I32 cond, F32 debug) { assert_true(cond, pun_to_I32(debug)); }
assert_true(I32 cond)608         void assert_true(I32 cond)            { assert_true(cond, cond); }
609 
610         // Store {8,16,32,64,128}-bit varying.
611         void store8  (Ptr ptr, I32 val);
612         void store16 (Ptr ptr, I32 val);
613         void store32 (Ptr ptr, I32 val);
storeF(Ptr ptr,F32 val)614         void storeF  (Ptr ptr, F32 val) { store32(ptr, pun_to_I32(val)); }
615         void store64 (Ptr ptr, I32 lo, I32 hi);              // *ptr = lo|(hi<<32)
616         void store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w);  // *ptr = x|(y<<32)|(z<<64)|(w<<96)
617 
618         // Returns varying {n, n-1, n-2, ..., 1}, where n is the argument to Program::eval().
619         I32 index();
620 
621         // Load {8,16,32,64,128}-bit varying.
622         I32 load8  (Ptr ptr);
623         I32 load16 (Ptr ptr);
624         I32 load32 (Ptr ptr);
loadF(Ptr ptr)625         F32 loadF  (Ptr ptr) { return pun_to_F32(load32(ptr)); }
626         I32 load64 (Ptr ptr, int lane);  // Load 32-bit lane 0-1 of  64-bit value.
627         I32 load128(Ptr ptr, int lane);  // Load 32-bit lane 0-3 of 128-bit value.
628 
629         // Load i32/f32 uniform with byte-count offset.
630         I32 uniform32(Ptr ptr, int offset);
uniformF(Ptr ptr,int offset)631         F32 uniformF (Ptr ptr, int offset) { return pun_to_F32(uniform32(ptr,offset)); }
632 
633         // Push and load this color as a uniform.
634         Color uniformColor(SkColor4f, Uniforms*);
635 
636         // Gather u8,u16,i32 with varying element-count index from *(ptr + byte-count offset).
637         I32 gather8 (Ptr ptr, int offset, I32 index);
638         I32 gather16(Ptr ptr, int offset, I32 index);
639         I32 gather32(Ptr ptr, int offset, I32 index);
gatherF(Ptr ptr,int offset,I32 index)640         F32 gatherF (Ptr ptr, int offset, I32 index) {
641             return pun_to_F32(gather32(ptr, offset, index));
642         }
643 
644         // Convenience methods for working with skvm::Uniform(s).
uniform32(Uniform u)645         I32 uniform32(Uniform u)            { return this->uniform32(u.ptr, u.offset); }
uniformF(Uniform u)646         F32 uniformF (Uniform u)            { return this->uniformF (u.ptr, u.offset); }
gather8(Uniform u,I32 index)647         I32 gather8  (Uniform u, I32 index) { return this->gather8  (u.ptr, u.offset, index); }
gather16(Uniform u,I32 index)648         I32 gather16 (Uniform u, I32 index) { return this->gather16 (u.ptr, u.offset, index); }
gather32(Uniform u,I32 index)649         I32 gather32 (Uniform u, I32 index) { return this->gather32 (u.ptr, u.offset, index); }
gatherF(Uniform u,I32 index)650         F32 gatherF  (Uniform u, I32 index) { return this->gatherF  (u.ptr, u.offset, index); }
651 
652         // Load an immediate constant.
653         I32 splat(int      n);
splat(unsigned u)654         I32 splat(unsigned u) { return splat((int)u); }
splat(float f)655         F32 splat(float    f) {
656             int bits;
657             memcpy(&bits, &f, 4);
658             return pun_to_F32(splat(bits));
659         }
660 
661         // Some operations make sense with immediate arguments,
662         // so we provide overloads inline to make that seamless.
663         //
664         // We omit overloads that may indicate a bug or performance issue.
665         // In general it does not make sense to pass immediates to unary operations,
666         // and even sometimes not for binary operations, e.g.
667         //
668         //   div(x, y)    -- normal every day divide
669         //   div(3.0f, y) -- yep, makes sense
670         //   div(x, 3.0f) -- omitted as a reminder you probably want mul(x, 1/3.0f).
671         //
672         // You can of course always splat() to override these opinions.
673 
674         // float math, comparisons, etc.
675         F32 add(F32, F32);
add(F32 x,float y)676         F32 add(F32 x, float y) { return add(x, splat(y)); }
add(float x,F32 y)677         F32 add(float x, F32 y) { return add(splat(x), y); }
678 
679         F32 sub(F32, F32);
sub(F32 x,float y)680         F32 sub(F32 x, float y) { return sub(x, splat(y)); }
sub(float x,F32 y)681         F32 sub(float x, F32 y) { return sub(splat(x), y); }
682 
683         F32 mul(F32, F32);
mul(F32 x,float y)684         F32 mul(F32 x, float y) { return mul(x, splat(y)); }
mul(float x,F32 y)685         F32 mul(float x, F32 y) { return mul(splat(x), y); }
686 
687         // mul(), but allowing optimizations not strictly legal under IEEE-754 rules.
688         F32 fast_mul(F32, F32);
fast_mul(F32 x,float y)689         F32 fast_mul(F32 x, float y) { return fast_mul(x, splat(y)); }
fast_mul(float x,F32 y)690         F32 fast_mul(float x, F32 y) { return fast_mul(splat(x), y); }
691 
692         F32 div(F32, F32);
div(float x,F32 y)693         F32 div(float x, F32 y) { return div(splat(x), y); }
694 
695         F32 min(F32, F32);
min(F32 x,float y)696         F32 min(F32 x, float y) { return min(x, splat(y)); }
min(float x,F32 y)697         F32 min(float x, F32 y) { return min(splat(x), y); }
698 
699         F32 max(F32, F32);
max(F32 x,float y)700         F32 max(F32 x, float y) { return max(x, splat(y)); }
max(float x,F32 y)701         F32 max(float x, F32 y) { return max(splat(x), y); }
702 
703         // TODO: remove mad()?  It's just sugar.
mad(F32 x,F32 y,F32 z)704         F32 mad(F32   x, F32   y, F32   z) { return add(mul(x,y), z); }
mad(F32 x,F32 y,float z)705         F32 mad(F32   x, F32   y, float z) { return mad(      x ,       y , splat(z)); }
mad(F32 x,float y,F32 z)706         F32 mad(F32   x, float y, F32   z) { return mad(      x , splat(y),       z ); }
mad(F32 x,float y,float z)707         F32 mad(F32   x, float y, float z) { return mad(      x , splat(y), splat(z)); }
mad(float x,F32 y,F32 z)708         F32 mad(float x, F32   y, F32   z) { return mad(splat(x),       y ,       z ); }
mad(float x,F32 y,float z)709         F32 mad(float x, F32   y, float z) { return mad(splat(x),       y , splat(z)); }
mad(float x,float y,F32 z)710         F32 mad(float x, float y, F32   z) { return mad(splat(x), splat(y),       z ); }
711 
712         F32        sqrt(F32);
713         F32 approx_log2(F32);
714         F32 approx_pow2(F32);
approx_log(F32 x)715         F32 approx_log (F32 x) { return mul(0.69314718f, approx_log2(x)); }
approx_exp(F32 x)716         F32 approx_exp (F32 x) { return approx_pow2(mul(x, 1.4426950408889634074f)); }
717 
718         F32 approx_powf(F32 base, F32 exp);
approx_powf(F32 base,float exp)719         F32 approx_powf(F32 base, float exp) { return approx_powf(base, splat(exp)); }
approx_powf(float base,F32 exp)720         F32 approx_powf(float base, F32 exp) { return approx_powf(splat(base), exp); }
721 
722 
723         F32 approx_sin(F32 radians);
approx_cos(F32 radians)724         F32 approx_cos(F32 radians) { return approx_sin(add(radians, SK_ScalarPI/2)); }
725         F32 approx_tan(F32 radians);
726 
727         F32 approx_asin(F32 x);
approx_acos(F32 x)728         F32 approx_acos(F32 x) { return sub(SK_ScalarPI/2, approx_asin(x)); }
729         F32 approx_atan(F32 x);
730         F32 approx_atan2(F32 y, F32 x);
731 
732         F32 lerp(F32   lo, F32   hi, F32   t);
lerp(F32 lo,F32 hi,float t)733         F32 lerp(F32   lo, F32   hi, float t) { return lerp(      lo ,       hi , splat(t)); }
lerp(F32 lo,float hi,float t)734         F32 lerp(F32   lo, float hi, float t) { return lerp(      lo , splat(hi), splat(t)); }
lerp(F32 lo,float hi,F32 t)735         F32 lerp(F32   lo, float hi, F32   t) { return lerp(      lo , splat(hi),       t ); }
lerp(float lo,F32 hi,F32 t)736         F32 lerp(float lo, F32   hi, F32   t) { return lerp(splat(lo),       hi ,       t ); }
lerp(float lo,F32 hi,float t)737         F32 lerp(float lo, F32   hi, float t) { return lerp(splat(lo),       hi , splat(t)); }
lerp(float lo,float hi,F32 t)738         F32 lerp(float lo, float hi, F32   t) { return lerp(splat(lo), splat(hi),       t ); }
739 
clamp(F32 x,F32 lo,F32 hi)740         F32 clamp(F32   x, F32   lo, F32   hi) { return max(lo, min(x, hi)); }
clamp(F32 x,F32 lo,float hi)741         F32 clamp(F32   x, F32   lo, float hi) { return clamp(      x ,       lo , splat(hi)); }
clamp(F32 x,float lo,float hi)742         F32 clamp(F32   x, float lo, float hi) { return clamp(      x , splat(lo), splat(hi)); }
clamp(F32 x,float lo,F32 hi)743         F32 clamp(F32   x, float lo, F32   hi) { return clamp(      x , splat(lo),       hi ); }
clamp(float x,F32 lo,F32 hi)744         F32 clamp(float x, F32   lo, F32   hi) { return clamp(splat(x),       lo ,       hi ); }
clamp(float x,F32 lo,float hi)745         F32 clamp(float x, F32   lo, float hi) { return clamp(splat(x),       lo , splat(hi)); }
clamp(float x,float lo,F32 hi)746         F32 clamp(float x, float lo, F32   hi) { return clamp(splat(x), splat(lo),       hi ); }
747 
clamp01(F32 x)748         F32 clamp01(F32 x) { return clamp(x, 0.0f, 1.0f); }
749 
abs(F32 x)750         F32    abs(F32 x) { return pun_to_F32(bit_and(pun_to_I32(x), 0x7fff'ffff)); }
751         F32  fract(F32 x) { return sub(x, floor(x)); }
752         F32   ceil(F32);
753         F32  floor(F32);
754         I32 is_NaN   (F32 x) { return neq(x,x); }
755         I32 is_finite(F32 x) { return lt(bit_and(pun_to_I32(x), 0x7f80'0000), 0x7f80'0000); }
756 
757         I32 trunc(F32 x);
758         I32 round(F32 x);  // Round to int using current rounding mode (as if lrintf()).
759         I32 pun_to_I32(F32 x) { return {x.builder, x.id}; }
760 
761         I32   to_fp16(F32 x);
762         F32 from_fp16(I32 x);
763 
764         I32 eq(F32, F32);
765         I32 eq(F32 x, float y) { return eq(x, splat(y)); }
766         I32 eq(float x, F32 y) { return eq(splat(x), y); }
767 
768         I32 neq(F32, F32);
769         I32 neq(F32 x, float y) { return neq(x, splat(y)); }
770         I32 neq(float x, F32 y) { return neq(splat(x), y); }
771 
772         I32 lt(F32, F32);
773         I32 lt(F32 x, float y) { return lt(x, splat(y)); }
774         I32 lt(float x, F32 y) { return lt(splat(x), y); }
775 
776         I32 lte(F32, F32);
777         I32 lte(F32 x, float y) { return lte(x, splat(y)); }
778         I32 lte(float x, F32 y) { return lte(splat(x), y); }
779 
780         I32 gt(F32, F32);
781         I32 gt(F32 x, float y) { return gt(x, splat(y)); }
782         I32 gt(float x, F32 y) { return gt(splat(x), y); }
783 
784         I32 gte(F32, F32);
785         I32 gte(F32 x, float y) { return gte(x, splat(y)); }
786         I32 gte(float x, F32 y) { return gte(splat(x), y); }
787 
788         // int math, comparisons, etc.
789         I32 add(I32, I32);
790         I32 add(I32 x, int y) { return add(x, splat(y)); }
791         I32 add(int x, I32 y) { return add(splat(x), y); }
792 
793         I32 sub(I32, I32);
794         I32 sub(I32 x, int y) { return sub(x, splat(y)); }
795         I32 sub(int x, I32 y) { return sub(splat(x), y); }
796 
797         I32 mul(I32, I32);
798         I32 mul(I32 x, int y) { return mul(x, splat(y)); }
799         I32 mul(int x, I32 y) { return mul(splat(x), y); }
800 
801         I32 shl(I32 x, int bits);
802         I32 shr(I32 x, int bits);
803         I32 sra(I32 x, int bits);
804 
805         I32 eq(I32, I32);
806         I32 eq(I32 x, int y) { return eq(x, splat(y)); }
807         I32 eq(int x, I32 y) { return eq(splat(x), y); }
808 
809         I32 neq(I32, I32);
810         I32 neq(I32 x, int y) { return neq(x, splat(y)); }
811         I32 neq(int x, I32 y) { return neq(splat(x), y); }
812 
813         I32 lt(I32, I32);
814         I32 lt(I32 x, int y) { return lt(x, splat(y)); }
815         I32 lt(int x, I32 y) { return lt(splat(x), y); }
816 
817         I32 lte(I32, I32);
818         I32 lte(I32 x, int y) { return lte(x, splat(y)); }
819         I32 lte(int x, I32 y) { return lte(splat(x), y); }
820 
821         I32 gt(I32, I32);
822         I32 gt(I32 x, int y) { return gt(x, splat(y)); }
823         I32 gt(int x, I32 y) { return gt(splat(x), y); }
824 
825         I32 gte(I32, I32);
826         I32 gte(I32 x, int y) { return gte(x, splat(y)); }
827         I32 gte(int x, I32 y) { return gte(splat(x), y); }
828 
829         F32 to_F32(I32 x);
830         F32 pun_to_F32(I32 x) { return {x.builder, x.id}; }
831 
832         // Bitwise operations.
833         I32 bit_and(I32, I32);
834         I32 bit_and(I32 x, int y) { return bit_and(x, splat(y)); }
835         I32 bit_and(int x, I32 y) { return bit_and(splat(x), y); }
836 
837         I32 bit_or(I32, I32);
838         I32 bit_or(I32 x, int y) { return bit_or(x, splat(y)); }
839         I32 bit_or(int x, I32 y) { return bit_or(splat(x), y); }
840 
841         I32 bit_xor(I32, I32);
842         I32 bit_xor(I32 x, int y) { return bit_xor(x, splat(y)); }
843         I32 bit_xor(int x, I32 y) { return bit_xor(splat(x), y); }
844 
845         I32 bit_clear(I32, I32);
846         I32 bit_clear(I32 x, int y) { return bit_clear(x, splat(y)); }
847         I32 bit_clear(int x, I32 y) { return bit_clear(splat(x), y); }
848 
849         I32 min(I32 x, I32 y) { return select(lte(x,y), x, y); }
850         I32 min(I32 x, int y) { return min(x, splat(y)); }
851         I32 min(int x, I32 y) { return min(splat(x), y); }
852 
853         I32 max(I32 x, I32 y) { return select(gte(x,y), x, y); }
854         I32 max(I32 x, int y) { return max(x, splat(y)); }
855         I32 max(int x, I32 y) { return max(splat(x), y); }
856 
857         I32 select(I32 cond, I32 t, I32 f);  // cond ? t : f
858         I32 select(I32 cond, int t, I32 f) { return select(cond, splat(t),       f ); }
859         I32 select(I32 cond, I32 t, int f) { return select(cond,       t , splat(f)); }
860         I32 select(I32 cond, int t, int f) { return select(cond, splat(t), splat(f)); }
861 
862         F32 select(I32 cond, F32 t, F32 f) {
863             return pun_to_F32(select(cond, pun_to_I32(t)
864                                          , pun_to_I32(f)));
865         }
866         F32 select(I32 cond, float t, F32   f) { return select(cond, splat(t),       f ); }
867         F32 select(I32 cond, F32   t, float f) { return select(cond,       t , splat(f)); }
868         F32 select(I32 cond, float t, float f) { return select(cond, splat(t), splat(f)); }
869 
870         I32 extract(I32 x, int bits, I32 z);   // (x>>bits) & z
871         I32 extract(I32 x, int bits, int z) { return extract(x, bits, splat(z)); }
872         I32 extract(int x, int bits, I32 z) { return extract(splat(x), bits, z); }
873 
874         I32 pack(I32 x, I32 y, int bits);   // x | (y<<bits)
875         I32 pack(I32 x, int y, int bits) { return pack(x, splat(y), bits); }
876         I32 pack(int x, I32 y, int bits) { return pack(splat(x), y, bits); }
877 
878 
879         // Common idioms used in several places, worth centralizing for consistency.
880         F32 from_unorm(int bits, I32);   // E.g. from_unorm(8, x) -> x * (1/255.0f)
881         I32   to_unorm(int bits, F32);   // E.g.   to_unorm(8, x) -> round(x * 255)
882 
883         Color   load(PixelFormat, Ptr ptr);
884         void   store(PixelFormat, Ptr ptr, Color);
885         Color gather(PixelFormat, Ptr ptr, int offset, I32 index);
886         Color gather(PixelFormat f, Uniform u, I32 index) {
887             return gather(f, u.ptr, u.offset, index);
888         }
889 
890         void   premul(F32* r, F32* g, F32* b, F32 a);
891         void unpremul(F32* r, F32* g, F32* b, F32 a);
892 
893         Color   premul(Color c) {   this->premul(&c.r, &c.g, &c.b, c.a); return c; }
894         Color unpremul(Color c) { this->unpremul(&c.r, &c.g, &c.b, c.a); return c; }
895 
896         Color lerp(Color lo, Color hi, F32 t);
897         Color blend(SkBlendMode, Color src, Color dst);
898 
899         Color clamp01(Color c) {
900             return { clamp01(c.r), clamp01(c.g), clamp01(c.b), clamp01(c.a) };
901         }
902 
903         HSLA  to_hsla(Color);
904         Color to_rgba(HSLA);
905 
906         void dump(SkWStream* = nullptr) const;
907 
908         uint64_t hash() const;
909 
910         Val push(Instruction);
911 
912         bool allImm() const { return true; }
913 
914         template <typename T, typename... Rest>
915         bool allImm(Val id, T* imm, Rest... rest) const {
916             if (fProgram[id].op == Op::splat) {
917                 static_assert(sizeof(T) == 4);
918                 memcpy(imm, &fProgram[id].immA, 4);
919                 return this->allImm(rest...);
920             }
921             return false;
922         }
923 
924     private:
925         Val push(Op op, Val x=NA, Val y=NA, Val z=NA, Val w=NA, int immA=0, int immB=0) {
926             return this->push(Instruction{op, x,y,z,w, immA,immB});
927         }
928 
929         template <typename T>
930         bool isImm(Val id, T want) const {
931             T imm = 0;
932             return this->allImm(id, &imm) && imm == want;
933         }
934 
935         SkTHashMap<Instruction, Val, InstructionHash> fIndex;
936         std::vector<Instruction>                      fProgram;
937         std::vector<int>                              fStrides;
938         const Features                                fFeatures;
939     };
940 
941     // Optimization passes and data structures normally used by Builder::optimize(),
942     // extracted here so they can be unit tested.
943     std::vector<Instruction>          eliminate_dead_code(std::vector<Instruction>);
944     std::vector<OptimizedInstruction> finalize           (std::vector<Instruction>);
945 
946     using Reg = int;
947 
948     // d = op(x,y,z,w, immA,immB)
949     struct InterpreterInstruction {
950         Op  op;
951         Reg d,x,y,z,w;
952         int immA,immB;
953     };
954 
955     class Program {
956     public:
957         Program(const std::vector<OptimizedInstruction>& instructions,
958                 const std::vector<int>& strides,
959                 const char* debug_name, bool allow_jit);
960 
961         Program();
962         ~Program();
963 
964         Program(Program&&);
965         Program& operator=(Program&&);
966 
967         Program(const Program&) = delete;
968         Program& operator=(const Program&) = delete;
969 
970         void eval(int n, void* args[]) const;
971 
972         template <typename... T>
973         void eval(int n, T*... arg) const {
974             SkASSERT(sizeof...(arg) == this->nargs());
975             // This nullptr isn't important except that it makes args[] non-empty if you pass none.
976             void* args[] = { (void*)arg..., nullptr };
977             this->eval(n, args);
978         }
979 
980         std::vector<InterpreterInstruction> instructions() const;
981         int  nargs() const;
982         int  nregs() const;
983         int  loop () const;
984         bool empty() const;
985 
986         bool hasJIT() const;  // Has this Program been JITted?
987 
988         void dump(SkWStream* = nullptr) const;
989 
990     private:
991         void setupInterpreter(const std::vector<OptimizedInstruction>&);
992         void setupJIT        (const std::vector<OptimizedInstruction>&, const char* debug_name);
993         void setupLLVM       (const std::vector<OptimizedInstruction>&, const char* debug_name);
994 
995         bool jit(const std::vector<OptimizedInstruction>&,
996                  int* stack_hint, uint32_t* registers_used,
997                  Assembler*) const;
998 
999         void waitForLLVM() const;
1000         void dropJIT();
1001 
1002         struct Impl;
1003         std::unique_ptr<Impl> fImpl;
1004     };
1005 
1006     // TODO: control flow
1007     // TODO: 64-bit values?
1008 
1009 #define SI static inline
1010 
1011     SI I32 operator+(I32 x, I32 y) { return x->add(x,y); }
1012     SI I32 operator+(I32 x, int y) { return x->add(x,y); }
1013     SI I32 operator+(int x, I32 y) { return y->add(x,y); }
1014 
1015     SI I32 operator-(I32 x, I32 y) { return x->sub(x,y); }
1016     SI I32 operator-(I32 x, int y) { return x->sub(x,y); }
1017     SI I32 operator-(int x, I32 y) { return y->sub(x,y); }
1018 
1019     SI I32 operator*(I32 x, I32 y) { return x->mul(x,y); }
1020     SI I32 operator*(I32 x, int y) { return x->mul(x,y); }
1021     SI I32 operator*(int x, I32 y) { return y->mul(x,y); }
1022 
min(I32 x,I32 y)1023     SI I32 min(I32 x, I32 y) { return x->min(x,y); }
min(I32 x,int y)1024     SI I32 min(I32 x, int y) { return x->min(x,y); }
min(int x,I32 y)1025     SI I32 min(int x, I32 y) { return y->min(x,y); }
1026 
max(I32 x,I32 y)1027     SI I32 max(I32 x, I32 y) { return x->max(x,y); }
max(I32 x,int y)1028     SI I32 max(I32 x, int y) { return x->max(x,y); }
max(int x,I32 y)1029     SI I32 max(int x, I32 y) { return y->max(x,y); }
1030 
1031     SI I32 operator==(I32 x, I32 y) { return x->eq(x,y); }
1032     SI I32 operator==(I32 x, int y) { return x->eq(x,y); }
1033     SI I32 operator==(int x, I32 y) { return y->eq(x,y); }
1034 
1035     SI I32 operator!=(I32 x, I32 y) { return x->neq(x,y); }
1036     SI I32 operator!=(I32 x, int y) { return x->neq(x,y); }
1037     SI I32 operator!=(int x, I32 y) { return y->neq(x,y); }
1038 
1039     SI I32 operator< (I32 x, I32 y) { return x->lt(x,y); }
1040     SI I32 operator< (I32 x, int y) { return x->lt(x,y); }
1041     SI I32 operator< (int x, I32 y) { return y->lt(x,y); }
1042 
1043     SI I32 operator<=(I32 x, I32 y) { return x->lte(x,y); }
1044     SI I32 operator<=(I32 x, int y) { return x->lte(x,y); }
1045     SI I32 operator<=(int x, I32 y) { return y->lte(x,y); }
1046 
1047     SI I32 operator> (I32 x, I32 y) { return x->gt(x,y); }
1048     SI I32 operator> (I32 x, int y) { return x->gt(x,y); }
1049     SI I32 operator> (int x, I32 y) { return y->gt(x,y); }
1050 
1051     SI I32 operator>=(I32 x, I32 y) { return x->gte(x,y); }
1052     SI I32 operator>=(I32 x, int y) { return x->gte(x,y); }
1053     SI I32 operator>=(int x, I32 y) { return y->gte(x,y); }
1054 
1055 
1056     SI F32 operator+(F32   x, F32   y) { return x->add(x,y); }
1057     SI F32 operator+(F32   x, float y) { return x->add(x,y); }
1058     SI F32 operator+(float x, F32   y) { return y->add(x,y); }
1059 
1060     SI F32 operator-(F32   x, F32   y) { return x->sub(x,y); }
1061     SI F32 operator-(F32   x, float y) { return x->sub(x,y); }
1062     SI F32 operator-(float x, F32   y) { return y->sub(x,y); }
1063 
1064     SI F32 operator*(F32   x, F32   y) { return x->mul(x,y); }
1065     SI F32 operator*(F32   x, float y) { return x->mul(x,y); }
1066     SI F32 operator*(float x, F32   y) { return y->mul(x,y); }
1067 
fast_mul(F32 x,F32 y)1068     SI F32 fast_mul(F32   x, F32   y) { return x->fast_mul(x,y); }
fast_mul(F32 x,float y)1069     SI F32 fast_mul(F32   x, float y) { return x->fast_mul(x,y); }
fast_mul(float x,F32 y)1070     SI F32 fast_mul(float x, F32   y) { return y->fast_mul(x,y); }
1071 
1072     SI F32 operator/(F32   x, F32  y) { return x->div(x,y); }
1073     SI F32 operator/(float x, F32  y) { return y->div(x,y); }
1074 
min(F32 x,F32 y)1075     SI F32 min(F32   x, F32   y) { return x->min(x,y); }
min(F32 x,float y)1076     SI F32 min(F32   x, float y) { return x->min(x,y); }
min(float x,F32 y)1077     SI F32 min(float x, F32   y) { return y->min(x,y); }
1078 
max(F32 x,F32 y)1079     SI F32 max(F32   x, F32   y) { return x->max(x,y); }
max(F32 x,float y)1080     SI F32 max(F32   x, float y) { return x->max(x,y); }
max(float x,F32 y)1081     SI F32 max(float x, F32   y) { return y->max(x,y); }
1082 
1083     SI I32 operator==(F32   x, F32   y) { return x->eq(x,y); }
1084     SI I32 operator==(F32   x, float y) { return x->eq(x,y); }
1085     SI I32 operator==(float x, F32   y) { return y->eq(x,y); }
1086 
1087     SI I32 operator!=(F32   x, F32   y) { return x->neq(x,y); }
1088     SI I32 operator!=(F32   x, float y) { return x->neq(x,y); }
1089     SI I32 operator!=(float x, F32   y) { return y->neq(x,y); }
1090 
1091     SI I32 operator< (F32   x, F32   y) { return x->lt(x,y); }
1092     SI I32 operator< (F32   x, float y) { return x->lt(x,y); }
1093     SI I32 operator< (float x, F32   y) { return y->lt(x,y); }
1094 
1095     SI I32 operator<=(F32   x, F32   y) { return x->lte(x,y); }
1096     SI I32 operator<=(F32   x, float y) { return x->lte(x,y); }
1097     SI I32 operator<=(float x, F32   y) { return y->lte(x,y); }
1098 
1099     SI I32 operator> (F32   x, F32   y) { return x->gt(x,y); }
1100     SI I32 operator> (F32   x, float y) { return x->gt(x,y); }
1101     SI I32 operator> (float x, F32   y) { return y->gt(x,y); }
1102 
1103     SI I32 operator>=(F32   x, F32   y) { return x->gte(x,y); }
1104     SI I32 operator>=(F32   x, float y) { return x->gte(x,y); }
1105     SI I32 operator>=(float x, F32   y) { return y->gte(x,y); }
1106 
1107     SI I32& operator+=(I32& x, I32 y) { return (x = x + y); }
1108     SI I32& operator+=(I32& x, int y) { return (x = x + y); }
1109 
1110     SI I32& operator-=(I32& x, I32 y) { return (x = x - y); }
1111     SI I32& operator-=(I32& x, int y) { return (x = x - y); }
1112 
1113     SI I32& operator*=(I32& x, I32 y) { return (x = x * y); }
1114     SI I32& operator*=(I32& x, int y) { return (x = x * y); }
1115 
1116     SI F32& operator+=(F32& x, F32   y) { return (x = x + y); }
1117     SI F32& operator+=(F32& x, float y) { return (x = x + y); }
1118 
1119     SI F32& operator-=(F32& x, F32   y) { return (x = x - y); }
1120     SI F32& operator-=(F32& x, float y) { return (x = x - y); }
1121 
1122     SI F32& operator*=(F32& x, F32   y) { return (x = x * y); }
1123     SI F32& operator*=(F32& x, float y) { return (x = x * y); }
1124 
1125     SI F32& operator/=(F32& x, F32   y) { return (x = x / y); }
1126 
assert_true(I32 cond,I32 debug)1127     SI void assert_true(I32 cond, I32 debug) { cond->assert_true(cond,debug); }
assert_true(I32 cond,F32 debug)1128     SI void assert_true(I32 cond, F32 debug) { cond->assert_true(cond,debug); }
assert_true(I32 cond)1129     SI void assert_true(I32 cond)            { cond->assert_true(cond); }
1130 
store8(Ptr ptr,I32 val)1131     SI void store8  (Ptr ptr, I32 val)                    { val->store8  (ptr, val); }
store16(Ptr ptr,I32 val)1132     SI void store16 (Ptr ptr, I32 val)                    { val->store16 (ptr, val); }
store32(Ptr ptr,I32 val)1133     SI void store32 (Ptr ptr, I32 val)                    { val->store32 (ptr, val); }
storeF(Ptr ptr,F32 val)1134     SI void storeF  (Ptr ptr, F32 val)                    { val->storeF  (ptr, val); }
store64(Ptr ptr,I32 lo,I32 hi)1135     SI void store64 (Ptr ptr, I32 lo, I32 hi)             { lo ->store64 (ptr, lo,hi); }
store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)1136     SI void store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) { x  ->store128(ptr, x,y,z,w); }
1137 
gather8(Ptr ptr,int off,I32 ix)1138     SI I32 gather8 (Ptr ptr, int off, I32 ix) { return ix->gather8 (ptr, off, ix); }
gather16(Ptr ptr,int off,I32 ix)1139     SI I32 gather16(Ptr ptr, int off, I32 ix) { return ix->gather16(ptr, off, ix); }
gather32(Ptr ptr,int off,I32 ix)1140     SI I32 gather32(Ptr ptr, int off, I32 ix) { return ix->gather32(ptr, off, ix); }
gatherF(Ptr ptr,int off,I32 ix)1141     SI F32 gatherF (Ptr ptr, int off, I32 ix) { return ix->gatherF (ptr, off, ix); }
1142 
gather8(Uniform u,I32 ix)1143     SI I32 gather8 (Uniform u, I32 ix) { return ix->gather8 (u, ix); }
gather16(Uniform u,I32 ix)1144     SI I32 gather16(Uniform u, I32 ix) { return ix->gather16(u, ix); }
gather32(Uniform u,I32 ix)1145     SI I32 gather32(Uniform u, I32 ix) { return ix->gather32(u, ix); }
gatherF(Uniform u,I32 ix)1146     SI F32 gatherF (Uniform u, I32 ix) { return ix->gatherF (u, ix); }
1147 
sqrt(F32 x)1148     SI F32        sqrt(F32 x) { return x->       sqrt(x); }
approx_log2(F32 x)1149     SI F32 approx_log2(F32 x) { return x->approx_log2(x); }
approx_pow2(F32 x)1150     SI F32 approx_pow2(F32 x) { return x->approx_pow2(x); }
approx_log(F32 x)1151     SI F32 approx_log (F32 x) { return x->approx_log (x); }
approx_exp(F32 x)1152     SI F32 approx_exp (F32 x) { return x->approx_exp (x); }
1153 
approx_powf(F32 base,F32 exp)1154     SI F32 approx_powf(F32   base, F32   exp) { return base->approx_powf(base, exp); }
approx_powf(F32 base,float exp)1155     SI F32 approx_powf(F32   base, float exp) { return base->approx_powf(base, exp); }
approx_powf(float base,F32 exp)1156     SI F32 approx_powf(float base, F32   exp) { return  exp->approx_powf(base, exp); }
1157 
approx_sin(F32 radians)1158     SI F32 approx_sin(F32 radians) { return radians->approx_sin(radians); }
approx_cos(F32 radians)1159     SI F32 approx_cos(F32 radians) { return radians->approx_cos(radians); }
approx_tan(F32 radians)1160     SI F32 approx_tan(F32 radians) { return radians->approx_tan(radians); }
1161 
approx_asin(F32 x)1162     SI F32 approx_asin(F32 x) { return x->approx_asin(x); }
approx_acos(F32 x)1163     SI F32 approx_acos(F32 x) { return x->approx_acos(x); }
approx_atan(F32 x)1164     SI F32 approx_atan(F32 x) { return x->approx_atan(x); }
approx_atan2(F32 y,F32 x)1165     SI F32 approx_atan2(F32 y, F32 x) { return x->approx_atan2(y, x); }
1166 
clamp01(F32 x)1167     SI F32   clamp01(F32 x) { return x->  clamp01(x); }
abs(F32 x)1168     SI F32       abs(F32 x) { return x->      abs(x); }
ceil(F32 x)1169     SI F32      ceil(F32 x) { return x->     ceil(x); }
fract(F32 x)1170     SI F32     fract(F32 x) { return x->    fract(x); }
floor(F32 x)1171     SI F32     floor(F32 x) { return x->    floor(x); }
is_NaN(F32 x)1172     SI I32    is_NaN(F32 x) { return x->   is_NaN(x); }
is_finite(F32 x)1173     SI I32 is_finite(F32 x) { return x->is_finite(x); }
1174 
trunc(F32 x)1175     SI I32      trunc(F32 x) { return x->      trunc(x); }
round(F32 x)1176     SI I32      round(F32 x) { return x->      round(x); }
pun_to_I32(F32 x)1177     SI I32 pun_to_I32(F32 x) { return x-> pun_to_I32(x); }
pun_to_F32(I32 x)1178     SI F32 pun_to_F32(I32 x) { return x-> pun_to_F32(x); }
to_F32(I32 x)1179     SI F32     to_F32(I32 x) { return x->     to_F32(x); }
to_fp16(F32 x)1180     SI I32    to_fp16(F32 x) { return x->    to_fp16(x); }
from_fp16(I32 x)1181     SI F32  from_fp16(I32 x) { return x->  from_fp16(x); }
1182 
lerp(F32 lo,F32 hi,F32 t)1183     SI F32 lerp(F32   lo, F32   hi, F32   t) { return lo->lerp(lo,hi,t); }
lerp(F32 lo,F32 hi,float t)1184     SI F32 lerp(F32   lo, F32   hi, float t) { return lo->lerp(lo,hi,t); }
lerp(F32 lo,float hi,F32 t)1185     SI F32 lerp(F32   lo, float hi, F32   t) { return lo->lerp(lo,hi,t); }
lerp(F32 lo,float hi,float t)1186     SI F32 lerp(F32   lo, float hi, float t) { return lo->lerp(lo,hi,t); }
lerp(float lo,F32 hi,F32 t)1187     SI F32 lerp(float lo, F32   hi, F32   t) { return hi->lerp(lo,hi,t); }
lerp(float lo,F32 hi,float t)1188     SI F32 lerp(float lo, F32   hi, float t) { return hi->lerp(lo,hi,t); }
lerp(float lo,float hi,F32 t)1189     SI F32 lerp(float lo, float hi, F32   t) { return  t->lerp(lo,hi,t); }
1190 
clamp(F32 x,F32 lo,F32 hi)1191     SI F32 clamp(F32   x, F32   lo, F32   hi) { return  x->clamp(x,lo,hi); }
clamp(F32 x,F32 lo,float hi)1192     SI F32 clamp(F32   x, F32   lo, float hi) { return  x->clamp(x,lo,hi); }
clamp(F32 x,float lo,F32 hi)1193     SI F32 clamp(F32   x, float lo, F32   hi) { return  x->clamp(x,lo,hi); }
clamp(F32 x,float lo,float hi)1194     SI F32 clamp(F32   x, float lo, float hi) { return  x->clamp(x,lo,hi); }
clamp(float x,F32 lo,F32 hi)1195     SI F32 clamp(float x, F32   lo, F32   hi) { return lo->clamp(x,lo,hi); }
clamp(float x,F32 lo,float hi)1196     SI F32 clamp(float x, F32   lo, float hi) { return lo->clamp(x,lo,hi); }
clamp(float x,float lo,F32 hi)1197     SI F32 clamp(float x, float lo, F32   hi) { return hi->clamp(x,lo,hi); }
1198 
1199     SI I32 operator<<(I32 x, int bits) { return x->shl(x, bits); }
shl(I32 x,int bits)1200     SI I32        shl(I32 x, int bits) { return x->shl(x, bits); }
shr(I32 x,int bits)1201     SI I32        shr(I32 x, int bits) { return x->shr(x, bits); }
sra(I32 x,int bits)1202     SI I32        sra(I32 x, int bits) { return x->sra(x, bits); }
1203 
1204     SI I32 operator&(I32 x, I32 y) { return x->bit_and(x,y); }
1205     SI I32 operator&(I32 x, int y) { return x->bit_and(x,y); }
1206     SI I32 operator&(int x, I32 y) { return y->bit_and(x,y); }
1207 
1208     SI I32 operator|(I32 x, I32 y) { return x->bit_or (x,y); }
1209     SI I32 operator|(I32 x, int y) { return x->bit_or (x,y); }
1210     SI I32 operator|(int x, I32 y) { return y->bit_or (x,y); }
1211 
1212     SI I32 operator^(I32 x, I32 y) { return x->bit_xor(x,y); }
1213     SI I32 operator^(I32 x, int y) { return x->bit_xor(x,y); }
1214     SI I32 operator^(int x, I32 y) { return y->bit_xor(x,y); }
1215 
1216     SI I32& operator&=(I32& x, I32 y) { return (x = x & y); }
1217     SI I32& operator&=(I32& x, int y) { return (x = x & y); }
1218     SI I32& operator|=(I32& x, I32 y) { return (x = x | y); }
1219     SI I32& operator|=(I32& x, int y) { return (x = x | y); }
1220     SI I32& operator^=(I32& x, I32 y) { return (x = x ^ y); }
1221     SI I32& operator^=(I32& x, int y) { return (x = x ^ y); }
1222 
bit_clear(I32 x,I32 y)1223     SI I32 bit_clear(I32 x, I32 y) { return x->bit_clear(x,y); }
bit_clear(I32 x,int y)1224     SI I32 bit_clear(I32 x, int y) { return x->bit_clear(x,y); }
bit_clear(int x,I32 y)1225     SI I32 bit_clear(int x, I32 y) { return y->bit_clear(x,y); }
1226 
select(I32 c,I32 t,I32 f)1227     SI I32 select(I32 c, I32 t, I32 f) { return c->select(c,          t ,          f ); }
select(I32 c,I32 t,int f)1228     SI I32 select(I32 c, I32 t, int f) { return c->select(c,          t , c->splat(f)); }
select(I32 c,int t,I32 f)1229     SI I32 select(I32 c, int t, I32 f) { return c->select(c, c->splat(t),          f ); }
select(I32 c,int t,int f)1230     SI I32 select(I32 c, int t, int f) { return c->select(c, c->splat(t), c->splat(f)); }
1231 
select(I32 c,F32 t,F32 f)1232     SI F32 select(I32 c, F32   t, F32   f) { return c->select(c,          t ,          f ); }
select(I32 c,F32 t,float f)1233     SI F32 select(I32 c, F32   t, float f) { return c->select(c,          t , c->splat(f)); }
select(I32 c,float t,F32 f)1234     SI F32 select(I32 c, float t, F32   f) { return c->select(c, c->splat(t),          f ); }
select(I32 c,float t,float f)1235     SI F32 select(I32 c, float t, float f) { return c->select(c, c->splat(t), c->splat(f)); }
1236 
extract(I32 x,int bits,I32 z)1237     SI I32 extract(I32 x, int bits, I32 z) { return x->extract(x,bits,z); }
extract(I32 x,int bits,int z)1238     SI I32 extract(I32 x, int bits, int z) { return x->extract(x,bits,z); }
extract(int x,int bits,I32 z)1239     SI I32 extract(int x, int bits, I32 z) { return z->extract(x,bits,z); }
1240 
pack(I32 x,I32 y,int bits)1241     SI I32 pack(I32 x, I32 y, int bits) { return x->pack   (x,y,bits); }
pack(I32 x,int y,int bits)1242     SI I32 pack(I32 x, int y, int bits) { return x->pack   (x,y,bits); }
pack(int x,I32 y,int bits)1243     SI I32 pack(int x, I32 y, int bits) { return y->pack   (x,y,bits); }
1244 
1245     SI I32 operator~(I32 x) { return ~0 ^ x; }
1246     SI I32 operator-(I32 x) { return  0 - x; }
1247     SI F32 operator-(F32 x) { return 0.0f - x; }
1248 
from_unorm(int bits,I32 x)1249     SI F32 from_unorm(int bits, I32 x) { return x->from_unorm(bits,x); }
to_unorm(int bits,F32 x)1250     SI I32   to_unorm(int bits, F32 x) { return x->  to_unorm(bits,x); }
1251 
store(PixelFormat f,Ptr p,Color c)1252     SI void store(PixelFormat f, Ptr p, Color c) { return c->store(f,p,c); }
1253 
gather(PixelFormat f,Ptr p,int off,I32 ix)1254     SI Color gather(PixelFormat f, Ptr p, int off, I32 ix) { return ix->gather(f,p,off,ix); }
gather(PixelFormat f,Uniform u,I32 ix)1255     SI Color gather(PixelFormat f, Uniform u     , I32 ix) { return ix->gather(f,u,ix); }
1256 
premul(F32 * r,F32 * g,F32 * b,F32 a)1257     SI void   premul(F32* r, F32* g, F32* b, F32 a) { a->  premul(r,g,b,a); }
unpremul(F32 * r,F32 * g,F32 * b,F32 a)1258     SI void unpremul(F32* r, F32* g, F32* b, F32 a) { a->unpremul(r,g,b,a); }
1259 
premul(Color c)1260     SI Color   premul(Color c) { return c->  premul(c); }
unpremul(Color c)1261     SI Color unpremul(Color c) { return c->unpremul(c); }
1262 
lerp(Color lo,Color hi,F32 t)1263     SI Color lerp(Color lo, Color hi, F32 t) { return t->lerp(lo,hi,t); }
1264 
blend(SkBlendMode m,Color s,Color d)1265     SI Color blend(SkBlendMode m, Color s, Color d) { return s->blend(m,s,d); }
1266 
clamp01(Color c)1267     SI Color clamp01(Color c) { return c->clamp01(c); }
1268 
to_hsla(Color c)1269     SI HSLA  to_hsla(Color c) { return c->to_hsla(c); }
to_rgba(HSLA c)1270     SI Color to_rgba(HSLA  c) { return c->to_rgba(c); }
1271 
1272     // Evaluate polynomials: ax^n + bx^(n-1) + ... for n >= 1
1273     template <typename F32_or_float, typename... Rest>
poly(F32 x,F32_or_float a,float b,Rest...rest)1274     SI F32 poly(F32 x, F32_or_float a, float b, Rest... rest) {
1275         if constexpr (sizeof...(rest) == 0) {
1276             return x*a+b;
1277         } else {
1278             return poly(x, x*a+b, rest...);
1279         }
1280     }
1281 #undef SI
1282 }  // namespace skvm
1283 
1284 #endif//SkVM_DEFINED
1285