1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkStream.h"
9 #include "include/core/SkString.h"
10 #include "include/private/SkChecksum.h"
11 #include "include/private/SkHalf.h"
12 #include "include/private/SkSpinlock.h"
13 #include "include/private/SkTFitsIn.h"
14 #include "include/private/SkThreadID.h"
15 #include "include/private/SkVx.h"
16 #include "src/core/SkColorSpaceXformSteps.h"
17 #include "src/core/SkCpu.h"
18 #include "src/core/SkEnumerate.h"
19 #include "src/core/SkOpts.h"
20 #include "src/core/SkVM.h"
21 #include <algorithm>
22 #include <atomic>
23 #include <queue>
24 
25 #if defined(SKVM_LLVM)
26     #include <future>
27     #include <llvm/Bitcode/BitcodeWriter.h>
28     #include <llvm/ExecutionEngine/ExecutionEngine.h>
29     #include <llvm/IR/IRBuilder.h>
30     #include <llvm/IR/Verifier.h>
31     #include <llvm/Support/TargetSelect.h>
32 
33     // Platform-specific intrinsics got their own files in LLVM 10.
34     #if __has_include(<llvm/IR/IntrinsicsX86.h>)
35         #include <llvm/IR/IntrinsicsX86.h>
36     #endif
37 #endif
38 
39 bool gSkVMAllowJIT{false};
40 bool gSkVMJITViaDylib{false};
41 
42 #if defined(SKVM_JIT)
43     #if defined(SK_BUILD_FOR_WIN)
44         #include "src/core/SkLeanWindows.h"
45         #include <memoryapi.h>
46 
alloc_jit_buffer(size_t * len)47         static void* alloc_jit_buffer(size_t* len) {
48             return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
49         }
unmap_jit_buffer(void * ptr,size_t len)50         static void unmap_jit_buffer(void* ptr, size_t len) {
51             VirtualFree(ptr, 0, MEM_RELEASE);
52         }
remap_as_executable(void * ptr,size_t len)53         static void remap_as_executable(void* ptr, size_t len) {
54             DWORD old;
55             VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
56             SkASSERT(old == PAGE_READWRITE);
57         }
close_dylib(void * dylib)58         static void close_dylib(void* dylib) {
59             SkASSERT(false);  // TODO?  For now just assert we never make one.
60         }
61     #else
62         #include <dlfcn.h>
63         #include <sys/mman.h>
64 
alloc_jit_buffer(size_t * len)65         static void* alloc_jit_buffer(size_t* len) {
66             // While mprotect and VirtualAlloc both work at page granularity,
67             // mprotect doesn't round up for you, and instead requires *len is at page granularity.
68             const size_t page = sysconf(_SC_PAGESIZE);
69             *len = ((*len + page - 1) / page) * page;
70             return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
71         }
unmap_jit_buffer(void * ptr,size_t len)72         static void unmap_jit_buffer(void* ptr, size_t len) {
73             munmap(ptr, len);
74         }
remap_as_executable(void * ptr,size_t len)75         static void remap_as_executable(void* ptr, size_t len) {
76             mprotect(ptr, len, PROT_READ|PROT_EXEC);
77             __builtin___clear_cache((char*)ptr,
78                                     (char*)ptr + len);
79         }
close_dylib(void * dylib)80         static void close_dylib(void* dylib) {
81             dlclose(dylib);
82         }
83     #endif
84 
85     #if defined(SKVM_JIT_VTUNE)
86         #include <jitprofiling.h>
notify_vtune(const char * name,void * addr,size_t len)87         static void notify_vtune(const char* name, void* addr, size_t len) {
88             if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) {
89                 iJIT_Method_Load event;
90                 memset(&event, 0, sizeof(event));
91                 event.method_id           = iJIT_GetNewMethodID();
92                 event.method_name         = const_cast<char*>(name);
93                 event.method_load_address = addr;
94                 event.method_size         = len;
95                 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event);
96             }
97         }
98     #else
notify_vtune(const char * name,void * addr,size_t len)99         static void notify_vtune(const char* name, void* addr, size_t len) {}
100     #endif
101 #endif
102 
103 // JIT code isn't MSAN-instrumented, so we won't see when it uses
104 // uninitialized memory, and we'll not see the writes it makes as properly
105 // initializing memory.  Instead force the interpreter, which should let
106 // MSAN see everything our programs do properly.
107 //
108 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
109 #if defined(__has_feature)
110     #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
111         #define SKVM_JIT_BUT_IGNORE_IT
112     #endif
113 #endif
114 
115 
116 
117 namespace skvm {
118 
detect_features()119     static Features detect_features() {
120         static const bool fma =
121         #if defined(SK_CPU_X86)
122             SkCpu::Supports(SkCpu::HSW);
123         #elif defined(SK_CPU_ARM64)
124             true;
125         #else
126             false;
127         #endif
128 
129         static const bool fp16 = false;  // TODO
130 
131         return { fma, fp16 };
132     }
133 
Builder()134     Builder::Builder()                  : fFeatures(detect_features()) {}
Builder(Features features)135     Builder::Builder(Features features) : fFeatures(features         ) {}
136 
137 
138     struct Program::Impl {
139         std::vector<InterpreterInstruction> instructions;
140         int regs = 0;
141         int loop = 0;
142         std::vector<int> strides;
143 
144         std::atomic<void*> jit_entry{nullptr};   // TODO: minimal std::memory_orders
145         size_t jit_size = 0;
146         void*  dylib    = nullptr;
147 
148     #if defined(SKVM_LLVM)
149         std::unique_ptr<llvm::LLVMContext>     llvm_ctx;
150         std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
151         std::future<void>                      llvm_compiling;
152     #endif
153     };
154 
155     // Debugging tools, mostly for printing various data structures out to a stream.
156 
157     namespace {
158         class SkDebugfStream final : public SkWStream {
159             size_t fBytesWritten = 0;
160 
write(const void * buffer,size_t size)161             bool write(const void* buffer, size_t size) override {
162                 SkDebugf("%.*s", size, buffer);
163                 fBytesWritten += size;
164                 return true;
165             }
166 
bytesWritten() const167             size_t bytesWritten() const override {
168                 return fBytesWritten;
169             }
170         };
171 
172         struct V { Val id; };
173         struct R { Reg id; };
174         struct Shift { int bits; };
175         struct Splat { int bits; };
176         struct Hex   { int bits; };
177 
write(SkWStream * o,const char * s)178         static void write(SkWStream* o, const char* s) {
179             o->writeText(s);
180         }
181 
name(Op op)182         static const char* name(Op op) {
183             switch (op) {
184             #define M(x) case Op::x: return #x;
185                 SKVM_OPS(M)
186             #undef M
187             }
188             return "unknown op";
189         }
190 
write(SkWStream * o,Op op)191         static void write(SkWStream* o, Op op) {
192             o->writeText(name(op));
193         }
write(SkWStream * o,Ptr p)194         static void write(SkWStream* o, Ptr p) {
195             write(o, "ptr");
196             o->writeDecAsText(p.ix);
197         }
write(SkWStream * o,V v)198         static void write(SkWStream* o, V v) {
199             write(o, "v");
200             o->writeDecAsText(v.id);
201         }
write(SkWStream * o,R r)202         static void write(SkWStream* o, R r) {
203             write(o, "r");
204             o->writeDecAsText(r.id);
205         }
write(SkWStream * o,Shift s)206         static void write(SkWStream* o, Shift s) {
207             o->writeDecAsText(s.bits);
208         }
write(SkWStream * o,Splat s)209         static void write(SkWStream* o, Splat s) {
210             float f;
211             memcpy(&f, &s.bits, 4);
212             o->writeHexAsText(s.bits);
213             write(o, " (");
214             o->writeScalarAsText(f);
215             write(o, ")");
216         }
write(SkWStream * o,Hex h)217         static void write(SkWStream* o, Hex h) {
218             o->writeHexAsText(h.bits);
219         }
220 
221         template <typename T, typename... Ts>
write(SkWStream * o,T first,Ts...rest)222         static void write(SkWStream* o, T first, Ts... rest) {
223             write(o, first);
224             write(o, " ");
225             write(o, rest...);
226         }
227     }  // namespace
228 
write_one_instruction(Val id,const OptimizedInstruction & inst,SkWStream * o)229     static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) {
230         Op  op = inst.op;
231         Val  x = inst.x,
232              y = inst.y,
233              z = inst.z,
234              w = inst.w;
235         int immA = inst.immA,
236             immB = inst.immB;
237         switch (op) {
238             case Op::assert_true: write(o, op, V{x}, V{y}); break;
239 
240             case Op::store8:   write(o, op, Ptr{immA}, V{x}               ); break;
241             case Op::store16:  write(o, op, Ptr{immA}, V{x}               ); break;
242             case Op::store32:  write(o, op, Ptr{immA}, V{x}               ); break;
243             case Op::store64:  write(o, op, Ptr{immA}, V{x},V{y}          ); break;
244             case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break;
245 
246             case Op::index: write(o, V{id}, "=", op); break;
247 
248             case Op::load8:   write(o, V{id}, "=", op, Ptr{immA}); break;
249             case Op::load16:  write(o, V{id}, "=", op, Ptr{immA}); break;
250             case Op::load32:  write(o, V{id}, "=", op, Ptr{immA}); break;
251             case Op::load64:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
252             case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
253 
254             case Op::gather8:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
255             case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
256             case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
257 
258             case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
259 
260             case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break;
261 
262             case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
263             case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
264             case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
265             case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
266             case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
267             case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
268             case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
269             case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
270             case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
271 
272 
273             case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
274 
275             case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
276             case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
277             case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
278             case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
279 
280 
281             case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
282             case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
283             case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
284 
285             case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
286             case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
287             case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
288 
289             case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
290             case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
291 
292 
293             case Op::bit_and  : write(o, V{id}, "=", op, V{x}, V{y}); break;
294             case Op::bit_or   : write(o, V{id}, "=", op, V{x}, V{y}); break;
295             case Op::bit_xor  : write(o, V{id}, "=", op, V{x}, V{y}); break;
296             case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break;
297 
298             case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
299 
300             case Op::ceil:      write(o, V{id}, "=", op, V{x}); break;
301             case Op::floor:     write(o, V{id}, "=", op, V{x}); break;
302             case Op::to_f32:    write(o, V{id}, "=", op, V{x}); break;
303             case Op::to_fp16:   write(o, V{id}, "=", op, V{x}); break;
304             case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break;
305             case Op::trunc:     write(o, V{id}, "=", op, V{x}); break;
306             case Op::round:     write(o, V{id}, "=", op, V{x}); break;
307         }
308 
309         write(o, "\n");
310     }
311 
dump(SkWStream * o) const312     void Builder::dump(SkWStream* o) const {
313         SkDebugfStream debug;
314         if (!o) { o = &debug; }
315 
316         std::vector<OptimizedInstruction> optimized = this->optimize();
317         o->writeDecAsText(optimized.size());
318         o->writeText(" values (originally ");
319         o->writeDecAsText(fProgram.size());
320         o->writeText("):\n");
321         for (Val id = 0; id < (Val)optimized.size(); id++) {
322             const OptimizedInstruction& inst = optimized[id];
323             write(o, inst.can_hoist ? "↑ " : "  ");
324             write_one_instruction(id, inst, o);
325         }
326     }
327 
dump(SkWStream * o) const328     void Program::dump(SkWStream* o) const {
329         SkDebugfStream debug;
330         if (!o) { o = &debug; }
331 
332         o->writeDecAsText(fImpl->regs);
333         o->writeText(" registers, ");
334         o->writeDecAsText(fImpl->instructions.size());
335         o->writeText(" instructions:\n");
336         for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) {
337             if (i == fImpl->loop) { write(o, "loop:\n"); }
338             o->writeDecAsText(i);
339             o->writeText("\t");
340             if (i >= fImpl->loop) { write(o, "    "); }
341             const InterpreterInstruction& inst = fImpl->instructions[i];
342             Op   op = inst.op;
343             Reg   d = inst.d,
344                   x = inst.x,
345                   y = inst.y,
346                   z = inst.z,
347                   w = inst.w;
348             int immA = inst.immA,
349                 immB = inst.immB;
350             switch (op) {
351                 case Op::assert_true: write(o, op, R{x}, R{y}); break;
352 
353                 case Op::store8:   write(o, op, Ptr{immA}, R{x}                  ); break;
354                 case Op::store16:  write(o, op, Ptr{immA}, R{x}                  ); break;
355                 case Op::store32:  write(o, op, Ptr{immA}, R{x}                  ); break;
356                 case Op::store64:  write(o, op, Ptr{immA}, R{x}, R{y}            ); break;
357                 case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break;
358 
359                 case Op::index: write(o, R{d}, "=", op); break;
360 
361                 case Op::load8:   write(o, R{d}, "=", op, Ptr{immA}); break;
362                 case Op::load16:  write(o, R{d}, "=", op, Ptr{immA}); break;
363                 case Op::load32:  write(o, R{d}, "=", op, Ptr{immA}); break;
364                 case Op::load64:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
365                 case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
366 
367                 case Op::gather8:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
368                 case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
369                 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
370 
371                 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
372 
373                 case Op::splat:     write(o, R{d}, "=", op, Splat{immA}); break;
374 
375                 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
376                 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
377                 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
378                 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
379                 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
380                 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
381                 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
382                 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
383                 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
384 
385                 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
386 
387                 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
388                 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
389                 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
390                 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
391 
392 
393                 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
394                 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
395                 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
396 
397                 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
398                 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
399                 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
400 
401                 case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
402                 case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
403 
404                 case Op::bit_and  : write(o, R{d}, "=", op, R{x}, R{y}); break;
405                 case Op::bit_or   : write(o, R{d}, "=", op, R{x}, R{y}); break;
406                 case Op::bit_xor  : write(o, R{d}, "=", op, R{x}, R{y}); break;
407                 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break;
408 
409                 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
410 
411                 case Op::ceil:      write(o, R{d}, "=", op, R{x}); break;
412                 case Op::floor:     write(o, R{d}, "=", op, R{x}); break;
413                 case Op::to_f32:    write(o, R{d}, "=", op, R{x}); break;
414                 case Op::to_fp16:   write(o, R{d}, "=", op, R{x}); break;
415                 case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break;
416                 case Op::trunc:     write(o, R{d}, "=", op, R{x}); break;
417                 case Op::round:     write(o, R{d}, "=", op, R{x}); break;
418             }
419             write(o, "\n");
420         }
421     }
422 
eliminate_dead_code(std::vector<Instruction> program)423     std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) {
424         // Determine which Instructions are live by working back from side effects.
425         std::vector<bool> live(program.size(), false);
426         auto mark_live = [&](Val id, auto& recurse) -> void {
427             if (live[id] == false) {
428                 live[id] =  true;
429                 Instruction inst = program[id];
430                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
431                     if (arg != NA) { recurse(arg, recurse); }
432                 }
433             }
434         };
435         for (Val id = 0; id < (Val)program.size(); id++) {
436             if (has_side_effect(program[id].op)) {
437                 mark_live(id, mark_live);
438             }
439         }
440 
441         // Rewrite the program with only live Instructions:
442         //   - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
443         //   - then actually remove the dead Instructions.
444         std::vector<Val> new_id(program.size(), NA);
445         for (Val id = 0, next = 0; id < (Val)program.size(); id++) {
446             if (live[id]) {
447                 Instruction& inst = program[id];
448                 for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) {
449                     if (*arg != NA) {
450                         *arg = new_id[*arg];
451                         SkASSERT(*arg != NA);
452                     }
453                 }
454                 new_id[id] = next++;
455             }
456         }
457         auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
458             Val id = (Val)(&inst - program.data());
459             return !live[id];
460         });
461         program.erase(it, program.end());
462 
463         return program;
464     }
465 
finalize(const std::vector<Instruction> program)466     std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) {
467         std::vector<OptimizedInstruction> optimized(program.size());
468         for (Val id = 0; id < (Val)program.size(); id++) {
469             Instruction inst = program[id];
470             optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w, inst.immA,inst.immB,
471                              /*death=*/id, /*can_hoist=*/true};
472         }
473 
474         // Each Instruction's inputs need to live at least until that Instruction issues.
475         for (Val id = 0; id < (Val)optimized.size(); id++) {
476             OptimizedInstruction& inst = optimized[id];
477             for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
478                 // (We're walking in order, so this is the same as max()ing with the existing Val.)
479                 if (arg != NA) { optimized[arg].death = id; }
480             }
481         }
482 
483         // Mark which values don't depend on the loop and can be hoisted.
484         for (OptimizedInstruction& inst : optimized) {
485             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
486             if (is_always_varying(inst.op)) {
487                 inst.can_hoist = false;
488             }
489 
490             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
491             if (inst.can_hoist) {
492                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
493                     if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; }
494                 }
495             }
496         }
497 
498         // Extend the lifetime of any hoisted value that's used in the loop to infinity.
499         for (OptimizedInstruction& inst : optimized) {
500             if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) {
501                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
502                     if (arg != NA && optimized[arg].can_hoist) {
503                         optimized[arg].death = (Val)program.size();
504                     }
505                 }
506             }
507         }
508 
509         return optimized;
510     }
511 
optimize() const512     std::vector<OptimizedInstruction> Builder::optimize() const {
513         std::vector<Instruction> program = this->program();
514         program = eliminate_dead_code(std::move(program));
515         return    finalize           (std::move(program));
516     }
517 
done(const char * debug_name,bool allow_jit) const518     Program Builder::done(const char* debug_name, bool allow_jit) const {
519         char buf[64] = "skvm-jit-";
520         if (!debug_name) {
521             *SkStrAppendU32(buf+9, this->hash()) = '\0';
522             debug_name = buf;
523         }
524 
525         return {this->optimize(), fStrides, debug_name, allow_jit};
526     }
527 
hash() const528     uint64_t Builder::hash() const {
529         uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
530                  hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
531         return (uint64_t)lo | (uint64_t)hi << 32;
532     }
533 
operator ==(const Instruction & a,const Instruction & b)534     bool operator==(const Instruction& a, const Instruction& b) {
535         return a.op   == b.op
536             && a.x    == b.x
537             && a.y    == b.y
538             && a.z    == b.z
539             && a.w    == b.w
540             && a.immA == b.immA
541             && a.immB == b.immB;
542     }
543 
operator ()(const Instruction & inst,uint32_t seed) const544     uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
545         return SkOpts::hash(&inst, sizeof(inst), seed);
546     }
547 
548 
549     // Most instructions produce a value and return it by ID,
550     // the value-producing instruction's own index in the program vector.
push(Instruction inst)551     Val Builder::push(Instruction inst) {
552         // Basic common subexpression elimination:
553         // if we've already seen this exact Instruction, use it instead of creating a new one.
554         //
555         // But we never dedup loads or stores: an intervening store could change that memory.
556         // Uniforms and gathers touch only uniform memory, so they're fine to dedup,
557         // and index is varying but doesn't touch memory, so it's fine to dedup too.
558         if (!touches_varying_memory(inst.op)) {
559             if (Val* id = fIndex.find(inst)) {
560                 return *id;
561             }
562         }
563         Val id = static_cast<Val>(fProgram.size());
564         fProgram.push_back(inst);
565         fIndex.set(inst, id);
566         return id;
567     }
568 
arg(int stride)569     Ptr Builder::arg(int stride) {
570         int ix = (int)fStrides.size();
571         fStrides.push_back(stride);
572         return {ix};
573     }
574 
assert_true(I32 cond,I32 debug)575     void Builder::assert_true(I32 cond, I32 debug) {
576     #ifdef SK_DEBUG
577         int imm;
578         if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
579         (void)push(Op::assert_true, cond.id, debug.id);
580     #endif
581     }
582 
store8(Ptr ptr,I32 val)583     void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); }
store16(Ptr ptr,I32 val)584     void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); }
store32(Ptr ptr,I32 val)585     void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); }
store64(Ptr ptr,I32 lo,I32 hi)586     void Builder::store64(Ptr ptr, I32 lo, I32 hi) {
587         (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix);
588     }
store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)589     void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) {
590         (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix);
591     }
592 
index()593     I32 Builder::index() { return {this, push(Op::index)}; }
594 
load8(Ptr ptr)595     I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; }
load16(Ptr ptr)596     I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; }
load32(Ptr ptr)597     I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; }
load64(Ptr ptr,int lane)598     I32 Builder::load64(Ptr ptr, int lane) {
599         return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) };
600     }
load128(Ptr ptr,int lane)601     I32 Builder::load128(Ptr ptr, int lane) {
602         return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) };
603     }
604 
gather8(Ptr ptr,int offset,I32 index)605     I32 Builder::gather8 (Ptr ptr, int offset, I32 index) {
606         return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)};
607     }
gather16(Ptr ptr,int offset,I32 index)608     I32 Builder::gather16(Ptr ptr, int offset, I32 index) {
609         return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)};
610     }
gather32(Ptr ptr,int offset,I32 index)611     I32 Builder::gather32(Ptr ptr, int offset, I32 index) {
612         return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)};
613     }
614 
uniform32(Ptr ptr,int offset)615     I32 Builder::uniform32(Ptr ptr, int offset) {
616         return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)};
617     }
618 
splat(int n)619     I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; }
620 
621     // Be careful peepholing float math!  Transformations you might expect to
622     // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
623     // Float peepholes must pass this equivalence test for all ~4B floats:
624     //
625     //     bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
626     //
627     //     unsigned bits = 0;
628     //     do {
629     //        float f;
630     //        memcpy(&f, &bits, 4);
631     //        if (!equiv(f, ...)) {
632     //           abort();
633     //        }
634     //     } while (++bits != 0);
635 
add(F32 x,F32 y)636     F32 Builder::add(F32 x, F32 y) {
637         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
638         if (this->isImm(y.id, 0.0f)) { return x; }   // x+0 == x
639         if (this->isImm(x.id, 0.0f)) { return y; }   // 0+y == y
640 
641         if (fFeatures.fma) {
642             if (fProgram[x.id].op == Op::mul_f32) {
643                 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
644             }
645             if (fProgram[y.id].op == Op::mul_f32) {
646                 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
647             }
648         }
649         return {this, this->push(Op::add_f32, x.id, y.id)};
650     }
651 
sub(F32 x,F32 y)652     F32 Builder::sub(F32 x, F32 y) {
653         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
654         if (this->isImm(y.id, 0.0f)) { return x; }   // x-0 == x
655         if (fFeatures.fma) {
656             if (fProgram[x.id].op == Op::mul_f32) {
657                 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
658             }
659             if (fProgram[y.id].op == Op::mul_f32) {
660                 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
661             }
662         }
663         return {this, this->push(Op::sub_f32, x.id, y.id)};
664     }
665 
mul(F32 x,F32 y)666     F32 Builder::mul(F32 x, F32 y) {
667         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
668         if (this->isImm(y.id, 1.0f)) { return x; }  // x*1 == x
669         if (this->isImm(x.id, 1.0f)) { return y; }  // 1*y == y
670         return {this, this->push(Op::mul_f32, x.id, y.id)};
671     }
672 
fast_mul(F32 x,F32 y)673     F32 Builder::fast_mul(F32 x, F32 y) {
674         if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); }
675         return mul(x,y);
676     }
677 
div(F32 x,F32 y)678     F32 Builder::div(F32 x, F32 y) {
679         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); }
680         if (this->isImm(y.id, 1.0f)) { return x; }  // x/1 == x
681         return {this, this->push(Op::div_f32, x.id, y.id)};
682     }
683 
sqrt(F32 x)684     F32 Builder::sqrt(F32 x) {
685         if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
686         return {this, this->push(Op::sqrt_f32, x.id)};
687     }
688 
689     // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
approx_log2(F32 x)690     F32 Builder::approx_log2(F32 x) {
691         // e - 127 is a fair approximation of log2(x) in its own right...
692         F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23)));
693 
694         // ... but using the mantissa to refine its error is _much_ better.
695         F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff),
696                                 0x3f000000));
697         F32 approx = sub(e,        124.225514990f);
698             approx = sub(approx, mul(1.498030302f, m));
699             approx = sub(approx, div(1.725879990f, add(0.3520887068f, m)));
700 
701         return approx;
702     }
703 
approx_pow2(F32 x)704     F32 Builder::approx_pow2(F32 x) {
705         F32 f = fract(x);
706         F32 approx = add(x,         121.274057500f);
707             approx = sub(approx, mul( 1.490129070f, f));
708             approx = add(approx, div(27.728023300f, sub(4.84252568f, f)));
709 
710         return pun_to_F32(round(mul(1.0f * (1<<23), approx)));
711     }
712 
approx_powf(F32 x,F32 y)713     F32 Builder::approx_powf(F32 x, F32 y) {
714         // TODO: assert this instead?  Sometimes x is very slightly negative.  See skia:10210.
715         x = max(0.0f, x);
716 
717         auto is_x = bit_or(eq(x, 0.0f),
718                            eq(x, 1.0f));
719         return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
720     }
721 
722     // Bhaskara I's sine approximation
723     // 16x(pi - x) / (5*pi^2 - 4x(pi - x)
724     // ... divide by 4
725     // 4x(pi - x) / 5*pi^2/4 - x(pi - x)
726     //
727     // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
728     // radians into that range first.
729     //
approx_sin(F32 radians)730     F32 Builder::approx_sin(F32 radians) {
731         constexpr float Pi = SK_ScalarPI;
732         // x = radians mod 2pi
733         F32 x = fract(radians * (0.5f/Pi)) * (2*Pi);
734         I32 neg = x > Pi;   // are we pi < x < 2pi --> need to negate result
735         x = select(neg, x - Pi, x);
736 
737         F32 pair = x * (Pi - x);
738         x = 4.0f * pair / ((5*Pi*Pi/4) - pair);
739         x = select(neg, -x, x);
740         return x;
741     }
742 
743     /*  "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
744          https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
745 
746         approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
747 
748         Some simplifications:
749         1. tan(x) is periodic, -PI/2 < x < PI/2
750         2. tan(x) is odd, so tan(-x) = -tan(x)
751         3. Our polynomial approximation is best near zero, so we use the following identity
752                         tan(x) + tan(y)
753            tan(x + y) = -----------------
754                        1 - tan(x)*tan(y)
755            tan(PI/4) = 1
756 
757            So for x > PI/8, we do the following refactor:
758            x' = x - PI/4
759 
760                     1 + tan(x')
761            tan(x) = ------------
762                     1 - tan(x')
763      */
approx_tan(F32 x)764     F32 Builder::approx_tan(F32 x) {
765         constexpr float Pi = SK_ScalarPI;
766         // periodic between -pi/2 ... pi/2
767         // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
768         x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2);
769 
770         I32 neg = (x < 0.0f);
771         x = select(neg, -x, x);
772 
773         // minimize total error by shifting if x > pi/8
774         I32 use_quotient = (x > (Pi/8));
775         x = select(use_quotient, x - (Pi/4), x);
776 
777         // 9th order poly = 4th order(x^2) * x
778         x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x;
779         x = select(use_quotient, (1+x)/(1-x), x);
780         x = select(neg, -x, x);
781         return x;
782     }
783 
784      // http://mathforum.org/library/drmath/view/54137.html
785      // referencing Handbook of Mathematical Functions,
786      //             by Milton Abramowitz and Irene Stegun
approx_asin(F32 x)787      F32 Builder::approx_asin(F32 x) {
788          I32 neg = (x < 0.0f);
789          x = select(neg, -x, x);
790          x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f);
791          x = select(neg, -x, x);
792          return x;
793      }
794 
795     /*  Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
796      *      with 129 values of x,atan(x) for x:[0...1]
797      *  This only works for 0 <= x <= 1
798      */
approx_atan_unit(F32 x)799     static F32 approx_atan_unit(F32 x) {
800         // for now we might be given NaN... let that through
801         x->assert_true((x != x) | ((x >= 0) & (x <= 1)));
802         return poly(x, 0.14130025741326729f,
803                       -0.34312835980675116f,
804                       -0.016172900528248768f,
805                        1.0037696976200385f,
806                       -0.00014758242182738969f);
807     }
808 
809     /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
810      */
approx_atan(F32 x)811     F32 Builder::approx_atan(F32 x) {
812         I32 neg = (x < 0.0f);
813         x = select(neg, -x, x);
814         I32 flip = (x > 1.0f);
815         x = select(flip, 1/x, x);
816         x = approx_atan_unit(x);
817         x = select(flip, SK_ScalarPI/2 - x, x);
818         x = select(neg, -x, x);
819         return x;
820     }
821 
822     /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
823      *  By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
824      *  which avoids a 2nd divide instruction if we had instead called atan().
825      */
approx_atan2(F32 y0,F32 x0)826     F32 Builder::approx_atan2(F32 y0, F32 x0) {
827 
828         I32 flip = (abs(y0) > abs(x0));
829         F32 y = select(flip, x0, y0);
830         F32 x = select(flip, y0, x0);
831         F32 arg = y/x;
832 
833         I32 neg = (arg < 0.0f);
834         arg = select(neg, -arg, arg);
835 
836         F32 r = approx_atan_unit(arg);
837         r = select(flip, SK_ScalarPI/2 - r, r);
838         r = select(neg, -r, r);
839 
840         // handle quadrant distinctions
841         r = select((y0 >= 0) & (x0  < 0), r + SK_ScalarPI, r);
842         r = select((y0  < 0) & (x0 <= 0), r - SK_ScalarPI, r);
843         // Note: we don't try to handle 0,0 or infinities (yet)
844         return r;
845     }
846 
min(F32 x,F32 y)847     F32 Builder::min(F32 x, F32 y) {
848         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
849         return {this, this->push(Op::min_f32, x.id, y.id)};
850     }
max(F32 x,F32 y)851     F32 Builder::max(F32 x, F32 y) {
852         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
853         return {this, this->push(Op::max_f32, x.id, y.id)};
854     }
855 
add(I32 x,I32 y)856     I32 Builder::add(I32 x, I32 y) {
857         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
858         if (this->isImm(x.id, 0)) { return y; }
859         if (this->isImm(y.id, 0)) { return x; }
860         return {this, this->push(Op::add_i32, x.id, y.id)};
861     }
sub(I32 x,I32 y)862     I32 Builder::sub(I32 x, I32 y) {
863         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
864         if (this->isImm(y.id, 0)) { return x; }
865         return {this, this->push(Op::sub_i32, x.id, y.id)};
866     }
mul(I32 x,I32 y)867     I32 Builder::mul(I32 x, I32 y) {
868         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
869         if (this->isImm(x.id, 0)) { return splat(0); }
870         if (this->isImm(y.id, 0)) { return splat(0); }
871         if (this->isImm(x.id, 1)) { return y; }
872         if (this->isImm(y.id, 1)) { return x; }
873         return {this, this->push(Op::mul_i32, x.id, y.id)};
874     }
875 
shl(I32 x,int bits)876     I32 Builder::shl(I32 x, int bits) {
877         if (bits == 0) { return x; }
878         if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
879         return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)};
880     }
shr(I32 x,int bits)881     I32 Builder::shr(I32 x, int bits) {
882         if (bits == 0) { return x; }
883         if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
884         return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)};
885     }
sra(I32 x,int bits)886     I32 Builder::sra(I32 x, int bits) {
887         if (bits == 0) { return x; }
888         if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
889         return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)};
890     }
891 
eq(F32 x,F32 y)892     I32 Builder:: eq(F32 x, F32 y) {
893         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
894         return {this, this->push(Op::eq_f32, x.id, y.id)};
895     }
neq(F32 x,F32 y)896     I32 Builder::neq(F32 x, F32 y) {
897         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
898         return {this, this->push(Op::neq_f32, x.id, y.id)};
899     }
lt(F32 x,F32 y)900     I32 Builder::lt(F32 x, F32 y) {
901         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); }
902         return {this, this->push(Op::gt_f32, y.id, x.id)};
903     }
lte(F32 x,F32 y)904     I32 Builder::lte(F32 x, F32 y) {
905         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); }
906         return {this, this->push(Op::gte_f32, y.id, x.id)};
907     }
gt(F32 x,F32 y)908     I32 Builder::gt(F32 x, F32 y) {
909         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
910         return {this, this->push(Op::gt_f32, x.id, y.id)};
911     }
gte(F32 x,F32 y)912     I32 Builder::gte(F32 x, F32 y) {
913         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
914         return {this, this->push(Op::gte_f32, x.id, y.id)};
915     }
916 
eq(I32 x,I32 y)917     I32 Builder:: eq(I32 x, I32 y) {
918         if (x.id == y.id) { return splat(~0); }
919         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
920         return {this, this->push(Op:: eq_i32, x.id, y.id)};
921     }
neq(I32 x,I32 y)922     I32 Builder::neq(I32 x, I32 y) {
923         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
924         return ~(x == y);
925     }
gt(I32 x,I32 y)926     I32 Builder:: gt(I32 x, I32 y) {
927         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
928         return {this, this->push(Op:: gt_i32, x.id, y.id)};
929     }
gte(I32 x,I32 y)930     I32 Builder::gte(I32 x, I32 y) {
931         if (x.id == y.id) { return splat(~0); }
932         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
933         return ~(x < y);
934     }
lt(I32 x,I32 y)935     I32 Builder:: lt(I32 x, I32 y) { return y>x; }
lte(I32 x,I32 y)936     I32 Builder::lte(I32 x, I32 y) { return y>=x; }
937 
bit_and(I32 x,I32 y)938     I32 Builder::bit_and(I32 x, I32 y) {
939         if (x.id == y.id) { return x; }
940         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
941         if (this->isImm(y.id, 0)) { return splat(0); }   // (x & false) == false
942         if (this->isImm(x.id, 0)) { return splat(0); }   // (false & y) == false
943         if (this->isImm(y.id,~0)) { return x; }          // (x & true) == x
944         if (this->isImm(x.id,~0)) { return y; }          // (true & y) == y
945         return {this, this->push(Op::bit_and, x.id, y.id)};
946     }
bit_or(I32 x,I32 y)947     I32 Builder::bit_or(I32 x, I32 y) {
948         if (x.id == y.id) { return x; }
949         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); }
950         if (this->isImm(y.id, 0)) { return x; }           // (x | false) == x
951         if (this->isImm(x.id, 0)) { return y; }           // (false | y) == y
952         if (this->isImm(y.id,~0)) { return splat(~0); }   // (x | true) == true
953         if (this->isImm(x.id,~0)) { return splat(~0); }   // (true | y) == true
954         return {this, this->push(Op::bit_or, x.id, y.id)};
955     }
bit_xor(I32 x,I32 y)956     I32 Builder::bit_xor(I32 x, I32 y) {
957         if (x.id == y.id) { return splat(0); }
958         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
959         if (this->isImm(y.id, 0)) { return x; }   // (x ^ false) == x
960         if (this->isImm(x.id, 0)) { return y; }   // (false ^ y) == y
961         return {this, this->push(Op::bit_xor, x.id, y.id)};
962     }
963 
bit_clear(I32 x,I32 y)964     I32 Builder::bit_clear(I32 x, I32 y) {
965         if (x.id == y.id) { return splat(0); }
966         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
967         if (this->isImm(y.id, 0)) { return x; }          // (x & ~false) == x
968         if (this->isImm(y.id,~0)) { return splat(0); }   // (x & ~true) == false
969         if (this->isImm(x.id, 0)) { return splat(0); }   // (false & ~y) == false
970         return {this, this->push(Op::bit_clear, x.id, y.id)};
971     }
972 
select(I32 x,I32 y,I32 z)973     I32 Builder::select(I32 x, I32 y, I32 z) {
974         if (y.id == z.id) { return y; }
975         if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
976         if (this->isImm(x.id,~0)) { return y; }               // true  ? y : z == y
977         if (this->isImm(x.id, 0)) { return z; }               // false ? y : z == z
978         if (this->isImm(y.id, 0)) { return bit_clear(z,x); }  //     x ? 0 : z == ~x&z
979         if (this->isImm(z.id, 0)) { return bit_and  (y,x); }  //     x ? y : 0 ==  x&y
980         return {this, this->push(Op::select, x.id, y.id, z.id)};
981     }
982 
extract(I32 x,int bits,I32 z)983     I32 Builder::extract(I32 x, int bits, I32 z) {
984         if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); }
985         return bit_and(z, shr(x, bits));
986     }
987 
pack(I32 x,I32 y,int bits)988     I32 Builder::pack(I32 x, I32 y, int bits) {
989         return bit_or(x, shl(y, bits));
990     }
991 
ceil(F32 x)992     F32 Builder::ceil(F32 x) {
993         if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); }
994         return {this, this->push(Op::ceil, x.id)};
995     }
floor(F32 x)996     F32 Builder::floor(F32 x) {
997         if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
998         return {this, this->push(Op::floor, x.id)};
999     }
to_F32(I32 x)1000     F32 Builder::to_F32(I32 x) {
1001         if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1002         return {this, this->push(Op::to_f32, x.id)};
1003     }
trunc(F32 x)1004     I32 Builder::trunc(F32 x) {
1005         if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1006         return {this, this->push(Op::trunc, x.id)};
1007     }
round(F32 x)1008     I32 Builder::round(F32 x) {
1009         if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1010         return {this, this->push(Op::round, x.id)};
1011     }
1012 
to_fp16(F32 x)1013     I32 Builder::to_fp16(F32 x) {
1014         if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); }
1015         return {this, this->push(Op::to_fp16, x.id)};
1016     }
from_fp16(I32 x)1017     F32 Builder::from_fp16(I32 x) {
1018         if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); }
1019         return {this, this->push(Op::from_fp16, x.id)};
1020     }
1021 
from_unorm(int bits,I32 x)1022     F32 Builder::from_unorm(int bits, I32 x) {
1023         F32 limit = splat(1 / ((1<<bits)-1.0f));
1024         return mul(to_F32(x), limit);
1025     }
to_unorm(int bits,F32 x)1026     I32 Builder::to_unorm(int bits, F32 x) {
1027         F32 limit = splat((1<<bits)-1.0f);
1028         return round(mul(x, limit));
1029     }
1030 
SkColorType_to_PixelFormat(SkColorType ct)1031     PixelFormat SkColorType_to_PixelFormat(SkColorType ct) {
1032         auto UNORM = PixelFormat::UNORM,
1033              FLOAT = PixelFormat::FLOAT;
1034         switch (ct) {
1035             case kUnknown_SkColorType: break;
1036 
1037             case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96};
1038 
1039             case kRGBA_F16Norm_SkColorType:       return {FLOAT,16,16,16,16, 0,16,32,48};
1040             case kRGBA_F16_SkColorType:           return {FLOAT,16,16,16,16, 0,16,32,48};
1041             case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48};
1042 
1043             case kA16_float_SkColorType:    return {FLOAT,  0, 0,0,16, 0, 0,0,0};
1044             case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0};
1045 
1046             case kAlpha_8_SkColorType: return {UNORM, 0,0,0,8, 0,0,0,0};
1047             case kGray_8_SkColorType:  return {UNORM, 8,8,8,0, 0,0,0,0};  // Subtle.
1048 
1049             case kRGB_565_SkColorType:   return {UNORM, 5,6,5,0, 11,5,0,0};  // (BGR)
1050             case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0};  // (ABGR)
1051 
1052             case kRGBA_8888_SkColorType:  return {UNORM, 8,8,8,8,  0,8,16,24};
1053             case kRGB_888x_SkColorType:   return {UNORM, 8,8,8,0,  0,8,16,32};  // 32-bit
1054             case kBGRA_8888_SkColorType:  return {UNORM, 8,8,8,8, 16,8, 0,24};
1055 
1056             case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2,  0,10,20,30};
1057             case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30};
1058             case kRGB_101010x_SkColorType:  return {UNORM, 10,10,10,0,  0,10,20, 0};
1059             case kBGR_101010x_SkColorType:  return {UNORM, 10,10,10,0, 20,10, 0, 0};
1060 
1061             case kR8G8_unorm_SkColorType:   return {UNORM,  8, 8,0, 0, 0, 8,0,0};
1062             case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0};
1063             case kA16_unorm_SkColorType:    return {UNORM,  0, 0,0,16, 0, 0,0,0};
1064         }
1065         SkASSERT(false);
1066         return {UNORM, 0,0,0,0, 0,0,0,0};
1067     }
1068 
byte_size(PixelFormat f)1069     static int byte_size(PixelFormat f) {
1070         // What's the highest bit we read?
1071         int bits = std::max(f.r_bits + f.r_shift,
1072                    std::max(f.g_bits + f.g_shift,
1073                    std::max(f.b_bits + f.b_shift,
1074                             f.a_bits + f.a_shift)));
1075         // Round up to bytes.
1076         return (bits + 7) / 8;
1077     }
1078 
unpack(PixelFormat f,I32 x)1079     static Color unpack(PixelFormat f, I32 x) {
1080         SkASSERT(byte_size(f) <= 4);
1081         auto unpack_channel = [=](int bits, int shift) {
1082             I32 channel = extract(x, shift, (1<<bits)-1);
1083             switch (f.encoding) {
1084                 case PixelFormat::UNORM: return from_unorm(bits, channel);
1085                 case PixelFormat::FLOAT: return from_fp16 (      channel);
1086             }
1087             SkUNREACHABLE;
1088         };
1089         return {
1090             f.r_bits ? unpack_channel(f.r_bits, f.r_shift) : x->splat(0.0f),
1091             f.g_bits ? unpack_channel(f.g_bits, f.g_shift) : x->splat(0.0f),
1092             f.b_bits ? unpack_channel(f.b_bits, f.b_shift) : x->splat(0.0f),
1093             f.a_bits ? unpack_channel(f.a_bits, f.a_shift) : x->splat(1.0f),
1094         };
1095     }
1096 
split_disjoint_8byte_format(PixelFormat f,PixelFormat * lo,PixelFormat * hi)1097     static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) {
1098         SkASSERT(byte_size(f) == 8);
1099         // We assume some of the channels are in the low 32 bits, some in the high 32 bits.
1100         // The assert on byte_size(lo) will trigger if this assumption is violated.
1101         *lo = f;
1102         if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; }
1103         if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; }
1104         if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; }
1105         if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; }
1106         SkASSERT(byte_size(*lo) == 4);
1107 
1108         *hi = f;
1109         if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; }
1110         if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; }
1111         if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; }
1112         if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; }
1113         SkASSERT(byte_size(*hi) == 4);
1114     }
1115 
1116     // The only 16-byte format we support today is RGBA F32,
1117     // though, TODO, we could generalize that to any swizzle, and to allow UNORM too.
assert_16byte_is_rgba_f32(PixelFormat f)1118     static void assert_16byte_is_rgba_f32(PixelFormat f) {
1119     #if defined(SK_DEBUG)
1120         SkASSERT(byte_size(f) == 16);
1121         PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType);
1122 
1123         SkASSERT(f.encoding == rgba_f32.encoding);
1124 
1125         SkASSERT(f.r_bits == rgba_f32.r_bits);
1126         SkASSERT(f.g_bits == rgba_f32.g_bits);
1127         SkASSERT(f.b_bits == rgba_f32.b_bits);
1128         SkASSERT(f.a_bits == rgba_f32.a_bits);
1129 
1130         SkASSERT(f.r_shift == rgba_f32.r_shift);
1131         SkASSERT(f.g_shift == rgba_f32.g_shift);
1132         SkASSERT(f.b_shift == rgba_f32.b_shift);
1133         SkASSERT(f.a_shift == rgba_f32.a_shift);
1134     #endif
1135     }
1136 
load(PixelFormat f,Ptr ptr)1137     Color Builder::load(PixelFormat f, Ptr ptr) {
1138         switch (byte_size(f)) {
1139             case 1: return unpack(f, load8 (ptr));
1140             case 2: return unpack(f, load16(ptr));
1141             case 4: return unpack(f, load32(ptr));
1142             case 8: {
1143                 PixelFormat lo,hi;
1144                 split_disjoint_8byte_format(f, &lo,&hi);
1145                 Color l = unpack(lo, load64(ptr, 0)),
1146                       h = unpack(hi, load64(ptr, 1));
1147                 return {
1148                     lo.r_bits ? l.r : h.r,
1149                     lo.g_bits ? l.g : h.g,
1150                     lo.b_bits ? l.b : h.b,
1151                     lo.a_bits ? l.a : h.a,
1152                 };
1153             }
1154             case 16: {
1155                 assert_16byte_is_rgba_f32(f);
1156                 return {
1157                     pun_to_F32(load128(ptr, 0)),
1158                     pun_to_F32(load128(ptr, 1)),
1159                     pun_to_F32(load128(ptr, 2)),
1160                     pun_to_F32(load128(ptr, 3)),
1161                 };
1162             }
1163             default: SkUNREACHABLE;
1164         }
1165         return {};
1166     }
1167 
gather(PixelFormat f,Ptr ptr,int offset,I32 index)1168     Color Builder::gather(PixelFormat f, Ptr ptr, int offset, I32 index) {
1169         switch (byte_size(f)) {
1170             case 1: return unpack(f, gather8 (ptr, offset, index));
1171             case 2: return unpack(f, gather16(ptr, offset, index));
1172             case 4: return unpack(f, gather32(ptr, offset, index));
1173             case 8: {
1174                 PixelFormat lo,hi;
1175                 split_disjoint_8byte_format(f, &lo,&hi);
1176                 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)),
1177                       h = unpack(hi, gather32(ptr, offset, (index<<1)+1));
1178                 return {
1179                     lo.r_bits ? l.r : h.r,
1180                     lo.g_bits ? l.g : h.g,
1181                     lo.b_bits ? l.b : h.b,
1182                     lo.a_bits ? l.a : h.a,
1183                 };
1184             }
1185             case 16: {
1186                 assert_16byte_is_rgba_f32(f);
1187                 return {
1188                     gatherF(ptr, offset, (index<<2)+0),
1189                     gatherF(ptr, offset, (index<<2)+1),
1190                     gatherF(ptr, offset, (index<<2)+2),
1191                     gatherF(ptr, offset, (index<<2)+3),
1192                 };
1193             }
1194             default: SkUNREACHABLE;
1195         }
1196         return {};
1197     }
1198 
pack32(PixelFormat f,Color c)1199     static I32 pack32(PixelFormat f, Color c) {
1200         SkASSERT(byte_size(f) <= 4);
1201         I32 packed = c->splat(0);
1202         auto pack_channel = [&](F32 channel, int bits, int shift) {
1203             I32 encoded;
1204             switch (f.encoding) {
1205                 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break;
1206                 case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1207             }
1208             packed = pack(packed, encoded, shift);
1209         };
1210         if (f.r_bits) { pack_channel(c.r, f.r_bits, f.r_shift); }
1211         if (f.g_bits) { pack_channel(c.g, f.g_bits, f.g_shift); }
1212         if (f.b_bits) { pack_channel(c.b, f.b_bits, f.b_shift); }
1213         if (f.a_bits) { pack_channel(c.a, f.a_bits, f.a_shift); }
1214         return packed;
1215     }
1216 
store(PixelFormat f,Ptr ptr,Color c)1217     void Builder::store(PixelFormat f, Ptr ptr, Color c) {
1218         // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal.
1219         if (f.r_bits  == f.g_bits  && f.g_bits  == f.b_bits &&
1220             f.r_shift == f.g_shift && f.g_shift == f.b_shift) {
1221 
1222             // TODO: pull these coefficients from an SkColorSpace?  This is sRGB luma/luminance.
1223             c.r = c.r * 0.2126f
1224                 + c.g * 0.7152f
1225                 + c.b * 0.0722f;
1226             f.g_bits = f.b_bits = 0;
1227         }
1228 
1229         switch (byte_size(f)) {
1230             case 1: store8 (ptr, pack32(f,c)); break;
1231             case 2: store16(ptr, pack32(f,c)); break;
1232             case 4: store32(ptr, pack32(f,c)); break;
1233             case 8: {
1234                 PixelFormat lo,hi;
1235                 split_disjoint_8byte_format(f, &lo,&hi);
1236                 store64(ptr, pack32(lo,c)
1237                            , pack32(hi,c));
1238                 break;
1239             }
1240             case 16: {
1241                 assert_16byte_is_rgba_f32(f);
1242                 store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a));
1243                 break;
1244             }
1245             default: SkUNREACHABLE;
1246         }
1247     }
1248 
unpremul(F32 * r,F32 * g,F32 * b,F32 a)1249     void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1250         skvm::F32 invA = 1.0f / a,
1251                   inf  = pun_to_F32(splat(0x7f800000));
1252         // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
1253         invA = select(invA < inf, invA
1254                                 , 0.0f);
1255         *r *= invA;
1256         *g *= invA;
1257         *b *= invA;
1258     }
1259 
premul(F32 * r,F32 * g,F32 * b,F32 a)1260     void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1261         *r *= a;
1262         *g *= a;
1263         *b *= a;
1264     }
1265 
uniformColor(SkColor4f color,Uniforms * uniforms)1266     Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) {
1267         auto [r,g,b,a] = color;
1268         return {
1269             uniformF(uniforms->pushF(r)),
1270             uniformF(uniforms->pushF(g)),
1271             uniformF(uniforms->pushF(b)),
1272             uniformF(uniforms->pushF(a)),
1273         };
1274     }
1275 
lerp(F32 lo,F32 hi,F32 t)1276     F32 Builder::lerp(F32 lo, F32 hi, F32 t) {
1277         if (this->isImm(t.id, 0.0f)) { return lo; }
1278         if (this->isImm(t.id, 1.0f)) { return hi; }
1279         return mad(sub(hi, lo), t, lo);
1280     }
1281 
lerp(Color lo,Color hi,F32 t)1282     Color Builder::lerp(Color lo, Color hi, F32 t) {
1283         return {
1284             lerp(lo.r, hi.r, t),
1285             lerp(lo.g, hi.g, t),
1286             lerp(lo.b, hi.b, t),
1287             lerp(lo.a, hi.a, t),
1288         };
1289     }
1290 
to_hsla(Color c)1291     HSLA Builder::to_hsla(Color c) {
1292         F32 mx = max(max(c.r,c.g),c.b),
1293             mn = min(min(c.r,c.g),c.b),
1294              d = mx - mn,
1295           invd = 1.0f / d,
1296         g_lt_b = select(c.g < c.b, splat(6.0f)
1297                                  , splat(0.0f));
1298 
1299         F32 h = (1/6.0f) * select(mx == mn,  0.0f,
1300                            select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1301                            select(mx == c.g, invd * (c.b - c.r) + 2.0f
1302                                            , invd * (c.r - c.g) + 4.0f)));
1303 
1304         F32 sum = mx + mn,
1305               l = sum * 0.5f,
1306               s = select(mx == mn, 0.0f
1307                                  , d / select(l > 0.5f, 2.0f - sum
1308                                                       , sum));
1309         return {h, s, l, c.a};
1310     }
1311 
to_rgba(HSLA c)1312     Color Builder::to_rgba(HSLA c) {
1313         // See GrRGBToHSLFilterEffect.fp
1314 
1315         auto [h,s,l,a] = c;
1316         F32 x = s * (1.0f - abs(l + l - 1.0f));
1317 
1318         auto hue_to_rgb = [&,l=l](auto hue) {
1319             auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f;
1320             return x * (clamp01(q) - 0.5f) + l;
1321         };
1322 
1323         return {
1324             hue_to_rgb(h + 0/3.0f),
1325             hue_to_rgb(h + 2/3.0f),
1326             hue_to_rgb(h + 1/3.0f),
1327             c.a,
1328         };
1329     }
1330 
1331     // We're basing our implementation of non-separable blend modes on
1332     //   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1333     // and
1334     //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1335     // They're equivalent, but ES' math has been better simplified.
1336     //
1337     // Anything extra we add beyond that is to make the math work with premul inputs.
1338 
saturation(skvm::F32 r,skvm::F32 g,skvm::F32 b)1339     static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1340         return max(r, max(g, b))
1341              - min(r, min(g, b));
1342     }
1343 
luminance(skvm::F32 r,skvm::F32 g,skvm::F32 b)1344     static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1345         return r*0.30f + g*0.59f + b*0.11f;
1346     }
1347 
set_sat(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1348     static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1349         F32 mn  = min(*r, min(*g, *b)),
1350             mx  = max(*r, max(*g, *b)),
1351             sat = mx - mn;
1352 
1353         // Map min channel to 0, max channel to s, and scale the middle proportionally.
1354         auto scale = [&](skvm::F32 c) {
1355             auto scaled = ((c - mn) * s) / sat;
1356             return select(is_finite(scaled), scaled, 0.0f);
1357         };
1358         *r = scale(*r);
1359         *g = scale(*g);
1360         *b = scale(*b);
1361     }
1362 
set_lum(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1363     static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1364         auto diff = lu - luminance(*r, *g, *b);
1365         *r += diff;
1366         *g += diff;
1367         *b += diff;
1368     }
1369 
clip_color(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1370     static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1371         F32 mn  = min(*r, min(*g, *b)),
1372             mx  = max(*r, max(*g, *b)),
1373             lu = luminance(*r, *g, *b);
1374 
1375         auto clip = [&](auto c) {
1376             c = select(mn >= 0, c
1377                               , lu + ((c-lu)*(  lu)) / (lu-mn));
1378             c = select(mx >  a, lu + ((c-lu)*(a-lu)) / (mx-lu)
1379                               , c);
1380             return clamp01(c);  // May be a little negative, or worse, NaN.
1381         };
1382         *r = clip(*r);
1383         *g = clip(*g);
1384         *b = clip(*b);
1385     }
1386 
blend(SkBlendMode mode,Color src,Color dst)1387     Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1388         auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1389             return x*y + z*w;
1390         };
1391 
1392         auto two = [](skvm::F32 x) { return x+x; };
1393 
1394         auto apply_rgba = [&](auto fn) {
1395             return Color {
1396                 fn(src.r, dst.r),
1397                 fn(src.g, dst.g),
1398                 fn(src.b, dst.b),
1399                 fn(src.a, dst.a),
1400             };
1401         };
1402 
1403         auto apply_rgb_srcover_a = [&](auto fn) {
1404             return Color {
1405                 fn(src.r, dst.r),
1406                 fn(src.g, dst.g),
1407                 fn(src.b, dst.b),
1408                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1409             };
1410         };
1411 
1412         auto non_sep = [&](auto R, auto G, auto B) {
1413             return Color{
1414                 R + mma(src.r, 1-dst.a,  dst.r, 1-src.a),
1415                 G + mma(src.g, 1-dst.a,  dst.g, 1-src.a),
1416                 B + mma(src.b, 1-dst.a,  dst.b, 1-src.a),
1417                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1418             };
1419         };
1420 
1421         switch (mode) {
1422             default:
1423                 SkASSERT(false);
1424                 [[fallthrough]]; /*but also, for safety, fallthrough*/
1425 
1426             case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) };
1427 
1428             case SkBlendMode::kSrc: return src;
1429             case SkBlendMode::kDst: return dst;
1430 
1431             case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]];
1432             case SkBlendMode::kSrcOver:
1433                 return apply_rgba([&](auto s, auto d) {
1434                     return mad(d,1-src.a, s);
1435                 });
1436 
1437             case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]];
1438             case SkBlendMode::kSrcIn:
1439                 return apply_rgba([&](auto s, auto d) {
1440                     return s * dst.a;
1441                 });
1442 
1443             case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]];
1444 
1445             case SkBlendMode::kSrcOut:
1446                 return apply_rgba([&](auto s, auto d) {
1447                     return s * (1-dst.a);
1448                 });
1449 
1450             case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]];
1451             case SkBlendMode::kSrcATop:
1452                 return apply_rgba([&](auto s, auto d) {
1453                     return mma(s, dst.a,  d, 1-src.a);
1454                 });
1455 
1456             case SkBlendMode::kXor:
1457                 return apply_rgba([&](auto s, auto d) {
1458                     return mma(s, 1-dst.a,  d, 1-src.a);
1459                 });
1460 
1461             case SkBlendMode::kPlus:
1462                 return apply_rgba([&](auto s, auto d) {
1463                     return min(s+d, 1.0f);
1464                 });
1465 
1466             case SkBlendMode::kModulate:
1467                 return apply_rgba([&](auto s, auto d) {
1468                     return s * d;
1469                 });
1470 
1471             case SkBlendMode::kScreen:
1472                 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts.
1473                 // It's kind of plausible that s + (d - sd) keeps more precision?
1474                 return apply_rgba([&](auto s, auto d) {
1475                     return s + (d - s*d);
1476                 });
1477 
1478             case SkBlendMode::kDarken:
1479                 return apply_rgb_srcover_a([&](auto s, auto d) {
1480                     return s + (d - max(s * dst.a,
1481                                         d * src.a));
1482                 });
1483 
1484             case SkBlendMode::kLighten:
1485                 return apply_rgb_srcover_a([&](auto s, auto d) {
1486                     return s + (d - min(s * dst.a,
1487                                         d * src.a));
1488                 });
1489 
1490             case SkBlendMode::kDifference:
1491                 return apply_rgb_srcover_a([&](auto s, auto d) {
1492                     return s + (d - two(min(s * dst.a,
1493                                             d * src.a)));
1494                 });
1495 
1496             case SkBlendMode::kExclusion:
1497                 return apply_rgb_srcover_a([&](auto s, auto d) {
1498                     return s + (d - two(s * d));
1499                 });
1500 
1501             case SkBlendMode::kColorBurn:
1502                 return apply_rgb_srcover_a([&](auto s, auto d) {
1503                     auto mn   = min(dst.a,
1504                                     src.a * (dst.a - d) / s),
1505                          burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a);
1506                     return select(d == dst.a     , s * (1-dst.a) + d,
1507                            select(is_finite(burn), burn
1508                                                  , d * (1-src.a) + s));
1509                 });
1510 
1511             case SkBlendMode::kColorDodge:
1512                 return apply_rgb_srcover_a([&](auto s, auto d) {
1513                     auto dodge = src.a * min(dst.a,
1514                                              d * src.a / (src.a - s))
1515                                        + mma(s, 1-dst.a, d, 1-src.a);
1516                     return select(d == 0.0f       , s * (1-dst.a) + d,
1517                            select(is_finite(dodge), dodge
1518                                                   , d * (1-src.a) + s));
1519                 });
1520 
1521             case SkBlendMode::kHardLight:
1522                 return apply_rgb_srcover_a([&](auto s, auto d) {
1523                     return mma(s, 1-dst.a, d, 1-src.a) +
1524                            select(two(s) <= src.a,
1525                                   two(s * d),
1526                                   src.a * dst.a - two((dst.a - d) * (src.a - s)));
1527                 });
1528 
1529             case SkBlendMode::kOverlay:
1530                 return apply_rgb_srcover_a([&](auto s, auto d) {
1531                     return mma(s, 1-dst.a, d, 1-src.a) +
1532                            select(two(d) <= dst.a,
1533                                   two(s * d),
1534                                   src.a * dst.a - two((dst.a - d) * (src.a - s)));
1535                 });
1536 
1537             case SkBlendMode::kMultiply:
1538                 return apply_rgba([&](auto s, auto d) {
1539                     return mma(s, 1-dst.a, d, 1-src.a) + s * d;
1540                 });
1541 
1542             case SkBlendMode::kSoftLight:
1543                 return apply_rgb_srcover_a([&](auto s, auto d) {
1544                     auto  m = select(dst.a > 0.0f, d / dst.a
1545                                                  , 0.0f),
1546                          s2 = two(s),
1547                          m4 = 4*m;
1548 
1549                          // The logic forks three ways:
1550                          //    1. dark src?
1551                          //    2. light src, dark dst?
1552                          //    3. light src, light dst?
1553 
1554                          // Used in case 1
1555                     auto darkSrc = d * ((s2-src.a) * (1-m) + src.a),
1556                          // Used in case 2
1557                          darkDst = (m4 * m4 + m4) * (m-1) + 7*m,
1558                          // Used in case 3.
1559                          liteDst = sqrt(m) - m,
1560                          // Used in 2 or 3?
1561                          liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst
1562                                                                              , liteDst)
1563                                    + d * src.a;
1564                     return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc
1565                                                                              , liteSrc);
1566                 });
1567 
1568             case SkBlendMode::kHue: {
1569                 skvm::F32 R = src.r * src.a,
1570                           G = src.g * src.a,
1571                           B = src.b * src.a;
1572 
1573                 set_sat   (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1574                 set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1575                 clip_color(&R, &G, &B, src.a * dst.a);
1576 
1577                 return non_sep(R, G, B);
1578             }
1579 
1580             case SkBlendMode::kSaturation: {
1581                 skvm::F32 R = dst.r * src.a,
1582                           G = dst.g * src.a,
1583                           B = dst.b * src.a;
1584 
1585                 set_sat   (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1586                 set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1587                 clip_color(&R, &G, &B, src.a * dst.a);
1588 
1589                 return non_sep(R, G, B);
1590             }
1591 
1592             case SkBlendMode::kColor: {
1593                 skvm::F32 R = src.r * dst.a,
1594                           G = src.g * dst.a,
1595                           B = src.b * dst.a;
1596 
1597                 set_lum   (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1598                 clip_color(&R, &G, &B, src.a * dst.a);
1599 
1600                 return non_sep(R, G, B);
1601             }
1602 
1603             case SkBlendMode::kLuminosity: {
1604                 skvm::F32 R = dst.r * src.a,
1605                           G = dst.g * src.a,
1606                           B = dst.b * src.a;
1607 
1608                 set_lum   (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1609                 clip_color(&R, &G, &B, dst.a * src.a);
1610 
1611                 return non_sep(R, G, B);
1612             }
1613         }
1614     }
1615 
1616     // ~~~~ Program::eval() and co. ~~~~ //
1617 
1618     // Handy references for x86-64 instruction encoding:
1619     // https://wiki.osdev.org/X86-64_Instruction_Encoding
1620     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1621     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1622     // http://ref.x86asm.net/coder64.html
1623 
1624     // Used for ModRM / immediate instruction encoding.
_233(int a,int b,int c)1625     static uint8_t _233(int a, int b, int c) {
1626         return (a & 3) << 6
1627              | (b & 7) << 3
1628              | (c & 7) << 0;
1629     }
1630 
1631     // ModRM byte encodes the arguments of an opcode.
1632     enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
mod_rm(Mod mod,int reg,int rm)1633     static uint8_t mod_rm(Mod mod, int reg, int rm) {
1634         return _233((int)mod, reg, rm);
1635     }
1636 
mod(int imm)1637     static Mod mod(int imm) {
1638         if (imm == 0)               { return Mod::Indirect; }
1639         if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1640         return Mod::FourByteImm;
1641     }
1642 
imm_bytes(Mod mod)1643     static int imm_bytes(Mod mod) {
1644         switch (mod) {
1645             case Mod::Indirect:    return 0;
1646             case Mod::OneByteImm:  return 1;
1647             case Mod::FourByteImm: return 4;
1648             case Mod::Direct: SkUNREACHABLE;
1649         }
1650         SkUNREACHABLE;
1651     }
1652 
1653     // SIB byte encodes a memory address, base + (index * scale).
sib(Assembler::Scale scale,int index,int base)1654     static uint8_t sib(Assembler::Scale scale, int index, int base) {
1655         return _233((int)scale, index, base);
1656     }
1657 
1658     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
rex(bool W,bool R,bool X,bool B)1659     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
1660                        bool R,   // Extra top bit to select ModRM reg, registers 8-15.
1661                        bool X,   // Extra top bit for SIB index register.
1662                        bool B) { // Extra top bit for SIB base or ModRM rm register.
1663         return 0b01000000   // Fixed 0100 for top four bits.
1664              | (W << 3)
1665              | (R << 2)
1666              | (X << 1)
1667              | (B << 0);
1668     }
1669 
1670 
1671     // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
1672     struct VEX {
1673         int     len;
1674         uint8_t bytes[3];
1675     };
1676 
vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1677     static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
1678                    bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
1679                    bool   X,   // Same as REX X.
1680                    bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1681                    int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1682                    int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
1683                    bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
1684                    int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1685 
1686         // Pack x86 opcode map selector to 5-bit VEX encoding.
1687         map = [map]{
1688             switch (map) {
1689                 case   0x0f: return 0b00001;
1690                 case 0x380f: return 0b00010;
1691                 case 0x3a0f: return 0b00011;
1692                 // Several more cases only used by XOP / TBM.
1693             }
1694             SkUNREACHABLE;
1695         }();
1696 
1697         // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1698         pp = [pp]{
1699             switch (pp) {
1700                 case 0x66: return 0b01;
1701                 case 0xf3: return 0b10;
1702                 case 0xf2: return 0b11;
1703             }
1704             return 0b00;
1705         }();
1706 
1707         VEX vex = {0, {0,0,0}};
1708         if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
1709             // With these conditions met, we can optionally compress VEX to 2-byte.
1710             vex.len = 2;
1711             vex.bytes[0] = 0xc5;
1712             vex.bytes[1] = (pp      &  3) << 0
1713                          | (L       &  1) << 2
1714                          | (~vvvv   & 15) << 3
1715                          | (~(int)R &  1) << 7;
1716         } else {
1717             // We could use this 3-byte VEX prefix all the time if we like.
1718             vex.len = 3;
1719             vex.bytes[0] = 0xc4;
1720             vex.bytes[1] = (map     & 31) << 0
1721                          | (~(int)B &  1) << 5
1722                          | (~(int)X &  1) << 6
1723                          | (~(int)R &  1) << 7;
1724             vex.bytes[2] = (pp    &  3) << 0
1725                          | (L     &  1) << 2
1726                          | (~vvvv & 15) << 3
1727                          | (WE    &  1) << 7;
1728         }
1729         return vex;
1730     }
1731 
Assembler(void * buf)1732     Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {}
1733 
size() const1734     size_t Assembler::size() const { return fSize; }
1735 
bytes(const void * p,int n)1736     void Assembler::bytes(const void* p, int n) {
1737         if (fCode) {
1738             memcpy(fCode+fSize, p, n);
1739         }
1740         fSize += n;
1741     }
1742 
byte(uint8_t b)1743     void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
word(uint32_t w)1744     void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1745 
align(int mod)1746     void Assembler::align(int mod) {
1747         while (this->size() % mod) {
1748             this->byte(0x00);
1749         }
1750     }
1751 
int3()1752     void Assembler::int3() {
1753         this->byte(0xcc);
1754     }
1755 
vzeroupper()1756     void Assembler::vzeroupper() {
1757         this->byte(0xc5);
1758         this->byte(0xf8);
1759         this->byte(0x77);
1760     }
ret()1761     void Assembler::ret() { this->byte(0xc3); }
1762 
op(int opcode,Operand dst,GP64 x)1763     void Assembler::op(int opcode, Operand dst, GP64 x) {
1764         if (dst.kind == Operand::REG) {
1765             this->byte(rex(W1,x>>3,0,dst.reg>>3));
1766             this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1767             this->byte(mod_rm(Mod::Direct, x, dst.reg&7));
1768         } else {
1769             SkASSERT(dst.kind == Operand::MEM);
1770             const Mem& m = dst.mem;
1771             const bool need_SIB = (m.base&7) == rsp
1772                                || m.index != rsp;
1773 
1774             this->byte(rex(W1,x>>3,m.index>>3,m.base>>3));
1775             this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1776             this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7));
1777             if (need_SIB) {
1778                 this->byte(sib(m.scale, m.index&7, m.base&7));
1779             }
1780             this->bytes(&m.disp, imm_bytes(mod(m.disp)));
1781         }
1782     }
1783 
op(int opcode,int opcode_ext,Operand dst,int imm)1784     void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) {
1785         opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
1786 
1787         int imm_bytes = 4;
1788         if (SkTFitsIn<int8_t>(imm)) {
1789             imm_bytes = 1;
1790             opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
1791         }
1792 
1793         this->op(opcode, dst, (GP64)opcode_ext);
1794         this->bytes(&imm, imm_bytes);
1795     }
1796 
add(Operand dst,int imm)1797     void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); }
sub(Operand dst,int imm)1798     void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); }
cmp(Operand dst,int imm)1799     void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); }
1800 
1801     // These don't work quite like the other instructions with immediates:
1802     // these immediates are always fixed size at 4 bytes or 1 byte.
mov(Operand dst,int imm)1803     void Assembler::mov(Operand dst, int imm) {
1804         this->op(0xC7,dst,(GP64)0b000);
1805         this->word(imm);
1806     }
movb(Operand dst,int imm)1807     void Assembler::movb(Operand dst, int imm) {
1808         this->op(0xC6,dst,(GP64)0b000);
1809         this->byte(imm);
1810     }
1811 
add(Operand dst,GP64 x)1812     void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); }
sub(Operand dst,GP64 x)1813     void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); }
cmp(Operand dst,GP64 x)1814     void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); }
mov(Operand dst,GP64 x)1815     void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); }
movb(Operand dst,GP64 x)1816     void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); }
1817 
add(GP64 dst,Operand x)1818     void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); }
sub(GP64 dst,Operand x)1819     void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); }
cmp(GP64 dst,Operand x)1820     void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); }
mov(GP64 dst,Operand x)1821     void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); }
movb(GP64 dst,Operand x)1822     void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); }
1823 
movzbq(GP64 dst,Operand x)1824     void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); }
movzwq(GP64 dst,Operand x)1825     void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); }
1826 
vpaddd(Ymm dst,Ymm x,Operand y)1827     void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
vpsubd(Ymm dst,Ymm x,Operand y)1828     void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
vpmulld(Ymm dst,Ymm x,Operand y)1829     void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); }
1830 
vpaddw(Ymm dst,Ymm x,Operand y)1831     void Assembler::vpaddw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfd, dst,x,y); }
vpsubw(Ymm dst,Ymm x,Operand y)1832     void Assembler::vpsubw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xf9, dst,x,y); }
vpmullw(Ymm dst,Ymm x,Operand y)1833     void Assembler::vpmullw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xd5, dst,x,y); }
vpavgw(Ymm dst,Ymm x,Operand y)1834     void Assembler::vpavgw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xe3, dst,x,y); }
vpmulhrsw(Ymm dst,Ymm x,Operand y)1835     void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); }
vpminsw(Ymm dst,Ymm x,Operand y)1836     void Assembler::vpminsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xea, dst,x,y); }
vpmaxsw(Ymm dst,Ymm x,Operand y)1837     void Assembler::vpmaxsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xee, dst,x,y); }
vpminuw(Ymm dst,Ymm x,Operand y)1838     void Assembler::vpminuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); }
vpmaxuw(Ymm dst,Ymm x,Operand y)1839     void Assembler::vpmaxuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); }
1840 
vpabsw(Ymm dst,Operand x)1841     void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); }
1842 
1843 
vpand(Ymm dst,Ymm x,Operand y)1844     void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
vpor(Ymm dst,Ymm x,Operand y)1845     void Assembler::vpor  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
vpxor(Ymm dst,Ymm x,Operand y)1846     void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); }
vpandn(Ymm dst,Ymm x,Operand y)1847     void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
1848 
vaddps(Ymm dst,Ymm x,Operand y)1849     void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); }
vsubps(Ymm dst,Ymm x,Operand y)1850     void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); }
vmulps(Ymm dst,Ymm x,Operand y)1851     void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); }
vdivps(Ymm dst,Ymm x,Operand y)1852     void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); }
vminps(Ymm dst,Ymm x,Operand y)1853     void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); }
vmaxps(Ymm dst,Ymm x,Operand y)1854     void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); }
1855 
vfmadd132ps(Ymm dst,Ymm x,Operand y)1856     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); }
vfmadd213ps(Ymm dst,Ymm x,Operand y)1857     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
vfmadd231ps(Ymm dst,Ymm x,Operand y)1858     void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
1859 
vfmsub132ps(Ymm dst,Ymm x,Operand y)1860     void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); }
vfmsub213ps(Ymm dst,Ymm x,Operand y)1861     void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); }
vfmsub231ps(Ymm dst,Ymm x,Operand y)1862     void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); }
1863 
vfnmadd132ps(Ymm dst,Ymm x,Operand y)1864     void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); }
vfnmadd213ps(Ymm dst,Ymm x,Operand y)1865     void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); }
vfnmadd231ps(Ymm dst,Ymm x,Operand y)1866     void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); }
1867 
vpackusdw(Ymm dst,Ymm x,Operand y)1868     void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
vpackuswb(Ymm dst,Ymm x,Operand y)1869     void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
1870 
vpunpckldq(Ymm dst,Ymm x,Operand y)1871     void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); }
vpunpckhdq(Ymm dst,Ymm x,Operand y)1872     void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); }
1873 
vpcmpeqd(Ymm dst,Ymm x,Operand y)1874     void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); }
vpcmpeqw(Ymm dst,Ymm x,Operand y)1875     void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); }
vpcmpgtd(Ymm dst,Ymm x,Operand y)1876     void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); }
vpcmpgtw(Ymm dst,Ymm x,Operand y)1877     void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); }
1878 
1879 
imm_byte_after_operand(const Operand & operand,int imm)1880     void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
1881         // When we've embedded a label displacement in the middle of an instruction,
1882         // we need to tweak it a little so that the resolved displacement starts
1883         // from the end of the instruction and not the end of the displacement.
1884         if (operand.kind == Operand::LABEL && fCode) {
1885             int disp;
1886             memcpy(&disp, fCode+fSize-4, 4);
1887             disp--;
1888             memcpy(fCode+fSize-4, &disp, 4);
1889         }
1890         this->byte(imm);
1891     }
1892 
vcmpps(Ymm dst,Ymm x,Operand y,int imm)1893     void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) {
1894         this->op(0,0x0f,0xc2, dst,x,y);
1895         this->imm_byte_after_operand(y, imm);
1896     }
1897 
vpblendvb(Ymm dst,Ymm x,Operand y,Ymm z)1898     void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) {
1899         this->op(0x66,0x3a0f,0x4c, dst,x,y);
1900         this->imm_byte_after_operand(y, z << 4);
1901     }
1902 
1903     // Shift instructions encode their opcode extension as "dst", dst as x, and x as y.
vpslld(Ymm dst,Ymm x,int imm)1904     void Assembler::vpslld(Ymm dst, Ymm x, int imm) {
1905         this->op(0x66,0x0f,0x72,(Ymm)6, dst,x);
1906         this->byte(imm);
1907     }
vpsrld(Ymm dst,Ymm x,int imm)1908     void Assembler::vpsrld(Ymm dst, Ymm x, int imm) {
1909         this->op(0x66,0x0f,0x72,(Ymm)2, dst,x);
1910         this->byte(imm);
1911     }
vpsrad(Ymm dst,Ymm x,int imm)1912     void Assembler::vpsrad(Ymm dst, Ymm x, int imm) {
1913         this->op(0x66,0x0f,0x72,(Ymm)4, dst,x);
1914         this->byte(imm);
1915     }
vpsllw(Ymm dst,Ymm x,int imm)1916     void Assembler::vpsllw(Ymm dst, Ymm x, int imm) {
1917         this->op(0x66,0x0f,0x71,(Ymm)6, dst,x);
1918         this->byte(imm);
1919     }
vpsrlw(Ymm dst,Ymm x,int imm)1920     void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
1921         this->op(0x66,0x0f,0x71,(Ymm)2, dst,x);
1922         this->byte(imm);
1923     }
vpsraw(Ymm dst,Ymm x,int imm)1924     void Assembler::vpsraw(Ymm dst, Ymm x, int imm) {
1925         this->op(0x66,0x0f,0x71,(Ymm)4, dst,x);
1926         this->byte(imm);
1927     }
1928 
vpermq(Ymm dst,Operand x,int imm)1929     void Assembler::vpermq(Ymm dst, Operand x, int imm) {
1930         // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
1931         this->op(0x66,0x3a0f,0x00, dst,x,W1);
1932         this->imm_byte_after_operand(x, imm);
1933     }
1934 
vperm2f128(Ymm dst,Ymm x,Operand y,int imm)1935     void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) {
1936         this->op(0x66,0x3a0f,0x06, dst,x,y);
1937         this->imm_byte_after_operand(y, imm);
1938     }
1939 
vpermps(Ymm dst,Ymm ix,Operand src)1940     void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) {
1941         this->op(0x66,0x380f,0x16, dst,ix,src);
1942     }
1943 
vroundps(Ymm dst,Operand x,Rounding imm)1944     void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) {
1945         this->op(0x66,0x3a0f,0x08, dst,x);
1946         this->imm_byte_after_operand(x, imm);
1947     }
1948 
vmovdqa(Ymm dst,Operand src)1949     void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); }
vmovups(Ymm dst,Operand src)1950     void Assembler::vmovups(Ymm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
vmovups(Xmm dst,Operand src)1951     void Assembler::vmovups(Xmm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
vmovups(Operand dst,Ymm src)1952     void Assembler::vmovups(Operand dst, Ymm src) { this->op(   0,0x0f,0x11, src,dst); }
vmovups(Operand dst,Xmm src)1953     void Assembler::vmovups(Operand dst, Xmm src) { this->op(   0,0x0f,0x11, src,dst); }
1954 
vcvtdq2ps(Ymm dst,Operand x)1955     void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op(   0,0x0f,0x5b, dst,x); }
vcvttps2dq(Ymm dst,Operand x)1956     void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); }
vcvtps2dq(Ymm dst,Operand x)1957     void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); }
vsqrtps(Ymm dst,Operand x)1958     void Assembler::vsqrtps   (Ymm dst, Operand x) { this->op(   0,0x0f,0x51, dst,x); }
1959 
vcvtps2ph(Operand dst,Ymm x,Rounding imm)1960     void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) {
1961         this->op(0x66,0x3a0f,0x1d, x,dst);
1962         this->imm_byte_after_operand(dst, imm);
1963     }
vcvtph2ps(Ymm dst,Operand x)1964     void Assembler::vcvtph2ps(Ymm dst, Operand x) {
1965         this->op(0x66,0x380f,0x13, dst,x);
1966     }
1967 
disp19(Label * l)1968     int Assembler::disp19(Label* l) {
1969         SkASSERT(l->kind == Label::NotYetSet ||
1970                  l->kind == Label::ARMDisp19);
1971         int here = (int)this->size();
1972         l->kind = Label::ARMDisp19;
1973         l->references.push_back(here);
1974         // ARM 19-bit instruction count, from the beginning of this instruction.
1975         return (l->offset - here) / 4;
1976     }
1977 
disp32(Label * l)1978     int Assembler::disp32(Label* l) {
1979         SkASSERT(l->kind == Label::NotYetSet ||
1980                  l->kind == Label::X86Disp32);
1981         int here = (int)this->size();
1982         l->kind = Label::X86Disp32;
1983         l->references.push_back(here);
1984         // x86 32-bit byte count, from the end of this instruction.
1985         return l->offset - (here + 4);
1986     }
1987 
op(int prefix,int map,int opcode,int dst,int x,Operand y,W w,L l)1988     void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) {
1989         switch (y.kind) {
1990             case Operand::REG: {
1991                 VEX v = vex(w, dst>>3, 0, y.reg>>3,
1992                             map, x, l, prefix);
1993                 this->bytes(v.bytes, v.len);
1994                 this->byte(opcode);
1995                 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7));
1996             } return;
1997 
1998             case Operand::MEM: {
1999                 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows;
2000                 // without an SIB byte, that's where the base register would usually go.
2001                 // This means we have to use an SIB byte if we want to use rsp as a base register.
2002                 const Mem& m = y.mem;
2003                 const bool need_SIB = m.base  == rsp
2004                                    || m.index != rsp;
2005 
2006                 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3,
2007                             map, x, l, prefix);
2008                 this->bytes(v.bytes, v.len);
2009                 this->byte(opcode);
2010                 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7));
2011                 if (need_SIB) {
2012                     this->byte(sib(m.scale, m.index&7, m.base&7));
2013                 }
2014                 this->bytes(&m.disp, imm_bytes(mod(m.disp)));
2015             } return;
2016 
2017             case Operand::LABEL: {
2018                 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
2019                 const int rip = rbp;
2020 
2021                 VEX v = vex(w, dst>>3, 0, rip>>3,
2022                             map, x, l, prefix);
2023                 this->bytes(v.bytes, v.len);
2024                 this->byte(opcode);
2025                 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
2026                 this->word(this->disp32(y.label));
2027             } return;
2028         }
2029     }
2030 
vpshufb(Ymm dst,Ymm x,Operand y)2031     void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); }
2032 
vptest(Ymm x,Operand y)2033     void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); }
2034 
vbroadcastss(Ymm dst,Operand y)2035     void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); }
2036 
jump(uint8_t condition,Label * l)2037     void Assembler::jump(uint8_t condition, Label* l) {
2038         // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
2039         //    7?     one-byte-disp
2040         //    0F 8? four-byte-disp
2041         // We always use the near displacement to make updating labels simpler (no resizing).
2042         this->byte(0x0f);
2043         this->byte(condition);
2044         this->word(this->disp32(l));
2045     }
je(Label * l)2046     void Assembler::je (Label* l) { this->jump(0x84, l); }
jne(Label * l)2047     void Assembler::jne(Label* l) { this->jump(0x85, l); }
jl(Label * l)2048     void Assembler::jl (Label* l) { this->jump(0x8c, l); }
jc(Label * l)2049     void Assembler::jc (Label* l) { this->jump(0x82, l); }
2050 
jmp(Label * l)2051     void Assembler::jmp(Label* l) {
2052         // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
2053         this->byte(0xe9);
2054         this->word(this->disp32(l));
2055     }
2056 
vpmovzxwd(Ymm dst,Operand src)2057     void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); }
vpmovzxbd(Ymm dst,Operand src)2058     void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); }
2059 
vmovq(Operand dst,Xmm src)2060     void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); }
2061 
vmovd(Operand dst,Xmm src)2062     void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); }
vmovd(Xmm dst,Operand src)2063     void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); }
2064 
vpinsrd(Xmm dst,Xmm src,Operand y,int imm)2065     void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) {
2066         this->op(0x66,0x3a0f,0x22, dst,src,y);
2067         this->imm_byte_after_operand(y, imm);
2068     }
vpinsrw(Xmm dst,Xmm src,Operand y,int imm)2069     void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) {
2070         this->op(0x66,0x0f,0xc4, dst,src,y);
2071         this->imm_byte_after_operand(y, imm);
2072     }
vpinsrb(Xmm dst,Xmm src,Operand y,int imm)2073     void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) {
2074         this->op(0x66,0x3a0f,0x20, dst,src,y);
2075         this->imm_byte_after_operand(y, imm);
2076     }
2077 
vextracti128(Operand dst,Ymm src,int imm)2078     void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
2079         this->op(0x66,0x3a0f,0x39, src,dst);
2080         SkASSERT(dst.kind != Operand::LABEL);
2081         this->byte(imm);
2082     }
vpextrd(Operand dst,Xmm src,int imm)2083     void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
2084         this->op(0x66,0x3a0f,0x16, src,dst);
2085         SkASSERT(dst.kind != Operand::LABEL);
2086         this->byte(imm);
2087     }
vpextrw(Operand dst,Xmm src,int imm)2088     void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
2089         this->op(0x66,0x3a0f,0x15, src,dst);
2090         SkASSERT(dst.kind != Operand::LABEL);
2091         this->byte(imm);
2092     }
vpextrb(Operand dst,Xmm src,int imm)2093     void Assembler::vpextrb(Operand dst, Xmm src, int imm) {
2094         this->op(0x66,0x3a0f,0x14, src,dst);
2095         SkASSERT(dst.kind != Operand::LABEL);
2096         this->byte(imm);
2097     }
2098 
vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2099     void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2100         // Unlike most instructions, no aliasing is permitted here.
2101         SkASSERT(dst != ix);
2102         SkASSERT(dst != mask);
2103         SkASSERT(mask != ix);
2104 
2105         int prefix = 0x66,
2106             map    = 0x380f,
2107             opcode = 0x92;
2108         VEX v = vex(0, dst>>3, ix>>3, base>>3,
2109                     map, mask, /*ymm?*/1, prefix);
2110         this->bytes(v.bytes, v.len);
2111         this->byte(opcode);
2112         this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/));
2113         this->byte(sib(scale, ix&7, base&7));
2114     }
2115 
2116     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2117 
operator ""_mask(unsigned long long bits)2118     static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
2119 
op(uint32_t hi,V m,uint32_t lo,V n,V d)2120     void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2121         this->word( (hi & 11_mask) << 21
2122                   | (m  &  5_mask) << 16
2123                   | (lo &  6_mask) << 10
2124                   | (n  &  5_mask) <<  5
2125                   | (d  &  5_mask) <<  0);
2126     }
op(uint32_t op22,V n,V d,int imm)2127     void Assembler::op(uint32_t op22, V n, V d, int imm) {
2128         this->word( (op22 & 22_mask) << 10
2129                   | imm  // size and location depends on the instruction
2130                   | (n    &  5_mask) <<  5
2131                   | (d    &  5_mask) <<  0);
2132     }
2133 
and16b(V d,V n,V m)2134     void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
orr16b(V d,V n,V m)2135     void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
eor16b(V d,V n,V m)2136     void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
bic16b(V d,V n,V m)2137     void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
bsl16b(V d,V n,V m)2138     void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
not16b(V d,V n)2139     void Assembler::not16b(V d, V n)      { this->op(0b0'1'1'01110'00'10000'00101'10,  n, d); }
2140 
add4s(V d,V n,V m)2141     void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
sub4s(V d,V n,V m)2142     void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
mul4s(V d,V n,V m)2143     void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
2144 
cmeq4s(V d,V n,V m)2145     void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
cmgt4s(V d,V n,V m)2146     void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
2147 
sub8h(V d,V n,V m)2148     void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
mul8h(V d,V n,V m)2149     void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
2150 
fadd4s(V d,V n,V m)2151     void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
fsub4s(V d,V n,V m)2152     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
fmul4s(V d,V n,V m)2153     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
fdiv4s(V d,V n,V m)2154     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
fmin4s(V d,V n,V m)2155     void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
fmax4s(V d,V n,V m)2156     void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
2157 
fneg4s(V d,V n)2158     void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); }
fsqrt4s(V d,V n)2159     void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); }
2160 
fcmeq4s(V d,V n,V m)2161     void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
fcmgt4s(V d,V n,V m)2162     void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
fcmge4s(V d,V n,V m)2163     void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
2164 
fmla4s(V d,V n,V m)2165     void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
fmls4s(V d,V n,V m)2166     void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
2167 
tbl(V d,V n,V m)2168     void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
2169 
uzp14s(V d,V n,V m)2170     void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); }
uzp24s(V d,V n,V m)2171     void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); }
zip14s(V d,V n,V m)2172     void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); }
zip24s(V d,V n,V m)2173     void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); }
2174 
sli4s(V d,V n,int imm5)2175     void Assembler::sli4s(V d, V n, int imm5) {
2176         this->op(0b0'1'1'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2177     }
shl4s(V d,V n,int imm5)2178     void Assembler::shl4s(V d, V n, int imm5) {
2179         this->op(0b0'1'0'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2180     }
sshr4s(V d,V n,int imm5)2181     void Assembler::sshr4s(V d, V n, int imm5) {
2182         this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2183     }
ushr4s(V d,V n,int imm5)2184     void Assembler::ushr4s(V d, V n, int imm5) {
2185         this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2186     }
ushr8h(V d,V n,int imm4)2187     void Assembler::ushr8h(V d, V n, int imm4) {
2188         this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16);
2189     }
2190 
scvtf4s(V d,V n)2191     void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
fcvtzs4s(V d,V n)2192     void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
fcvtns4s(V d,V n)2193     void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
frintp4s(V d,V n)2194     void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); }
frintm4s(V d,V n)2195     void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); }
2196 
fcvtn(V d,V n)2197     void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); }
fcvtl(V d,V n)2198     void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); }
2199 
xtns2h(V d,V n)2200     void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
xtnh2b(V d,V n)2201     void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
2202 
uxtlb2h(V d,V n)2203     void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
uxtlh2s(V d,V n)2204     void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
2205 
uminv4s(V d,V n)2206     void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
2207 
brk(int imm16)2208     void Assembler::brk(int imm16) {
2209         this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5);
2210     }
2211 
ret(X n)2212     void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); }
2213 
add(X d,X n,int imm12)2214     void Assembler::add(X d, X n, int imm12) {
2215         this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2216     }
sub(X d,X n,int imm12)2217     void Assembler::sub(X d, X n, int imm12) {
2218         this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2219     }
subs(X d,X n,int imm12)2220     void Assembler::subs(X d, X n, int imm12) {
2221         this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2222     }
2223 
add(X d,X n,X m,Shift shift,int imm6)2224     void Assembler::add(X d, X n, X m, Shift shift, int imm6) {
2225         SkASSERT(shift != ROR);
2226 
2227         int imm = (imm6  & 6_mask) << 0
2228                 | (m     & 5_mask) << 6
2229                 | (0     & 1_mask) << 11
2230                 | (shift & 2_mask) << 12;
2231         this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10);
2232     }
2233 
b(Condition cond,Label * l)2234     void Assembler::b(Condition cond, Label* l) {
2235         const int imm19 = this->disp19(l);
2236         this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5);
2237     }
cbz(X t,Label * l)2238     void Assembler::cbz(X t, Label* l) {
2239         const int imm19 = this->disp19(l);
2240         this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2241     }
cbnz(X t,Label * l)2242     void Assembler::cbnz(X t, Label* l) {
2243         const int imm19 = this->disp19(l);
2244         this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2245     }
2246 
ldrd(X dst,X src,int imm12)2247     void Assembler::ldrd(X dst, X src, int imm12) {
2248         this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2249     }
ldrs(X dst,X src,int imm12)2250     void Assembler::ldrs(X dst, X src, int imm12) {
2251         this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2252     }
ldrh(X dst,X src,int imm12)2253     void Assembler::ldrh(X dst, X src, int imm12) {
2254         this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2255     }
ldrb(X dst,X src,int imm12)2256     void Assembler::ldrb(X dst, X src, int imm12) {
2257         this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2258     }
2259 
ldrq(V dst,X src,int imm12)2260     void Assembler::ldrq(V dst, X src, int imm12) {
2261         this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10);
2262     }
ldrd(V dst,X src,int imm12)2263     void Assembler::ldrd(V dst, X src, int imm12) {
2264         this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2265     }
ldrs(V dst,X src,int imm12)2266     void Assembler::ldrs(V dst, X src, int imm12) {
2267         this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2268     }
ldrh(V dst,X src,int imm12)2269     void Assembler::ldrh(V dst, X src, int imm12) {
2270         this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2271     }
ldrb(V dst,X src,int imm12)2272     void Assembler::ldrb(V dst, X src, int imm12) {
2273         this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2274     }
2275 
strs(X src,X dst,int imm12)2276     void Assembler::strs(X src, X dst, int imm12) {
2277         this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2278     }
2279 
strq(V src,X dst,int imm12)2280     void Assembler::strq(V src, X dst, int imm12) {
2281         this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10);
2282     }
strd(V src,X dst,int imm12)2283     void Assembler::strd(V src, X dst, int imm12) {
2284         this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2285     }
strs(V src,X dst,int imm12)2286     void Assembler::strs(V src, X dst, int imm12) {
2287         this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2288     }
strh(V src,X dst,int imm12)2289     void Assembler::strh(V src, X dst, int imm12) {
2290         this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2291     }
strb(V src,X dst,int imm12)2292     void Assembler::strb(V src, X dst, int imm12) {
2293         this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2294     }
2295 
movs(X dst,V src,int lane)2296     void Assembler::movs(X dst, V src, int lane) {
2297         int imm5 = (lane << 3) | 0b100;
2298         this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & 5_mask) << 16);
2299     }
inss(V dst,X src,int lane)2300     void Assembler::inss(V dst, X src, int lane) {
2301         int imm5 = (lane << 3) | 0b100;
2302         this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & 5_mask) << 16);
2303     }
2304 
2305 
ldrq(V dst,Label * l)2306     void Assembler::ldrq(V dst, Label* l) {
2307         const int imm19 = this->disp19(l);
2308         this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5);
2309     }
2310 
dup4s(V dst,X src)2311     void Assembler::dup4s(V dst, X src) {
2312         this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst);
2313     }
2314 
ld1r4s(V dst,X src)2315     void Assembler::ld1r4s(V dst, X src) {
2316         this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst);
2317     }
ld1r8h(V dst,X src)2318     void Assembler::ld1r8h(V dst, X src) {
2319         this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst);
2320     }
ld1r16b(V dst,X src)2321     void Assembler::ld1r16b(V dst, X src) {
2322         this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst);
2323     }
2324 
ld24s(V dst,X src)2325     void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); }
ld44s(V dst,X src)2326     void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); }
st24s(V src,X dst)2327     void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); }
st44s(V src,X dst)2328     void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); }
2329 
ld24s(V dst,X src,int lane)2330     void Assembler::ld24s(V dst, X src, int lane) {
2331         int Q = (lane & 2)>>1,
2332             S = (lane & 1);
2333                  /*  Q                       S */
2334         this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12));
2335     }
ld44s(V dst,X src,int lane)2336     void Assembler::ld44s(V dst, X src, int lane) {
2337         int Q = (lane & 2)>>1,
2338             S = (lane & 1);
2339         this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12));
2340     }
2341 
label(Label * l)2342     void Assembler::label(Label* l) {
2343         if (fCode) {
2344             // The instructions all currently point to l->offset.
2345             // We'll want to add a delta to point them to here.
2346             int here = (int)this->size();
2347             int delta = here - l->offset;
2348             l->offset = here;
2349 
2350             if (l->kind == Label::ARMDisp19) {
2351                 for (int ref : l->references) {
2352                     // ref points to a 32-bit instruction with 19-bit displacement in instructions.
2353                     uint32_t inst;
2354                     memcpy(&inst, fCode + ref, 4);
2355 
2356                     // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2357                     int disp = (int)(inst << 8) >> 13;
2358 
2359                     disp += delta/4;  // delta is in bytes, we want instructions.
2360 
2361                     // Put it all back together, preserving the high 8 bits and low 5.
2362                     inst = ((disp << 5) &  (19_mask << 5))
2363                          | ((inst     ) & ~(19_mask << 5));
2364 
2365                     memcpy(fCode + ref, &inst, 4);
2366                 }
2367             }
2368 
2369             if (l->kind == Label::X86Disp32) {
2370                 for (int ref : l->references) {
2371                     // ref points to a 32-bit displacement in bytes.
2372                     int disp;
2373                     memcpy(&disp, fCode + ref, 4);
2374 
2375                     disp += delta;
2376 
2377                     memcpy(fCode + ref, &disp, 4);
2378                 }
2379             }
2380         }
2381     }
2382 
eval(int n,void * args[]) const2383     void Program::eval(int n, void* args[]) const {
2384     #define SKVM_JIT_STATS 0
2385     #if SKVM_JIT_STATS
2386         static std::atomic<int64_t>  calls{0}, jits{0},
2387                                     pixels{0}, fast{0};
2388         pixels += n;
2389         if (0 == calls++) {
2390             atexit([]{
2391                 int64_t num = jits .load(),
2392                         den = calls.load();
2393                 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den);
2394                 num = fast  .load();
2395                 den = pixels.load();
2396                 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den);
2397             });
2398         }
2399     #endif
2400 
2401     #if !defined(SKVM_JIT_BUT_IGNORE_IT)
2402         const void* jit_entry = fImpl->jit_entry.load();
2403         // jit_entry may be null either simply because we can't JIT, or when using LLVM
2404         // if the work represented by fImpl->llvm_compiling hasn't finished yet.
2405         //
2406         // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it
2407         // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off,
2408         // due to timing or program caching.
2409         if (jit_entry != nullptr && gSkVMAllowJIT) {
2410         #if SKVM_JIT_STATS
2411             jits++;
2412             fast += n;
2413         #endif
2414             void** a = args;
2415             switch (fImpl->strides.size()) {
2416                 case 0: return ((void(*)(int                        ))jit_entry)(n               );
2417                 case 1: return ((void(*)(int,void*                  ))jit_entry)(n,a[0]          );
2418                 case 2: return ((void(*)(int,void*,void*            ))jit_entry)(n,a[0],a[1]     );
2419                 case 3: return ((void(*)(int,void*,void*,void*      ))jit_entry)(n,a[0],a[1],a[2]);
2420                 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry)
2421                                 (n,a[0],a[1],a[2],a[3]);
2422                 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry)
2423                                 (n,a[0],a[1],a[2],a[3],a[4]);
2424                 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry)
2425                                 (n,a[0],a[1],a[2],a[3],a[4],a[5]);
2426                 case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry)
2427                                 (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]);
2428                 default: SkASSERT(fImpl->strides.size() <= 7);
2429             }
2430         }
2431     #endif
2432 
2433         // So we'll sometimes use the interpreter here even if later calls will use the JIT.
2434         SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
2435                                this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(),
2436                                n, args);
2437     }
2438 
2439 #if defined(SKVM_LLVM)
setupLLVM(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2440     void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2441                             const char* debug_name) {
2442         auto ctx = std::make_unique<llvm::LLVMContext>();
2443 
2444         auto mod = std::make_unique<llvm::Module>("", *ctx);
2445         // All the scary bare pointers from here on are owned by ctx or mod, I think.
2446 
2447         // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2448         const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4;
2449 
2450         llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(),
2451                    *i32 = llvm::Type::getInt32Ty(*ctx);
2452 
2453         std::vector<llvm::Type*> arg_types = { i32 };
2454         for (size_t i = 0; i < fImpl->strides.size(); i++) {
2455             arg_types.push_back(ptr);
2456         }
2457 
2458         llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2459                                                               arg_types, /*vararg?=*/false);
2460         llvm::Function* fn
2461             = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2462         for (size_t i = 0; i < fImpl->strides.size(); i++) {
2463             fn->addParamAttr(i+1, llvm::Attribute::NoAlias);
2464         }
2465 
2466         llvm::BasicBlock *enter  = llvm::BasicBlock::Create(*ctx, "enter" , fn),
2467                          *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn),
2468                          *testK  = llvm::BasicBlock::Create(*ctx, "testK" , fn),
2469                          *loopK  = llvm::BasicBlock::Create(*ctx, "loopK" , fn),
2470                          *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn),
2471                          *test1  = llvm::BasicBlock::Create(*ctx, "test1" , fn),
2472                          *loop1  = llvm::BasicBlock::Create(*ctx, "loop1" , fn),
2473                          *leave  = llvm::BasicBlock::Create(*ctx, "leave" , fn);
2474 
2475         using IRBuilder = llvm::IRBuilder<>;
2476 
2477         llvm::PHINode*                 n;
2478         std::vector<llvm::PHINode*> args;
2479         std::vector<llvm::Value*> vals(instructions.size());
2480 
2481         auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2482             auto [op, x,y,z,w, immA,immB, death,can_hoist] = instructions[i];
2483 
2484             llvm::Type *i1    = llvm::Type::getInt1Ty (*ctx),
2485                        *i8    = llvm::Type::getInt8Ty (*ctx),
2486                        *i16   = llvm::Type::getInt16Ty(*ctx),
2487                        *f32   = llvm::Type::getFloatTy(*ctx),
2488                        *I1    = scalar ? i1    : llvm::VectorType::get(i1 , K  ),
2489                        *I8    = scalar ? i8    : llvm::VectorType::get(i8 , K  ),
2490                        *I16   = scalar ? i16   : llvm::VectorType::get(i16, K  ),
2491                        *I32   = scalar ? i32   : llvm::VectorType::get(i32, K  ),
2492                        *F32   = scalar ? f32   : llvm::VectorType::get(f32, K  );
2493 
2494             auto I  = [&](llvm::Value* v) { return b->CreateBitCast(v, I32  ); };
2495             auto F  = [&](llvm::Value* v) { return b->CreateBitCast(v, F32  ); };
2496 
2497             auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2498 
2499             switch (llvm::Type* t = nullptr; op) {
2500                 default:
2501                     SkDebugf("can't llvm %s (%d)\n", name(op), op);
2502                     return false;
2503 
2504                 case Op::assert_true: /*TODO*/ break;
2505 
2506                 case Op::index:
2507                     if (I32->isVectorTy()) {
2508                         std::vector<llvm::Constant*> iota(K);
2509                         for (int j = 0; j < K; j++) {
2510                             iota[j] = b->getInt32(j);
2511                         }
2512                         vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2513                                                llvm::ConstantVector::get(iota));
2514                     } else {
2515                         vals[i] = n;
2516                     } break;
2517 
2518                 case Op::load8:  t = I8 ; goto load;
2519                 case Op::load16: t = I16; goto load;
2520                 case Op::load32: t = I32; goto load;
2521                 load: {
2522                     llvm::Value* ptr = b->CreateBitCast(args[immA], t->getPointerTo());
2523                     vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), I32);
2524                 } break;
2525 
2526 
2527                 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immA); break;
2528 
2529                 case Op::uniform32: {
2530                     llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2531                                                                                       args[immA],
2532                                                                                       immB),
2533                                                         i32->getPointerTo());
2534                     llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), i32);
2535                     vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2536                                                 : val;
2537                 } break;
2538 
2539                 case Op::gather8:  t = i8 ; goto gather;
2540                 case Op::gather16: t = i16; goto gather;
2541                 case Op::gather32: t = i32; goto gather;
2542                 gather: {
2543                     // Our gather base pointer is immB bytes off of uniform immA.
2544                     llvm::Value* base =
2545                         b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2546                                                                                      args[immA],
2547                                                                                      immB),
2548                                                        t->getPointerTo()->getPointerTo()));
2549 
2550                     llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]);
2551                     llvm::Value* gathered;
2552                     if (ptr->getType()->isVectorTy()) {
2553                         gathered = b->CreateMaskedGather(ptr, 1);
2554                     } else {
2555                         gathered = b->CreateAlignedLoad(ptr, 1);
2556                     }
2557                     vals[i] = b->CreateZExt(gathered, I32);
2558                 } break;
2559 
2560                 case Op::store8:  t = I8 ; goto store;
2561                 case Op::store16: t = I16; goto store;
2562                 case Op::store32: t = I32; goto store;
2563                 store: {
2564                     llvm::Value* val = b->CreateTrunc(vals[x], t);
2565                     llvm::Value* ptr = b->CreateBitCast(args[immA],
2566                                                         val->getType()->getPointerTo());
2567                     vals[i] = b->CreateAlignedStore(val, ptr, 1);
2568                 } break;
2569 
2570                 case Op::bit_and:   vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2571                 case Op::bit_or :   vals[i] = b->CreateOr (vals[x], vals[y]); break;
2572                 case Op::bit_xor:   vals[i] = b->CreateXor(vals[x], vals[y]); break;
2573                 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2574 
2575                 case Op::select:
2576                     vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2577                     break;
2578 
2579                 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2580                 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2581                 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2582 
2583                 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immA); break;
2584                 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immA); break;
2585                 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immA); break;
2586 
2587                 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2588                 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2589 
2590                 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2591                 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2592                 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2593                 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2594 
2595                 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2596                 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2597                 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2598                 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2599 
2600                 case Op::fma_f32:
2601                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2602                                                    {F(vals[x]), F(vals[y]), F(vals[z])}));
2603                     break;
2604 
2605                 case Op::fms_f32:
2606                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2607                                                    {F(vals[x]), F(vals[y]),
2608                                                     b->CreateFNeg(F(vals[z]))}));
2609                     break;
2610 
2611                 case Op::fnma_f32:
2612                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2613                                                    {b->CreateFNeg(F(vals[x])), F(vals[y]),
2614                                                     F(vals[z])}));
2615                     break;
2616 
2617                 case Op::ceil:
2618                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x])));
2619                     break;
2620                 case Op::floor:
2621                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2622                     break;
2623 
2624                 case Op::max_f32:
2625                     vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2626                                                 F(vals[y]), F(vals[x])));
2627                     break;
2628                 case Op::min_f32:
2629                     vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2630                                                 F(vals[y]), F(vals[x])));
2631                     break;
2632 
2633                 case Op::sqrt_f32:
2634                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2635                     break;
2636 
2637                 case Op::to_f32: vals[i] = I(b->CreateSIToFP(  vals[x] , F32)); break;
2638                 case Op::trunc : vals[i] =   b->CreateFPToSI(F(vals[x]), I32) ; break;
2639                 case Op::round : {
2640                     // Basic impl when we can't use cvtps2dq and co.
2641                     auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2642                     vals[i] = b->CreateFPToSI(round, I32);
2643 
2644                 #if 1 && defined(SK_CPU_X86)
2645                     // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2646                     if (scalar) {
2647                         // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3.  ¯\_(ツ)_/¯
2648                         llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, 4));
2649                         v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0);
2650                         vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2651                     } else {
2652                         SkASSERT(K == 4  || K == 8);
2653                         auto intr = K == 4 ?   llvm::Intrinsic::x86_sse2_cvtps2dq :
2654                                  /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2655                         vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2656                     }
2657                 #endif
2658                 } break;
2659 
2660             }
2661             return true;
2662         };
2663 
2664         {
2665             IRBuilder b(enter);
2666             b.CreateBr(hoistK);
2667         }
2668 
2669         // hoistK: emit each hoistable vector instruction; goto testK;
2670         // LLVM can do this sort of thing itself, but we've got the information cheap,
2671         // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2672         {
2673             IRBuilder b(hoistK);
2674 
2675             // Hoisted instructions will need args (think, uniforms), so set that up now.
2676             // These phi nodes are degenerate... they'll always be the passed-in args from enter.
2677             // Later on when we start looping the phi nodes will start looking useful.
2678             llvm::Argument* arg = fn->arg_begin();
2679             (void)arg++;  // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2680             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2681                 args.push_back(b.CreatePHI(arg->getType(), 1));
2682                 args.back()->addIncoming(arg++, enter);
2683             }
2684 
2685             for (size_t i = 0; i < instructions.size(); i++) {
2686                 if (instructions[i].can_hoist && !emit(i, false, &b)) {
2687                     return;
2688                 }
2689             }
2690 
2691             b.CreateBr(testK);
2692         }
2693 
2694         // testK:  if (N >= K) goto loopK; else goto hoist1;
2695         {
2696             IRBuilder b(testK);
2697 
2698             // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2699             // These also start as the initial function arguments; hoistK can't have changed them.
2700             llvm::Argument* arg = fn->arg_begin();
2701 
2702             n = b.CreatePHI(arg->getType(), 2);
2703             n->addIncoming(arg++, hoistK);
2704 
2705             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2706                 args[i] = b.CreatePHI(arg->getType(), 2);
2707                 args[i]->addIncoming(arg++, hoistK);
2708             }
2709 
2710             b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2711         }
2712 
2713         // loopK:  ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
2714         {
2715             IRBuilder b(loopK);
2716             for (size_t i = 0; i < instructions.size(); i++) {
2717                 if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2718                     return;
2719                 }
2720             }
2721 
2722             // n -= K
2723             llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2724             n->addIncoming(n_next, loopK);
2725 
2726             // Each arg ptr += K
2727             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2728                 llvm::Value* arg_next
2729                     = b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]);
2730                 args[i]->addIncoming(arg_next, loopK);
2731             }
2732             b.CreateBr(testK);
2733         }
2734 
2735         // hoist1: emit each hoistable scalar instruction; goto test1;
2736         {
2737             IRBuilder b(hoist1);
2738             for (size_t i = 0; i < instructions.size(); i++) {
2739                 if (instructions[i].can_hoist && !emit(i, true, &b)) {
2740                     return;
2741                 }
2742             }
2743             b.CreateBr(test1);
2744         }
2745 
2746         // test1:  if (N >= 1) goto loop1; else goto leave;
2747         {
2748             IRBuilder b(test1);
2749 
2750             // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2751             llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2);
2752             n_new->addIncoming(n, hoist1);
2753             n = n_new;
2754 
2755             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2756                 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2);
2757                 arg_new->addIncoming(args[i], hoist1);
2758                 args[i] = arg_new;
2759             }
2760 
2761             b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave);
2762         }
2763 
2764         // loop1:  ... insts on scalars; N -= 1, args += stride; goto test1;
2765         {
2766             IRBuilder b(loop1);
2767             for (size_t i = 0; i < instructions.size(); i++) {
2768                 if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2769                     return;
2770                 }
2771             }
2772 
2773             // n -= 1
2774             llvm::Value* n_next = b.CreateSub(n, b.getInt32(1));
2775             n->addIncoming(n_next, loop1);
2776 
2777             // Each arg ptr += K
2778             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2779                 llvm::Value* arg_next
2780                     = b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]);
2781                 args[i]->addIncoming(arg_next, loop1);
2782             }
2783             b.CreateBr(test1);
2784         }
2785 
2786         // leave:  ret
2787         {
2788             IRBuilder b(leave);
2789             b.CreateRetVoid();
2790         }
2791 
2792         SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
2793 
2794         if (true) {
2795             SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
2796             std::error_code err;
2797             llvm::raw_fd_ostream os(path.c_str(), err);
2798             if (err) {
2799                 return;
2800             }
2801             llvm::WriteBitcodeToFile(*mod, os);
2802         }
2803 
2804         static SkOnce once;
2805         once([]{
2806             SkAssertResult(false == llvm::InitializeNativeTarget());
2807             SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
2808         });
2809 
2810         if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
2811                                             .setEngineKind(llvm::EngineKind::JIT)
2812                                             .setMCPU(llvm::sys::getHostCPUName())
2813                                             .create()) {
2814             fImpl->llvm_ctx = std::move(ctx);
2815             fImpl->llvm_ee.reset(ee);
2816 
2817             // We have to be careful here about what we close over and how, in case fImpl moves.
2818             // fImpl itself may change, but its pointee fields won't, so close over them by value.
2819             // Also, debug_name will almost certainly leave scope, so copy it.
2820             fImpl->llvm_compiling = std::async(std::launch::async, [dst  = &fImpl->jit_entry,
2821                                                                     ee   =  fImpl->llvm_ee.get(),
2822                                                                     name = std::string(debug_name)]{
2823                 // std::atomic<void*>*    dst;
2824                 // llvm::ExecutionEngine* ee;
2825                 // std::string            name;
2826                 dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
2827             });
2828         }
2829     }
2830 #endif
2831 
waitForLLVM() const2832     void Program::waitForLLVM() const {
2833     #if defined(SKVM_LLVM)
2834         if (fImpl->llvm_compiling.valid()) {
2835             fImpl->llvm_compiling.wait();
2836         }
2837     #endif
2838     }
2839 
hasJIT() const2840     bool Program::hasJIT() const {
2841         // Program::hasJIT() is really just a debugging / test aid,
2842         // so we don't mind adding a sync point here to wait for compilation.
2843         this->waitForLLVM();
2844 
2845         return fImpl->jit_entry.load() != nullptr;
2846     }
2847 
dropJIT()2848     void Program::dropJIT() {
2849     #if defined(SKVM_LLVM)
2850         this->waitForLLVM();
2851         fImpl->llvm_ee .reset(nullptr);
2852         fImpl->llvm_ctx.reset(nullptr);
2853     #elif defined(SKVM_JIT)
2854         if (fImpl->dylib) {
2855             close_dylib(fImpl->dylib);
2856         } else if (auto jit_entry = fImpl->jit_entry.load()) {
2857             unmap_jit_buffer(jit_entry, fImpl->jit_size);
2858         }
2859     #else
2860         SkASSERT(!this->hasJIT());
2861     #endif
2862 
2863         fImpl->jit_entry.store(nullptr);
2864         fImpl->jit_size  = 0;
2865         fImpl->dylib     = nullptr;
2866     }
2867 
Program()2868     Program::Program() : fImpl(std::make_unique<Impl>()) {}
2869 
~Program()2870     Program::~Program() {
2871         // Moved-from Programs may have fImpl == nullptr.
2872         if (fImpl) {
2873             this->dropJIT();
2874         }
2875     }
2876 
Program(Program && other)2877     Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {}
2878 
operator =(Program && other)2879     Program& Program::operator=(Program&& other) {
2880         fImpl = std::move(other.fImpl);
2881         return *this;
2882     }
2883 
Program(const std::vector<OptimizedInstruction> & instructions,const std::vector<int> & strides,const char * debug_name,bool allow_jit)2884     Program::Program(const std::vector<OptimizedInstruction>& instructions,
2885                      const std::vector<int>& strides,
2886                      const char* debug_name, bool allow_jit) : Program() {
2887         fImpl->strides = strides;
2888         if (gSkVMAllowJIT && allow_jit) {
2889         #if 1 && defined(SKVM_LLVM)
2890             this->setupLLVM(instructions, debug_name);
2891         #elif 1 && defined(SKVM_JIT)
2892             this->setupJIT(instructions, debug_name);
2893         #endif
2894         }
2895 
2896         // Might as well do this after setupLLVM() to get a little more time to compile.
2897         this->setupInterpreter(instructions);
2898     }
2899 
instructions() const2900     std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
nargs() const2901     int  Program::nargs() const { return (int)fImpl->strides.size(); }
nregs() const2902     int  Program::nregs() const { return fImpl->regs; }
loop() const2903     int  Program::loop () const { return fImpl->loop; }
empty() const2904     bool Program::empty() const { return fImpl->instructions.empty(); }
2905 
2906     // Translate OptimizedInstructions to InterpreterInstructions.
setupInterpreter(const std::vector<OptimizedInstruction> & instructions)2907     void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
2908         // Register each instruction is assigned to.
2909         std::vector<Reg> reg(instructions.size());
2910 
2911         // This next bit is a bit more complicated than strictly necessary;
2912         // we could just assign every instruction to its own register.
2913         //
2914         // But recycling registers is fairly cheap, and good practice for the
2915         // JITs where minimizing register pressure really is important.
2916         //
2917         // We have effectively infinite registers, so we hoist any value we can.
2918         // (The JIT may choose a more complex policy to reduce register pressure.)
2919 
2920         fImpl->regs = 0;
2921         std::vector<Reg> avail;
2922 
2923         // Assign this value to a register, recycling them where we can.
2924         auto assign_register = [&](Val id) {
2925             const OptimizedInstruction& inst = instructions[id];
2926 
2927             // If this is a real input and it's lifetime ends at this instruction,
2928             // we can recycle the register it's occupying.
2929             auto maybe_recycle_register = [&](Val input) {
2930                 if (input != NA && instructions[input].death == id) {
2931                     avail.push_back(reg[input]);
2932                 }
2933             };
2934 
2935             // Take care to not recycle the same register twice.
2936             const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w;
2937             if (true                      ) { maybe_recycle_register(x); }
2938             if (y != x                    ) { maybe_recycle_register(y); }
2939             if (z != x && z != y          ) { maybe_recycle_register(z); }
2940             if (w != x && w != y && w != z) { maybe_recycle_register(w); }
2941 
2942             // Instructions that die at themselves (stores) don't need a register.
2943             if (inst.death != id) {
2944                 // Allocate a register if we have to, preferring to reuse anything available.
2945                 if (avail.empty()) {
2946                     reg[id] = fImpl->regs++;
2947                 } else {
2948                     reg[id] = avail.back();
2949                     avail.pop_back();
2950                 }
2951             }
2952         };
2953 
2954         // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
2955         for (Val id = 0; id < (Val)instructions.size(); id++) {
2956             if ( instructions[id].can_hoist) { assign_register(id); }
2957         }
2958         for (Val id = 0; id < (Val)instructions.size(); id++) {
2959             if (!instructions[id].can_hoist) { assign_register(id); }
2960         }
2961 
2962         // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
2963         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
2964 
2965         // The loop begins at the fImpl->loop'th Instruction.
2966         fImpl->loop = 0;
2967         fImpl->instructions.reserve(instructions.size());
2968 
2969         // Add a dummy mapping for the N/A sentinel Val to any arbitrary register
2970         // so lookups don't have to know which arguments are used by which Ops.
2971         auto lookup_register = [&](Val id) {
2972             return id == NA ? (Reg)0
2973                             : reg[id];
2974         };
2975 
2976         auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
2977             InterpreterInstruction pinst{
2978                 inst.op,
2979                 lookup_register(id),
2980                 lookup_register(inst.x),
2981                 lookup_register(inst.y),
2982                 lookup_register(inst.z),
2983                 lookup_register(inst.w),
2984                 inst.immA,
2985                 inst.immB,
2986             };
2987             fImpl->instructions.push_back(pinst);
2988         };
2989 
2990         for (Val id = 0; id < (Val)instructions.size(); id++) {
2991             const OptimizedInstruction& inst = instructions[id];
2992             if (inst.can_hoist) {
2993                 push_instruction(id, inst);
2994                 fImpl->loop++;
2995             }
2996         }
2997         for (Val id = 0; id < (Val)instructions.size(); id++) {
2998             const OptimizedInstruction& inst = instructions[id];
2999             if (!inst.can_hoist) {
3000                 push_instruction(id, inst);
3001             }
3002         }
3003     }
3004 
3005 #if defined(SKVM_JIT)
3006 
jit(const std::vector<OptimizedInstruction> & instructions,int * stack_hint,uint32_t * registers_used,Assembler * a) const3007     bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
3008                       int* stack_hint,
3009                       uint32_t* registers_used,
3010                       Assembler* a) const {
3011         using A = Assembler;
3012 
3013         SkTHashMap<int, A::Label> constants;    // Constants (mostly splats) share the same pool.
3014         A::Label                  iota;         // Varies per lane, for Op::index.
3015         A::Label                  load64_index; // Used to load low or high half of 64-bit lanes.
3016 
3017         // The `regs` array tracks everything we know about each register's state:
3018         //   - NA:   empty
3019         //   - RES:  reserved by ABI
3020         //   - TMP:  holding a temporary
3021         //   - id:   holding Val id
3022         constexpr Val RES = NA-1,
3023                       TMP = RES-1;
3024 
3025         // Map val -> stack slot.
3026         std::vector<int> stack_slot(instructions.size(), NA);
3027         int next_stack_slot = 0;
3028 
3029         const int nstack_slots = *stack_hint >= 0 ? *stack_hint
3030                                                   : stack_slot.size();
3031 
3032     #if defined(__x86_64__) || defined(_M_X64)
3033         if (!SkCpu::Supports(SkCpu::HSW)) {
3034             return false;
3035         }
3036         const int K = 8;
3037         using Reg = A::Ymm;
3038         #if defined(_M_X64)  // Important to check this first; clang-cl defines both.
3039             const A::GP64 N = A::rcx,
3040                         GP0 = A::rax,
3041                         GP1 = A::r11,
3042                         arg[]    = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi };
3043 
3044             // xmm6-15 need are callee-saved.
3045             std::array<Val,16> regs = {
3046                  NA, NA, NA, NA,  NA, NA,RES,RES,
3047                 RES,RES,RES,RES, RES,RES,RES,RES,
3048             };
3049             const uint32_t incoming_registers_used = *registers_used;
3050 
3051             auto enter = [&]{
3052                 // rcx,rdx,r8,r9 are all already holding their correct values.
3053                 // Load caller-saved r10 from rsp+40 if there's a fourth arg.
3054                 if (fImpl->strides.size() >= 4) {
3055                     a->mov(A::r10, A::Mem{A::rsp, 40});
3056                 }
3057                 // Load callee-saved rdi from rsp+48 if there's a fifth arg,
3058                 // first saving it to ABI reserved shadow area rsp+8.
3059                 if (fImpl->strides.size() >= 5) {
3060                     a->mov(A::Mem{A::rsp, 8}, A::rdi);
3061                     a->mov(A::rdi, A::Mem{A::rsp, 48});
3062                 }
3063                 // Load callee-saved rsi from rsp+56 if there's a sixth arg,
3064                 // first saving it to ABI reserved shadow area rsp+16.
3065                 if (fImpl->strides.size() >= 6) {
3066                     a->mov(A::Mem{A::rsp, 16}, A::rsi);
3067                     a->mov(A::rsi, A::Mem{A::rsp, 56});
3068                 }
3069 
3070                 // Allocate stack for our values and callee-saved xmm6-15.
3071                 int stack_needed = nstack_slots*K*4;
3072                 for (int r = 6; r < 16; r++) {
3073                     if (incoming_registers_used & (1<<r)) {
3074                         stack_needed += 16;
3075                     }
3076                 }
3077                 if (stack_needed) { a->sub(A::rsp, stack_needed); }
3078 
3079                 int next_saved_xmm = nstack_slots*K*4;
3080                 for (int r = 6; r < 16; r++) {
3081                     if (incoming_registers_used & (1<<r)) {
3082                         a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r);
3083                         next_saved_xmm += 16;
3084                         regs[r] = NA;
3085                     }
3086                 }
3087             };
3088             auto exit  = [&]{
3089                 // The second pass of jit() shouldn't use any register it didn't in the first pass.
3090                 SkASSERT((*registers_used & incoming_registers_used) == *registers_used);
3091 
3092                 // Restore callee-saved xmm6-15 and the stack pointer.
3093                 int stack_used = nstack_slots*K*4;
3094                 for (int r = 6; r < 16; r++) {
3095                     if (incoming_registers_used & (1<<r)) {
3096                         a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used});
3097                         stack_used += 16;
3098                     }
3099                 }
3100                 if (stack_used) { a->add(A::rsp, stack_used); }
3101 
3102                 // Restore callee-saved rdi/rsi if we used them.
3103                 if (fImpl->strides.size() >= 5) {
3104                     a->mov(A::rdi, A::Mem{A::rsp, 8});
3105                 }
3106                 if (fImpl->strides.size() >= 6) {
3107                     a->mov(A::rsi, A::Mem{A::rsp, 16});
3108                 }
3109 
3110                 a->vzeroupper();
3111                 a->ret();
3112             };
3113         #elif defined(__x86_64__)
3114             const A::GP64 N = A::rdi,
3115                         GP0 = A::rax,
3116                         GP1 = A::r11,
3117                         arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 };
3118 
3119             // All 16 ymm registers are available to use.
3120             std::array<Val,16> regs = {
3121                 NA,NA,NA,NA, NA,NA,NA,NA,
3122                 NA,NA,NA,NA, NA,NA,NA,NA,
3123             };
3124 
3125             auto enter = [&]{
3126                 // Load caller-saved r10 from rsp+8 if there's a sixth arg.
3127                 if (fImpl->strides.size() >= 6) {
3128                     a->mov(A::r10, A::Mem{A::rsp, 8});
3129                 }
3130                 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); }
3131             };
3132             auto exit  = [&]{
3133                 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
3134                 a->vzeroupper();
3135                 a->ret();
3136             };
3137         #endif
3138 
3139         auto load_from_memory = [&](Reg r, Val v) {
3140             if (instructions[v].op == Op::splat) {
3141                 if (instructions[v].immA == 0) {
3142                     a->vpxor(r,r,r);
3143                 } else {
3144                     a->vmovups(r, constants.find(instructions[v].immA));
3145                 }
3146             } else {
3147                 SkASSERT(stack_slot[v] != NA);
3148                 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4});
3149             }
3150         };
3151         auto store_to_stack = [&](Reg r, Val v) {
3152             SkASSERT(next_stack_slot < nstack_slots);
3153             stack_slot[v] = next_stack_slot++;
3154             a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r);
3155         };
3156     #elif defined(__aarch64__)
3157         const int K = 4;
3158         using Reg = A::V;
3159         const A::X N     = A::x0,
3160                    GP0   = A::x8,
3161                    GP1   = A::x9,
3162                    arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
3163 
3164         // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
3165         std::array<Val,32> regs = {
3166              NA, NA, NA, NA,  NA, NA, NA, NA,
3167             RES,RES,RES,RES, RES,RES,RES,RES,
3168              NA, NA, NA, NA,  NA, NA, NA, NA,
3169              NA, NA, NA, NA,  NA, NA, NA, NA,
3170         };
3171 
3172         auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
3173         auto exit  = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
3174                           a->ret(A::x30); };
3175 
3176         auto load_from_memory = [&](Reg r, Val v) {
3177             if (instructions[v].op == Op::splat) {
3178                 if (instructions[v].immA == 0) {
3179                     a->eor16b(r,r,r);
3180                 } else {
3181                     a->ldrq(r, constants.find(instructions[v].immA));
3182                 }
3183             } else {
3184                 SkASSERT(stack_slot[v] != NA);
3185                 a->ldrq(r, A::sp, stack_slot[v]);
3186             }
3187         };
3188         auto store_to_stack  = [&](Reg r, Val v) {
3189             SkASSERT(next_stack_slot < nstack_slots);
3190             stack_slot[v] = next_stack_slot++;
3191             a->strq(r, A::sp, stack_slot[v]);
3192         };
3193     #endif
3194 
3195         *registers_used = 0;  // We'll update this as we go.
3196 
3197         if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) {
3198             return false;
3199         }
3200 
3201         auto emit = [&](Val id, bool scalar) {
3202             const int active_lanes = scalar ? 1 : K;
3203             const OptimizedInstruction& inst = instructions[id];
3204             const Op op = inst.op;
3205             const Val x = inst.x,
3206                       y = inst.y,
3207                       z = inst.z,
3208                       w = inst.w;
3209             const int immA = inst.immA,
3210                       immB = inst.immB;
3211 
3212             // alloc_tmp() returns the first of N adjacent temporary registers,
3213             // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst().
3214             auto alloc_tmp = [&](int N=1) -> Reg {
3215                 auto needs_spill = [&](Val v) -> bool {
3216                     SkASSERT(v >= 0);   // {NA,TMP,RES} need to be handled before calling this.
3217                     return stack_slot[v] == NA               // We haven't spilled it already?
3218                         && instructions[v].op != Op::splat;  // No need to spill constants.
3219                 };
3220 
3221                 // We want to find a block of N adjacent registers requiring the fewest spills.
3222                 int best_block = -1,
3223                     min_spills = 0x7fff'ffff;
3224                 for (int block = 0; block+N <= (int)regs.size(); block++) {
3225                     int spills = 0;
3226                     for (int r = block; r < block+N; r++) {
3227                         Val v = regs[r];
3228                         // Registers holding NA (nothing) are ideal, nothing to spill.
3229                         if (v == NA) {
3230                             continue;
3231                         }
3232                         // We can't spill anything REServed or that we'll need this instruction.
3233                         if (v == RES ||
3234                             v == TMP || v == id || v == x || v == y || v == z || v == w) {
3235                             spills = 0x7fff'ffff;
3236                             block  = r;   // (optimization) continue outer loop at next register.
3237                             break;
3238                         }
3239                         // Usually here we've got a value v that we'd have to spill to the stack
3240                         // before reusing its register, but sometimes even now we get a freebie.
3241                         spills += needs_spill(v) ? 1 : 0;
3242                     }
3243 
3244                     // TODO: non-arbitrary tie-breaking?
3245                     if (min_spills > spills) {
3246                         min_spills = spills;
3247                         best_block = block;
3248                     }
3249                     if (min_spills == 0) {
3250                         break;  // (optimization) stop early if we find an unbeatable block.
3251                     }
3252                 }
3253 
3254                 // TODO: our search's success isn't obviously guaranteed... it depends on N
3255                 // and the number and relative position in regs of any unspillable values.
3256                 // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64;
3257                 // we'll need to revisit this logic should this assert fire.
3258                 SkASSERT(min_spills <= N);
3259 
3260                 // Spill what needs spilling, and mark the block all as TMP.
3261                 for (int r = best_block; r < best_block+N; r++) {
3262                     Val& v = regs[r];
3263                     *registers_used |= (1<<r);
3264 
3265                     SkASSERT(v == NA || v >= 0);
3266                     if (v >= 0 && needs_spill(v)) {
3267                         store_to_stack((Reg)r, v);
3268                         SkASSERT(!needs_spill(v));
3269                         min_spills--;
3270                     }
3271 
3272                     v = TMP;
3273                 }
3274                 SkASSERT(min_spills == 0);
3275                 return (Reg)best_block;
3276             };
3277 
3278             auto free_tmp = [&](Reg r) {
3279                 SkASSERT(regs[r] == TMP);
3280                 regs[r] = NA;
3281             };
3282 
3283             // Which register holds dst,x,y,z,w for this instruction?  NA if none does yet.
3284             int rd = NA,
3285                 rx = NA,
3286                 ry = NA,
3287                 rz = NA,
3288                 rw = NA;
3289 
3290             auto update_regs = [&](Reg r, Val v) {
3291                 if (v == id) { rd = r; }
3292                 if (v ==  x) { rx = r; }
3293                 if (v ==  y) { ry = r; }
3294                 if (v ==  z) { rz = r; }
3295                 if (v ==  w) { rw = r; }
3296                 return r;
3297             };
3298 
3299             auto find_existing_reg = [&](Val v) -> int {
3300                 // Quick-check our working registers.
3301                 if (v == id && rd != NA) { return rd; }
3302                 if (v ==  x && rx != NA) { return rx; }
3303                 if (v ==  y && ry != NA) { return ry; }
3304                 if (v ==  z && rz != NA) { return rz; }
3305                 if (v ==  w && rw != NA) { return rw; }
3306 
3307                 // Search inter-instruction register map.
3308                 for (auto [r,val] : SkMakeEnumerate(regs)) {
3309                     if (val == v) {
3310                         return update_regs((Reg)r, v);
3311                     }
3312                 }
3313                 return NA;
3314             };
3315 
3316             // Return a register for Val, holding that value if it already exists.
3317             // During this instruction all calls to r(v) will return the same register.
3318             auto r = [&](Val v) -> Reg {
3319                 SkASSERT(v >= 0);
3320 
3321                 if (int found = find_existing_reg(v); found != NA) {
3322                     return (Reg)found;
3323                 }
3324 
3325                 Reg r = alloc_tmp();
3326                 SkASSERT(regs[r] == TMP);
3327 
3328                 SkASSERT(v <= id);
3329                 if (v < id) {
3330                     // If v < id, we're loading one of this instruction's inputs.
3331                     // If v == id we're just allocating its destination register.
3332                     load_from_memory(r, v);
3333                 }
3334                 regs[r] = v;
3335                 return update_regs(r, v);
3336             };
3337 
3338             auto dies_here = [&](Val v) -> bool {
3339                 SkASSERT(v >= 0);
3340                 return instructions[v].death == id;
3341             };
3342 
3343             // Alias dst() to r(v) if dies_here(v).
3344             auto try_alias = [&](Val v) -> bool {
3345                 SkASSERT(v == x || v == y || v == z || v == w);
3346                 if (dies_here(v)) {
3347                     rd = r(v);      // Vals v and id share a register for this instruction.
3348                     regs[rd] = id;  // Next instruction, Val id will be in the register, not Val v.
3349                     return true;
3350                 }
3351                 return false;
3352             };
3353 
3354             // Generally r(id),
3355             // but with a hint, try to alias dst() to r(v) if dies_here(v).
3356             auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg {
3357                 if (hint1 != NA && try_alias(hint1)) { return r(id); }
3358                 if (hint2 != NA && try_alias(hint2)) { return r(id); }
3359                 return r(id);
3360             };
3361 
3362         #if defined(__aarch64__)  // Nothing sneaky, just unused on x86-64.
3363             auto mark_tmp_as_dst = [&](Reg tmp) {
3364                 SkASSERT(regs[tmp] == TMP);
3365                 rd = tmp;
3366                 regs[rd] = id;
3367                 SkASSERT(dst() == tmp);
3368             };
3369         #endif
3370 
3371         #if defined(__x86_64__) || defined(_M_X64)
3372             // On x86 we can work with many values directly from the stack or program constant pool.
3373             auto any = [&](Val v) -> A::Operand {
3374                 SkASSERT(v >= 0);
3375                 SkASSERT(v < id);
3376 
3377                 if (int found = find_existing_reg(v); found != NA) {
3378                     return (Reg)found;
3379                 }
3380                 if (instructions[v].op == Op::splat) {
3381                     return constants.find(instructions[v].immA);
3382                 }
3383                 return A::Mem{A::rsp, stack_slot[v]*K*4};
3384             };
3385 
3386             // This is never really worth asking except when any() might be used;
3387             // if we need this value in ARM, might as well just call r(v) to get it into a register.
3388             auto in_reg = [&](Val v) -> bool {
3389                 return find_existing_reg(v) != NA;
3390             };
3391         #endif
3392 
3393             switch (op) {
3394                 // Make sure splat constants can be found by load_from_memory() or any().
3395                 case Op::splat:
3396                     (void)constants[immA];
3397                     break;
3398 
3399             #if defined(__x86_64__) || defined(_M_X64)
3400                 case Op::assert_true: {
3401                     a->vptest (r(x), &constants[0xffffffff]);
3402                     A::Label all_true;
3403                     a->jc(&all_true);
3404                     a->int3();
3405                     a->label(&all_true);
3406                 } break;
3407 
3408                 case Op::store8:
3409                     if (scalar) {
3410                         a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3411                     } else {
3412                         a->vpackusdw(dst(x), r(x), r(x));
3413                         a->vpermq   (dst(), dst(), 0xd8);
3414                         a->vpackuswb(dst(), dst(), dst());
3415                         a->vmovq    (A::Mem{arg[immA]}, (A::Xmm)dst());
3416                     } break;
3417 
3418                 case Op::store16:
3419                     if (scalar) {
3420                         a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3421                     } else {
3422                         a->vpackusdw(dst(x), r(x), r(x));
3423                         a->vpermq   (dst(), dst(), 0xd8);
3424                         a->vmovups  (A::Mem{arg[immA]}, (A::Xmm)dst());
3425                     } break;
3426 
3427                 case Op::store32: if (scalar) { a->vmovd  (A::Mem{arg[immA]}, (A::Xmm)r(x)); }
3428                                   else        { a->vmovups(A::Mem{arg[immA]},         r(x)); }
3429                                   break;
3430 
3431                 case Op::store64: if (scalar) {
3432                                       a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x));
3433                                       a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y));
3434                                   } else {
3435                                       // r(x) = {a,b,c,d|e,f,g,h}
3436                                       // r(y) = {i,j,k,l|m,n,o,p}
3437                                       // We want to write a,i,b,j,c,k,d,l,e,m...
3438                                       A::Ymm L = alloc_tmp(),
3439                                              H = alloc_tmp();
3440                                       a->vpunpckldq(L, r(x), any(y));  // L = {a,i,b,j|e,m,f,n}
3441                                       a->vpunpckhdq(H, r(x), any(y));  // H = {c,k,d,l|g,o,h,p}
3442                                       a->vperm2f128(dst(), L,H, 0x20); //   = {a,i,b,j|c,k,d,l}
3443                                       a->vmovups(A::Mem{arg[immA], 0}, dst());
3444                                       a->vperm2f128(dst(), L,H, 0x31); //   = {e,m,f,n|g,o,h,p}
3445                                       a->vmovups(A::Mem{arg[immA],32}, dst());
3446                                       free_tmp(L);
3447                                       free_tmp(H);
3448                                   } break;
3449 
3450                 case Op::store128: {
3451                     // TODO: >32-bit stores
3452                     a->vmovd  (A::Mem{arg[immA], 0*16 +  0}, (A::Xmm)r(x)   );
3453                     a->vmovd  (A::Mem{arg[immA], 0*16 +  4}, (A::Xmm)r(y)   );
3454                     a->vmovd  (A::Mem{arg[immA], 0*16 +  8}, (A::Xmm)r(z)   );
3455                     a->vmovd  (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w)   );
3456                     if (scalar) { break; }
3457 
3458                     a->vpextrd(A::Mem{arg[immA], 1*16 +  0}, (A::Xmm)r(x), 1);
3459                     a->vpextrd(A::Mem{arg[immA], 1*16 +  4}, (A::Xmm)r(y), 1);
3460                     a->vpextrd(A::Mem{arg[immA], 1*16 +  8}, (A::Xmm)r(z), 1);
3461                     a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1);
3462 
3463                     a->vpextrd(A::Mem{arg[immA], 2*16 +  0}, (A::Xmm)r(x), 2);
3464                     a->vpextrd(A::Mem{arg[immA], 2*16 +  4}, (A::Xmm)r(y), 2);
3465                     a->vpextrd(A::Mem{arg[immA], 2*16 +  8}, (A::Xmm)r(z), 2);
3466                     a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2);
3467 
3468                     a->vpextrd(A::Mem{arg[immA], 3*16 +  0}, (A::Xmm)r(x), 3);
3469                     a->vpextrd(A::Mem{arg[immA], 3*16 +  4}, (A::Xmm)r(y), 3);
3470                     a->vpextrd(A::Mem{arg[immA], 3*16 +  8}, (A::Xmm)r(z), 3);
3471                     a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3);
3472                     // Now we need to store the upper 128 bits of x,y,z,w.
3473                     // Storing in this order rather than interlacing minimizes temporaries.
3474                     a->vextracti128(dst(), r(x), 1);
3475                     a->vmovd  (A::Mem{arg[immA], 4*16 +  0}, (A::Xmm)dst()   );
3476                     a->vpextrd(A::Mem{arg[immA], 5*16 +  0}, (A::Xmm)dst(), 1);
3477                     a->vpextrd(A::Mem{arg[immA], 6*16 +  0}, (A::Xmm)dst(), 2);
3478                     a->vpextrd(A::Mem{arg[immA], 7*16 +  0}, (A::Xmm)dst(), 3);
3479 
3480                     a->vextracti128(dst(), r(y), 1);
3481                     a->vmovd  (A::Mem{arg[immA], 4*16 +  4}, (A::Xmm)dst()   );
3482                     a->vpextrd(A::Mem{arg[immA], 5*16 +  4}, (A::Xmm)dst(), 1);
3483                     a->vpextrd(A::Mem{arg[immA], 6*16 +  4}, (A::Xmm)dst(), 2);
3484                     a->vpextrd(A::Mem{arg[immA], 7*16 +  4}, (A::Xmm)dst(), 3);
3485 
3486                     a->vextracti128(dst(), r(z), 1);
3487                     a->vmovd  (A::Mem{arg[immA], 4*16 +  8}, (A::Xmm)dst()   );
3488                     a->vpextrd(A::Mem{arg[immA], 5*16 +  8}, (A::Xmm)dst(), 1);
3489                     a->vpextrd(A::Mem{arg[immA], 6*16 +  8}, (A::Xmm)dst(), 2);
3490                     a->vpextrd(A::Mem{arg[immA], 7*16 +  8}, (A::Xmm)dst(), 3);
3491 
3492                     a->vextracti128(dst(), r(w), 1);
3493                     a->vmovd  (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst()   );
3494                     a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1);
3495                     a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2);
3496                     a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3);
3497                 } break;
3498 
3499                 case Op::load8:  if (scalar) {
3500                                      a->vpxor  (dst(), dst(), dst());
3501                                      a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3502                                  } else {
3503                                      a->vpmovzxbd(dst(), A::Mem{arg[immA]});
3504                                  } break;
3505 
3506                 case Op::load16: if (scalar) {
3507                                      a->vpxor  (dst(), dst(), dst());
3508                                      a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3509                                  } else {
3510                                      a->vpmovzxwd(dst(), A::Mem{arg[immA]});
3511                                  } break;
3512 
3513                 case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), A::Mem{arg[immA]}); }
3514                                  else        { a->vmovups(        dst(), A::Mem{arg[immA]}); }
3515                                  break;
3516 
3517                 case Op::load64: if (scalar) {
3518                                     a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3519                                  } else {
3520                                     A::Ymm tmp = alloc_tmp();
3521                                     a->vmovups(tmp, &load64_index);
3522                                     a->vpermps(dst(), tmp, A::Mem{arg[immA],  0});
3523                                     a->vpermps(  tmp, tmp, A::Mem{arg[immA], 32});
3524                                     // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1.
3525                                     a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20);
3526                                     free_tmp(tmp);
3527                                  } break;
3528 
3529                 case Op::load128: if (scalar) {
3530                                       a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3531                                   } else {
3532                                       // Load 4 low values into xmm tmp,
3533                                       A::Ymm tmp = alloc_tmp();
3534                                       A::Xmm t = (A::Xmm)tmp;
3535                                       a->vmovd  (t,   A::Mem{arg[immA], 0*16 + 4*immB}   );
3536                                       a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1);
3537                                       a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2);
3538                                       a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3);
3539 
3540                                       // Load 4 high values into xmm dst(),
3541                                       A::Xmm d = (A::Xmm)dst();
3542                                       a->vmovd  (d,   A::Mem{arg[immA], 4*16 + 4*immB}   );
3543                                       a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1);
3544                                       a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2);
3545                                       a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3);
3546 
3547                                       // Merge the two, ymm dst() = {xmm tmp|xmm dst()}
3548                                       a->vperm2f128(dst(), tmp,dst(), 0x20);
3549                                       free_tmp(tmp);
3550                                   } break;
3551 
3552                 case Op::gather8: {
3553                     // As usual, the gather base pointer is immB bytes off of uniform immA.
3554                     a->mov(GP0, A::Mem{arg[immA], immB});
3555 
3556                     A::Ymm tmp = alloc_tmp();
3557                     a->vmovups(tmp, any(x));
3558 
3559                     for (int i = 0; i < active_lanes; i++) {
3560                         if (i == 4) {
3561                             // vpextrd can only pluck indices out from an Xmm register,
3562                             // so we manually swap over to the top when we're halfway through.
3563                             a->vextracti128((A::Xmm)tmp, tmp, 1);
3564                         }
3565                         a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3566                         a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i);
3567                     }
3568                     a->vpmovzxbd(dst(), dst());
3569                     free_tmp(tmp);
3570                 } break;
3571 
3572                 case Op::gather16: {
3573                     // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
3574                     a->mov(GP0, A::Mem{arg[immA], immB});
3575 
3576                     A::Ymm tmp = alloc_tmp();
3577                     a->vmovups(tmp, any(x));
3578 
3579                     for (int i = 0; i < active_lanes; i++) {
3580                         if (i == 4) {
3581                             a->vextracti128((A::Xmm)tmp, tmp, 1);
3582                         }
3583                         a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3584                         a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i);
3585                     }
3586                     a->vpmovzxwd(dst(), dst());
3587                     free_tmp(tmp);
3588                 } break;
3589 
3590                 case Op::gather32:
3591                 if (scalar) {
3592                     // Our gather base pointer is immB bytes off of uniform immA.
3593                     a->mov(GP0, A::Mem{arg[immA], immB});
3594 
3595                     // Grab our index from lane 0 of the index argument.
3596                     a->vmovd(GP1, (A::Xmm)r(x));
3597 
3598                     // dst = *(base + 4*index)
3599                     a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR});
3600                 } else {
3601                     a->mov(GP0, A::Mem{arg[immA], immB});
3602 
3603                     A::Ymm mask = alloc_tmp();
3604                     a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
3605 
3606                     a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask);
3607                     free_tmp(mask);
3608                 }
3609                 break;
3610 
3611                 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB});
3612                                     break;
3613 
3614                 case Op::index: a->vmovd((A::Xmm)dst(), N);
3615                                 a->vbroadcastss(dst(), dst());
3616                                 a->vpsubd(dst(), dst(), &iota);
3617                                 break;
3618 
3619                 // We can swap the arguments of symmetric instructions to make better use of any().
3620                 case Op::add_f32:
3621                     if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); }
3622                     else           { a->vaddps(dst(y), r(y), any(x)); }
3623                                      break;
3624 
3625                 case Op::mul_f32:
3626                     if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); }
3627                     else           { a->vmulps(dst(y), r(y), any(x)); }
3628                                      break;
3629 
3630                 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break;
3631                 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break;
3632                 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break;  // Order matters,
3633                 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break;  // see test SkVM_min_max.
3634 
3635                 case Op::fma_f32:
3636                     if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else
3637                     if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else
3638                     if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else
3639                                       { a->vmovups    (dst(), any(x));
3640                                         a->vfmadd132ps(dst(), r(z), any(y)); }
3641                                         break;
3642 
3643                 case Op::fms_f32:
3644                     if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else
3645                     if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else
3646                     if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else
3647                                       { a->vmovups    (dst(), any(x));
3648                                         a->vfmsub132ps(dst(), r(z), any(y)); }
3649                                         break;
3650 
3651                 case Op::fnma_f32:
3652                     if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else
3653                     if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else
3654                     if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else
3655                                       { a->vmovups     (dst(), any(x));
3656                                         a->vfnmadd132ps(dst(), r(z), any(y)); }
3657                                         break;
3658 
3659                 // In situations like this we want to try aliasing dst(x) when x is
3660                 // already in a register, but not if we'd have to load it from the stack
3661                 // just to alias it.  That's done better directly into the new register.
3662                 case Op::sqrt_f32:
3663                     if (in_reg(x)) { a->vsqrtps(dst(x),  r(x)); }
3664                     else           { a->vsqrtps(dst(), any(x)); }
3665                                      break;
3666 
3667                 case Op::add_i32:
3668                     if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); }
3669                     else           { a->vpaddd(dst(y), r(y), any(x)); }
3670                                      break;
3671 
3672                 case Op::mul_i32:
3673                     if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); }
3674                     else           { a->vpmulld(dst(y), r(y), any(x)); }
3675                                      break;
3676 
3677                 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break;
3678 
3679                 case Op::bit_and:
3680                     if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); }
3681                     else           { a->vpand(dst(y), r(y), any(x)); }
3682                                      break;
3683                 case Op::bit_or:
3684                     if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); }
3685                     else           { a->vpor(dst(y), r(y), any(x)); }
3686                                      break;
3687                 case Op::bit_xor:
3688                     if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); }
3689                     else           { a->vpxor(dst(y), r(y), any(x)); }
3690                                      break;
3691 
3692                 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x.
3693 
3694                 case Op::select:
3695                     if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); }
3696                     else              { a->vpblendvb(dst(x), r(z), any(y), r(x)); }
3697                                         break;
3698 
3699                 case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break;
3700                 case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break;
3701                 case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break;
3702 
3703                 case Op::eq_i32:
3704                     if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); }
3705                     else           { a->vpcmpeqd(dst(y), r(y), any(x)); }
3706                                      break;
3707 
3708                 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break;
3709 
3710                 case Op::eq_f32:
3711                     if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); }
3712                     else           { a->vcmpeqps(dst(y), r(y), any(x)); }
3713                                      break;
3714                 case Op::neq_f32:
3715                     if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); }
3716                     else           { a->vcmpneqps(dst(y), r(y), any(x)); }
3717                                      break;
3718 
3719                 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
3720                 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
3721 
3722                 case Op::ceil:
3723                     if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::CEIL); }
3724                     else           { a->vroundps(dst(), any(x), Assembler::CEIL); }
3725                                      break;
3726 
3727                 case Op::floor:
3728                     if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::FLOOR); }
3729                     else           { a->vroundps(dst(), any(x), Assembler::FLOOR); }
3730                                      break;
3731 
3732                 case Op::to_f32:
3733                     if (in_reg(x)) { a->vcvtdq2ps(dst(x),  r(x)); }
3734                     else           { a->vcvtdq2ps(dst(), any(x)); }
3735                                      break;
3736 
3737                 case Op::trunc:
3738                     if (in_reg(x)) { a->vcvttps2dq(dst(x),  r(x)); }
3739                     else           { a->vcvttps2dq(dst(), any(x)); }
3740                                      break;
3741 
3742                 case Op::round:
3743                     if (in_reg(x)) { a->vcvtps2dq(dst(x),  r(x)); }
3744                     else           { a->vcvtps2dq(dst(), any(x)); }
3745                                      break;
3746 
3747                 case Op::to_fp16:
3748                     a->vcvtps2ph(dst(x), r(x), A::CURRENT);  // f32 ymm -> f16 xmm
3749                     a->vpmovzxwd(dst(), dst());              // f16 xmm -> f16 ymm
3750                     break;
3751 
3752                 case Op::from_fp16:
3753                     a->vpackusdw(dst(x), r(x), r(x));  // f16 ymm -> f16 xmm
3754                     a->vpermq   (dst(), dst(), 0xd8);  // swap middle two 64-bit lanes
3755                     a->vcvtph2ps(dst(), dst());        // f16 xmm -> f32 ymm
3756                     break;
3757 
3758             #elif defined(__aarch64__)
3759                 case Op::assert_true: {
3760                     a->uminv4s(dst(), r(x));   // uminv acts like an all() across the vector.
3761                     a->movs(GP0, dst(), 0);
3762                     A::Label all_true;
3763                     a->cbnz(GP0, &all_true);
3764                     a->brk(0);
3765                     a->label(&all_true);
3766                 } break;
3767 
3768                 case Op::index: {
3769                     A::V tmp = alloc_tmp();
3770                     a->ldrq (tmp, &iota);
3771                     a->dup4s(dst(), N);
3772                     a->sub4s(dst(), dst(), tmp);
3773                     free_tmp(tmp);
3774                 } break;
3775 
3776                 case Op::store8: a->xtns2h(dst(x), r(x));
3777                                  a->xtnh2b(dst(), dst());
3778                    if (scalar) { a->strb  (dst(), arg[immA]); }
3779                    else        { a->strs  (dst(), arg[immA]); }
3780                                  break;
3781 
3782                 case Op::store16: a->xtns2h(dst(x), r(x));
3783                     if (scalar) { a->strh  (dst(), arg[immA]); }
3784                     else        { a->strd  (dst(), arg[immA]); }
3785                                   break;
3786 
3787                 case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); }
3788                                   else        { a->strq(r(x), arg[immA]); }
3789                                                 break;
3790 
3791                 case Op::store64: if (scalar) {
3792                                       a->strs(r(x), arg[immA], 0);
3793                                       a->strs(r(y), arg[immA], 1);
3794                                   } else if (r(y) == r(x)+1) {
3795                                       a->st24s(r(x), arg[immA]);
3796                                   } else {
3797                                       Reg tmp0 = alloc_tmp(2),
3798                                           tmp1 = (Reg)(tmp0+1);
3799                                       a->orr16b(tmp0, r(x), r(x));
3800                                       a->orr16b(tmp1, r(y), r(y));
3801                                       a-> st24s(tmp0, arg[immA]);
3802                                       free_tmp(tmp0);
3803                                       free_tmp(tmp1);
3804                                   } break;
3805 
3806                 case Op::store128:
3807                     if (scalar) {
3808                         a->strs(r(x), arg[immA], 0);
3809                         a->strs(r(y), arg[immA], 1);
3810                         a->strs(r(z), arg[immA], 2);
3811                         a->strs(r(w), arg[immA], 3);
3812                     } else if (r(y) == r(x)+1 &&
3813                                r(z) == r(x)+2 &&
3814                                r(w) == r(x)+3) {
3815                         a->st44s(r(x), arg[immA]);
3816                     } else {
3817                         Reg tmp0 = alloc_tmp(4),
3818                             tmp1 = (Reg)(tmp0+1),
3819                             tmp2 = (Reg)(tmp0+2),
3820                             tmp3 = (Reg)(tmp0+3);
3821                         a->orr16b(tmp0, r(x), r(x));
3822                         a->orr16b(tmp1, r(y), r(y));
3823                         a->orr16b(tmp2, r(z), r(z));
3824                         a->orr16b(tmp3, r(w), r(w));
3825                         a-> st44s(tmp0, arg[immA]);
3826                         free_tmp(tmp0);
3827                         free_tmp(tmp1);
3828                         free_tmp(tmp2);
3829                         free_tmp(tmp3);
3830                     } break;
3831 
3832 
3833                 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); }
3834                                 else        { a->ldrs(dst(), arg[immA]); }
3835                                               a->uxtlb2h(dst(), dst());
3836                                               a->uxtlh2s(dst(), dst());
3837                                               break;
3838 
3839                 case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); }
3840                                  else        { a->ldrd(dst(), arg[immA]); }
3841                                                a->uxtlh2s(dst(), dst());
3842                                                break;
3843 
3844                 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); }
3845                                  else        { a->ldrq(dst(), arg[immA]); }
3846                                                break;
3847 
3848                 case Op::load64: if (scalar) {
3849                                     a->ldrs(dst(), arg[immA], immB);
3850                                  } else {
3851                                     Reg tmp0 = alloc_tmp(2),
3852                                         tmp1 = (Reg)(tmp0+1);
3853                                     a->ld24s(tmp0, arg[immA]);
3854                                     // TODO: return both
3855                                     switch (immB) {
3856                                         case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break;
3857                                         case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break;
3858                                     }
3859                                  } break;
3860 
3861                 case Op::load128: if (scalar) {
3862                                       a->ldrs(dst(), arg[immA], immB);
3863                                   } else {
3864                                       Reg tmp0 = alloc_tmp(4),
3865                                           tmp1 = (Reg)(tmp0+1),
3866                                           tmp2 = (Reg)(tmp0+2),
3867                                           tmp3 = (Reg)(tmp0+3);
3868                                       a->ld44s(tmp0, arg[immA]);
3869                                       // TODO: return all four
3870                                       switch (immB) {
3871                                           case 0: mark_tmp_as_dst(tmp0); break;
3872                                           case 1: mark_tmp_as_dst(tmp1); break;
3873                                           case 2: mark_tmp_as_dst(tmp2); break;
3874                                           case 3: mark_tmp_as_dst(tmp3); break;
3875                                       }
3876                                       if (immB != 0) { free_tmp(tmp0); }
3877                                       if (immB != 1) { free_tmp(tmp1); }
3878                                       if (immB != 2) { free_tmp(tmp2); }
3879                                       if (immB != 3) { free_tmp(tmp3); }
3880                                   } break;
3881 
3882                 case Op::uniform32: a->add(GP0, arg[immA], immB);
3883                                     a->ld1r4s(dst(), GP0);
3884                                     break;
3885 
3886                 case Op::gather8: {
3887                     // As usual, the gather base pointer is immB bytes off of uniform immA.
3888                     a->add (GP0, arg[immA], immB);  // GP0 = &(gather base pointer)
3889                     a->ldrd(GP0, GP0);              // GP0 =   gather base pointer
3890 
3891                     for (int i = 0; i < active_lanes; i++) {
3892                         a->movs(GP1, r(x), i);    // Extract index lane i into GP1.
3893                         a->add (GP1, GP0, GP1);   // Add the gather base pointer.
3894                         a->ldrb(GP1, GP1);        // Load that byte.
3895                         a->inss(dst(x), GP1, i);  // Insert it into dst() lane i.
3896                     }
3897                 } break;
3898 
3899                 // See gather8 for general idea; comments here only where gather16 differs.
3900                 case Op::gather16: {
3901                     a->add (GP0, arg[immA], immB);
3902                     a->ldrd(GP0, GP0);
3903                     for (int i = 0; i < active_lanes; i++) {
3904                         a->movs(GP1, r(x), i);
3905                         a->add (GP1, GP0, GP1, A::LSL, 1);  // Scale index 2x into a byte offset.
3906                         a->ldrh(GP1, GP1);                  // 2-byte load.
3907                         a->inss(dst(x), GP1, i);
3908                     }
3909                 } break;
3910 
3911                 // See gather8 for general idea; comments here only where gather32 differs.
3912                 case Op::gather32: {
3913                     a->add (GP0, arg[immA], immB);
3914                     a->ldrd(GP0, GP0);
3915                     for (int i = 0; i < active_lanes; i++) {
3916                         a->movs(GP1, r(x), i);
3917                         a->add (GP1, GP0, GP1, A::LSL, 2);  // Scale index 4x into a byte offset.
3918                         a->ldrs(GP1, GP1);                  // 4-byte load.
3919                         a->inss(dst(x), GP1, i);
3920                     }
3921                 } break;
3922 
3923                 case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break;
3924                 case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break;
3925                 case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break;
3926                 case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break;
3927 
3928                 case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break;
3929 
3930                 case Op::fma_f32: // fmla.4s is z += x*y
3931                     if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
3932                     else              { a->orr16b(dst(), r(z), r(z));
3933                                         a->fmla4s(dst(), r(x), r(y)); }
3934                                         break;
3935 
3936                 case Op::fnma_f32:  // fmls.4s is z -= x*y
3937                     if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
3938                     else              { a->orr16b(dst(), r(z), r(z));
3939                                         a->fmls4s(dst(), r(x), r(y)); }
3940                                         break;
3941 
3942                 case Op::fms_f32:   // calculate z - xy, then negate to xy - z
3943                     if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
3944                     else              { a->orr16b(dst(), r(z), r(z));
3945                                         a->fmls4s(dst(), r(x), r(y)); }
3946                                         a->fneg4s(dst(), dst());
3947                                         break;
3948 
3949                 case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break;
3950                 case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break;
3951                 case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break;
3952                 case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y));
3953                                   a->not16b  (dst(), dst());         break;
3954 
3955 
3956                 case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break;
3957                 case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break;
3958                 case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break;
3959 
3960                 case Op::bit_and  : a->and16b(dst(x,y), r(x), r(y)); break;
3961                 case Op::bit_or   : a->orr16b(dst(x,y), r(x), r(y)); break;
3962                 case Op::bit_xor  : a->eor16b(dst(x,y), r(x), r(y)); break;
3963                 case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break;
3964 
3965                 case Op::select: // bsl16b is x = x ? y : z
3966                     if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
3967                     else              { a->orr16b(dst(), r(x), r(x));
3968                                         a->bsl16b(dst(), r(y), r(z)); }
3969                                         break;
3970 
3971                 // fmin4s and fmax4s don't work the way we want with NaN,
3972                 // so we write them the long way:
3973                 case Op::min_f32: // min(x,y) = y<x ? y : x
3974                                   a->fcmgt4s(dst(), r(x), r(y));
3975                                   a->bsl16b (dst(), r(y), r(x));
3976                                   break;
3977 
3978                 case Op::max_f32: // max(x,y) = x<y ? y : x
3979                                   a->fcmgt4s(dst(), r(y), r(x));
3980                                   a->bsl16b (dst(), r(y), r(x));
3981                                   break;
3982 
3983                 case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break;
3984                 case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break;
3985                 case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break;
3986 
3987                 case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break;
3988                 case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break;
3989 
3990                 case Op::to_f32: a->scvtf4s (dst(x), r(x)); break;
3991                 case Op::trunc:  a->fcvtzs4s(dst(x), r(x)); break;
3992                 case Op::round:  a->fcvtns4s(dst(x), r(x)); break;
3993                 case Op::ceil:   a->frintp4s(dst(x), r(x)); break;
3994                 case Op::floor:  a->frintm4s(dst(x), r(x)); break;
3995 
3996                 case Op::to_fp16:
3997                     a->fcvtn  (dst(x), r(x));    // 4x f32 -> 4x f16 in bottom four lanes
3998                     a->uxtlh2s(dst(), dst());    // expand to 4x f16 in even 16-bit lanes
3999                     break;
4000 
4001                 case Op::from_fp16:
4002                     a->xtns2h(dst(x), r(x));     // pack even 16-bit lanes into bottom four lanes
4003                     a->fcvtl (dst(), dst());     // 4x f16 -> 4x f32
4004                     break;
4005             #endif
4006             }
4007 
4008             // Proactively free the registers holding any value that dies here.
4009             if (rd != NA &&                   dies_here(regs[rd])) { regs[rd] = NA; }
4010             if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; }
4011             if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; }
4012             if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; }
4013             if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; }
4014             return true;
4015         };
4016 
4017         #if defined(__x86_64__) || defined(_M_X64)
4018             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
4019             auto jump         = [&](A::Label* l) { a->jmp(l); };
4020 
4021             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
4022             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
4023         #elif defined(__aarch64__)
4024             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
4025             auto jump         = [&](A::Label* l) { a->b  (l); };
4026 
4027             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
4028             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
4029         #endif
4030 
4031         A::Label body,
4032                  tail,
4033                  done;
4034 
4035         enter();
4036         for (Val id = 0; id < (Val)instructions.size(); id++) {
4037             if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4038                 return false;
4039             }
4040         }
4041 
4042         // This point marks a kind of canonical fixed point for register contents: if loop
4043         // code is generated as if these registers are holding these values, the next time
4044         // the loop comes around we'd better find those same registers holding those same values.
4045         auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot,
4046                                       saved_next_stack_slot=next_stack_slot]{
4047             for (int r = 0; r < (int)regs.size(); r++) {
4048                 if (regs[r] != incoming[r]) {
4049                     regs[r]  = incoming[r];
4050                     if (regs[r] >= 0) {
4051                         load_from_memory((Reg)r, regs[r]);
4052                     }
4053                 }
4054             }
4055             *stack_hint = std::max(*stack_hint, next_stack_slot);
4056             stack_slot = saved_stack_slot;
4057             next_stack_slot = saved_next_stack_slot;
4058         };
4059 
4060         a->label(&body);
4061         {
4062             a->cmp(N, K);
4063             jump_if_less(&tail);
4064             for (Val id = 0; id < (Val)instructions.size(); id++) {
4065                 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4066                     return false;
4067                 }
4068             }
4069             restore_incoming_regs();
4070             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4071                 if (fImpl->strides[i]) {
4072                     add(arg[i], K*fImpl->strides[i]);
4073                 }
4074             }
4075             sub(N, K);
4076             jump(&body);
4077         }
4078 
4079         a->label(&tail);
4080         {
4081             a->cmp(N, 1);
4082             jump_if_less(&done);
4083             for (Val id = 0; id < (Val)instructions.size(); id++) {
4084                 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) {
4085                     return false;
4086                 }
4087             }
4088             restore_incoming_regs();
4089             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4090                 if (fImpl->strides[i]) {
4091                     add(arg[i], 1*fImpl->strides[i]);
4092                 }
4093             }
4094             sub(N, 1);
4095             jump(&tail);
4096         }
4097 
4098         a->label(&done);
4099         {
4100             exit();
4101         }
4102 
4103         // Except for explicit aligned load and store instructions, AVX allows
4104         // memory operands to be unaligned.  So even though we're creating 16
4105         // byte patterns on ARM or 32-byte patterns on x86, we only need to
4106         // align to 4 bytes, the element size and alignment requirement.
4107 
4108         constants.foreach([&](int imm, A::Label* label) {
4109             a->align(4);
4110             a->label(label);
4111             for (int i = 0; i < K; i++) {
4112                 a->word(imm);
4113             }
4114         });
4115 
4116         if (!iota.references.empty()) {
4117             a->align(4);
4118             a->label(&iota);        // 0,1,2,3,4,...
4119             for (int i = 0; i < K; i++) {
4120                 a->word(i);
4121             }
4122         }
4123 
4124         if (!load64_index.references.empty()) {
4125             a->align(4);
4126             a->label(&load64_index);  // {0,2,4,6|1,3,5,7}
4127             a->word(0); a->word(2); a->word(4); a->word(6);
4128             a->word(1); a->word(3); a->word(5); a->word(7);
4129         }
4130 
4131         return true;
4132     }
4133 
setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)4134     void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
4135                            const char* debug_name) {
4136         // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble)
4137         // and stack_hint/registers_used to feed forward into the next jit() call.
4138         Assembler a{nullptr};
4139         int stack_hint = -1;
4140         uint32_t registers_used = 0xffff'ffff;  // Start conservatively with all.
4141         if (!this->jit(instructions, &stack_hint, &registers_used, &a)) {
4142             return;
4143         }
4144 
4145         fImpl->jit_size = a.size();
4146         void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
4147         fImpl->jit_entry.store(jit_entry);
4148 
4149         // Assemble the program for real with stack_hint/registers_used as feedback from first call.
4150         a = Assembler{jit_entry};
4151         SkAssertResult(this->jit(instructions, &stack_hint, &registers_used, &a));
4152         SkASSERT(a.size() <= fImpl->jit_size);
4153 
4154         // Remap as executable, and flush caches on platforms that need that.
4155         remap_as_executable(jit_entry, fImpl->jit_size);
4156 
4157         notify_vtune(debug_name, jit_entry, fImpl->jit_size);
4158 
4159     #if !defined(SK_BUILD_FOR_WIN)
4160         // For profiling and debugging, it's helpful to have this code loaded
4161         // dynamically rather than just jumping info fImpl->jit_entry.
4162         if (gSkVMJITViaDylib) {
4163             // Dump the raw program binary.
4164             SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
4165             int fd = mkstemp(path.writable_str());
4166             ::write(fd, jit_entry, a.size());
4167             close(fd);
4168 
4169             this->dropJIT();  // (unmap and null out fImpl->jit_entry.)
4170 
4171             // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4172             SkString cmd = SkStringPrintf(
4173                     "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4174                     " | clang -x assembler -shared - -o %s",
4175                     path.c_str(), path.c_str());
4176             system(cmd.c_str());
4177 
4178             // Load that dynamic library and look up skvm_jit().
4179             fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
4180             void* sym = nullptr;
4181             for (const char* name : {"skvm_jit", "_skvm_jit"} ) {
4182                 if (!sym) { sym = dlsym(fImpl->dylib, name); }
4183             }
4184             fImpl->jit_entry.store(sym);
4185         }
4186     #endif
4187     }
4188 #endif
4189 
4190 }  // namespace skvm
4191