1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColorPriv.h"
9 #include "include/private/SkColorData.h"
10 #include "src/core/SkCpu.h"
11 #include "src/core/SkMSAN.h"
12 #include "src/core/SkVM.h"
13 #include "tests/Test.h"
14 
15 template <typename Fn>
test_jit_and_interpreter(const skvm::Builder & b,Fn && test)16 static void test_jit_and_interpreter(const skvm::Builder& b, Fn&& test) {
17     skvm::Program p = b.done();
18     test(p);
19     if (p.hasJIT()) {
20         test(b.done(/*debug_name=*/nullptr, /*allow_jit=*/false));
21     }
22 }
23 
DEF_TEST(SkVM_eliminate_dead_code,r)24 DEF_TEST(SkVM_eliminate_dead_code, r) {
25     skvm::Builder b;
26     {
27         skvm::Ptr arg = b.varying<int>();
28         skvm::I32 l = b.load32(arg);
29         skvm::I32 a = b.add(l, l);
30         b.add(a, b.splat(7));
31     }
32 
33     std::vector<skvm::Instruction> program = b.program();
34     REPORTER_ASSERT(r, program.size() == 4);
35 
36     program = skvm::eliminate_dead_code(program);
37     REPORTER_ASSERT(r, program.size() == 0);
38 }
39 
DEF_TEST(SkVM_Pointless,r)40 DEF_TEST(SkVM_Pointless, r) {
41     // Let's build a program with no memory arguments.
42     // It should all be pegged as dead code, but we should be able to "run" it.
43     skvm::Builder b;
44     {
45         b.add(b.splat(5.0f),
46               b.splat(4.0f));
47     }
48 
49     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
50         for (int N = 0; N < 64; N++) {
51             program.eval(N);
52         }
53     });
54 
55     for (const skvm::OptimizedInstruction& inst : b.optimize()) {
56         REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
57     }
58 }
59 
DEF_TEST(SkVM_memset,r)60 DEF_TEST(SkVM_memset, r) {
61     skvm::Builder b;
62     b.store32(b.varying<int>(), b.splat(42));
63 
64     test_jit_and_interpreter(b, [&](const skvm::Program& p) {
65         int buf[18];
66         buf[17] = 47;
67 
68         p.eval(17, buf);
69         for (int i = 0; i < 17; i++) {
70             REPORTER_ASSERT(r, buf[i] == 42);
71         }
72         REPORTER_ASSERT(r, buf[17] == 47);
73     });
74 }
75 
DEF_TEST(SkVM_memcpy,r)76 DEF_TEST(SkVM_memcpy, r) {
77     skvm::Builder b;
78     {
79         auto src = b.varying<int>(),
80              dst = b.varying<int>();
81         b.store32(dst, b.load32(src));
82     }
83 
84     test_jit_and_interpreter(b, [&](const skvm::Program& p) {
85         int src[] = {1,2,3,4,5,6,7,8,9},
86             dst[] = {0,0,0,0,0,0,0,0,0};
87 
88         p.eval(SK_ARRAY_COUNT(src)-1, src, dst);
89         for (size_t i = 0; i < SK_ARRAY_COUNT(src)-1; i++) {
90             REPORTER_ASSERT(r, dst[i] == src[i]);
91         }
92         size_t i = SK_ARRAY_COUNT(src)-1;
93         REPORTER_ASSERT(r, dst[i] == 0);
94     });
95 }
96 
DEF_TEST(SkVM_allow_jit,r)97 DEF_TEST(SkVM_allow_jit, r) {
98     skvm::Builder b;
99     {
100         auto src = b.varying<int>(),
101              dst = b.varying<int>();
102         b.store32(dst, b.load32(src));
103     }
104 
105     if (b.done("", /*allow_jit=*/true).hasJIT()) {
106         REPORTER_ASSERT(r, !b.done("", false).hasJIT());
107     }
108 }
109 
DEF_TEST(SkVM_LoopCounts,r)110 DEF_TEST(SkVM_LoopCounts, r) {
111     // Make sure we cover all the exact N we want.
112 
113     // buf[i] += 1
114     skvm::Builder b;
115     skvm::Ptr arg = b.varying<int>();
116     b.store32(arg,
117               b.add(b.splat(1),
118                     b.load32(arg)));
119 
120     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
121         int buf[64];
122         for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
123             for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
124                 buf[i] = i;
125             }
126             program.eval(N, buf);
127 
128             for (int i = 0; i < N; i++) {
129                 REPORTER_ASSERT(r, buf[i] == i+1);
130             }
131             for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
132                 REPORTER_ASSERT(r, buf[i] == i);
133             }
134         }
135     });
136 }
137 
DEF_TEST(SkVM_gather32,r)138 DEF_TEST(SkVM_gather32, r) {
139     skvm::Builder b;
140     {
141         skvm::Ptr uniforms = b.uniform(),
142                   buf      = b.varying<int>();
143         skvm::I32 x = b.load32(buf);
144         b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
145     }
146 
147     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
148         const int img[] = {12,34,56,78, 90,98,76,54};
149 
150         int buf[20];
151         for (int i = 0; i < 20; i++) {
152             buf[i] = i;
153         }
154 
155         struct Uniforms {
156             const int* img;
157         } uniforms{img};
158 
159         program.eval(20, &uniforms, buf);
160         int i = 0;
161         REPORTER_ASSERT(r, buf[i] == 12); i++;
162         REPORTER_ASSERT(r, buf[i] == 34); i++;
163         REPORTER_ASSERT(r, buf[i] == 56); i++;
164         REPORTER_ASSERT(r, buf[i] == 78); i++;
165         REPORTER_ASSERT(r, buf[i] == 90); i++;
166         REPORTER_ASSERT(r, buf[i] == 98); i++;
167         REPORTER_ASSERT(r, buf[i] == 76); i++;
168         REPORTER_ASSERT(r, buf[i] == 54); i++;
169 
170         REPORTER_ASSERT(r, buf[i] == 12); i++;
171         REPORTER_ASSERT(r, buf[i] == 34); i++;
172         REPORTER_ASSERT(r, buf[i] == 56); i++;
173         REPORTER_ASSERT(r, buf[i] == 78); i++;
174         REPORTER_ASSERT(r, buf[i] == 90); i++;
175         REPORTER_ASSERT(r, buf[i] == 98); i++;
176         REPORTER_ASSERT(r, buf[i] == 76); i++;
177         REPORTER_ASSERT(r, buf[i] == 54); i++;
178 
179         REPORTER_ASSERT(r, buf[i] == 12); i++;
180         REPORTER_ASSERT(r, buf[i] == 34); i++;
181         REPORTER_ASSERT(r, buf[i] == 56); i++;
182         REPORTER_ASSERT(r, buf[i] == 78); i++;
183     });
184 }
185 
DEF_TEST(SkVM_gathers,r)186 DEF_TEST(SkVM_gathers, r) {
187     skvm::Builder b;
188     {
189         skvm::Ptr uniforms = b.uniform(),
190                   buf32    = b.varying<int>(),
191                   buf16    = b.varying<uint16_t>(),
192                   buf8     = b.varying<uint8_t>();
193 
194         skvm::I32 x = b.load32(buf32);
195 
196         b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
197         b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
198         b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
199     }
200 
201     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
202         const int img[] = {12,34,56,78, 90,98,76,54};
203 
204         constexpr int N = 20;
205         int      buf32[N];
206         uint16_t buf16[N];
207         uint8_t  buf8 [N];
208 
209         for (int i = 0; i < 20; i++) {
210             buf32[i] = i;
211         }
212 
213         struct Uniforms {
214             const int* img;
215         } uniforms{img};
216 
217         program.eval(N, &uniforms, buf32, buf16, buf8);
218         int i = 0;
219         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
220         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
221         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
222         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
223         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
224         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
225         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] ==  0); i++;
226         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
227 
228         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
229         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
230         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] ==  0); i++;
231         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
232         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
233         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
234         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] ==  0); i++;
235         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
236 
237         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
238         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
239         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
240         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
241     });
242 }
243 
DEF_TEST(SkVM_gathers2,r)244 DEF_TEST(SkVM_gathers2, r) {
245     skvm::Builder b;
246     {
247         skvm::Ptr uniforms = b.uniform(),
248                   buf32    = b.varying<int>(),
249                   buf16    = b.varying<uint16_t>(),
250                   buf8     = b.varying<uint8_t>();
251 
252         skvm::I32 x = b.load32(buf32);
253 
254         b.store32(buf32, b.gather32(uniforms,0, x));
255         b.store16(buf16, b.gather16(uniforms,0, x));
256         b.store8 (buf8 , b.gather8 (uniforms,0, x));
257     }
258 
259     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
260         uint8_t img[256];
261         for (int i = 0; i < 256; i++) {
262             img[i] = i;
263         }
264 
265         int      buf32[64];
266         uint16_t buf16[64];
267         uint8_t  buf8 [64];
268 
269         for (int i = 0; i < 64; i++) {
270             buf32[i] = (i*47)&63;
271             buf16[i] = 0;
272             buf8 [i] = 0;
273         }
274 
275         struct Uniforms {
276             const uint8_t* img;
277         } uniforms{img};
278 
279         program.eval(64, &uniforms, buf32, buf16, buf8);
280 
281         for (int i = 0; i < 64; i++) {
282             REPORTER_ASSERT(r, buf8[i] == ((i*47)&63));  // 0,47,30,13,60,...
283         }
284 
285         REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
286         REPORTER_ASSERT(r, buf16[63] == 0x2322);
287 
288         REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
289         REPORTER_ASSERT(r, buf32[63] == 0x47464544);
290     });
291 }
292 
DEF_TEST(SkVM_bitops,r)293 DEF_TEST(SkVM_bitops, r) {
294     skvm::Builder b;
295     {
296         skvm::Ptr ptr = b.varying<int>();
297 
298         skvm::I32 x = b.load32(ptr);
299 
300         x = b.bit_and  (x, b.splat(0xf1));  // 0x40
301         x = b.bit_or   (x, b.splat(0x80));  // 0xc0
302         x = b.bit_xor  (x, b.splat(0xfe));  // 0x3e
303         x = b.bit_clear(x, b.splat(0x30));  // 0x0e
304 
305         x = b.shl(x, 28);  // 0xe000'0000
306         x = b.sra(x, 28);  // 0xffff'fffe
307         x = b.shr(x,  1);  // 0x7fff'ffff
308 
309         b.store32(ptr, x);
310     }
311 
312     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
313         int x = 0x42;
314         program.eval(1, &x);
315         REPORTER_ASSERT(r, x == 0x7fff'ffff);
316     });
317 }
318 
DEF_TEST(SkVM_select_is_NaN,r)319 DEF_TEST(SkVM_select_is_NaN, r) {
320     skvm::Builder b;
321     {
322         skvm::Ptr src = b.varying<float>(),
323                   dst = b.varying<float>();
324 
325         skvm::F32 x = b.loadF(src);
326         x = select(is_NaN(x), b.splat(0.0f)
327                             , x);
328         b.storeF(dst, x);
329     }
330 
331     std::vector<skvm::OptimizedInstruction> program = b.optimize();
332     REPORTER_ASSERT(r, program.size() == 4);
333     REPORTER_ASSERT(r, program[0].op == skvm::Op::load32);
334     REPORTER_ASSERT(r, program[1].op == skvm::Op::neq_f32);
335     REPORTER_ASSERT(r, program[2].op == skvm::Op::bit_clear);
336     REPORTER_ASSERT(r, program[3].op == skvm::Op::store32);
337 
338     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
339         // ±NaN, ±0, ±1, ±inf
340         uint32_t src[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
341                           0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
342         uint32_t dst[SK_ARRAY_COUNT(src)];
343         program.eval(SK_ARRAY_COUNT(src), src, dst);
344 
345         for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
346             REPORTER_ASSERT(r, dst[i] == (i < 2 ? 0 : src[i]));
347         }
348     });
349 }
350 
DEF_TEST(SkVM_f32,r)351 DEF_TEST(SkVM_f32, r) {
352     skvm::Builder b;
353     {
354         skvm::Ptr arg = b.varying<float>();
355 
356         skvm::F32 x = b.loadF(arg),
357                   y = b.add(x,x),   // y = 2x
358                   z = b.sub(y,x),   // z = 2x-x = x
359                   w = b.div(z,x);   // w = x/x = 1
360         b.storeF(arg, w);
361     }
362 
363     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
364         float buf[] = { 1,2,3,4,5,6,7,8,9 };
365         program.eval(SK_ARRAY_COUNT(buf), buf);
366         for (float v : buf) {
367             REPORTER_ASSERT(r, v == 1.0f);
368         }
369     });
370 }
371 
DEF_TEST(SkVM_cmp_i32,r)372 DEF_TEST(SkVM_cmp_i32, r) {
373     skvm::Builder b;
374     {
375         skvm::I32 x = b.load32(b.varying<int>());
376 
377         auto to_bit = [&](int shift, skvm::I32 mask) {
378             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
379         };
380 
381         skvm::I32 m = b.splat(0);
382         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
383         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
384         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
385         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
386         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
387         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
388 
389         b.store32(b.varying<int>(), m);
390     }
391     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
392         int in[] = { 0,1,2,3,4,5,6,7,8,9 };
393         int out[SK_ARRAY_COUNT(in)];
394 
395         program.eval(SK_ARRAY_COUNT(in), in, out);
396 
397         REPORTER_ASSERT(r, out[0] == 0b001111);
398         REPORTER_ASSERT(r, out[1] == 0b001100);
399         REPORTER_ASSERT(r, out[2] == 0b001010);
400         REPORTER_ASSERT(r, out[3] == 0b001010);
401         REPORTER_ASSERT(r, out[4] == 0b000010);
402         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
403             REPORTER_ASSERT(r, out[i] == 0b110010);
404         }
405     });
406 }
407 
DEF_TEST(SkVM_cmp_f32,r)408 DEF_TEST(SkVM_cmp_f32, r) {
409     skvm::Builder b;
410     {
411         skvm::F32 x = b.loadF(b.varying<float>());
412 
413         auto to_bit = [&](int shift, skvm::I32 mask) {
414             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
415         };
416 
417         skvm::I32 m = b.splat(0);
418         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
419         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
420         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
421         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
422         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
423         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
424 
425         b.store32(b.varying<int>(), m);
426     }
427 
428     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
429         float in[] = { 0,1,2,3,4,5,6,7,8,9 };
430         int out[SK_ARRAY_COUNT(in)];
431 
432         program.eval(SK_ARRAY_COUNT(in), in, out);
433 
434         REPORTER_ASSERT(r, out[0] == 0b001111);
435         REPORTER_ASSERT(r, out[1] == 0b001100);
436         REPORTER_ASSERT(r, out[2] == 0b001010);
437         REPORTER_ASSERT(r, out[3] == 0b001010);
438         REPORTER_ASSERT(r, out[4] == 0b000010);
439         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
440             REPORTER_ASSERT(r, out[i] == 0b110010);
441         }
442     });
443 }
444 
DEF_TEST(SkVM_index,r)445 DEF_TEST(SkVM_index, r) {
446     skvm::Builder b;
447     b.store32(b.varying<int>(), b.index());
448 
449     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
450         int buf[23];
451         program.eval(SK_ARRAY_COUNT(buf), buf);
452         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
453             REPORTER_ASSERT(r, buf[i] == (int)SK_ARRAY_COUNT(buf)-i);
454         }
455     });
456 }
457 
DEF_TEST(SkVM_mad,r)458 DEF_TEST(SkVM_mad, r) {
459     // This program is designed to exercise the tricky corners of instruction
460     // and register selection for Op::mad_f32.
461 
462     skvm::Builder b;
463     {
464         skvm::Ptr arg = b.varying<int>();
465 
466         skvm::F32 x = b.to_F32(b.load32(arg)),
467                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
468                   z = b.mad(y,y,x),   // y is needed in the future, but r[z] = r[x] is ok.
469                   w = b.mad(z,z,y),   // w can alias z but not y.
470                   v = b.mad(w,y,w);   // Got to stop somewhere.
471         b.store32(arg, b.trunc(v));
472     }
473 
474     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
475         int x = 2;
476         program.eval(1, &x);
477         // x = 2
478         // y = 2*2 + 2 = 6
479         // z = 6*6 + 2 = 38
480         // w = 38*38 + 6 = 1450
481         // v = 1450*6 + 1450 = 10150
482         REPORTER_ASSERT(r, x == 10150);
483     });
484 }
485 
DEF_TEST(SkVM_fms,r)486 DEF_TEST(SkVM_fms, r) {
487     // Create a pattern that can be peepholed into an Op::fms_f32.
488     skvm::Builder b;
489     {
490         skvm::Ptr arg = b.varying<int>();
491 
492         skvm::F32 x = b.to_F32(b.load32(arg)),
493                   v = b.sub(b.mul(x, b.splat(2.0f)),
494                             b.splat(1.0f));
495         b.store32(arg, b.trunc(v));
496     }
497 
498     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
499         int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
500         program.eval((int)SK_ARRAY_COUNT(buf), &buf);
501 
502         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
503             REPORTER_ASSERT(r, buf[i] = 2*i-1);
504         }
505     });
506 }
507 
DEF_TEST(SkVM_fnma,r)508 DEF_TEST(SkVM_fnma, r) {
509     // Create a pattern that can be peepholed into an Op::fnma_f32.
510     skvm::Builder b;
511     {
512         skvm::Ptr arg = b.varying<int>();
513 
514         skvm::F32 x = b.to_F32(b.load32(arg)),
515                   v = b.sub(b.splat(1.0f),
516                             b.mul(x, b.splat(2.0f)));
517         b.store32(arg, b.trunc(v));
518     }
519 
520     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
521         int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
522         program.eval((int)SK_ARRAY_COUNT(buf), &buf);
523 
524         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
525             REPORTER_ASSERT(r, buf[i] = 1-2*i);
526         }
527     });
528 }
529 
DEF_TEST(SkVM_madder,r)530 DEF_TEST(SkVM_madder, r) {
531     skvm::Builder b;
532     {
533         skvm::Ptr arg = b.varying<float>();
534 
535         skvm::F32 x = b.loadF(arg),
536                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
537                   z = b.mad(y,x,y),   // r[x] can be reused after this instruction, but not r[y].
538                   w = b.mad(y,y,z);
539         b.storeF(arg, w);
540     }
541 
542     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
543         float x = 2.0f;
544         // y = 2*2 + 2 = 6
545         // z = 6*2 + 6 = 18
546         // w = 6*6 + 18 = 54
547         program.eval(1, &x);
548         REPORTER_ASSERT(r, x == 54.0f);
549     });
550 }
551 
DEF_TEST(SkVM_floor,r)552 DEF_TEST(SkVM_floor, r) {
553     skvm::Builder b;
554     {
555         skvm::Ptr arg = b.varying<float>();
556         b.storeF(arg, b.floor(b.loadF(arg)));
557     }
558 
559     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
560         float buf[]  = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
561         float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
562         program.eval(SK_ARRAY_COUNT(buf), buf);
563         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
564             REPORTER_ASSERT(r, buf[i] == want[i]);
565         }
566     });
567 }
568 
DEF_TEST(SkVM_round,r)569 DEF_TEST(SkVM_round, r) {
570     skvm::Builder b;
571     {
572         skvm::Ptr src = b.varying<float>();
573         skvm::Ptr dst = b.varying<int>();
574         b.store32(dst, b.round(b.loadF(src)));
575     }
576 
577     // The test cases on exact 0.5f boundaries assume the current rounding mode is nearest even.
578     // We haven't explicitly guaranteed that here... it just probably is.
579     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
580         float buf[]  = { -1.5f, -0.5f, 0.0f, 0.5f, 0.2f, 0.6f, 1.0f, 1.4f, 1.5f, 2.0f };
581         int want[] =   { -2   ,  0   , 0   , 0   , 0   , 1   , 1   , 1   , 2   , 2    };
582         int dst[SK_ARRAY_COUNT(buf)];
583 
584         program.eval(SK_ARRAY_COUNT(buf), buf, dst);
585         for (int i = 0; i < (int)SK_ARRAY_COUNT(dst); i++) {
586             REPORTER_ASSERT(r, dst[i] == want[i]);
587         }
588     });
589 }
590 
DEF_TEST(SkVM_min,r)591 DEF_TEST(SkVM_min, r) {
592     skvm::Builder b;
593     {
594         skvm::Ptr src1 = b.varying<float>();
595         skvm::Ptr src2 = b.varying<float>();
596         skvm::Ptr dst = b.varying<float>();
597 
598         b.storeF(dst, b.min(b.loadF(src1), b.loadF(src2)));
599     }
600 
601     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
602         float s1[]  =  { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
603         float s2[]  =  { 0.0f, 2.0f, 3.0f,  1.0f, -2.0f};
604         float want[] = { 0.0f, 1.0f, 3.0f, -1.0f, -2.0f};
605         float d[SK_ARRAY_COUNT(s1)];
606         program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
607         for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
608           REPORTER_ASSERT(r, d[i] == want[i]);
609         }
610     });
611 }
612 
DEF_TEST(SkVM_max,r)613 DEF_TEST(SkVM_max, r) {
614     skvm::Builder b;
615     {
616         skvm::Ptr src1 = b.varying<float>();
617         skvm::Ptr src2 = b.varying<float>();
618         skvm::Ptr dst = b.varying<float>();
619 
620         b.storeF(dst, b.max(b.loadF(src1), b.loadF(src2)));
621     }
622 
623     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
624         float s1[]  =  { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
625         float s2[]  =  { 0.0f, 2.0f, 3.0f,  1.0f, -2.0f};
626         float want[] = { 0.0f, 2.0f, 4.0f,  1.0f, -1.0f};
627         float d[SK_ARRAY_COUNT(s1)];
628         program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
629         for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
630           REPORTER_ASSERT(r, d[i] == want[i]);
631         }
632     });
633 }
634 
DEF_TEST(SkVM_hoist,r)635 DEF_TEST(SkVM_hoist, r) {
636     // This program uses enough constants that it will fail to JIT if we hoist them.
637     // The JIT will try again without hoisting, and that'll just need 2 registers.
638     skvm::Builder b;
639     {
640         skvm::Ptr arg = b.varying<int>();
641         skvm::I32 x = b.load32(arg);
642         for (int i = 0; i < 32; i++) {
643             x = b.add(x, b.splat(i));
644         }
645         b.store32(arg, x);
646     }
647 
648     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
649         int x = 4;
650         program.eval(1, &x);
651         // x += 0 + 1 + 2 + 3 + ... + 30 + 31
652         // x += 496
653         REPORTER_ASSERT(r, x == 500);
654     });
655 }
656 
DEF_TEST(SkVM_select,r)657 DEF_TEST(SkVM_select, r) {
658     skvm::Builder b;
659     {
660         skvm::Ptr buf = b.varying<int>();
661 
662         skvm::I32 x = b.load32(buf);
663 
664         x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
665 
666         b.store32(buf, x);
667     }
668 
669     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
670         int buf[] = { 0,1,2,3,4,5,6,7,8 };
671         program.eval(SK_ARRAY_COUNT(buf), buf);
672         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
673             REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
674         }
675     });
676 }
677 
DEF_TEST(SkVM_swap,r)678 DEF_TEST(SkVM_swap, r) {
679     skvm::Builder b;
680     {
681         // This program is the equivalent of
682         //     x = *X
683         //     y = *Y
684         //     *X = y
685         //     *Y = x
686         // One rescheduling of the program based only on data flow of Op arguments is
687         //     x = *X
688         //     *Y = x
689         //     y = *Y
690         //     *X = y
691         // but this reordering does not produce the same results and is invalid.
692         skvm::Ptr X = b.varying<int>(),
693                   Y = b.varying<int>();
694 
695         skvm::I32 x = b.load32(X),
696                   y = b.load32(Y);
697 
698         b.store32(X, y);
699         b.store32(Y, x);
700     }
701 
702     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
703         int b1[] = { 0,1,2,3 };
704         int b2[] = { 4,5,6,7 };
705         program.eval(SK_ARRAY_COUNT(b1), b1, b2);
706         for (int i = 0; i < (int)SK_ARRAY_COUNT(b1); i++) {
707             REPORTER_ASSERT(r, b1[i] == 4 + i);
708             REPORTER_ASSERT(r, b2[i] == i);
709         }
710     });
711 }
712 
DEF_TEST(SkVM_NewOps,r)713 DEF_TEST(SkVM_NewOps, r) {
714     // Exercise a somewhat arbitrary set of new ops.
715     skvm::Builder b;
716     {
717         skvm::Ptr buf      = b.varying<int16_t>(),
718                   uniforms = b.uniform();
719 
720         skvm::I32 x = b.load16(buf);
721 
722         const size_t kPtr = sizeof(const int*);
723 
724         x = b.add(x, b.uniform32(uniforms, kPtr+0));
725         x = b.mul(x, b.uniform32(uniforms, kPtr+4));
726         x = b.sub(x, b.uniform32(uniforms, kPtr+8));
727 
728         skvm::I32 limit = b.uniform32(uniforms, kPtr+12);
729         x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
730         x = b.select(b.gt(x, limit     ), limit     , x);
731 
732         x = b.gather8(uniforms,0, x);
733 
734         b.store16(buf, x);
735     }
736 
737     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
738         const int N = 31;
739         int16_t buf[N];
740         for (int i = 0; i < N; i++) {
741             buf[i] = i;
742         }
743 
744         const int M = 16;
745         uint8_t img[M];
746         for (int i = 0; i < M; i++) {
747             img[i] = i*i;
748         }
749 
750         struct {
751             const uint8_t* img;
752             int      add   = 5;
753             int      mul   = 3;
754             int      sub   = 18;
755             int      limit = M-1;
756         } uniforms{img};
757 
758         program.eval(N, buf, &uniforms);
759 
760         for (int i = 0; i < N; i++) {
761             // Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
762             int x = 3*(i-1);
763 
764             // Then that's pinned to the limits of img.
765             if (i < 2) { x =  0; }  // Notice i == 1 hits x == 0 exactly...
766             if (i > 5) { x = 15; }  // ...and i == 6 hits x == 15 exactly
767             REPORTER_ASSERT(r, buf[i] == img[x]);
768         }
769     });
770 }
771 
DEF_TEST(SkVM_sqrt,r)772 DEF_TEST(SkVM_sqrt, r) {
773     skvm::Builder b;
774     auto buf = b.varying<int>();
775     b.storeF(buf, b.sqrt(b.loadF(buf)));
776 
777     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
778         constexpr int K = 17;
779         float buf[K];
780         for (int i = 0; i < K; i++) {
781             buf[i] = (float)(i*i);
782         }
783 
784         // x^2 -> x
785         program.eval(K, buf);
786 
787         for (int i = 0; i < K; i++) {
788             REPORTER_ASSERT(r, buf[i] == (float)i);
789         }
790     });
791 }
792 
DEF_TEST(SkVM_MSAN,r)793 DEF_TEST(SkVM_MSAN, r) {
794     // This little memset32() program should be able to JIT, but if we run that
795     // JIT code in an MSAN build, it won't see the writes initialize buf.  So
796     // this tests that we're using the interpreter instead.
797     skvm::Builder b;
798     b.store32(b.varying<int>(), b.splat(42));
799 
800     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
801         constexpr int K = 17;
802         int buf[K];                 // Intentionally uninitialized.
803         program.eval(K, buf);
804         sk_msan_assert_initialized(buf, buf+K);
805         for (int x : buf) {
806             REPORTER_ASSERT(r, x == 42);
807         }
808     });
809 }
810 
DEF_TEST(SkVM_assert,r)811 DEF_TEST(SkVM_assert, r) {
812     skvm::Builder b;
813     b.assert_true(b.lt(b.load32(b.varying<int>()),
814                        b.splat(42)));
815 
816     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
817         int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
818         program.eval(SK_ARRAY_COUNT(buf), buf);
819     });
820 }
821 
DEF_TEST(SkVM_premul,reporter)822 DEF_TEST(SkVM_premul, reporter) {
823     // Test that premul is short-circuited when alpha is known opaque.
824     {
825         skvm::Builder p;
826         auto rptr = p.varying<int>(),
827              aptr = p.varying<int>();
828 
829         skvm::F32 r = p.loadF(rptr),
830                   g = p.splat(0.0f),
831                   b = p.splat(0.0f),
832                   a = p.loadF(aptr);
833 
834         p.premul(&r, &g, &b, a);
835         p.storeF(rptr, r);
836 
837         // load red, load alpha, red *= alpha, store red
838         REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
839     }
840 
841     {
842         skvm::Builder p;
843         auto rptr = p.varying<int>();
844 
845         skvm::F32 r = p.loadF(rptr),
846                   g = p.splat(0.0f),
847                   b = p.splat(0.0f),
848                   a = p.splat(1.0f);
849 
850         p.premul(&r, &g, &b, a);
851         p.storeF(rptr, r);
852 
853         // load red, store red
854         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
855     }
856 
857     // Same deal for unpremul.
858     {
859         skvm::Builder p;
860         auto rptr = p.varying<int>(),
861              aptr = p.varying<int>();
862 
863         skvm::F32 r = p.loadF(rptr),
864                   g = p.splat(0.0f),
865                   b = p.splat(0.0f),
866                   a = p.loadF(aptr);
867 
868         p.unpremul(&r, &g, &b, a);
869         p.storeF(rptr, r);
870 
871         // load red, load alpha, a bunch of unpremul instructions, store red
872         REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
873     }
874 
875     {
876         skvm::Builder p;
877         auto rptr = p.varying<int>();
878 
879         skvm::F32 r = p.loadF(rptr),
880                   g = p.splat(0.0f),
881                   b = p.splat(0.0f),
882                   a = p.splat(1.0f);
883 
884         p.unpremul(&r, &g, &b, a);
885         p.storeF(rptr, r);
886 
887         // load red, store red
888         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
889     }
890 }
891 
892 template <typename Fn>
test_asm(skiatest::Reporter * r,Fn && fn,std::initializer_list<uint8_t> expected)893 static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
894     uint8_t buf[4096];
895     skvm::Assembler a{buf};
896     fn(a);
897 
898     REPORTER_ASSERT(r, a.size() == expected.size());
899 
900     auto got = (const uint8_t*)buf,
901          want = expected.begin();
902     for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
903         REPORTER_ASSERT(r, got[i] == want[i],
904                         "byte %d was %02x, want %02x", i, got[i], want[i]);
905     }
906 }
907 
DEF_TEST(SkVM_Assembler,r)908 DEF_TEST(SkVM_Assembler, r) {
909     // Easiest way to generate test cases is
910     //
911     //   echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
912     //
913     // The -x86-asm-syntax=intel bit is optional, controlling the
914     // input syntax only; the output will always be AT&T  op x,y,dst style.
915     // Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
916     // that a bit easier to use here, despite maybe favoring AT&T overall.
917 
918     using A = skvm::Assembler;
919     // Our exit strategy from AVX code.
920     test_asm(r, [&](A& a) {
921         a.int3();
922         a.vzeroupper();
923         a.ret();
924     },{
925         0xcc,
926         0xc5, 0xf8, 0x77,
927         0xc3,
928     });
929 
930     // Align should pad with zero
931     test_asm(r, [&](A& a) {
932         a.ret();
933         a.align(4);
934     },{
935         0xc3,
936         0x00, 0x00, 0x00,
937     });
938 
939     test_asm(r, [&](A& a) {
940         a.add(A::rax, 8);       // Always good to test rax.
941         a.sub(A::rax, 32);
942 
943         a.add(A::rdi, 12);      // Last 0x48 REX
944         a.sub(A::rdi, 8);
945 
946         a.add(A::r8 , 7);       // First 0x49 REX
947         a.sub(A::r8 , 4);
948 
949         a.add(A::rsi, 128);     // Requires 4 byte immediate.
950         a.sub(A::r8 , 1000000);
951 
952         a.add(A::Mem{A::rsi}, 7);                       // addq $7, (%rsi)
953         a.add(A::Mem{A::rsi, 12}, 7);                   // addq $7, 12(%rsi)
954         a.add(A::Mem{A::rsp, 12}, 7);                   // addq $7, 12(%rsp)
955         a.add(A::Mem{A::r12, 12}, 7);                   // addq $7, 12(%r12)
956         a.add(A::Mem{A::rsp, 12, A::rax, A::FOUR}, 7);  // addq $7, 12(%rsp,%rax,4)
957         a.add(A::Mem{A::r12, 12, A::rax, A::FOUR}, 7);  // addq $7, 12(%r12,%rax,4)
958         a.add(A::Mem{A::rax, 12, A::r12, A::FOUR}, 7);  // addq $7, 12(%rax,%r12,4)
959         a.add(A::Mem{A::r11, 12, A::r8 , A::TWO }, 7);  // addq $7, 12(%r11,%r8,2)
960         a.add(A::Mem{A::r11, 12, A::rax}         , 7);  // addq $7, 12(%r11,%rax)
961         a.add(A::Mem{A::rax, 12, A::r11}         , 7);  // addq $7, 12(%rax,%r11)
962 
963         a.sub(A::Mem{A::rax, 12, A::r11}         , 7);  // subq $7, 12(%rax,%r11)
964 
965         a.add(       A::rax     , A::rcx);              // addq %rcx, %rax
966         a.add(A::Mem{A::rax}    , A::rcx);              // addq %rcx, (%rax)
967         a.add(A::Mem{A::rax, 12}, A::rcx);              // addq %rcx, 12(%rax)
968         a.add(A::rcx, A::Mem{A::rax, 12});              // addq 12(%rax), %rcx
969 
970         a.sub(A::rcx, A::Mem{A::rax, 12});              // subq 12(%rax), %rcx
971     },{
972         0x48, 0x83, 0b11'000'000, 0x08,
973         0x48, 0x83, 0b11'101'000, 0x20,
974 
975         0x48, 0x83, 0b11'000'111, 0x0c,
976         0x48, 0x83, 0b11'101'111, 0x08,
977 
978         0x49, 0x83, 0b11'000'000, 0x07,
979         0x49, 0x83, 0b11'101'000, 0x04,
980 
981         0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
982         0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
983 
984         0x48,0x83,0x06,0x07,
985         0x48,0x83,0x46,0x0c,0x07,
986         0x48,0x83,0x44,0x24,0x0c,0x07,
987         0x49,0x83,0x44,0x24,0x0c,0x07,
988         0x48,0x83,0x44,0x84,0x0c,0x07,
989         0x49,0x83,0x44,0x84,0x0c,0x07,
990         0x4a,0x83,0x44,0xa0,0x0c,0x07,
991         0x4b,0x83,0x44,0x43,0x0c,0x07,
992         0x49,0x83,0x44,0x03,0x0c,0x07,
993         0x4a,0x83,0x44,0x18,0x0c,0x07,
994 
995         0x4a,0x83,0x6c,0x18,0x0c,0x07,
996 
997         0x48,0x01,0xc8,
998         0x48,0x01,0x08,
999         0x48,0x01,0x48,0x0c,
1000         0x48,0x03,0x48,0x0c,
1001         0x48,0x2b,0x48,0x0c,
1002     });
1003 
1004 
1005     test_asm(r, [&](A& a) {
1006         a.vpaddd (A::ymm0, A::ymm1, A::ymm2);  // Low registers and 0x0f map     -> 2-byte VEX.
1007         a.vpaddd (A::ymm8, A::ymm1, A::ymm2);  // A high dst register is ok      -> 2-byte VEX.
1008         a.vpaddd (A::ymm0, A::ymm8, A::ymm2);  // A high first argument register -> 2-byte VEX.
1009         a.vpaddd (A::ymm0, A::ymm1, A::ymm8);  // A high second argument         -> 3-byte VEX.
1010         a.vpmulld(A::ymm0, A::ymm1, A::ymm2);  // Using non-0x0f map instruction -> 3-byte VEX.
1011         a.vpsubd (A::ymm0, A::ymm1, A::ymm2);  // Test vpsubd to ensure argument order is right.
1012     },{
1013         /*    VEX     */ /*op*/ /*modRM*/
1014         0xc5,       0xf5, 0xfe, 0xc2,
1015         0xc5,       0x75, 0xfe, 0xc2,
1016         0xc5,       0xbd, 0xfe, 0xc2,
1017         0xc4, 0xc1, 0x75, 0xfe, 0xc0,
1018         0xc4, 0xe2, 0x75, 0x40, 0xc2,
1019         0xc5,       0xf5, 0xfa, 0xc2,
1020     });
1021 
1022     test_asm(r, [&](A& a) {
1023         a.vpaddw   (A::ymm4, A::ymm3, A::ymm2);
1024         a.vpavgw   (A::ymm4, A::ymm3, A::ymm2);
1025         a.vpcmpeqw (A::ymm4, A::ymm3, A::ymm2);
1026         a.vpcmpgtw (A::ymm4, A::ymm3, A::ymm2);
1027 
1028         a.vpminsw  (A::ymm4, A::ymm3, A::ymm2);
1029         a.vpmaxsw  (A::ymm4, A::ymm3, A::ymm2);
1030         a.vpminuw  (A::ymm4, A::ymm3, A::ymm2);
1031         a.vpmaxuw  (A::ymm4, A::ymm3, A::ymm2);
1032 
1033         a.vpmulhrsw(A::ymm4, A::ymm3, A::ymm2);
1034         a.vpabsw   (A::ymm4, A::ymm3);
1035         a.vpsllw   (A::ymm4, A::ymm3, 12);
1036         a.vpsraw   (A::ymm4, A::ymm3, 12);
1037     },{
1038         0xc5,     0xe5, 0xfd, 0xe2,
1039         0xc5,     0xe5, 0xe3, 0xe2,
1040         0xc5,     0xe5, 0x75, 0xe2,
1041         0xc5,     0xe5, 0x65, 0xe2,
1042 
1043         0xc5,     0xe5, 0xea, 0xe2,
1044         0xc5,     0xe5, 0xee, 0xe2,
1045         0xc4,0xe2,0x65, 0x3a, 0xe2,
1046         0xc4,0xe2,0x65, 0x3e, 0xe2,
1047 
1048         0xc4,0xe2,0x65, 0x0b, 0xe2,
1049         0xc4,0xe2,0x7d, 0x1d, 0xe3,
1050         0xc5,0xdd,0x71, 0xf3, 0x0c,
1051         0xc5,0xdd,0x71, 0xe3, 0x0c,
1052     });
1053 
1054     test_asm(r, [&](A& a) {
1055         A::Label l;
1056         a.vcmpeqps (A::ymm0, A::ymm1, &l);      // vcmpeqps 0x1c(%rip), %ymm1, %ymm0
1057         a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
1058         a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
1059         a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
1060         a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
1061         a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
1062         a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
1063         a.label(&l);   // 28 bytes after the vcmpeqps that uses it.
1064     },{
1065         0xc5,0xf4,0xc2,0x05,0x1c,0x00,0x00,0x00,0x00,
1066         0xc5,0xf5,0x76,0xc2,
1067         0xc5,0xf5,0x66,0xc2,
1068         0xc5,0xf4,0xc2,0xc2,0x00,
1069         0xc5,0xf4,0xc2,0xc2,0x01,
1070         0xc5,0xf4,0xc2,0xc2,0x02,
1071         0xc5,0xf4,0xc2,0xc2,0x04,
1072     });
1073 
1074     test_asm(r, [&](A& a) {
1075         a.vminps(A::ymm0, A::ymm1, A::ymm2);
1076         a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
1077     },{
1078         0xc5,0xf4,0x5d,0xc2,
1079         0xc5,0xf4,0x5f,0xc2,
1080     });
1081 
1082     test_asm(r, [&](A& a) {
1083         a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
1084     },{
1085         0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
1086     });
1087 
1088     test_asm(r, [&](A& a) {
1089         a.vpsrld(A::ymm15, A::ymm2, 8);
1090         a.vpsrld(A::ymm0 , A::ymm8, 5);
1091     },{
1092         0xc5,     0x85, 0x72,0xd2, 0x08,
1093         0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
1094     });
1095 
1096     test_asm(r, [&](A& a) {
1097         A::Label l;
1098         a.vpermps(A::ymm1, A::ymm2, A::Mem{A::rdi, 32});
1099         a.vperm2f128(A::ymm1, A::ymm2, &l, 0x20);
1100         a.vpermq(A::ymm1, A::ymm2, 5);
1101         a.label(&l);  // 6 bytes after vperm2f128
1102     },{
1103         0xc4,0xe2,0x6d,0x16,0x4f,0x20,
1104         0xc4,0xe3,0x6d,0x06,0x0d,0x06,0x00,0x00,0x00,0x20,
1105         0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
1106     });
1107 
1108     test_asm(r, [&](A& a) {
1109         a.vpunpckldq(A::ymm1, A::ymm2, A::Mem{A::rdi});
1110         a.vpunpckhdq(A::ymm1, A::ymm2, A::ymm3);
1111     },{
1112         0xc5,0xed,0x62,0x0f,
1113         0xc5,0xed,0x6a,0xcb,
1114     });
1115 
1116     test_asm(r, [&](A& a) {
1117         a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
1118         a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
1119         a.vroundps(A::ymm1, A::ymm2, A::CEIL);
1120         a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
1121     },{
1122         0xc4,0xe3,0x7d,0x08,0xca,0x00,
1123         0xc4,0xe3,0x7d,0x08,0xca,0x01,
1124         0xc4,0xe3,0x7d,0x08,0xca,0x02,
1125         0xc4,0xe3,0x7d,0x08,0xca,0x03,
1126     });
1127 
1128     test_asm(r, [&](A& a) {
1129         A::Label l;
1130         a.label(&l);
1131         a.byte(1);
1132         a.byte(2);
1133         a.byte(3);
1134         a.byte(4);
1135 
1136         a.vbroadcastss(A::ymm0 , &l);
1137         a.vbroadcastss(A::ymm1 , &l);
1138         a.vbroadcastss(A::ymm8 , &l);
1139         a.vbroadcastss(A::ymm15, &l);
1140 
1141         a.vpshufb(A::ymm4, A::ymm3, &l);
1142         a.vpaddd (A::ymm4, A::ymm3, &l);
1143         a.vpsubd (A::ymm4, A::ymm3, &l);
1144 
1145         a.vptest(A::ymm4, &l);
1146 
1147         a.vmulps (A::ymm4, A::ymm3, &l);
1148     },{
1149         0x01, 0x02, 0x03, 0x4,
1150 
1151         /*     VEX    */  /*op*/ /*   ModRM    */  /*     offset     */
1152         0xc4, 0xe2, 0x7d,  0x18,   0b00'000'101,   0xf3,0xff,0xff,0xff,   // 0xfffffff3 == -13
1153         0xc4, 0xe2, 0x7d,  0x18,   0b00'001'101,   0xea,0xff,0xff,0xff,   // 0xffffffea == -22
1154         0xc4, 0x62, 0x7d,  0x18,   0b00'000'101,   0xe1,0xff,0xff,0xff,   // 0xffffffe1 == -31
1155         0xc4, 0x62, 0x7d,  0x18,   0b00'111'101,   0xd8,0xff,0xff,0xff,   // 0xffffffd8 == -40
1156 
1157         0xc4, 0xe2, 0x65,  0x00,   0b00'100'101,   0xcf,0xff,0xff,0xff,   // 0xffffffcf == -49
1158 
1159         0xc5, 0xe5,        0xfe,   0b00'100'101,   0xc7,0xff,0xff,0xff,   // 0xffffffc7 == -57
1160         0xc5, 0xe5,        0xfa,   0b00'100'101,   0xbf,0xff,0xff,0xff,   // 0xffffffbf == -65
1161 
1162         0xc4, 0xe2, 0x7d,  0x17,   0b00'100'101,   0xb6,0xff,0xff,0xff,   // 0xffffffb6 == -74
1163 
1164         0xc5, 0xe4,        0x59,   0b00'100'101,   0xae,0xff,0xff,0xff,   // 0xffffffaf == -82
1165     });
1166 
1167     test_asm(r, [&](A& a) {
1168         a.vbroadcastss(A::ymm0,  A::Mem{A::rdi,   0});
1169         a.vbroadcastss(A::ymm13, A::Mem{A::r14,   7});
1170         a.vbroadcastss(A::ymm8,  A::Mem{A::rdx, -12});
1171         a.vbroadcastss(A::ymm8,  A::Mem{A::rdx, 400});
1172 
1173         a.vbroadcastss(A::ymm8,  A::xmm0);
1174         a.vbroadcastss(A::ymm0,  A::xmm13);
1175     },{
1176         /*   VEX    */ /*op*/     /*ModRM*/   /*offset*/
1177         0xc4,0xe2,0x7d, 0x18,   0b00'000'111,
1178         0xc4,0x42,0x7d, 0x18,   0b01'101'110,  0x07,
1179         0xc4,0x62,0x7d, 0x18,   0b01'000'010,  0xf4,
1180         0xc4,0x62,0x7d, 0x18,   0b10'000'010,  0x90,0x01,0x00,0x00,
1181 
1182         0xc4,0x62,0x7d, 0x18,   0b11'000'000,
1183         0xc4,0xc2,0x7d, 0x18,   0b11'000'101,
1184     });
1185 
1186     test_asm(r, [&](A& a) {
1187         A::Label l;
1188         a.label(&l);
1189         a.jne(&l);
1190         a.jne(&l);
1191         a.je (&l);
1192         a.jmp(&l);
1193         a.jl (&l);
1194         a.jc (&l);
1195 
1196         a.cmp(A::rdx, 1);
1197         a.cmp(A::rax, 12);
1198         a.cmp(A::r14, 2000000000);
1199     },{
1200         0x0f,0x85, 0xfa,0xff,0xff,0xff,   // near jne -6 bytes
1201         0x0f,0x85, 0xf4,0xff,0xff,0xff,   // near jne -12 bytes
1202         0x0f,0x84, 0xee,0xff,0xff,0xff,   // near je  -18 bytes
1203         0xe9,      0xe9,0xff,0xff,0xff,   // near jmp -23 bytes
1204         0x0f,0x8c, 0xe3,0xff,0xff,0xff,   // near jl  -29 bytes
1205         0x0f,0x82, 0xdd,0xff,0xff,0xff,   // near jc  -35 bytes
1206 
1207         0x48,0x83,0xfa,0x01,
1208         0x48,0x83,0xf8,0x0c,
1209         0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
1210     });
1211 
1212     test_asm(r, [&](A& a) {
1213         a.vmovups(A::ymm5, A::Mem{A::rsi});
1214         a.vmovups(A::Mem{A::rsi}, A::ymm5);
1215 
1216         a.vmovups(A::xmm5, A::Mem{A::rsi});
1217         a.vmovups(A::Mem{A::rsi}, A::xmm5);
1218 
1219         a.vpmovzxwd(A::ymm4, A::Mem{A::rsi});
1220         a.vpmovzxbd(A::ymm4, A::Mem{A::rsi});
1221 
1222         a.vmovq(A::Mem{A::rdx}, A::xmm15);
1223     },{
1224         /*    VEX    */  /*Op*/  /*  ModRM  */
1225         0xc5,     0xfc,   0x10,  0b00'101'110,
1226         0xc5,     0xfc,   0x11,  0b00'101'110,
1227 
1228         0xc5,     0xf8,   0x10,  0b00'101'110,
1229         0xc5,     0xf8,   0x11,  0b00'101'110,
1230 
1231         0xc4,0xe2,0x7d,   0x33,  0b00'100'110,
1232         0xc4,0xe2,0x7d,   0x31,  0b00'100'110,
1233 
1234         0xc5,     0x79,   0xd6,  0b00'111'010,
1235     });
1236 
1237     test_asm(r, [&](A& a) {
1238         a.vmovups(A::ymm5, A::Mem{A::rsp,  0});
1239         a.vmovups(A::ymm5, A::Mem{A::rsp, 64});
1240         a.vmovups(A::ymm5, A::Mem{A::rsp,128});
1241 
1242         a.vmovups(A::Mem{A::rsp,  0}, A::ymm5);
1243         a.vmovups(A::Mem{A::rsp, 64}, A::ymm5);
1244         a.vmovups(A::Mem{A::rsp,128}, A::ymm5);
1245     },{
1246         0xc5,0xfc,0x10,0x2c,0x24,
1247         0xc5,0xfc,0x10,0x6c,0x24,0x40,
1248         0xc5,0xfc,0x10,0xac,0x24,0x80,0x00,0x00,0x00,
1249 
1250         0xc5,0xfc,0x11,0x2c,0x24,
1251         0xc5,0xfc,0x11,0x6c,0x24,0x40,
1252         0xc5,0xfc,0x11,0xac,0x24,0x80,0x00,0x00,0x00,
1253     });
1254 
1255     test_asm(r, [&](A& a) {
1256         a.movzbq(A::rax, A::Mem{A::rsi});   // Low registers for src and dst.
1257         a.movzbq(A::rax, A::Mem{A::r8,});   // High src register.
1258         a.movzbq(A::r8 , A::Mem{A::rsi});   // High dst register.
1259         a.movzbq(A::r8,  A::Mem{A::rsi, 12});
1260         a.movzbq(A::r8,  A::Mem{A::rsi, 400});
1261 
1262         a.movzwq(A::rax, A::Mem{A::rsi});   // Low registers for src and dst.
1263         a.movzwq(A::rax, A::Mem{A::r8,});   // High src register.
1264         a.movzwq(A::r8 , A::Mem{A::rsi});   // High dst register.
1265         a.movzwq(A::r8,  A::Mem{A::rsi, 12});
1266         a.movzwq(A::r8,  A::Mem{A::rsi, 400});
1267 
1268         a.vmovd(A::Mem{A::rax}, A::xmm0);
1269         a.vmovd(A::Mem{A::rax}, A::xmm8);
1270         a.vmovd(A::Mem{A::r8 }, A::xmm0);
1271 
1272         a.vmovd(A::xmm0, A::Mem{A::rax});
1273         a.vmovd(A::xmm8, A::Mem{A::rax});
1274         a.vmovd(A::xmm0, A::Mem{A::r8 });
1275 
1276         a.vmovd(A::xmm0 , A::Mem{A::rax, 0, A::rcx, A::FOUR});
1277         a.vmovd(A::xmm15, A::Mem{A::rax, 0, A::r8,  A::TWO });
1278         a.vmovd(A::xmm0 , A::Mem{A::r8 , 0, A::rcx});
1279 
1280         a.vmovd(A::rax, A::xmm0);
1281         a.vmovd(A::rax, A::xmm8);
1282         a.vmovd(A::r8 ,  A::xmm0);
1283 
1284         a.vmovd(A::xmm0, A::rax);
1285         a.vmovd(A::xmm8, A::rax);
1286         a.vmovd(A::xmm0, A::r8 );
1287 
1288         a.movb(A::Mem{A::rdx}, A::rax);
1289         a.movb(A::Mem{A::rdx}, A::r8 );
1290         a.movb(A::Mem{A::r8 }, A::rax);
1291 
1292         a.movb(A::rdx, A::Mem{A::rax});
1293         a.movb(A::rdx, A::Mem{A::r8 });
1294         a.movb(A::r8 , A::Mem{A::rax});
1295 
1296         a.movb(A::rdx, 12);
1297         a.movb(A::rax,  4);
1298         a.movb(A::r8 , -1);
1299 
1300         a.movb(A::Mem{A::rdx}, 12);
1301         a.movb(A::Mem{A::rax},  4);
1302         a.movb(A::Mem{A::r8 }, -1);
1303     },{
1304         0x48,0x0f,0xb6,0x06,     // movzbq (%rsi), %rax
1305         0x49,0x0f,0xb6,0x00,
1306         0x4c,0x0f,0xb6,0x06,
1307         0x4c,0x0f,0xb6,0x46, 12,
1308         0x4c,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
1309 
1310         0x48,0x0f,0xb7,0x06,    // movzwq (%rsi), %rax
1311         0x49,0x0f,0xb7,0x00,
1312         0x4c,0x0f,0xb7,0x06,
1313         0x4c,0x0f,0xb7,0x46, 12,
1314         0x4c,0x0f,0xb7,0x86, 0x90,0x01,0x00,0x00,
1315 
1316         0xc5,0xf9,0x7e,0x00,
1317         0xc5,0x79,0x7e,0x00,
1318         0xc4,0xc1,0x79,0x7e,0x00,
1319 
1320         0xc5,0xf9,0x6e,0x00,
1321         0xc5,0x79,0x6e,0x00,
1322         0xc4,0xc1,0x79,0x6e,0x00,
1323 
1324         0xc5,0xf9,0x6e,0x04,0x88,
1325         0xc4,0x21,0x79,0x6e,0x3c,0x40,
1326         0xc4,0xc1,0x79,0x6e,0x04,0x08,
1327 
1328         0xc5,0xf9,0x7e,0xc0,
1329         0xc5,0x79,0x7e,0xc0,
1330         0xc4,0xc1,0x79,0x7e,0xc0,
1331 
1332         0xc5,0xf9,0x6e,0xc0,
1333         0xc5,0x79,0x6e,0xc0,
1334         0xc4,0xc1,0x79,0x6e,0xc0,
1335 
1336         0x48 ,0x88, 0x02,
1337         0x4c, 0x88, 0x02,
1338         0x49, 0x88, 0x00,
1339 
1340         0x48 ,0x8a, 0x10,
1341         0x49, 0x8a, 0x10,
1342         0x4c, 0x8a, 0x00,
1343 
1344         0x48, 0xc6, 0xc2, 0x0c,
1345         0x48, 0xc6, 0xc0, 0x04,
1346         0x49, 0xc6, 0xc0, 0xff,
1347 
1348         0x48, 0xc6, 0x02, 0x0c,
1349         0x48, 0xc6, 0x00, 0x04,
1350         0x49, 0xc6, 0x00, 0xff,
1351     });
1352 
1353     test_asm(r, [&](A& a) {
1354         a.vpinsrd(A::xmm1, A::xmm8, A::Mem{A::rsi}, 1);   // vpinsrd $1, (%rsi), %xmm8, %xmm1
1355         a.vpinsrd(A::xmm8, A::xmm1, A::Mem{A::r8 }, 3);   // vpinsrd $3, (%r8), %xmm1, %xmm8;
1356 
1357         a.vpinsrw(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrw $4, (%rsi), %xmm8, %xmm1
1358         a.vpinsrw(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinrsw $12, (%r8), %xmm1, %xmm8
1359 
1360         a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrb $4, (%rsi), %xmm8, %xmm1
1361         a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinsrb $12, (%r8), %xmm1, %xmm8
1362 
1363         a.vextracti128(A::xmm1, A::ymm8, 1);  // vextracti128 $1, %ymm8, %xmm1
1364         a.vextracti128(A::xmm8, A::ymm1, 0);  // vextracti128 $0, %ymm1, %xmm8
1365 
1366         a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3);  // vpextrd  $3, %xmm8, (%rsi)
1367         a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2);  // vpextrd  $2, %xmm1, (%r8)
1368 
1369         a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
1370         a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
1371 
1372         a.vpextrb(A::Mem{A::rsi}, A::xmm8, 7);
1373         a.vpextrb(A::Mem{A::r8 }, A::xmm1, 15);
1374     },{
1375         0xc4,0xe3,0x39, 0x22, 0x0e, 1,
1376         0xc4,0x43,0x71, 0x22, 0x00, 3,
1377 
1378         0xc5,0xb9,      0xc4, 0x0e,  4,
1379         0xc4,0x41,0x71, 0xc4, 0x00, 12,
1380 
1381         0xc4,0xe3,0x39, 0x20, 0x0e,  4,
1382         0xc4,0x43,0x71, 0x20, 0x00, 12,
1383 
1384         0xc4,0x63,0x7d,0x39,0xc1, 1,
1385         0xc4,0xc3,0x7d,0x39,0xc8, 0,
1386 
1387         0xc4,0x63,0x79,0x16,0x06, 3,
1388         0xc4,0xc3,0x79,0x16,0x08, 2,
1389 
1390         0xc4,0x63,0x79, 0x15, 0x06,  7,
1391         0xc4,0xc3,0x79, 0x15, 0x08, 15,
1392 
1393         0xc4,0x63,0x79, 0x14, 0x06,  7,
1394         0xc4,0xc3,0x79, 0x14, 0x08, 15,
1395     });
1396 
1397     test_asm(r, [&](A& a) {
1398         a.vpandn(A::ymm3, A::ymm12, A::ymm2);
1399     },{
1400         0xc5, 0x9d, 0xdf, 0xda,
1401     });
1402 
1403     test_asm(r, [&](A& a) {
1404         A::Label l;
1405         a.vmovdqa(A::ymm3, A::ymm2);                                // vmovdqa %ymm2         , %ymm3
1406 
1407         a.vmovdqa(A::ymm3, A::Mem{A::rsi});                         // vmovdqa  (%rsi)       , %ymm3
1408         a.vmovdqa(A::ymm3, A::Mem{A::rsp});                         // vmovdqa  (%rsp)       , %ymm3
1409         a.vmovdqa(A::ymm3, A::Mem{A::r11});                         // vmovdqa  (%r11)       , %ymm3
1410 
1411         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4});                     // vmovdqa 4(%rsi)       , %ymm3
1412         a.vmovdqa(A::ymm3, A::Mem{A::rsp,  4});                     // vmovdqa 4(%rsp)       , %ymm3
1413 
1414         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::rax, A::EIGHT});   // vmovdqa 4(%rsi,%rax,8), %ymm3
1415         a.vmovdqa(A::ymm3, A::Mem{A::r11,  4, A::rax, A::TWO  });   // vmovdqa 4(%r11,%rax,2), %ymm3
1416         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11, A::FOUR });   // vmovdqa 4(%rsi,%r11,4), %ymm3
1417         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11, A::ONE  });   // vmovdqa 4(%rsi,%r11,1), %ymm3
1418         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11});             // vmovdqa 4(%rsi,%r11)  , %ymm3
1419 
1420         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  64, A::r11});            // vmovdqa  64(%rsi,%r11), %ymm3
1421         a.vmovdqa(A::ymm3, A::Mem{A::rsi, 128, A::r11});            // vmovdqa 128(%rsi,%r11), %ymm3
1422         a.vmovdqa(A::ymm3, &l);                                     // vmovdqa  16(%rip)     , %ymm3
1423 
1424         a.vcvttps2dq(A::ymm3, A::ymm2);
1425         a.vcvtdq2ps (A::ymm3, A::ymm2);
1426         a.vcvtps2dq (A::ymm3, A::ymm2);
1427         a.vsqrtps   (A::ymm3, A::ymm2);
1428         a.label(&l);
1429     },{
1430         0xc5,0xfd,0x6f,0xda,
1431 
1432         0xc5,0xfd,0x6f,0x1e,
1433         0xc5,0xfd,0x6f,0x1c,0x24,
1434         0xc4,0xc1,0x7d,0x6f,0x1b,
1435 
1436         0xc5,0xfd,0x6f,0x5e,0x04,
1437         0xc5,0xfd,0x6f,0x5c,0x24,0x04,
1438 
1439         0xc5,0xfd,0x6f,0x5c,0xc6,0x04,
1440         0xc4,0xc1,0x7d,0x6f,0x5c,0x43,0x04,
1441         0xc4,0xa1,0x7d,0x6f,0x5c,0x9e,0x04,
1442         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1443         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1444 
1445         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x40,
1446         0xc4,0xa1,0x7d,0x6f,0x9c,0x1e,0x80,0x00,0x00,0x00,
1447 
1448         0xc5,0xfd,0x6f,0x1d,0x10,0x00,0x00,0x00,
1449 
1450         0xc5,0xfe,0x5b,0xda,
1451         0xc5,0xfc,0x5b,0xda,
1452         0xc5,0xfd,0x5b,0xda,
1453         0xc5,0xfc,0x51,0xda,
1454     });
1455 
1456     test_asm(r, [&](A& a) {
1457         a.vcvtps2ph(A::xmm3, A::ymm2, A::CURRENT);
1458         a.vcvtps2ph(A::Mem{A::rsi, 32, A::rax, A::EIGHT}, A::ymm5, A::CEIL);
1459 
1460         a.vcvtph2ps(A::ymm15, A::Mem{A::rdi, 12, A::r9, A::ONE});
1461         a.vcvtph2ps(A::ymm2, A::xmm3);
1462     },{
1463         0xc4,0xe3,0x7d,0x1d,0xd3,0x04,
1464         0xc4,0xe3,0x7d,0x1d,0x6c,0xc6,0x20,0x02,
1465 
1466         0xc4,0x22,0x7d,0x13,0x7c,0x0f,0x0c,
1467         0xc4,0xe2,0x7d,0x13,0xd3,
1468     });
1469 
1470     test_asm(r, [&](A& a) {
1471         a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
1472         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1473         a.vgatherdps(A::ymm10, A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1474         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm12, A::rax, A::ymm1 );
1475         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::r9 , A::ymm1 );
1476         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm12);
1477         a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
1478     },{
1479         0xc4,0xe2,0x6d,0x92,0x0c,0x87,
1480         0xc4,0xe2,0x75,0x92,0x04,0x10,
1481         0xc4,0x62,0x75,0x92,0x14,0x10,
1482         0xc4,0xa2,0x75,0x92,0x04,0x20,
1483         0xc4,0xc2,0x75,0x92,0x04,0x11,
1484         0xc4,0xe2,0x1d,0x92,0x04,0x10,
1485         0xc4,0xe2,0x1d,0x92,0x04,0xd0,
1486     });
1487 
1488     test_asm(r, [&](A& a) {
1489         a.mov(A::rax, A::Mem{A::rdi,   0});
1490         a.mov(A::rax, A::Mem{A::rdi,   1});
1491         a.mov(A::rax, A::Mem{A::rdi, 512});
1492         a.mov(A::r15, A::Mem{A::r13,  42});
1493         a.mov(A::rax, A::Mem{A::r13,  42});
1494         a.mov(A::r15, A::Mem{A::rax,  42});
1495         a.mov(A::rax, 1);
1496         a.mov(A::rax, A::rcx);
1497     },{
1498         0x48, 0x8b, 0x07,
1499         0x48, 0x8b, 0x47, 0x01,
1500         0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
1501         0x4d, 0x8b, 0x7d, 0x2a,
1502         0x49, 0x8b, 0x45, 0x2a,
1503         0x4c, 0x8b, 0x78, 0x2a,
1504         0x48, 0xc7, 0xc0, 0x01,0x00,0x00,0x00,
1505         0x48, 0x89, 0xc8,
1506     });
1507 
1508     // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
1509 
1510     test_asm(r, [&](A& a) {
1511         a.and16b(A::v4, A::v3, A::v1);
1512         a.orr16b(A::v4, A::v3, A::v1);
1513         a.eor16b(A::v4, A::v3, A::v1);
1514         a.bic16b(A::v4, A::v3, A::v1);
1515         a.bsl16b(A::v4, A::v3, A::v1);
1516         a.not16b(A::v4, A::v3);
1517 
1518         a.add4s(A::v4, A::v3, A::v1);
1519         a.sub4s(A::v4, A::v3, A::v1);
1520         a.mul4s(A::v4, A::v3, A::v1);
1521 
1522         a.cmeq4s(A::v4, A::v3, A::v1);
1523         a.cmgt4s(A::v4, A::v3, A::v1);
1524 
1525         a.sub8h(A::v4, A::v3, A::v1);
1526         a.mul8h(A::v4, A::v3, A::v1);
1527 
1528         a.fadd4s(A::v4, A::v3, A::v1);
1529         a.fsub4s(A::v4, A::v3, A::v1);
1530         a.fmul4s(A::v4, A::v3, A::v1);
1531         a.fdiv4s(A::v4, A::v3, A::v1);
1532         a.fmin4s(A::v4, A::v3, A::v1);
1533         a.fmax4s(A::v4, A::v3, A::v1);
1534 
1535         a.fneg4s (A::v4, A::v3);
1536         a.fsqrt4s(A::v4, A::v3);
1537 
1538         a.fmla4s(A::v4, A::v3, A::v1);
1539         a.fmls4s(A::v4, A::v3, A::v1);
1540 
1541         a.fcmeq4s(A::v4, A::v3, A::v1);
1542         a.fcmgt4s(A::v4, A::v3, A::v1);
1543         a.fcmge4s(A::v4, A::v3, A::v1);
1544     },{
1545         0x64,0x1c,0x21,0x4e,
1546         0x64,0x1c,0xa1,0x4e,
1547         0x64,0x1c,0x21,0x6e,
1548         0x64,0x1c,0x61,0x4e,
1549         0x64,0x1c,0x61,0x6e,
1550         0x64,0x58,0x20,0x6e,
1551 
1552         0x64,0x84,0xa1,0x4e,
1553         0x64,0x84,0xa1,0x6e,
1554         0x64,0x9c,0xa1,0x4e,
1555 
1556         0x64,0x8c,0xa1,0x6e,
1557         0x64,0x34,0xa1,0x4e,
1558 
1559         0x64,0x84,0x61,0x6e,
1560         0x64,0x9c,0x61,0x4e,
1561 
1562         0x64,0xd4,0x21,0x4e,
1563         0x64,0xd4,0xa1,0x4e,
1564         0x64,0xdc,0x21,0x6e,
1565         0x64,0xfc,0x21,0x6e,
1566         0x64,0xf4,0xa1,0x4e,
1567         0x64,0xf4,0x21,0x4e,
1568 
1569         0x64,0xf8,0xa0,0x6e,
1570         0x64,0xf8,0xa1,0x6e,
1571 
1572         0x64,0xcc,0x21,0x4e,
1573         0x64,0xcc,0xa1,0x4e,
1574 
1575         0x64,0xe4,0x21,0x4e,
1576         0x64,0xe4,0xa1,0x6e,
1577         0x64,0xe4,0x21,0x6e,
1578     });
1579 
1580     test_asm(r, [&](A& a) {
1581         a.shl4s(A::v4, A::v3,  0);
1582         a.shl4s(A::v4, A::v3,  1);
1583         a.shl4s(A::v4, A::v3,  8);
1584         a.shl4s(A::v4, A::v3, 16);
1585         a.shl4s(A::v4, A::v3, 31);
1586 
1587         a.sshr4s(A::v4, A::v3,  1);
1588         a.sshr4s(A::v4, A::v3,  8);
1589         a.sshr4s(A::v4, A::v3, 31);
1590 
1591         a.ushr4s(A::v4, A::v3,  1);
1592         a.ushr4s(A::v4, A::v3,  8);
1593         a.ushr4s(A::v4, A::v3, 31);
1594 
1595         a.ushr8h(A::v4, A::v3,  1);
1596         a.ushr8h(A::v4, A::v3,  8);
1597         a.ushr8h(A::v4, A::v3, 15);
1598     },{
1599         0x64,0x54,0x20,0x4f,
1600         0x64,0x54,0x21,0x4f,
1601         0x64,0x54,0x28,0x4f,
1602         0x64,0x54,0x30,0x4f,
1603         0x64,0x54,0x3f,0x4f,
1604 
1605         0x64,0x04,0x3f,0x4f,
1606         0x64,0x04,0x38,0x4f,
1607         0x64,0x04,0x21,0x4f,
1608 
1609         0x64,0x04,0x3f,0x6f,
1610         0x64,0x04,0x38,0x6f,
1611         0x64,0x04,0x21,0x6f,
1612 
1613         0x64,0x04,0x1f,0x6f,
1614         0x64,0x04,0x18,0x6f,
1615         0x64,0x04,0x11,0x6f,
1616     });
1617 
1618     test_asm(r, [&](A& a) {
1619         a.sli4s(A::v4, A::v3,  0);
1620         a.sli4s(A::v4, A::v3,  1);
1621         a.sli4s(A::v4, A::v3,  8);
1622         a.sli4s(A::v4, A::v3, 16);
1623         a.sli4s(A::v4, A::v3, 31);
1624     },{
1625         0x64,0x54,0x20,0x6f,
1626         0x64,0x54,0x21,0x6f,
1627         0x64,0x54,0x28,0x6f,
1628         0x64,0x54,0x30,0x6f,
1629         0x64,0x54,0x3f,0x6f,
1630     });
1631 
1632     test_asm(r, [&](A& a) {
1633         a.scvtf4s (A::v4, A::v3);
1634         a.fcvtzs4s(A::v4, A::v3);
1635         a.fcvtns4s(A::v4, A::v3);
1636         a.frintp4s(A::v4, A::v3);
1637         a.frintm4s(A::v4, A::v3);
1638         a.fcvtn   (A::v4, A::v3);
1639         a.fcvtl   (A::v4, A::v3);
1640     },{
1641         0x64,0xd8,0x21,0x4e,
1642         0x64,0xb8,0xa1,0x4e,
1643         0x64,0xa8,0x21,0x4e,
1644         0x64,0x88,0xa1,0x4e,
1645         0x64,0x98,0x21,0x4e,
1646         0x64,0x68,0x21,0x0e,
1647         0x64,0x78,0x21,0x0e,
1648     });
1649 
1650     test_asm(r, [&](A& a) {
1651         a.sub (A::sp, A::sp, 32);  // sub   sp, sp, #32
1652         a.strq(A::v0, A::sp, 1);   // str   q0, [sp, #16]
1653         a.strq(A::v1, A::sp);      // str   q1, [sp]
1654         a.strd(A::v0, A::sp, 6);   // str   s0, [sp, #48]
1655         a.strs(A::v0, A::sp, 6);   // str   s0, [sp, #24]
1656         a.strh(A::v0, A::sp, 10);  // str   h0, [sp, #20]
1657         a.strb(A::v0, A::sp, 47);  // str   b0, [sp, #47]
1658         a.ldrb(A::v9, A::sp, 42);  // ldr   b9, [sp, #42]
1659         a.ldrh(A::v9, A::sp, 47);  // ldr   h9, [sp, #94]
1660         a.ldrs(A::v7, A::sp, 10);  // ldr   s7, [sp, #40]
1661         a.ldrd(A::v7, A::sp,  1);  // ldr   d7, [sp, #8]
1662         a.ldrq(A::v5, A::sp, 128); // ldr   q5, [sp, #2048]
1663         a.add (A::sp, A::sp, 32);  // add   sp, sp, #32
1664     },{
1665          0xff,0x83,0x00,0xd1,
1666          0xe0,0x07,0x80,0x3d,
1667          0xe1,0x03,0x80,0x3d,
1668          0xe0,0x1b,0x00,0xfd,
1669          0xe0,0x1b,0x00,0xbd,
1670          0xe0,0x2b,0x00,0x7d,
1671          0xe0,0xbf,0x00,0x3d,
1672          0xe9,0xab,0x40,0x3d,
1673          0xe9,0xbf,0x40,0x7d,
1674          0xe7,0x2b,0x40,0xbd,
1675          0xe7,0x07,0x40,0xfd,
1676          0xe5,0x03,0xc2,0x3d,
1677          0xff,0x83,0x00,0x91,
1678     });
1679 
1680     test_asm(r, [&](A& a) {
1681         a.brk(0);
1682         a.brk(65535);
1683 
1684         a.ret(A::x30);   // Conventional ret using link register.
1685         a.ret(A::x13);   // Can really return using any register if we like.
1686 
1687         a.add(A::x2, A::x2,  4);
1688         a.add(A::x3, A::x2, 32);
1689 
1690         a.sub(A::x2, A::x2, 4);
1691         a.sub(A::x3, A::x2, 32);
1692 
1693         a.subs(A::x2, A::x2,  4);
1694         a.subs(A::x3, A::x2, 32);
1695 
1696         a.subs(A::xzr, A::x2, 4);  // These are actually the same instruction!
1697         a.cmp(A::x2, 4);
1698 
1699         A::Label l;
1700         a.label(&l);
1701         a.bne(&l);
1702         a.bne(&l);
1703         a.blt(&l);
1704         a.b(&l);
1705         a.cbnz(A::x2, &l);
1706         a.cbz(A::x2, &l);
1707 
1708         a.add(A::x3, A::x2, A::x1);             // add x3,x2,x1
1709         a.add(A::x3, A::x2, A::x1, A::ASR, 3);  // add x3,x2,x1, asr #3
1710     },{
1711         0x00,0x00,0x20,0xd4,
1712         0xe0,0xff,0x3f,0xd4,
1713 
1714         0xc0,0x03,0x5f,0xd6,
1715         0xa0,0x01,0x5f,0xd6,
1716 
1717         0x42,0x10,0x00,0x91,
1718         0x43,0x80,0x00,0x91,
1719 
1720         0x42,0x10,0x00,0xd1,
1721         0x43,0x80,0x00,0xd1,
1722 
1723         0x42,0x10,0x00,0xf1,
1724         0x43,0x80,0x00,0xf1,
1725 
1726         0x5f,0x10,0x00,0xf1,
1727         0x5f,0x10,0x00,0xf1,
1728 
1729         0x01,0x00,0x00,0x54,   // b.ne #0
1730         0xe1,0xff,0xff,0x54,   // b.ne #-4
1731         0xcb,0xff,0xff,0x54,   // b.lt #-8
1732         0xae,0xff,0xff,0x54,   // b.al #-12
1733         0x82,0xff,0xff,0xb5,   // cbnz x2, #-16
1734         0x62,0xff,0xff,0xb4,   // cbz x2, #-20
1735 
1736         0x43,0x00,0x01,0x8b,
1737         0x43,0x0c,0x81,0x8b,
1738     });
1739 
1740     // Can we cbz() to a not-yet-defined label?
1741     test_asm(r, [&](A& a) {
1742         A::Label l;
1743         a.cbz(A::x2, &l);
1744         a.add(A::x3, A::x2, 32);
1745         a.label(&l);
1746         a.ret(A::x30);
1747     },{
1748         0x42,0x00,0x00,0xb4,  // cbz x2, #8
1749         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1750         0xc0,0x03,0x5f,0xd6,  // ret
1751     });
1752 
1753     // If we start a label as a backward label,
1754     // can we redefine it to be a future label?
1755     // (Not sure this is useful... just want to test it works.)
1756     test_asm(r, [&](A& a) {
1757         A::Label l1;
1758         a.label(&l1);
1759         a.add(A::x3, A::x2, 32);
1760         a.cbz(A::x2, &l1);          // This will jump backward... nothing sneaky.
1761 
1762         A::Label l2;                // Start off the same...
1763         a.label(&l2);
1764         a.add(A::x3, A::x2, 32);
1765         a.cbz(A::x2, &l2);          // Looks like this will go backward...
1766         a.add(A::x2, A::x2, 4);
1767         a.add(A::x3, A::x2, 32);
1768         a.label(&l2);               // But no... actually forward!  What a switcheroo!
1769     },{
1770         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1771         0xe2,0xff,0xff,0xb4,  // cbz x2, #-4
1772 
1773         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1774         0x62,0x00,0x00,0xb4,  // cbz x2, #12
1775         0x42,0x10,0x00,0x91,  // add x2, x2, #4
1776         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1777     });
1778 
1779     // Loading from a label on ARM.
1780     test_asm(r, [&](A& a) {
1781         A::Label fore,aft;
1782         a.label(&fore);
1783         a.word(0x01234567);
1784         a.ldrq(A::v1, &fore);
1785         a.ldrq(A::v2, &aft);
1786         a.label(&aft);
1787         a.word(0x76543210);
1788     },{
1789         0x67,0x45,0x23,0x01,
1790         0xe1,0xff,0xff,0x9c,  // ldr q1, #-4
1791         0x22,0x00,0x00,0x9c,  // ldr q2, #4
1792         0x10,0x32,0x54,0x76,
1793     });
1794 
1795     test_asm(r, [&](A& a) {
1796         a.ldrq(A::v0, A::x8);
1797         a.strq(A::v0, A::x8);
1798     },{
1799         0x00,0x01,0xc0,0x3d,
1800         0x00,0x01,0x80,0x3d,
1801     });
1802 
1803     test_asm(r, [&](A& a) {
1804         a.dup4s  (A::v0, A::x8);
1805         a.ld1r4s (A::v0, A::x8);  // echo 'ld1r.4s {v0}, [x8]' | llvm-mc --show-encoding
1806         a.ld1r8h (A::v0, A::x8);
1807         a.ld1r16b(A::v0, A::x8);
1808     },{
1809         0x00,0x0d,0x04,0x4e,
1810         0x00,0xc9,0x40,0x4d,
1811         0x00,0xc5,0x40,0x4d,
1812         0x00,0xc1,0x40,0x4d,
1813     });
1814 
1815     test_asm(r, [&](A& a) {
1816         a.ld24s(A::v0, A::x8);  // echo 'ld2.4s {v0,v1}, [x8]' | llvm-mc --show-encoding
1817         a.ld44s(A::v0, A::x8);
1818         a.st24s(A::v0, A::x8);
1819         a.st44s(A::v0, A::x8);  // echo 'st4.4s {v0,v1,v2,v3}, [x8]' | llvm-mc --show-encoding
1820 
1821         a.ld24s(A::v0, A::x8, 0);  //echo 'ld2 {v0.s,v1.s}[0], [x8]' | llvm-mc --show-encoding
1822         a.ld24s(A::v0, A::x8, 1);
1823         a.ld24s(A::v0, A::x8, 2);
1824         a.ld24s(A::v0, A::x8, 3);
1825 
1826         a.ld44s(A::v0, A::x8, 0);  // ld4 {v0.s,v1.s,v2.s,v3.s}[0], [x8]
1827         a.ld44s(A::v0, A::x8, 1);
1828         a.ld44s(A::v0, A::x8, 2);
1829         a.ld44s(A::v0, A::x8, 3);
1830     },{
1831         0x00,0x89,0x40,0x4c,
1832         0x00,0x09,0x40,0x4c,
1833         0x00,0x89,0x00,0x4c,
1834         0x00,0x09,0x00,0x4c,
1835 
1836         0x00,0x81,0x60,0x0d,
1837         0x00,0x91,0x60,0x0d,
1838         0x00,0x81,0x60,0x4d,
1839         0x00,0x91,0x60,0x4d,
1840 
1841         0x00,0xa1,0x60,0x0d,
1842         0x00,0xb1,0x60,0x0d,
1843         0x00,0xa1,0x60,0x4d,
1844         0x00,0xb1,0x60,0x4d,
1845     });
1846 
1847     test_asm(r, [&](A& a) {
1848         a.xtns2h(A::v0, A::v0);
1849         a.xtnh2b(A::v0, A::v0);
1850         a.strs  (A::v0, A::x0);
1851 
1852         a.ldrs   (A::v0, A::x0);
1853         a.uxtlb2h(A::v0, A::v0);
1854         a.uxtlh2s(A::v0, A::v0);
1855 
1856         a.uminv4s(A::v3, A::v4);
1857         a.movs   (A::x3, A::v4,0);  // mov.s w3,v4[0]
1858         a.movs   (A::x3, A::v4,1);  // mov.s w3,v4[1]
1859         a.inss   (A::v4, A::x3,3);  // ins.s v4[3],w3
1860     },{
1861         0x00,0x28,0x61,0x0e,
1862         0x00,0x28,0x21,0x0e,
1863         0x00,0x00,0x00,0xbd,
1864 
1865         0x00,0x00,0x40,0xbd,
1866         0x00,0xa4,0x08,0x2f,
1867         0x00,0xa4,0x10,0x2f,
1868 
1869         0x83,0xa8,0xb1,0x6e,
1870         0x83,0x3c,0x04,0x0e,
1871         0x83,0x3c,0x0c,0x0e,
1872         0x64,0x1c,0x1c,0x4e,
1873     });
1874 
1875     test_asm(r, [&](A& a) {
1876         a.ldrb(A::v0, A::x8);
1877         a.strb(A::v0, A::x8);
1878     },{
1879         0x00,0x01,0x40,0x3d,
1880         0x00,0x01,0x00,0x3d,
1881     });
1882 
1883     test_asm(r, [&](A& a) {
1884         a.ldrd(A::x0, A::x1, 3);   // ldr  x0, [x1, #24]
1885         a.ldrs(A::x0, A::x1, 3);   // ldr  w0, [x1, #12]
1886         a.ldrh(A::x0, A::x1, 3);   // ldrh w0, [x1, #6]
1887         a.ldrb(A::x0, A::x1, 3);   // ldrb w0, [x1, #3]
1888 
1889         a.strs(A::x0, A::x1, 3);   // str  w0, [x1, #12]
1890     },{
1891         0x20,0x0c,0x40,0xf9,
1892         0x20,0x0c,0x40,0xb9,
1893         0x20,0x0c,0x40,0x79,
1894         0x20,0x0c,0x40,0x39,
1895 
1896         0x20,0x0c,0x00,0xb9,
1897     });
1898 
1899     test_asm(r, [&](A& a) {
1900         a.tbl   (A::v0, A::v1, A::v2);
1901         a.uzp14s(A::v0, A::v1, A::v2);
1902         a.uzp24s(A::v0, A::v1, A::v2);
1903         a.zip14s(A::v0, A::v1, A::v2);
1904         a.zip24s(A::v0, A::v1, A::v2);
1905     },{
1906         0x20,0x00,0x02,0x4e,
1907         0x20,0x18,0x82,0x4e,
1908         0x20,0x58,0x82,0x4e,
1909         0x20,0x38,0x82,0x4e,
1910         0x20,0x78,0x82,0x4e,
1911     });
1912 }
1913 
DEF_TEST(SkVM_approx_math,r)1914 DEF_TEST(SkVM_approx_math, r) {
1915     auto eval = [](int N, float values[], auto fn) {
1916         skvm::Builder b;
1917         skvm::Ptr inout  = b.varying<float>();
1918 
1919         b.storeF(inout, fn(&b, b.loadF(inout)));
1920 
1921         b.done().eval(N, values);
1922     };
1923 
1924     auto compare = [r](int N, const float values[], const float expected[]) {
1925         for (int i = 0; i < N; ++i) {
1926             REPORTER_ASSERT(r, SkScalarNearlyEqual(values[i], expected[i], 0.001f));
1927         }
1928     };
1929 
1930     // log2
1931     {
1932         float values[] = {0.25f, 0.5f, 1, 2, 4, 8};
1933         constexpr int N = SK_ARRAY_COUNT(values);
1934         eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
1935             return b->approx_log2(v);
1936         });
1937         const float expected[] = {-2, -1, 0, 1, 2, 3};
1938         compare(N, values, expected);
1939     }
1940 
1941     // pow2
1942     {
1943         float values[] = {-2, -1, 0, 1, 2, 3};
1944         constexpr int N = SK_ARRAY_COUNT(values);
1945         eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
1946             return b->approx_pow2(v);
1947         });
1948         const float expected[] = {0.25f, 0.5f, 1, 2, 4, 8};
1949         compare(N, values, expected);
1950     }
1951 
1952     // powf -- x^0.5
1953     {
1954         float bases[] = {0, 1, 4, 9, 16};
1955         constexpr int N = SK_ARRAY_COUNT(bases);
1956         eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
1957             return b->approx_powf(base, b->splat(0.5f));
1958         });
1959         const float expected[] = {0, 1, 2, 3, 4};
1960         compare(N, bases, expected);
1961     }
1962     // powf -- 3^x
1963     {
1964         float exps[] = {-2, -1, 0, 1, 2};
1965         constexpr int N = SK_ARRAY_COUNT(exps);
1966         eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
1967             return b->approx_powf(b->splat(3.0f), exp);
1968         });
1969         const float expected[] = {1/9.0f, 1/3.0f, 1, 3, 9};
1970         compare(N, exps, expected);
1971     }
1972 
1973     auto test = [r](float arg, float expected, float tolerance, auto prog) {
1974         skvm::Builder b;
1975         skvm::Ptr inout  = b.varying<float>();
1976         b.storeF(inout, prog(b.loadF(inout)));
1977         float actual = arg;
1978         b.done().eval(1, &actual);
1979 
1980         float err = std::abs(actual - expected);
1981 
1982         if (err > tolerance) {
1983     //        SkDebugf("arg %g, expected %g, actual %g\n", arg, expected, actual);
1984             REPORTER_ASSERT(r, true);
1985         }
1986         return err;
1987     };
1988 
1989     auto test2 = [r](float arg0, float arg1, float expected, float tolerance, auto prog) {
1990         skvm::Builder b;
1991         skvm::Ptr in0  = b.varying<float>();
1992         skvm::Ptr in1  = b.varying<float>();
1993         skvm::Ptr out  = b.varying<float>();
1994         b.storeF(out, prog(b.loadF(in0), b.loadF(in1)));
1995         float actual;
1996         b.done().eval(1, &arg0, &arg1, &actual);
1997 
1998         float err = std::abs(actual - expected);
1999 
2000         if (err > tolerance) {
2001     //        SkDebugf("[%g, %g]: expected %g, actual %g\n", arg0, arg1, expected, actual);
2002             REPORTER_ASSERT(r, true);
2003         }
2004         return err;
2005     };
2006 
2007     // sine, cosine, tangent
2008     {
2009         constexpr float P = SK_ScalarPI;
2010         constexpr float tol = 0.00175f;
2011         for (float rad = -5*P; rad <= 5*P; rad += 0.1f) {
2012             test(rad, sk_float_sin(rad), tol, [](skvm::F32 x) {
2013                 return approx_sin(x);
2014             });
2015             test(rad, sk_float_cos(rad), tol, [](skvm::F32 x) {
2016                 return approx_cos(x);
2017             });
2018         }
2019 
2020         // Our tangent diverge more as we get near infinities (x near +- Pi/2),
2021         // so bring in the domain a little.
2022         constexpr float eps = 0.16f;
2023         float err = 0;
2024         for (float rad = -P/2 + eps; rad <= P/2 - eps; rad += 0.01f) {
2025             err += test(rad, sk_float_tan(rad), tol, [](skvm::F32 x) {
2026                 return approx_tan(x);
2027             });
2028             // try again with some multiples of P, to check our periodicity
2029             test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2030                 return approx_tan(x + 3*P);
2031             });
2032             test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2033                 return approx_tan(x - 3*P);
2034             });
2035         }
2036         if (0) { SkDebugf("tan error %g\n", err); }
2037     }
2038 
2039     // asin, acos, atan
2040     {
2041         constexpr float tol = 0.00175f;
2042         float err = 0;
2043         for (float x = -1; x <= 1; x += 1.0f/64) {
2044             err += test(x, asin(x), tol, [](skvm::F32 x) {
2045                 return approx_asin(x);
2046             });
2047             test(x, acos(x), tol, [](skvm::F32 x) {
2048                 return approx_acos(x);
2049             });
2050         }
2051         if (0) { SkDebugf("asin error %g\n", err); }
2052 
2053         err = 0;
2054         for (float x = -10; x <= 10; x += 1.0f/16) {
2055             err += test(x, atan(x), tol, [](skvm::F32 x) {
2056                 return approx_atan(x);
2057             });
2058         }
2059         if (0) { SkDebugf("atan error %g\n", err); }
2060 
2061         for (float y = -3; y <= 3; y += 1) {
2062             for (float x = -3; x <= 3; x += 1) {
2063                 err += test2(y, x, atan2(y,x), tol, [](skvm::F32 y, skvm::F32 x) {
2064                     return approx_atan2(y,x);
2065                 });
2066             }
2067         }
2068         if (0) { SkDebugf("atan2 error %g\n", err); }
2069     }
2070 }
2071 
DEF_TEST(SkVM_min_max,r)2072 DEF_TEST(SkVM_min_max, r) {
2073     // min() and max() have subtle behavior when one argument is NaN and
2074     // the other isn't.  It's not sound to blindly swap their arguments.
2075     //
2076     // All backends must behave like std::min() and std::max(), which are
2077     //
2078     //    min(x,y) = y<x ? y : x
2079     //    max(x,y) = x<y ? y : x
2080 
2081     // ±NaN, ±0, ±1, ±inf
2082     const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2083                              0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2084 
2085     float f[8];
2086     memcpy(f, bits, sizeof(bits));
2087 
2088     auto identical = [&](float x, float y) {
2089         uint32_t X,Y;
2090         memcpy(&X, &x, 4);
2091         memcpy(&Y, &y, 4);
2092         return X == Y;
2093     };
2094 
2095     // Test min/max with non-constant x, non-constant y.
2096     // (Whether x and y are varying or uniform shouldn't make any difference.)
2097     {
2098         skvm::Builder b;
2099         {
2100             skvm::Ptr src = b.varying<float>(),
2101                        mn = b.varying<float>(),
2102                        mx = b.varying<float>();
2103 
2104             skvm::F32 x = b.loadF(src),
2105                       y = b.uniformF(b.uniform(), 0);
2106 
2107             b.storeF(mn, b.min(x,y));
2108             b.storeF(mx, b.max(x,y));
2109         }
2110 
2111         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2112             float mn[8], mx[8];
2113             for (int i = 0; i < 8; i++) {
2114                 // min() and max() everything with f[i].
2115                 program.eval(8, f,mn,mx, &f[i]);
2116 
2117                 for (int j = 0; j < 8; j++) {
2118                     REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2119                     REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2120                 }
2121             }
2122         });
2123     }
2124 
2125     // Test each with constant on the right.
2126     for (int i = 0; i < 8; i++) {
2127         skvm::Builder b;
2128         {
2129             skvm::Ptr src = b.varying<float>(),
2130                        mn = b.varying<float>(),
2131                        mx = b.varying<float>();
2132 
2133             skvm::F32 x = b.loadF(src),
2134                       y = b.splat(f[i]);
2135 
2136             b.storeF(mn, b.min(x,y));
2137             b.storeF(mx, b.max(x,y));
2138         }
2139 
2140         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2141             float mn[8], mx[8];
2142             program.eval(8, f,mn,mx);
2143             for (int j = 0; j < 8; j++) {
2144                 REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2145                 REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2146             }
2147         });
2148     }
2149 
2150     // Test each with constant on the left.
2151     for (int i = 0; i < 8; i++) {
2152         skvm::Builder b;
2153         {
2154             skvm::Ptr src = b.varying<float>(),
2155                        mn = b.varying<float>(),
2156                        mx = b.varying<float>();
2157 
2158             skvm::F32 x = b.splat(f[i]),
2159                       y = b.loadF(src);
2160 
2161             b.storeF(mn, b.min(x,y));
2162             b.storeF(mx, b.max(x,y));
2163         }
2164 
2165         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2166             float mn[8], mx[8];
2167             program.eval(8, f,mn,mx);
2168             for (int j = 0; j < 8; j++) {
2169                 REPORTER_ASSERT(r, identical(mn[j], std::min(f[i], f[j])));
2170                 REPORTER_ASSERT(r, identical(mx[j], std::max(f[i], f[j])));
2171             }
2172         });
2173     }
2174 }
2175 
DEF_TEST(SkVM_halfs,r)2176 DEF_TEST(SkVM_halfs, r) {
2177     const uint16_t hs[] = {0x0000,0x3800,0x3c00,0x4000,
2178                            0xc400,0xb800,0xbc00,0xc000};
2179     const float fs[] = {+0.0f,+0.5f,+1.0f,+2.0f,
2180                         -4.0f,-0.5f,-1.0f,-2.0f};
2181     {
2182         skvm::Builder b;
2183         skvm::Ptr src = b.varying<uint16_t>(),
2184                   dst = b.varying<float>();
2185         b.storeF(dst, b.from_fp16(b.load16(src)));
2186 
2187         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2188             float dst[8];
2189             program.eval(8, hs, dst);
2190             for (int i = 0; i < 8; i++) {
2191                 REPORTER_ASSERT(r, dst[i] == fs[i]);
2192             }
2193         });
2194     }
2195     {
2196         skvm::Builder b;
2197         skvm::Ptr src = b.varying<float>(),
2198                   dst = b.varying<uint16_t>();
2199         b.store16(dst, b.to_fp16(b.loadF(src)));
2200 
2201         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2202             uint16_t dst[8];
2203             program.eval(8, fs, dst);
2204             for (int i = 0; i < 8; i++) {
2205                 REPORTER_ASSERT(r, dst[i] == hs[i]);
2206             }
2207         });
2208     }
2209 }
2210 
DEF_TEST(SkVM_64bit,r)2211 DEF_TEST(SkVM_64bit, r) {
2212     uint32_t lo[65],
2213              hi[65];
2214     uint64_t wide[65];
2215     for (int i = 0; i < 65; i++) {
2216         lo[i] = 2*i+0;
2217         hi[i] = 2*i+1;
2218         wide[i] = ((uint64_t)lo[i] <<  0)
2219                 | ((uint64_t)hi[i] << 32);
2220     }
2221 
2222     {
2223         skvm::Builder b;
2224         {
2225             skvm::Ptr wide = b.varying<uint64_t>(),
2226                         lo = b.varying<int>(),
2227                         hi = b.varying<int>();
2228             b.store32(lo, b.load64(wide, 0));
2229             b.store32(hi, b.load64(wide, 1));
2230         }
2231         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2232             uint32_t l[65], h[65];
2233             program.eval(65, wide,l,h);
2234             for (int i = 0; i < 65; i++) {
2235                 REPORTER_ASSERT(r, l[i] == lo[i]);
2236                 REPORTER_ASSERT(r, h[i] == hi[i]);
2237             }
2238         });
2239     }
2240 
2241     {
2242         skvm::Builder b;
2243         {
2244             skvm::Ptr wide = b.varying<uint64_t>(),
2245                         lo = b.varying<int>(),
2246                         hi = b.varying<int>();
2247             b.store64(wide, b.load32(lo), b.load32(hi));
2248         }
2249         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2250             uint64_t w[65];
2251             program.eval(65, w,lo,hi);
2252             for (int i = 0; i < 65; i++) {
2253                 REPORTER_ASSERT(r, w[i] == wide[i]);
2254             }
2255         });
2256     }
2257 }
2258 
DEF_TEST(SkVM_128bit,r)2259 DEF_TEST(SkVM_128bit, r) {
2260     float   floats[4*63];
2261     uint8_t packed[4*63];
2262 
2263     for (int i = 0; i < 4*63; i++) {
2264         floats[i] = i * (1/255.0f);
2265     }
2266 
2267     skvm::PixelFormat rgba_ffff = skvm::SkColorType_to_PixelFormat(kRGBA_F32_SkColorType),
2268                       rgba_8888 = skvm::SkColorType_to_PixelFormat(kRGBA_8888_SkColorType);
2269 
2270     {  // Convert RGBA F32 to RGBA 8888, testing 128-bit loads.
2271         skvm::Builder b;
2272         {
2273             skvm::Ptr dst = b.arg( 4),
2274                       src = b.arg(16);
2275 
2276             skvm::Color c = b.load(rgba_ffff, src);
2277             b.store(rgba_8888, dst, c);
2278         }
2279         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2280             memset(packed, 0, sizeof(packed));
2281             program.eval(63, packed, floats);
2282             for (int i = 0; i < 4*63; i++) {
2283                 REPORTER_ASSERT(r, packed[i] == i);
2284             }
2285         });
2286     }
2287 
2288 
2289     {  // Convert RGBA 8888 to RGBA F32, testing 128-bit stores.
2290         skvm::Builder b;
2291         {
2292             skvm::Ptr dst = b.arg(16),
2293                       src = b.arg( 4);
2294 
2295             skvm::Color c = b.load(rgba_8888, src);
2296             b.store(rgba_ffff, dst, c);
2297         }
2298         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2299             memset(floats, 0, sizeof(floats));
2300             program.eval(63, floats, packed);
2301             for (int i = 0; i < 4*63; i++) {
2302                 REPORTER_ASSERT(r, floats[i] == i * (1/255.0f));
2303             }
2304         });
2305     }
2306 
2307 }
2308 
DEF_TEST(SkVM_is_NaN_is_finite,r)2309 DEF_TEST(SkVM_is_NaN_is_finite, r) {
2310     skvm::Builder b;
2311     {
2312         skvm::Ptr src = b.varying<float>(),
2313                   nan = b.varying<int>(),
2314                   fin = b.varying<int>();
2315         b.store32(nan, is_NaN   (b.loadF(src)));
2316         b.store32(fin, is_finite(b.loadF(src)));
2317     }
2318     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2319         // ±NaN, ±0, ±1, ±inf
2320         const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2321                                  0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2322         uint32_t nan[8], fin[8];
2323         program.eval(8, bits, nan,fin);
2324 
2325         for (int i = 0; i < 8; i++) {
2326             REPORTER_ASSERT(r, nan[i] == ((i == 0 || i == 1) ? 0xffffffff : 0));
2327             REPORTER_ASSERT(r, fin[i] == ((i == 2 || i == 3 ||
2328                                            i == 4 || i == 5) ? 0xffffffff : 0));
2329         }
2330     });
2331 }
2332 
DEF_TEST(SkVM_args,r)2333 DEF_TEST(SkVM_args, r) {
2334     // Test we can handle at least six arguments.
2335     skvm::Builder b;
2336     {
2337         skvm::Ptr dst = b.varying<float>(),
2338                     A = b.varying<float>(),
2339                     B = b.varying<float>(),
2340                     C = b.varying<float>(),
2341                     D = b.varying<float>(),
2342                     E = b.varying<float>();
2343         storeF(dst, b.loadF(A)
2344                   + b.loadF(B)
2345                   + b.loadF(C)
2346                   + b.loadF(D)
2347                   + b.loadF(E));
2348     }
2349 
2350     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2351         float dst[17],A[17],B[17],C[17],D[17],E[17];
2352         for (int i = 0; i < 17; i++) {
2353             A[i] = B[i] = C[i] = D[i] = E[i] = (float)i;
2354         }
2355         program.eval(17, dst,A,B,C,D,E);
2356         for (int i = 0; i < 17; i++) {
2357             REPORTER_ASSERT(r, dst[i] == 5.0f*i);
2358         }
2359     });
2360 }
2361 
DEF_TEST(SkVM_badpack,r)2362 DEF_TEST(SkVM_badpack, r) {
2363     // Test case distilled from actual failing draw,
2364     // originally with a bad arm64 implementation of pack().
2365     skvm::Builder p;
2366     {
2367         skvm::Ptr uniforms = p.uniform(),
2368                   dst      = p.varying<uint16_t>();
2369 
2370         skvm::I32 r = round(p.uniformF(uniforms, 8) * 15),
2371                   a = p.splat(0xf);
2372 
2373         skvm::I32 _4444 = p.splat(0);
2374         _4444 = pack(_4444, r, 12);
2375         _4444 = pack(_4444, a,  0);
2376         store16(dst, _4444);
2377     }
2378 
2379     test_jit_and_interpreter(p, [&](const skvm::Program& program){
2380         const float uniforms[] = { 0.0f, 0.0f,
2381                                    1.0f, 0.0f, 0.0f, 1.0f };
2382 
2383         uint16_t dst[17] = {0};
2384         program.eval(17, uniforms,dst);
2385         for (int i = 0; i < 17; i++) {
2386             REPORTER_ASSERT(r, dst[i] == 0xf00f, "got %04x, want %04x\n", dst[i], 0xf00f);
2387         }
2388     });
2389 }
2390 
DEF_TEST(SkVM_features,r)2391 DEF_TEST(SkVM_features, r) {
2392     auto build_program = [](skvm::Builder* b) {
2393         skvm::F32 x = b->loadF(b->varying<float>());
2394         b->storeF(b->varying<float>(), x*x+x);
2395     };
2396 
2397     {   // load-fma-store with FMA available.
2398         skvm::Features features;
2399         features.fma = true;
2400         skvm::Builder b(features);
2401         build_program(&b);
2402         REPORTER_ASSERT(r, b.optimize().size() == 3);
2403     }
2404 
2405     {   // load-mul-add-store without FMA.
2406         skvm::Features features;
2407         features.fma = false;
2408         skvm::Builder b(features);
2409         build_program(&b);
2410         REPORTER_ASSERT(r, b.optimize().size() == 4);
2411     }
2412 
2413     {   // Auto-detected, could be either.
2414         skvm::Builder b;
2415         build_program(&b);
2416         REPORTER_ASSERT(r, b.optimize().size() == 3
2417                         || b.optimize().size() == 4);
2418     }
2419 }
2420 
DEF_TEST(SkVM_gather_can_hoist,r)2421 DEF_TEST(SkVM_gather_can_hoist, r) {
2422     // A gather instruction isn't necessarily varying... it's whatever its index is.
2423     // First a typical gather scenario with varying index.
2424     {
2425         skvm::Builder b;
2426         skvm::Ptr uniforms = b.uniform(),
2427                   buf      = b.varying<int>();
2428         skvm::I32 ix = b.load32(buf);
2429         b.store32(buf, b.gather32(uniforms,0, ix));
2430 
2431         skvm::Program p = b.done();
2432 
2433         // ix is varying, so the gather is too.
2434         //
2435         // loop:
2436         //     v0 = load32 buf
2437         //     v1 = gather32 uniforms+0 v0
2438         //     store32 buf v1
2439         REPORTER_ASSERT(r, p.instructions().size() == 3);
2440         REPORTER_ASSERT(r, p.loop() == 0);
2441     }
2442 
2443     // Now the same but with a uniform index instead.
2444     {
2445         skvm::Builder b;
2446         skvm::Ptr uniforms = b.uniform(),
2447                   buf      = b.varying<int>();
2448         skvm::I32 ix = b.uniform32(uniforms,8);
2449         b.store32(buf, b.gather32(uniforms,0, ix));
2450 
2451         skvm::Program p = b.done();
2452 
2453         // ix is uniform, so the gather is too.
2454         //
2455         // v0 = uniform32 uniforms+8
2456         // v1 = gather32 uniforms+0 v0
2457         // loop:
2458         //     store32 buf v1
2459         REPORTER_ASSERT(r, p.instructions().size() == 3);
2460         REPORTER_ASSERT(r, p.loop() == 2);
2461     }
2462 }
2463 
DEF_TEST(SkVM_dont_dedup_loads,r)2464 DEF_TEST(SkVM_dont_dedup_loads, r) {
2465     // We've been assuming that all Ops with the same arguments produce the same value
2466     // and deduplicating them, which results in a simple common subexpression eliminator.
2467     //
2468     // But we can't soundly dedup two identical loads with a store between.
2469     // If we dedup the loads in this test program it will always increment by 1, not K.
2470     constexpr int K = 2;
2471     skvm::Builder b;
2472     {
2473         skvm::Ptr buf = b.varying<int>();
2474         for (int i = 0; i < K; i++) {
2475             b.store32(buf, b.load32(buf) + 1);
2476         }
2477     }
2478 
2479     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2480         int buf[] = { 0,1,2,3,4 };
2481         program.eval(SK_ARRAY_COUNT(buf), buf);
2482         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
2483             REPORTER_ASSERT(r, buf[i] == i+K);
2484         }
2485     });
2486 }
2487 
DEF_TEST(SkVM_dont_dedup_stores,r)2488 DEF_TEST(SkVM_dont_dedup_stores, r) {
2489     // Following a similar line of reasoning to SkVM_dont_dedup_loads,
2490     // we cannot dedup stores either.  A different store between two identical stores
2491     // will invalidate the first store, meaning we do need to reissue that store operation.
2492     skvm::Builder b;
2493     {
2494         skvm::Ptr buf = b.varying<int>();
2495         b.store32(buf, b.splat(4));
2496         b.store32(buf, b.splat(5));
2497         b.store32(buf, b.splat(4));   // If we dedup'd, we'd skip this store.
2498     }
2499 
2500     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2501         int buf[42];
2502         program.eval(SK_ARRAY_COUNT(buf), buf);
2503         for (int x : buf) {
2504             REPORTER_ASSERT(r, x == 4);
2505         }
2506     });
2507 }
2508 
DEF_TEST(SkVM_fast_mul,r)2509 DEF_TEST(SkVM_fast_mul, r) {
2510     skvm::Builder b;
2511     {
2512         skvm::Ptr src = b.varying<float>(),
2513                  fast = b.varying<float>(),
2514                  slow = b.varying<float>();
2515         skvm::F32 x = b.loadF(src);
2516         b.storeF(fast, fast_mul(0.0f, x));
2517         b.storeF(slow, 0.0f * x);
2518     }
2519     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2520         const uint32_t bits[] = {
2521             0x0000'0000, 0x8000'0000, //±0
2522             0x3f80'0000, 0xbf80'0000, //±1
2523             0x7f80'0000, 0xff80'0000, //±inf
2524             0x7f80'0001, 0xff80'0001, //±NaN
2525         };
2526         float fast[8],
2527               slow[8];
2528         program.eval(8,bits,fast,slow);
2529 
2530         for (int i = 0; i < 8; i++) {
2531             REPORTER_ASSERT(r, fast[i] == 0.0f);
2532 
2533             if (i < 4) {
2534                 REPORTER_ASSERT(r, slow[i] == 0.0f);
2535             } else {
2536                 REPORTER_ASSERT(r, isnan(slow[i]));
2537             }
2538         }
2539     });
2540 }
2541