1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27 
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33 
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39 
40 #include "vulkan/radv_shader.h"
41 
42 struct radv_shader_args;
43 struct radv_shader_info;
44 
45 namespace aco {
46 
47 extern uint64_t debug_flags;
48 
49 enum {
50    DEBUG_VALIDATE_IR = 0x1,
51    DEBUG_VALIDATE_RA = 0x2,
52    DEBUG_PERFWARN = 0x4,
53    DEBUG_FORCE_WAITCNT = 0x8,
54    DEBUG_NO_VN = 0x10,
55    DEBUG_NO_OPT = 0x20,
56    DEBUG_NO_SCHED = 0x40,
57 };
58 
59 /**
60  * Representation of the instruction's microcode encoding format
61  * Note: Some Vector ALU Formats can be combined, such that:
62  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
63  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
64  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
65  *
66  * (*) The same is applicable for VOP1 and VOPC instructions.
67  */
68 enum class Format : std::uint16_t {
69    /* Pseudo Instruction Format */
70    PSEUDO = 0,
71    /* Scalar ALU & Control Formats */
72    SOP1 = 1,
73    SOP2 = 2,
74    SOPK = 3,
75    SOPP = 4,
76    SOPC = 5,
77    /* Scalar Memory Format */
78    SMEM = 6,
79    /* LDS/GDS Format */
80    DS = 8,
81    /* Vector Memory Buffer Formats */
82    MTBUF = 9,
83    MUBUF = 10,
84    /* Vector Memory Image Format */
85    MIMG = 11,
86    /* Export Format */
87    EXP = 12,
88    /* Flat Formats */
89    FLAT = 13,
90    GLOBAL = 14,
91    SCRATCH = 15,
92 
93    PSEUDO_BRANCH = 16,
94    PSEUDO_BARRIER = 17,
95    PSEUDO_REDUCTION = 18,
96 
97    /* Vector ALU Formats */
98    VOP3P = 19,
99    VOP1 = 1 << 8,
100    VOP2 = 1 << 9,
101    VOPC = 1 << 10,
102    VOP3 = 1 << 11,
103    VOP3A = 1 << 11,
104    VOP3B = 1 << 11,
105    /* Vector Parameter Interpolation Format */
106    VINTRP = 1 << 12,
107    DPP = 1 << 13,
108    SDWA = 1 << 14,
109 };
110 
111 enum storage_class : uint8_t {
112    storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
113    storage_buffer = 0x1, /* SSBOs and global memory */
114    storage_atomic_counter = 0x2, /* not used for Vulkan */
115    storage_image = 0x4,
116    storage_shared = 0x8, /* or TCS output */
117    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
118    storage_scratch = 0x20,
119    storage_vgpr_spill = 0x40,
120    storage_count = 8,
121 };
122 
123 enum memory_semantics : uint8_t {
124    semantic_none = 0x0,
125    /* for loads: don't move any access after this load to before this load (even other loads)
126     * for barriers: don't move any access after the barrier to before any
127     * atomics/control_barriers/sendmsg_gs_done before the barrier */
128    semantic_acquire = 0x1,
129    /* for stores: don't move any access before this store to after this store
130     * for barriers: don't move any access before the barrier to after any
131     * atomics/control_barriers/sendmsg_gs_done after the barrier */
132    semantic_release = 0x2,
133 
134    /* the rest are for load/stores/atomics only */
135    /* cannot be DCE'd or CSE'd */
136    semantic_volatile = 0x4,
137    /* does not interact with barriers and assumes this lane is the only lane
138     * accessing this memory */
139    semantic_private = 0x8,
140    /* this operation can be reordered around operations of the same storage. says nothing about barriers */
141    semantic_can_reorder = 0x10,
142    /* this is a atomic instruction (may only read or write memory) */
143    semantic_atomic = 0x20,
144    /* this is instruction both reads and writes memory */
145    semantic_rmw = 0x40,
146 
147    semantic_acqrel = semantic_acquire | semantic_release,
148    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
149 };
150 
151 enum sync_scope : uint8_t {
152    scope_invocation = 0,
153    scope_subgroup = 1,
154    scope_workgroup = 2,
155    scope_queuefamily = 3,
156    scope_device = 4,
157 };
158 
159 struct memory_sync_info {
memory_sync_infomemory_sync_info160    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
161    memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
162       : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
163 
164    storage_class storage:8;
165    memory_semantics semantics:8;
166    sync_scope scope:8;
167 
168    bool operator == (const memory_sync_info& rhs) const {
169       return storage == rhs.storage &&
170              semantics == rhs.semantics &&
171              scope == rhs.scope;
172    }
173 
can_reordermemory_sync_info174    bool can_reorder() const {
175       if (semantics & semantic_acqrel)
176          return false;
177       /* Also check storage so that zero-initialized memory_sync_info can be
178        * reordered. */
179       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
180    }
181 };
182 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
183 
184 enum fp_round {
185    fp_round_ne = 0,
186    fp_round_pi = 1,
187    fp_round_ni = 2,
188    fp_round_tz = 3,
189 };
190 
191 enum fp_denorm {
192    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
193     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
194    fp_denorm_flush = 0x0,
195    fp_denorm_keep_in = 0x1,
196    fp_denorm_keep_out = 0x2,
197    fp_denorm_keep = 0x3,
198 };
199 
200 struct float_mode {
201    /* matches encoding of the MODE register */
202    union {
203       struct {
204           fp_round round32:2;
205           fp_round round16_64:2;
206           unsigned denorm32:2;
207           unsigned denorm16_64:2;
208       };
209       struct {
210          uint8_t round:4;
211          uint8_t denorm:4;
212       };
213       uint8_t val = 0;
214    };
215    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
216    bool preserve_signed_zero_inf_nan32:1;
217    bool preserve_signed_zero_inf_nan16_64:1;
218    /* if false, optimizations which may remove denormal flushing can be done */
219    bool must_flush_denorms32:1;
220    bool must_flush_denorms16_64:1;
221    bool care_about_round32:1;
222    bool care_about_round16_64:1;
223 
224    /* Returns true if instructions using the mode "other" can safely use the
225     * current one instead. */
canReplacefloat_mode226    bool canReplace(float_mode other) const noexcept {
227       return val == other.val &&
228              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
229              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
230              (must_flush_denorms32  || !other.must_flush_denorms32) &&
231              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
232              (care_about_round32 || !other.care_about_round32) &&
233              (care_about_round16_64 || !other.care_about_round16_64);
234    }
235 };
236 
asVOP3(Format format)237 constexpr Format asVOP3(Format format) {
238    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
239 };
240 
asSDWA(Format format)241 constexpr Format asSDWA(Format format) {
242    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
243    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
244 }
245 
246 enum class RegType {
247    none = 0,
248    sgpr,
249    vgpr,
250    linear_vgpr,
251 };
252 
253 struct RegClass {
254 
255    enum RC : uint8_t {
256       s1 = 1,
257       s2 = 2,
258       s3 = 3,
259       s4 = 4,
260       s6 = 6,
261       s8 = 8,
262       s16 = 16,
263       v1 = s1 | (1 << 5),
264       v2 = s2 | (1 << 5),
265       v3 = s3 | (1 << 5),
266       v4 = s4 | (1 << 5),
267       v5 = 5  | (1 << 5),
268       v6 = 6  | (1 << 5),
269       v7 = 7  | (1 << 5),
270       v8 = 8  | (1 << 5),
271       /* byte-sized register class */
272       v1b = v1 | (1 << 7),
273       v2b = v2 | (1 << 7),
274       v3b = v3 | (1 << 7),
275       v4b = v4 | (1 << 7),
276       v6b = v6 | (1 << 7),
277       v8b = v8 | (1 << 7),
278       /* these are used for WWM and spills to vgpr */
279       v1_linear = v1 | (1 << 6),
280       v2_linear = v2 | (1 << 6),
281    };
282 
283    RegClass() = default;
RegClassRegClass284    constexpr RegClass(RC rc)
285       : rc(rc) {}
RegClassRegClass286    constexpr RegClass(RegType type, unsigned size)
287       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
288 
RCRegClass289    constexpr operator RC() const { return rc; }
290    explicit operator bool() = delete;
291 
typeRegClass292    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_subdwordRegClass293    constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass294    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
295    //TODO: use size() less in favor of bytes()
sizeRegClass296    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass297    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
as_linearRegClass298    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
as_subdwordRegClass299    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
300 
getRegClass301    static constexpr RegClass get(RegType type, unsigned bytes) {
302       if (type == RegType::sgpr) {
303          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
304       } else {
305          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
306                              RegClass(type, bytes / 4u);
307       }
308    }
309 
310 private:
311    RC rc;
312 };
313 
314 /* transitional helper expressions */
315 static constexpr RegClass s1{RegClass::s1};
316 static constexpr RegClass s2{RegClass::s2};
317 static constexpr RegClass s3{RegClass::s3};
318 static constexpr RegClass s4{RegClass::s4};
319 static constexpr RegClass s8{RegClass::s8};
320 static constexpr RegClass s16{RegClass::s16};
321 static constexpr RegClass v1{RegClass::v1};
322 static constexpr RegClass v2{RegClass::v2};
323 static constexpr RegClass v3{RegClass::v3};
324 static constexpr RegClass v4{RegClass::v4};
325 static constexpr RegClass v5{RegClass::v5};
326 static constexpr RegClass v6{RegClass::v6};
327 static constexpr RegClass v7{RegClass::v7};
328 static constexpr RegClass v8{RegClass::v8};
329 static constexpr RegClass v1b{RegClass::v1b};
330 static constexpr RegClass v2b{RegClass::v2b};
331 static constexpr RegClass v3b{RegClass::v3b};
332 static constexpr RegClass v4b{RegClass::v4b};
333 static constexpr RegClass v6b{RegClass::v6b};
334 static constexpr RegClass v8b{RegClass::v8b};
335 
336 /**
337  * Temp Class
338  * Each temporary virtual register has a
339  * register class (i.e. size and type)
340  * and SSA id.
341  */
342 struct Temp {
TempTemp343    Temp() noexcept : id_(0), reg_class(0) {}
TempTemp344    constexpr Temp(uint32_t id, RegClass cls) noexcept
345       : id_(id), reg_class(uint8_t(cls)) {}
346 
idTemp347    constexpr uint32_t id() const noexcept { return id_; }
regClassTemp348    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
349 
bytesTemp350    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp351    constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp352    constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp353    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
354 
355    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
356    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
357    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
358 
359 private:
360    uint32_t id_: 24;
361    uint32_t reg_class : 8;
362 };
363 
364 /**
365  * PhysReg
366  * Represents the physical register for each
367  * Operand and Definition.
368  */
369 struct PhysReg {
370    constexpr PhysReg() = default;
PhysRegPhysReg371    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg372    constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg373    constexpr unsigned byte() const { return reg_b & 0x3; }
374    constexpr operator unsigned() const { return reg(); }
375    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
376    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
377    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg378    constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
379 
380    uint16_t reg_b = 0;
381 };
382 
383 /* helper expressions for special registers */
384 static constexpr PhysReg m0{124};
385 static constexpr PhysReg vcc{106};
386 static constexpr PhysReg vcc_hi{107};
387 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
388 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
389 static constexpr PhysReg ttmp0{112};
390 static constexpr PhysReg ttmp1{113};
391 static constexpr PhysReg ttmp2{114};
392 static constexpr PhysReg ttmp3{115};
393 static constexpr PhysReg ttmp4{116};
394 static constexpr PhysReg ttmp5{117};
395 static constexpr PhysReg ttmp6{118};
396 static constexpr PhysReg ttmp7{119};
397 static constexpr PhysReg ttmp8{120};
398 static constexpr PhysReg ttmp9{121};
399 static constexpr PhysReg ttmp10{122};
400 static constexpr PhysReg ttmp11{123};
401 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
402 static constexpr PhysReg exec{126};
403 static constexpr PhysReg exec_lo{126};
404 static constexpr PhysReg exec_hi{127};
405 static constexpr PhysReg vccz{251};
406 static constexpr PhysReg execz{252};
407 static constexpr PhysReg scc{253};
408 
409 /**
410  * Operand Class
411  * Initially, each Operand refers to either
412  * a temporary virtual register
413  * or to a constant value
414  * Temporary registers get mapped to physical register during RA
415  * Constant values are inlined into the instruction sequence.
416  */
417 class Operand final
418 {
419 public:
Operand()420    constexpr Operand()
421       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
422         isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
423         isLateKill_(false) {}
424 
Operand(Temp r)425    explicit Operand(Temp r) noexcept
426    {
427       data_.temp = r;
428       if (r.id()) {
429          isTemp_ = true;
430       } else {
431          isUndef_ = true;
432          setFixed(PhysReg{128});
433       }
434    };
Operand(uint8_t v)435    explicit Operand(uint8_t v) noexcept
436    {
437       /* 8-bit constants are only used for copies and copies from any 8-bit
438        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
439        * to be inline constants. */
440       data_.i = v;
441       isConstant_ = true;
442       constSize = 0;
443       setFixed(PhysReg{0u});
444    };
Operand(uint16_t v)445    explicit Operand(uint16_t v) noexcept
446    {
447       data_.i = v;
448       isConstant_ = true;
449       constSize = 1;
450       if (v <= 64)
451          setFixed(PhysReg{128u + v});
452       else if (v >= 0xFFF0) /* [-16 .. -1] */
453          setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
454       else if (v == 0x3800) /* 0.5 */
455          setFixed(PhysReg{240});
456       else if (v == 0xB800) /* -0.5 */
457          setFixed(PhysReg{241});
458       else if (v == 0x3C00) /* 1.0 */
459          setFixed(PhysReg{242});
460       else if (v == 0xBC00) /* -1.0 */
461          setFixed(PhysReg{243});
462       else if (v == 0x4000) /* 2.0 */
463          setFixed(PhysReg{244});
464       else if (v == 0xC000) /* -2.0 */
465          setFixed(PhysReg{245});
466       else if (v == 0x4400) /* 4.0 */
467          setFixed(PhysReg{246});
468       else if (v == 0xC400) /* -4.0 */
469          setFixed(PhysReg{247});
470       else if (v == 0x3118) /* 1/2 PI */
471          setFixed(PhysReg{248});
472       else /* Literal Constant */
473          setFixed(PhysReg{255});
474    };
475    explicit Operand(uint32_t v, bool is64bit = false) noexcept
476    {
477       data_.i = v;
478       isConstant_ = true;
479       constSize = is64bit ? 3 : 2;
480       if (v <= 64)
481          setFixed(PhysReg{128 + v});
482       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
483          setFixed(PhysReg{192 - v});
484       else if (v == 0x3f000000) /* 0.5 */
485          setFixed(PhysReg{240});
486       else if (v == 0xbf000000) /* -0.5 */
487          setFixed(PhysReg{241});
488       else if (v == 0x3f800000) /* 1.0 */
489          setFixed(PhysReg{242});
490       else if (v == 0xbf800000) /* -1.0 */
491          setFixed(PhysReg{243});
492       else if (v == 0x40000000) /* 2.0 */
493          setFixed(PhysReg{244});
494       else if (v == 0xc0000000) /* -2.0 */
495          setFixed(PhysReg{245});
496       else if (v == 0x40800000) /* 4.0 */
497          setFixed(PhysReg{246});
498       else if (v == 0xc0800000) /* -4.0 */
499          setFixed(PhysReg{247});
500       else { /* Literal Constant */
501          assert(!is64bit && "attempt to create a 64-bit literal constant");
502          setFixed(PhysReg{255});
503       }
504    };
Operand(uint64_t v)505    explicit Operand(uint64_t v) noexcept
506    {
507       isConstant_ = true;
508       constSize = 3;
509       if (v <= 64) {
510          data_.i = (uint32_t) v;
511          setFixed(PhysReg{128 + (uint32_t) v});
512       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
513          data_.i = (uint32_t) v;
514          setFixed(PhysReg{192 - (uint32_t) v});
515       } else if (v == 0x3FE0000000000000) { /* 0.5 */
516          data_.i = 0x3f000000;
517          setFixed(PhysReg{240});
518       } else if (v == 0xBFE0000000000000) { /* -0.5 */
519          data_.i = 0xbf000000;
520          setFixed(PhysReg{241});
521       } else if (v == 0x3FF0000000000000) { /* 1.0 */
522          data_.i = 0x3f800000;
523          setFixed(PhysReg{242});
524       } else if (v == 0xBFF0000000000000) { /* -1.0 */
525          data_.i = 0xbf800000;
526          setFixed(PhysReg{243});
527       } else if (v == 0x4000000000000000) { /* 2.0 */
528          data_.i = 0x40000000;
529          setFixed(PhysReg{244});
530       } else if (v == 0xC000000000000000) { /* -2.0 */
531          data_.i = 0xc0000000;
532          setFixed(PhysReg{245});
533       } else if (v == 0x4010000000000000) { /* 4.0 */
534          data_.i = 0x40800000;
535          setFixed(PhysReg{246});
536       } else if (v == 0xC010000000000000) { /* -4.0 */
537          data_.i = 0xc0800000;
538          setFixed(PhysReg{247});
539       } else { /* Literal Constant: we don't know if it is a long or double.*/
540          isConstant_ = 0;
541          assert(false && "attempt to create a 64-bit literal constant");
542       }
543    };
Operand(RegClass type)544    explicit Operand(RegClass type) noexcept
545    {
546       isUndef_ = true;
547       data_.temp = Temp(0, type);
548       setFixed(PhysReg{128});
549    };
Operand(PhysReg reg,RegClass type)550    explicit Operand(PhysReg reg, RegClass type) noexcept
551    {
552       data_.temp = Temp(0, type);
553       setFixed(reg);
554    }
555 
isTemp()556    constexpr bool isTemp() const noexcept
557    {
558       return isTemp_;
559    }
560 
setTemp(Temp t)561    constexpr void setTemp(Temp t) noexcept {
562       assert(!isConstant_);
563       isTemp_ = true;
564       data_.temp = t;
565    }
566 
getTemp()567    constexpr Temp getTemp() const noexcept
568    {
569       return data_.temp;
570    }
571 
tempId()572    constexpr uint32_t tempId() const noexcept
573    {
574       return data_.temp.id();
575    }
576 
hasRegClass()577    constexpr bool hasRegClass() const noexcept
578    {
579       return isTemp() || isUndefined();
580    }
581 
regClass()582    constexpr RegClass regClass() const noexcept
583    {
584       return data_.temp.regClass();
585    }
586 
bytes()587    constexpr unsigned bytes() const noexcept
588    {
589       if (isConstant())
590          return 1 << constSize;
591       else
592          return data_.temp.bytes();
593    }
594 
size()595    constexpr unsigned size() const noexcept
596    {
597       if (isConstant())
598          return constSize > 2 ? 2 : 1;
599       else
600          return data_.temp.size();
601    }
602 
isFixed()603    constexpr bool isFixed() const noexcept
604    {
605       return isFixed_;
606    }
607 
physReg()608    constexpr PhysReg physReg() const noexcept
609    {
610       return reg_;
611    }
612 
setFixed(PhysReg reg)613    constexpr void setFixed(PhysReg reg) noexcept
614    {
615       isFixed_ = reg != unsigned(-1);
616       reg_ = reg;
617    }
618 
isConstant()619    constexpr bool isConstant() const noexcept
620    {
621       return isConstant_;
622    }
623 
isLiteral()624    constexpr bool isLiteral() const noexcept
625    {
626       return isConstant() && reg_ == 255;
627    }
628 
isUndefined()629    constexpr bool isUndefined() const noexcept
630    {
631       return isUndef_;
632    }
633 
constantValue()634    constexpr uint32_t constantValue() const noexcept
635    {
636       return data_.i;
637    }
638 
constantEquals(uint32_t cmp)639    constexpr bool constantEquals(uint32_t cmp) const noexcept
640    {
641       return isConstant() && constantValue() == cmp;
642    }
643 
644    constexpr uint64_t constantValue64(bool signext=false) const noexcept
645    {
646       if (constSize == 3) {
647          if (reg_ <= 192)
648             return reg_ - 128;
649          else if (reg_ <= 208)
650             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
651 
652          switch (reg_) {
653          case 240:
654             return 0x3FE0000000000000;
655          case 241:
656             return 0xBFE0000000000000;
657          case 242:
658             return 0x3FF0000000000000;
659          case 243:
660             return 0xBFF0000000000000;
661          case 244:
662             return 0x4000000000000000;
663          case 245:
664             return 0xC000000000000000;
665          case 246:
666             return 0x4010000000000000;
667          case 247:
668             return 0xC010000000000000;
669          }
670       } else if (constSize == 1) {
671          return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
672       } else if (constSize == 0) {
673          return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
674       }
675       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
676    }
677 
isOfType(RegType type)678    constexpr bool isOfType(RegType type) const noexcept
679    {
680       return hasRegClass() && regClass().type() == type;
681    }
682 
683    /* Indicates that the killed operand's live range intersects with the
684     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
685     * not set by liveness analysis. */
setLateKill(bool flag)686    constexpr void setLateKill(bool flag) noexcept
687    {
688       isLateKill_ = flag;
689    }
690 
isLateKill()691    constexpr bool isLateKill() const noexcept
692    {
693       return isLateKill_;
694    }
695 
setKill(bool flag)696    constexpr void setKill(bool flag) noexcept
697    {
698       isKill_ = flag;
699       if (!flag)
700          setFirstKill(false);
701    }
702 
isKill()703    constexpr bool isKill() const noexcept
704    {
705       return isKill_ || isFirstKill();
706    }
707 
setFirstKill(bool flag)708    constexpr void setFirstKill(bool flag) noexcept
709    {
710       isFirstKill_ = flag;
711       if (flag)
712          setKill(flag);
713    }
714 
715    /* When there are multiple operands killing the same temporary,
716     * isFirstKill() is only returns true for the first one. */
isFirstKill()717    constexpr bool isFirstKill() const noexcept
718    {
719       return isFirstKill_;
720    }
721 
isKillBeforeDef()722    constexpr bool isKillBeforeDef() const noexcept
723    {
724       return isKill() && !isLateKill();
725    }
726 
isFirstKillBeforeDef()727    constexpr bool isFirstKillBeforeDef() const noexcept
728    {
729       return isFirstKill() && !isLateKill();
730    }
731 
732    constexpr bool operator == (Operand other) const noexcept
733    {
734       if (other.size() != size())
735          return false;
736       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
737          return false;
738       if (isFixed() && other.isFixed() && physReg() != other.physReg())
739          return false;
740       if (isLiteral())
741          return other.isLiteral() && other.constantValue() == constantValue();
742       else if (isConstant())
743          return other.isConstant() && other.physReg() == physReg();
744       else if (isUndefined())
745          return other.isUndefined() && other.regClass() == regClass();
746       else
747          return other.isTemp() && other.getTemp() == getTemp();
748    }
749 private:
750    union {
751       uint32_t i;
752       float f;
753       Temp temp = Temp(0, s1);
754    } data_;
755    PhysReg reg_;
756    union {
757       struct {
758          uint8_t isTemp_:1;
759          uint8_t isFixed_:1;
760          uint8_t isConstant_:1;
761          uint8_t isKill_:1;
762          uint8_t isUndef_:1;
763          uint8_t isFirstKill_:1;
764          uint8_t constSize:2;
765          uint8_t isLateKill_:1;
766       };
767       /* can't initialize bit-fields in c++11, so work around using a union */
768       uint16_t control_ = 0;
769    };
770 };
771 
772 /**
773  * Definition Class
774  * Definitions are the results of Instructions
775  * and refer to temporary virtual registers
776  * which are later mapped to physical registers
777  */
778 class Definition final
779 {
780 public:
Definition()781    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
782                             isKill_(0), isPrecise_(0), isNUW_(0) {}
Definition(uint32_t index,RegClass type)783    Definition(uint32_t index, RegClass type) noexcept
784       : temp(index, type) {}
Definition(Temp tmp)785    explicit Definition(Temp tmp) noexcept
786       : temp(tmp) {}
Definition(PhysReg reg,RegClass type)787    Definition(PhysReg reg, RegClass type) noexcept
788       : temp(Temp(0, type))
789    {
790       setFixed(reg);
791    }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)792    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
793       : temp(Temp(tmpId, type))
794    {
795       setFixed(reg);
796    }
797 
isTemp()798    constexpr bool isTemp() const noexcept
799    {
800       return tempId() > 0;
801    }
802 
getTemp()803    constexpr Temp getTemp() const noexcept
804    {
805       return temp;
806    }
807 
tempId()808    constexpr uint32_t tempId() const noexcept
809    {
810       return temp.id();
811    }
812 
setTemp(Temp t)813    constexpr void setTemp(Temp t) noexcept {
814       temp = t;
815    }
816 
regClass()817    constexpr RegClass regClass() const noexcept
818    {
819       return temp.regClass();
820    }
821 
bytes()822    constexpr unsigned bytes() const noexcept
823    {
824       return temp.bytes();
825    }
826 
size()827    constexpr unsigned size() const noexcept
828    {
829       return temp.size();
830    }
831 
isFixed()832    constexpr bool isFixed() const noexcept
833    {
834       return isFixed_;
835    }
836 
physReg()837    constexpr PhysReg physReg() const noexcept
838    {
839       return reg_;
840    }
841 
setFixed(PhysReg reg)842    constexpr void setFixed(PhysReg reg) noexcept
843    {
844       isFixed_ = 1;
845       reg_ = reg;
846    }
847 
setHint(PhysReg reg)848    constexpr void setHint(PhysReg reg) noexcept
849    {
850       hasHint_ = 1;
851       reg_ = reg;
852    }
853 
hasHint()854    constexpr bool hasHint() const noexcept
855    {
856       return hasHint_;
857    }
858 
setKill(bool flag)859    constexpr void setKill(bool flag) noexcept
860    {
861       isKill_ = flag;
862    }
863 
isKill()864    constexpr bool isKill() const noexcept
865    {
866       return isKill_;
867    }
868 
setPrecise(bool precise)869    constexpr void setPrecise(bool precise) noexcept
870    {
871       isPrecise_ = precise;
872    }
873 
isPrecise()874    constexpr bool isPrecise() const noexcept
875    {
876       return isPrecise_;
877    }
878 
879    /* No Unsigned Wrap */
setNUW(bool nuw)880    constexpr void setNUW(bool nuw) noexcept
881    {
882       isNUW_ = nuw;
883    }
884 
isNUW()885    constexpr bool isNUW() const noexcept
886    {
887       return isNUW_;
888    }
889 
890 private:
891    Temp temp = Temp(0, s1);
892    PhysReg reg_;
893    union {
894       struct {
895          uint8_t isFixed_:1;
896          uint8_t hasHint_:1;
897          uint8_t isKill_:1;
898          uint8_t isPrecise_:1;
899          uint8_t isNUW_:1;
900       };
901       /* can't initialize bit-fields in c++11, so work around using a union */
902       uint8_t control_ = 0;
903    };
904 };
905 
906 struct Block;
907 
908 struct Instruction {
909    aco_opcode opcode;
910    Format format;
911    uint32_t pass_flags;
912 
913    aco::span<Operand> operands;
914    aco::span<Definition> definitions;
915 
isVALUInstruction916    constexpr bool isVALU() const noexcept
917    {
918       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
919           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
920           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
921           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
922           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
923           || format == Format::VOP3P;
924    }
925 
isSALUInstruction926    constexpr bool isSALU() const noexcept
927    {
928       return format == Format::SOP1 ||
929              format == Format::SOP2 ||
930              format == Format::SOPC ||
931              format == Format::SOPK ||
932              format == Format::SOPP;
933    }
934 
isVMEMInstruction935    constexpr bool isVMEM() const noexcept
936    {
937       return format == Format::MTBUF ||
938              format == Format::MUBUF ||
939              format == Format::MIMG;
940    }
941 
isDPPInstruction942    constexpr bool isDPP() const noexcept
943    {
944       return (uint16_t) format & (uint16_t) Format::DPP;
945    }
946 
isVOP3Instruction947    constexpr bool isVOP3() const noexcept
948    {
949       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
950              ((uint16_t) format & (uint16_t) Format::VOP3B);
951    }
952 
isSDWAInstruction953    constexpr bool isSDWA() const noexcept
954    {
955       return (uint16_t) format & (uint16_t) Format::SDWA;
956    }
957 
isFlatOrGlobalInstruction958    constexpr bool isFlatOrGlobal() const noexcept
959    {
960       return format == Format::FLAT || format == Format::GLOBAL;
961    }
962 
963    constexpr bool usesModifiers() const noexcept;
964 
reads_execInstruction965    constexpr bool reads_exec() const noexcept
966    {
967       for (const Operand& op : operands) {
968          if (op.isFixed() && op.physReg() == exec)
969             return true;
970       }
971       return false;
972    }
973 };
974 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
975 
976 struct SOPK_instruction : public Instruction {
977    uint16_t imm;
978    uint16_t padding;
979 };
980 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
981 
982 struct SOPP_instruction : public Instruction {
983    uint32_t imm;
984    int block;
985 };
986 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
987 
988 struct SOPC_instruction : public Instruction {
989 };
990 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
991 
992 struct SOP1_instruction : public Instruction {
993 };
994 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
995 
996 struct SOP2_instruction : public Instruction {
997 };
998 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
999 
1000 /**
1001  * Scalar Memory Format:
1002  * For s_(buffer_)load_dword*:
1003  * Operand(0): SBASE - SGPR-pair which provides base address
1004  * Operand(1): Offset - immediate (un)signed offset or SGPR
1005  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1006  * Operand(n-1): SOffset - SGPR offset (Vega only)
1007  *
1008  * Having no operands is also valid for instructions such as s_dcache_inv.
1009  *
1010  */
1011 struct SMEM_instruction : public Instruction {
1012    memory_sync_info sync;
1013    bool glc : 1; /* VI+: globally coherent */
1014    bool dlc : 1; /* NAVI: device level coherent */
1015    bool nv : 1; /* VEGA only: Non-volatile */
1016    bool disable_wqm : 1;
1017    bool prevent_overflow : 1; /* avoid overflow when combining additions */
1018    uint32_t padding: 3;
1019 };
1020 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1021 
1022 struct VOP1_instruction : public Instruction {
1023 };
1024 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1025 
1026 struct VOP2_instruction : public Instruction {
1027 };
1028 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1029 
1030 struct VOPC_instruction : public Instruction {
1031 };
1032 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1033 
1034 struct VOP3A_instruction : public Instruction {
1035    bool abs[3];
1036    bool neg[3];
1037    uint8_t opsel : 4;
1038    uint8_t omod : 2;
1039    bool clamp : 1;
1040    uint32_t padding : 9;
1041 };
1042 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1043 
1044 struct VOP3P_instruction : public Instruction {
1045    bool neg_lo[3];
1046    bool neg_hi[3];
1047    uint8_t opsel_lo : 3;
1048    uint8_t opsel_hi : 3;
1049    bool clamp : 1;
1050    uint32_t padding : 9;
1051 };
1052 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1053 
1054 /**
1055  * Data Parallel Primitives Format:
1056  * This format can be used for VOP1, VOP2 or VOPC instructions.
1057  * The swizzle applies to the src0 operand.
1058  *
1059  */
1060 struct DPP_instruction : public Instruction {
1061    bool abs[2];
1062    bool neg[2];
1063    uint16_t dpp_ctrl;
1064    uint8_t row_mask : 4;
1065    uint8_t bank_mask : 4;
1066    bool bound_ctrl : 1;
1067    uint32_t padding : 7;
1068 };
1069 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1070 
1071 enum sdwa_sel : uint8_t {
1072     /* masks */
1073     sdwa_wordnum = 0x1,
1074     sdwa_bytenum = 0x3,
1075     sdwa_asuint = 0x7 | 0x10,
1076     sdwa_rasize = 0x3,
1077 
1078     /* flags */
1079     sdwa_isword = 0x4,
1080     sdwa_sext = 0x8,
1081     sdwa_isra = 0x10,
1082 
1083     /* specific values */
1084     sdwa_ubyte0 = 0,
1085     sdwa_ubyte1 = 1,
1086     sdwa_ubyte2 = 2,
1087     sdwa_ubyte3 = 3,
1088     sdwa_uword0 = sdwa_isword | 0,
1089     sdwa_uword1 = sdwa_isword | 1,
1090     sdwa_udword = 6,
1091 
1092     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1093     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1094     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1095     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1096     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1097     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1098     sdwa_sdword = sdwa_udword | sdwa_sext,
1099 
1100     /* register-allocated */
1101     sdwa_ubyte = 1 | sdwa_isra,
1102     sdwa_uword = 2 | sdwa_isra,
1103     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1104     sdwa_sword = sdwa_uword | sdwa_sext,
1105 };
1106 
1107 /**
1108  * Sub-Dword Addressing Format:
1109  * This format can be used for VOP1, VOP2 or VOPC instructions.
1110  *
1111  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1112  * the definition doesn't have to be VCC on GFX9+.
1113  *
1114  */
1115 struct SDWA_instruction : public Instruction {
1116    /* these destination modifiers aren't available with VOPC except for
1117     * clamp on GFX8 */
1118    uint8_t sel[2];
1119    uint8_t dst_sel;
1120    bool neg[2];
1121    bool abs[2];
1122    bool dst_preserve : 1;
1123    bool clamp : 1;
1124    uint8_t omod : 2; /* GFX9+ */
1125    uint32_t padding : 4;
1126 };
1127 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1128 
1129 struct Interp_instruction : public Instruction {
1130    uint8_t attribute;
1131    uint8_t component;
1132    uint16_t padding;
1133 };
1134 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1135 
1136 /**
1137  * Local and Global Data Sharing instructions
1138  * Operand(0): ADDR - VGPR which supplies the address.
1139  * Operand(1): DATA0 - First data VGPR.
1140  * Operand(2): DATA1 - Second data VGPR.
1141  * Operand(n-1): M0 - LDS size.
1142  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1143  *
1144  */
1145 struct DS_instruction : public Instruction {
1146    memory_sync_info sync;
1147    bool gds;
1148    int16_t offset0;
1149    int8_t offset1;
1150    uint8_t padding;
1151 };
1152 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1153 
1154 /**
1155  * Vector Memory Untyped-buffer Instructions
1156  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1157  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1158  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1159  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1160  *
1161  */
1162 struct MUBUF_instruction : public Instruction {
1163    memory_sync_info sync;
1164    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1165    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1166    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1167    bool glc : 1; /* globally coherent */
1168    bool dlc : 1; /* NAVI: device level coherent */
1169    bool slc : 1; /* system level coherent */
1170    bool tfe : 1; /* texture fail enable */
1171    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1172    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1173    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1174    bool swizzled : 1;
1175    uint32_t padding1 : 18;
1176 };
1177 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1178 
1179 /**
1180  * Vector Memory Typed-buffer Instructions
1181  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1182  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1183  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1184  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1185  *
1186  */
1187 struct MTBUF_instruction : public Instruction {
1188    memory_sync_info sync;
1189    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1190    uint8_t nfmt : 3; /* Numeric format of data in memory */
1191    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1192    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1193    bool glc : 1; /* globally coherent */
1194    bool dlc : 1; /* NAVI: device level coherent */
1195    bool slc : 1; /* system level coherent */
1196    bool tfe : 1; /* texture fail enable */
1197    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1198    uint32_t padding : 10;
1199    uint16_t offset; /* Unsigned byte offset - 12 bit */
1200 };
1201 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1202 
1203 /**
1204  * Vector Memory Image Instructions
1205  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1206  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1207  *             or VDATA - Vector GPR for write data.
1208  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1209  * Definition(0): VDATA - Vector GPR for read result.
1210  *
1211  */
1212 struct MIMG_instruction : public Instruction {
1213    memory_sync_info sync;
1214    uint8_t dmask; /* Data VGPR enable mask */
1215    uint8_t dim : 3; /* NAVI: dimensionality */
1216    bool unrm : 1; /* Force address to be un-normalized */
1217    bool dlc : 1; /* NAVI: device level coherent */
1218    bool glc : 1; /* globally coherent */
1219    bool slc : 1; /* system level coherent */
1220    bool tfe : 1; /* texture fail enable */
1221    bool da : 1; /* declare an array */
1222    bool lwe : 1; /* Force data to be un-normalized */
1223    bool r128 : 1; /* NAVI: Texture resource size */
1224    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1225    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1226    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1227    uint32_t padding : 18;
1228 };
1229 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1230 
1231 /**
1232  * Flat/Scratch/Global Instructions
1233  * Operand(0): ADDR
1234  * Operand(1): SADDR
1235  * Operand(2) / Definition(0): DATA/VDST
1236  *
1237  */
1238 struct FLAT_instruction : public Instruction {
1239    memory_sync_info sync;
1240    bool slc : 1; /* system level coherent */
1241    bool glc : 1; /* globally coherent */
1242    bool dlc : 1; /* NAVI: device level coherent */
1243    bool lds : 1;
1244    bool nv : 1;
1245    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1246    uint32_t padding0 : 2;
1247    uint16_t offset; /* Vega/Navi only */
1248    uint16_t padding1;
1249 };
1250 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1251 
1252 struct Export_instruction : public Instruction {
1253    uint8_t enabled_mask;
1254    uint8_t dest;
1255    bool compressed : 1;
1256    bool done : 1;
1257    bool valid_mask : 1;
1258    uint32_t padding : 13;
1259 };
1260 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1261 
1262 struct Pseudo_instruction : public Instruction {
1263    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1264    bool tmp_in_scc;
1265    uint8_t padding;
1266 };
1267 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1268 
1269 struct Pseudo_branch_instruction : public Instruction {
1270    /* target[0] is the block index of the branch target.
1271     * For conditional branches, target[1] contains the fall-through alternative.
1272     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1273     */
1274    uint32_t target[2];
1275 };
1276 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1277 
1278 struct Pseudo_barrier_instruction : public Instruction {
1279    memory_sync_info sync;
1280    sync_scope exec_scope;
1281 };
1282 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1283 
1284 enum ReduceOp : uint16_t {
1285    iadd8, iadd16, iadd32, iadd64,
1286    imul8, imul16, imul32, imul64,
1287           fadd16, fadd32, fadd64,
1288           fmul16, fmul32, fmul64,
1289    imin8, imin16, imin32, imin64,
1290    imax8, imax16, imax32, imax64,
1291    umin8, umin16, umin32, umin64,
1292    umax8, umax16, umax32, umax64,
1293           fmin16, fmin32, fmin64,
1294           fmax16, fmax32, fmax64,
1295    iand8, iand16, iand32, iand64,
1296    ior8, ior16, ior32, ior64,
1297    ixor8, ixor16, ixor32, ixor64,
1298 };
1299 
1300 /**
1301  * Subgroup Reduction Instructions, everything except for the data to be
1302  * reduced and the result as inserted by setup_reduce_temp().
1303  * Operand(0): data to be reduced
1304  * Operand(1): reduce temporary
1305  * Operand(2): vector temporary
1306  * Definition(0): result
1307  * Definition(1): scalar temporary
1308  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1309  * Definition(3): scc clobber
1310  * Definition(4): vcc clobber
1311  *
1312  */
1313 struct Pseudo_reduction_instruction : public Instruction {
1314    ReduceOp reduce_op;
1315    uint16_t cluster_size; // must be 0 for scans
1316 };
1317 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1318 
1319 struct instr_deleter_functor {
operatorinstr_deleter_functor1320    void operator()(void* p) {
1321       free(p);
1322    }
1323 };
1324 
1325 template<typename T>
1326 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1327 
1328 template<typename T>
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1329 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1330 {
1331    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1332    char *data = (char*) calloc(1, size);
1333    T* inst = (T*) data;
1334 
1335    inst->opcode = opcode;
1336    inst->format = format;
1337 
1338    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1339    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1340    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1341    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1342 
1343    return inst;
1344 }
1345 
usesModifiers()1346 constexpr bool Instruction::usesModifiers() const noexcept
1347 {
1348    if (isDPP() || isSDWA())
1349       return true;
1350 
1351    if (format == Format::VOP3P) {
1352       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1353       for (unsigned i = 0; i < operands.size(); i++) {
1354          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1355             return true;
1356       }
1357       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1358    } else if (isVOP3()) {
1359       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1360       for (unsigned i = 0; i < operands.size(); i++) {
1361          if (vop3->abs[i] || vop3->neg[i])
1362             return true;
1363       }
1364       return vop3->opsel || vop3->clamp || vop3->omod;
1365    }
1366    return false;
1367 }
1368 
is_phi(Instruction * instr)1369 constexpr bool is_phi(Instruction* instr)
1370 {
1371    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1372 }
1373 
is_phi(aco_ptr<Instruction> & instr)1374 static inline bool is_phi(aco_ptr<Instruction>& instr)
1375 {
1376    return is_phi(instr.get());
1377 }
1378 
1379 memory_sync_info get_sync_info(const Instruction* instr);
1380 
1381 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1382 
1383 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1384 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1385 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1386 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1387 
1388 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1389 
1390 enum block_kind {
1391    /* uniform indicates that leaving this block,
1392     * all actives lanes stay active */
1393    block_kind_uniform = 1 << 0,
1394    block_kind_top_level = 1 << 1,
1395    block_kind_loop_preheader = 1 << 2,
1396    block_kind_loop_header = 1 << 3,
1397    block_kind_loop_exit = 1 << 4,
1398    block_kind_continue = 1 << 5,
1399    block_kind_break = 1 << 6,
1400    block_kind_continue_or_break = 1 << 7,
1401    block_kind_discard = 1 << 8,
1402    block_kind_branch = 1 << 9,
1403    block_kind_merge = 1 << 10,
1404    block_kind_invert = 1 << 11,
1405    block_kind_uses_discard_if = 1 << 12,
1406    block_kind_needs_lowering = 1 << 13,
1407    block_kind_uses_demote = 1 << 14,
1408    block_kind_export_end = 1 << 15,
1409 };
1410 
1411 
1412 struct RegisterDemand {
1413    constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1414    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1415       : vgpr{v}, sgpr{s} {}
1416    int16_t vgpr = 0;
1417    int16_t sgpr = 0;
1418 
1419    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1420       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1421    }
1422 
exceedsRegisterDemand1423    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1424       return vgpr > other.vgpr || sgpr > other.sgpr;
1425    }
1426 
1427    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1428       if (t.type() == RegType::sgpr)
1429          return RegisterDemand( vgpr, sgpr + t.size() );
1430       else
1431          return RegisterDemand( vgpr + t.size(), sgpr );
1432    }
1433 
1434    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1435       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1436    }
1437 
1438    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1439       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1440    }
1441 
1442    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1443       vgpr += other.vgpr;
1444       sgpr += other.sgpr;
1445       return *this;
1446    }
1447 
1448    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1449       vgpr -= other.vgpr;
1450       sgpr -= other.sgpr;
1451       return *this;
1452    }
1453 
1454    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1455       if (t.type() == RegType::sgpr)
1456          sgpr += t.size();
1457       else
1458          vgpr += t.size();
1459       return *this;
1460    }
1461 
1462    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1463       if (t.type() == RegType::sgpr)
1464          sgpr -= t.size();
1465       else
1466          vgpr -= t.size();
1467       return *this;
1468    }
1469 
updateRegisterDemand1470    constexpr void update(const RegisterDemand other) noexcept {
1471       vgpr = std::max(vgpr, other.vgpr);
1472       sgpr = std::max(sgpr, other.sgpr);
1473    }
1474 
1475 };
1476 
1477 /* CFG */
1478 struct Block {
1479    float_mode fp_mode;
1480    unsigned index;
1481    unsigned offset = 0;
1482    std::vector<aco_ptr<Instruction>> instructions;
1483    std::vector<unsigned> logical_preds;
1484    std::vector<unsigned> linear_preds;
1485    std::vector<unsigned> logical_succs;
1486    std::vector<unsigned> linear_succs;
1487    RegisterDemand register_demand = RegisterDemand();
1488    uint16_t loop_nest_depth = 0;
1489    uint16_t kind = 0;
1490    int logical_idom = -1;
1491    int linear_idom = -1;
1492    Temp live_out_exec = Temp();
1493 
1494    /* this information is needed for predecessors to blocks with phis when
1495     * moving out of ssa */
1496    bool scc_live_out = false;
1497    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1498 
BlockBlock1499    Block(unsigned idx) : index(idx) {}
BlockBlock1500    Block() : index(0) {}
1501 };
1502 
1503 /*
1504  * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1505  */
1506 enum class SWStage : uint8_t {
1507     None = 0,
1508     VS = 1 << 0,     /* Vertex Shader */
1509     GS = 1 << 1,     /* Geometry Shader */
1510     TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
1511     TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
1512     FS = 1 << 4,     /* Fragment aka Pixel Shader */
1513     CS = 1 << 5,     /* Compute Shader */
1514     GSCopy = 1 << 6, /* GS Copy Shader (internal) */
1515 
1516     /* Stage combinations merged to run on a single HWStage */
1517     VS_GS = VS | GS,
1518     VS_TCS = VS | TCS,
1519     TES_GS = TES | GS,
1520 };
1521 
1522 constexpr SWStage operator|(SWStage a, SWStage b) {
1523     return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
1524 }
1525 
1526 /*
1527  * Shader stages as running on the AMD GPU.
1528  *
1529  * The relation between HWStages and SWStages is not a one-to-one mapping:
1530  * Some SWStages are merged by ACO to run on a single HWStage.
1531  * See README.md for details.
1532  */
1533 enum class HWStage : uint8_t {
1534     VS,
1535     ES,  /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1536     GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
1537     NGG, /* Primitive shader, used to implement VS, TES, GS. */
1538     LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1539     HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1540     FS,
1541     CS,
1542 };
1543 
1544 /*
1545  * Set of SWStages to be merged into a single shader paired with the
1546  * HWStage it will run on.
1547  */
1548 struct Stage {
1549     constexpr Stage() = default;
1550 
StageStage1551     explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) { }
1552 
1553     /* Check if the given SWStage is included */
hasStage1554     constexpr bool has(SWStage stage) const {
1555         return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
1556     }
1557 
num_sw_stagesStage1558     unsigned num_sw_stages() const {
1559         return util_bitcount(static_cast<uint8_t>(sw));
1560     }
1561 
1562     constexpr bool operator==(const Stage& other) const {
1563         return sw == other.sw && hw == other.hw;
1564     }
1565 
1566     constexpr bool operator!=(const Stage& other) const {
1567         return sw != other.sw || hw != other.hw;
1568     }
1569 
1570     /* Mask of merged software stages */
1571     SWStage sw = SWStage::None;
1572 
1573     /* Active hardware stage */
1574     HWStage hw {};
1575 };
1576 
1577 /* possible settings of Program::stage */
1578 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
1579 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
1580 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
1581 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
1582 static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
1583 /* GFX10/NGG */
1584 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
1585 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
1586 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
1587 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
1588 /* GFX9 (and GFX10 if NGG isn't used) */
1589 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
1590 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
1591 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
1592 /* pre-GFX9 */
1593 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
1594 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
1595 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
1596 static constexpr Stage tess_eval_es(HWStage::ES, SWStage::TES); /* tesselation evaluation before geometry */
1597 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
1598 
1599 enum statistic {
1600    statistic_hash,
1601    statistic_instructions,
1602    statistic_copies,
1603    statistic_branches,
1604    statistic_cycles,
1605    statistic_vmem_clauses,
1606    statistic_smem_clauses,
1607    statistic_vmem_score,
1608    statistic_smem_score,
1609    statistic_sgpr_presched,
1610    statistic_vgpr_presched,
1611    num_statistics
1612 };
1613 
1614 class Program final {
1615 public:
1616    float_mode next_fp_mode;
1617    std::vector<Block> blocks;
1618    std::vector<RegClass> temp_rc = {s1};
1619    RegisterDemand max_reg_demand = RegisterDemand();
1620    uint16_t num_waves = 0;
1621    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1622    ac_shader_config* config;
1623    struct radv_shader_info *info;
1624    enum chip_class chip_class;
1625    enum radeon_family family;
1626    unsigned wave_size;
1627    RegClass lane_mask;
1628    Stage stage;
1629    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1630    bool needs_wqm = false; /* there exists a p_wqm instruction */
1631    bool wb_smem_l1_on_end = false;
1632 
1633    std::vector<uint8_t> constant_data;
1634    Temp private_segment_buffer;
1635    Temp scratch_offset;
1636 
1637    uint16_t min_waves = 0;
1638    uint16_t lds_alloc_granule;
1639    uint32_t lds_limit; /* in bytes */
1640    bool has_16bank_lds;
1641    uint16_t vgpr_limit;
1642    uint16_t sgpr_limit;
1643    uint16_t physical_sgprs;
1644    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1645    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1646    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1647 
1648    bool xnack_enabled = false;
1649    bool sram_ecc_enabled = false;
1650    bool has_fast_fma32 = false;
1651 
1652    bool needs_vcc = false;
1653    bool needs_flat_scr = false;
1654 
1655    bool collect_statistics = false;
1656    uint32_t statistics[num_statistics];
1657 
1658    struct {
1659       void (*func)(void *private_data,
1660                    enum radv_compiler_debug_level level,
1661                    const char *message);
1662       void *private_data;
1663    } debug;
1664 
allocateId(RegClass rc)1665    uint32_t allocateId(RegClass rc)
1666    {
1667       assert(allocationID <= 16777215);
1668       temp_rc.push_back(rc);
1669       return allocationID++;
1670    }
1671 
allocateRange(unsigned amount)1672    void allocateRange(unsigned amount)
1673    {
1674       assert(allocationID + amount <= 16777216);
1675       temp_rc.resize(temp_rc.size() + amount);
1676       allocationID += amount;
1677    }
1678 
allocateTmp(RegClass rc)1679    Temp allocateTmp(RegClass rc)
1680    {
1681       return Temp(allocateId(rc), rc);
1682    }
1683 
peekAllocationId()1684    uint32_t peekAllocationId()
1685    {
1686       return allocationID;
1687    }
1688 
create_and_insert_block()1689    Block* create_and_insert_block() {
1690       blocks.emplace_back(blocks.size());
1691       blocks.back().fp_mode = next_fp_mode;
1692       return &blocks.back();
1693    }
1694 
insert_block(Block && block)1695    Block* insert_block(Block&& block) {
1696       block.index = blocks.size();
1697       block.fp_mode = next_fp_mode;
1698       blocks.emplace_back(std::move(block));
1699       return &blocks.back();
1700    }
1701 
1702 private:
1703    uint32_t allocationID = 1;
1704 };
1705 
1706 struct live {
1707    /* live temps out per block */
1708    std::vector<IDSet> live_out;
1709    /* register demand (sgpr/vgpr) per instruction per block */
1710    std::vector<std::vector<RegisterDemand>> register_demand;
1711 };
1712 
1713 void init();
1714 
1715 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1716                   enum chip_class chip_class, enum radeon_family family,
1717                   ac_shader_config *config);
1718 
1719 void select_program(Program *program,
1720                     unsigned shader_count,
1721                     struct nir_shader *const *shaders,
1722                     ac_shader_config* config,
1723                     struct radv_shader_args *args);
1724 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1725                            ac_shader_config* config,
1726                            struct radv_shader_args *args);
1727 void select_trap_handler_shader(Program *program, struct nir_shader *shader,
1728                                 ac_shader_config* config,
1729                                 struct radv_shader_args *args);
1730 
1731 void lower_phis(Program* program);
1732 void calc_min_waves(Program* program);
1733 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1734 live live_var_analysis(Program* program);
1735 std::vector<uint16_t> dead_code_analysis(Program *program);
1736 void dominator_tree(Program* program);
1737 void insert_exec_mask(Program *program);
1738 void value_numbering(Program* program);
1739 void optimize(Program* program);
1740 void setup_reduce_temp(Program* program);
1741 void lower_to_cssa(Program* program, live& live_vars);
1742 void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block);
1743 void ssa_elimination(Program* program);
1744 void lower_to_hw_instr(Program* program);
1745 void schedule_program(Program* program, live& live_vars);
1746 void spill(Program* program, live& live_vars);
1747 void insert_wait_states(Program* program);
1748 void insert_NOPs(Program* program);
1749 void form_hard_clauses(Program *program);
1750 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1751 bool print_asm(Program *program, std::vector<uint32_t>& binary,
1752                unsigned exec_size, FILE *output);
1753 bool validate_ir(Program* program);
1754 bool validate_ra(Program* program);
1755 #ifndef NDEBUG
1756 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1757 #else
1758 #define perfwarn(program, cond, msg, ...) do {} while(0)
1759 #endif
1760 
1761 void collect_presched_stats(Program *program);
1762 void collect_preasm_stats(Program *program);
1763 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1764 
1765 void aco_print_instr(const Instruction *instr, FILE *output);
1766 void aco_print_program(const Program *program, FILE *output);
1767 
1768 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1769                    const char *fmt, ...);
1770 void _aco_err(Program *program, const char *file, unsigned line,
1771               const char *fmt, ...);
1772 
1773 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1774 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1775 
1776 /* utilities for dealing with register demand */
1777 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1778 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1779 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1780 
1781 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1782 uint16_t get_extra_sgprs(Program *program);
1783 
1784 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1785 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1786 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1787 
1788 /* return number of addressable sgprs/vgprs for max_waves */
1789 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1790 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1791 
1792 typedef struct {
1793    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1794    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1795    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1796    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1797    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1798    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1799    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1800    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1801    /* sizes used for input/output modifiers and constants */
1802    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1803    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1804 } Info;
1805 
1806 extern const Info instr_info;
1807 
1808 }
1809 
1810 #endif /* ACO_IR_H */
1811 
1812