1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
19  *
20  * The above copyright notice and this permission notice (including the
21  * next paragraph) shall be included in all copies or substantial portions
22  * of the Software.
23  *
24  */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27 
28 #include "ac_exp_param.h"
29 #include "ac_llvm_util.h"
30 #include "ac_shader_util.h"
31 #include "c11/threads.h"
32 #include "shader_enums.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/macros.h"
36 #include "util/u_atomic.h"
37 #include "util/u_math.h"
38 #include <llvm-c/Core.h>
39 #include <llvm/Config/llvm-config.h>
40 
41 #include <assert.h>
42 #include <stdio.h>
43 
44 #define AC_LLVM_INITIAL_CF_DEPTH 4
45 
46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
47  */
48 struct ac_llvm_flow {
49    /* Loop exit or next part of if/else/endif. */
50    LLVMBasicBlockRef next_block;
51    LLVMBasicBlockRef loop_entry_block;
52 };
53 
54 /* Initialize module-independent parts of the context.
55  *
56  * The caller is responsible for initializing ctx::module and ctx::builder.
57  */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,enum chip_class chip_class,enum radeon_family family,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits)58 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59                           enum chip_class chip_class, enum radeon_family family,
60                           enum ac_float_mode float_mode, unsigned wave_size,
61                           unsigned ballot_mask_bits)
62 {
63    ctx->context = LLVMContextCreate();
64 
65    ctx->chip_class = chip_class;
66    ctx->family = family;
67    ctx->wave_size = wave_size;
68    ctx->ballot_mask_bits = ballot_mask_bits;
69    ctx->float_mode = float_mode;
70    ctx->module =
71       ac_create_module(wave_size == 32 ? compiler->tm_wave32 : compiler->tm, ctx->context);
72    ctx->builder = ac_create_builder(ctx->context, float_mode);
73 
74    ctx->voidt = LLVMVoidTypeInContext(ctx->context);
75    ctx->i1 = LLVMInt1TypeInContext(ctx->context);
76    ctx->i8 = LLVMInt8TypeInContext(ctx->context);
77    ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
78    ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
79    ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
80    ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
81    ctx->intptr = ctx->i32;
82    ctx->f16 = LLVMHalfTypeInContext(ctx->context);
83    ctx->f32 = LLVMFloatTypeInContext(ctx->context);
84    ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
85    ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
86    ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
87    ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
88    ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
89    ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
90    ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
91    ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
92    ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
93    ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
94    ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
95    ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
96    ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
97    ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
98 
99    ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
100    ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
101    ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
102    ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
103    ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
104    ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
105    ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
106    ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
107    ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
108    ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
109    ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
110    ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
111    ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
112    ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
113    ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
114    ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
115 
116    ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
117    ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
118 
119    ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
120 
121    ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
122 
123    ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
124 
125    ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
126    ctx->flow = calloc(1, sizeof(*ctx->flow));
127 }
128 
ac_llvm_context_dispose(struct ac_llvm_context * ctx)129 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
130 {
131    free(ctx->flow->stack);
132    free(ctx->flow);
133    ctx->flow = NULL;
134 }
135 
ac_get_llvm_num_components(LLVMValueRef value)136 int ac_get_llvm_num_components(LLVMValueRef value)
137 {
138    LLVMTypeRef type = LLVMTypeOf(value);
139    unsigned num_components =
140       LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
141    return num_components;
142 }
143 
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)144 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
145 {
146    if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
147       assert(index == 0);
148       return value;
149    }
150 
151    return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
152 }
153 
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)154 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
155 {
156    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
157       type = LLVMGetElementType(type);
158 
159    if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
160       return LLVMGetIntTypeWidth(type);
161 
162    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
163       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
164          return 32;
165    }
166 
167    if (type == ctx->f16)
168       return 16;
169    if (type == ctx->f32)
170       return 32;
171    if (type == ctx->f64)
172       return 64;
173 
174    unreachable("Unhandled type kind in get_elem_bits");
175 }
176 
ac_get_type_size(LLVMTypeRef type)177 unsigned ac_get_type_size(LLVMTypeRef type)
178 {
179    LLVMTypeKind kind = LLVMGetTypeKind(type);
180 
181    switch (kind) {
182    case LLVMIntegerTypeKind:
183       return LLVMGetIntTypeWidth(type) / 8;
184    case LLVMHalfTypeKind:
185       return 2;
186    case LLVMFloatTypeKind:
187       return 4;
188    case LLVMDoubleTypeKind:
189       return 8;
190    case LLVMPointerTypeKind:
191       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
192          return 4;
193       return 8;
194    case LLVMVectorTypeKind:
195       return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
196    case LLVMArrayTypeKind:
197       return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
198    default:
199       assert(0);
200       return 0;
201    }
202 }
203 
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)204 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
205 {
206    if (t == ctx->i1)
207       return ctx->i1;
208    else if (t == ctx->i8)
209       return ctx->i8;
210    else if (t == ctx->f16 || t == ctx->i16)
211       return ctx->i16;
212    else if (t == ctx->f32 || t == ctx->i32)
213       return ctx->i32;
214    else if (t == ctx->f64 || t == ctx->i64)
215       return ctx->i64;
216    else
217       unreachable("Unhandled integer size");
218 }
219 
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)220 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
221 {
222    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
223       LLVMTypeRef elem_type = LLVMGetElementType(t);
224       return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
225    }
226    if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
227       switch (LLVMGetPointerAddressSpace(t)) {
228       case AC_ADDR_SPACE_GLOBAL:
229          return ctx->i64;
230       case AC_ADDR_SPACE_CONST_32BIT:
231       case AC_ADDR_SPACE_LDS:
232          return ctx->i32;
233       default:
234          unreachable("unhandled address space");
235       }
236    }
237    return to_integer_type_scalar(ctx, t);
238 }
239 
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)240 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
241 {
242    LLVMTypeRef type = LLVMTypeOf(v);
243    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
244       return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
245    }
246    return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
247 }
248 
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)249 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
250 {
251    LLVMTypeRef type = LLVMTypeOf(v);
252    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
253       return v;
254    return ac_to_integer(ctx, v);
255 }
256 
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)257 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
258 {
259    if (t == ctx->i8)
260       return ctx->i8;
261    else if (t == ctx->i16 || t == ctx->f16)
262       return ctx->f16;
263    else if (t == ctx->i32 || t == ctx->f32)
264       return ctx->f32;
265    else if (t == ctx->i64 || t == ctx->f64)
266       return ctx->f64;
267    else
268       unreachable("Unhandled float size");
269 }
270 
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)271 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
272 {
273    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
274       LLVMTypeRef elem_type = LLVMGetElementType(t);
275       return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
276    }
277    return to_float_type_scalar(ctx, t);
278 }
279 
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)280 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
281 {
282    LLVMTypeRef type = LLVMTypeOf(v);
283    return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
284 }
285 
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)286 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
287                                 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
288                                 unsigned attrib_mask)
289 {
290    LLVMValueRef function, call;
291    bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
292 
293    function = LLVMGetNamedFunction(ctx->module, name);
294    if (!function) {
295       LLVMTypeRef param_types[32], function_type;
296       unsigned i;
297 
298       assert(param_count <= 32);
299 
300       for (i = 0; i < param_count; ++i) {
301          assert(params[i]);
302          param_types[i] = LLVMTypeOf(params[i]);
303       }
304       function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
305       function = LLVMAddFunction(ctx->module, name, function_type);
306 
307       LLVMSetFunctionCallConv(function, LLVMCCallConv);
308       LLVMSetLinkage(function, LLVMExternalLinkage);
309 
310       if (!set_callsite_attrs)
311          ac_add_func_attributes(ctx->context, function, attrib_mask);
312    }
313 
314    call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
315    if (set_callsite_attrs)
316       ac_add_func_attributes(ctx->context, call, attrib_mask);
317    return call;
318 }
319 
320 /**
321  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
322  * intrinsic names).
323  */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)324 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
325 {
326    LLVMTypeRef elem_type = type;
327 
328    assert(bufsize >= 8);
329 
330    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
331       int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
332       if (ret < 0) {
333          char *type_name = LLVMPrintTypeToString(type);
334          fprintf(stderr, "Error building type name for: %s\n", type_name);
335          LLVMDisposeMessage(type_name);
336          return;
337       }
338       elem_type = LLVMGetElementType(type);
339       buf += ret;
340       bufsize -= ret;
341    }
342    switch (LLVMGetTypeKind(elem_type)) {
343    default:
344       break;
345    case LLVMIntegerTypeKind:
346       snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
347       break;
348    case LLVMHalfTypeKind:
349       snprintf(buf, bufsize, "f16");
350       break;
351    case LLVMFloatTypeKind:
352       snprintf(buf, bufsize, "f32");
353       break;
354    case LLVMDoubleTypeKind:
355       snprintf(buf, bufsize, "f64");
356       break;
357    }
358 }
359 
360 /**
361  * Helper function that builds an LLVM IR PHI node and immediately adds
362  * incoming edges.
363  */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)364 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
365                           LLVMValueRef *values, LLVMBasicBlockRef *blocks)
366 {
367    LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
368    LLVMAddIncoming(phi, values, blocks, count_incoming);
369    return phi;
370 }
371 
ac_build_s_barrier(struct ac_llvm_context * ctx)372 void ac_build_s_barrier(struct ac_llvm_context *ctx)
373 {
374    ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
375 }
376 
377 /* Prevent optimizations (at least of memory accesses) across the current
378  * point in the program by emitting empty inline assembly that is marked as
379  * having side effects.
380  *
381  * Optionally, a value can be passed through the inline assembly to prevent
382  * LLVM from hoisting calls to ReadNone functions.
383  */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pvgpr)384 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pvgpr)
385 {
386    static int counter = 0;
387 
388    LLVMBuilderRef builder = ctx->builder;
389    char code[16];
390 
391    snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
392 
393    if (!pvgpr) {
394       LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
395       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
396       LLVMBuildCall(builder, inlineasm, NULL, 0, "");
397    } else {
398       LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
399       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
400       LLVMTypeRef type = LLVMTypeOf(*pvgpr);
401       unsigned bitsize = ac_get_elem_bits(ctx, type);
402       LLVMValueRef vgpr = *pvgpr;
403       LLVMTypeRef vgpr_type;
404       unsigned vgpr_size;
405       LLVMValueRef vgpr0;
406 
407       if (bitsize < 32)
408          vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
409 
410       vgpr_type = LLVMTypeOf(vgpr);
411       vgpr_size = ac_get_type_size(vgpr_type);
412 
413       assert(vgpr_size % 4 == 0);
414 
415       vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
416       vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
417       vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
418       vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
419       vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
420 
421       if (bitsize < 32)
422          vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
423 
424       *pvgpr = vgpr;
425    }
426 }
427 
ac_build_shader_clock(struct ac_llvm_context * ctx,nir_scope scope)428 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
429 {
430    const char *subgroup = LLVM_VERSION_MAJOR >= 9 ? "llvm.readcyclecounter" : "llvm.amdgcn.s.memtime";
431    const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
432 
433    LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
434    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
435 }
436 
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)437 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
438 {
439    const char *name;
440 
441    if (LLVMTypeOf(value) == ctx->i1)
442       value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
443 
444    if (LLVM_VERSION_MAJOR >= 9) {
445       if (ctx->wave_size == 64)
446          name = "llvm.amdgcn.icmp.i64.i32";
447       else
448          name = "llvm.amdgcn.icmp.i32.i32";
449    } else {
450       name = "llvm.amdgcn.icmp.i32";
451    }
452    LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
453 
454    /* We currently have no other way to prevent LLVM from lifting the icmp
455     * calls to a dominating basic block.
456     */
457    ac_build_optimization_barrier(ctx, &args[0]);
458 
459    args[0] = ac_to_integer(ctx, args[0]);
460 
461    return ac_build_intrinsic(
462       ctx, name, ctx->iN_wavemask, args, 3,
463       AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
464 }
465 
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)466 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
467 {
468    const char *name;
469 
470    if (LLVM_VERSION_MAJOR >= 9) {
471       if (ctx->wave_size == 64)
472          name = "llvm.amdgcn.icmp.i64.i1";
473       else
474          name = "llvm.amdgcn.icmp.i32.i1";
475    } else {
476       name = "llvm.amdgcn.icmp.i1";
477    }
478    LLVMValueRef args[3] = {
479       value,
480       ctx->i1false,
481       LLVMConstInt(ctx->i32, LLVMIntNE, 0),
482    };
483 
484    return ac_build_intrinsic(
485       ctx, name, ctx->iN_wavemask, args, 3,
486       AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
487 }
488 
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)489 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
490 {
491    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
492    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
493    return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
494 }
495 
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)496 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
497 {
498    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
499    return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
500                         "");
501 }
502 
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)503 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
504 {
505    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
506    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
507 
508    LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
509    LLVMValueRef none =
510       LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
511    return LLVMBuildOr(ctx->builder, all, none, "");
512 }
513 
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)514 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
515                                             unsigned value_count, unsigned component)
516 {
517    LLVMValueRef vec = NULL;
518 
519    if (value_count == 1) {
520       return values[component];
521    } else if (!value_count)
522       unreachable("value_count is 0");
523 
524    for (unsigned i = component; i < value_count + component; i++) {
525       LLVMValueRef value = values[i];
526 
527       if (i == component)
528          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
529       LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
530       vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
531    }
532    return vec;
533 }
534 
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool load,bool always_vector)535 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
536                                              unsigned value_count, unsigned value_stride, bool load,
537                                              bool always_vector)
538 {
539    LLVMBuilderRef builder = ctx->builder;
540    LLVMValueRef vec = NULL;
541    unsigned i;
542 
543    if (value_count == 1 && !always_vector) {
544       if (load)
545          return LLVMBuildLoad(builder, values[0], "");
546       return values[0];
547    } else if (!value_count)
548       unreachable("value_count is 0");
549 
550    for (i = 0; i < value_count; i++) {
551       LLVMValueRef value = values[i * value_stride];
552       if (load)
553          value = LLVMBuildLoad(builder, value, "");
554 
555       if (!i)
556          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
557       LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
558       vec = LLVMBuildInsertElement(builder, vec, value, index, "");
559    }
560    return vec;
561 }
562 
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)563 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
564                                     unsigned value_count)
565 {
566    return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
567 }
568 
569 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
570  * channels with undef. Extract at most src_channels components from the input.
571  */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)572 static LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
573                                     unsigned src_channels, unsigned dst_channels)
574 {
575    LLVMTypeRef elemtype;
576    LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
577 
578    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
579       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
580 
581       if (src_channels == dst_channels && vec_size == dst_channels)
582          return value;
583 
584       src_channels = MIN2(src_channels, vec_size);
585 
586       for (unsigned i = 0; i < src_channels; i++)
587          chan[i] = ac_llvm_extract_elem(ctx, value, i);
588 
589       elemtype = LLVMGetElementType(LLVMTypeOf(value));
590    } else {
591       if (src_channels) {
592          assert(src_channels == 1);
593          chan[0] = value;
594       }
595       elemtype = LLVMTypeOf(value);
596    }
597 
598    for (unsigned i = src_channels; i < dst_channels; i++)
599       chan[i] = LLVMGetUndef(elemtype);
600 
601    return ac_build_gather_values(ctx, chan, dst_channels);
602 }
603 
604 /* Extract components [start, start + channels) from a vector.
605  */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)606 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
607                                    unsigned channels)
608 {
609    LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
610 
611    for (unsigned i = 0; i < channels; i++)
612       chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
613 
614    return ac_build_gather_values(ctx, chan, channels);
615 }
616 
617 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
618  * with undef. Extract at most num_channels components from the input.
619  */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)620 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
621                                      unsigned num_channels)
622 {
623    return ac_build_expand(ctx, value, num_channels, 4);
624 }
625 
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)626 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
627 {
628    unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
629    const char *name;
630 
631    if (type_size == 2)
632       name = "llvm.rint.f16";
633    else if (type_size == 4)
634       name = "llvm.rint.f32";
635    else
636       name = "llvm.rint.f64";
637 
638    return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
639 }
640 
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)641 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
642 {
643    unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
644    const char *name;
645 
646    /* For doubles, we need precise division to pass GLCTS. */
647    if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
648       return LLVMBuildFDiv(ctx->builder, num, den, "");
649 
650    if (type_size == 2)
651       name = "llvm.amdgcn.rcp.f16";
652    else if (type_size == 4)
653       name = "llvm.amdgcn.rcp.f32";
654    else
655       name = "llvm.amdgcn.rcp.f64";
656 
657    LLVMValueRef rcp =
658       ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
659 
660    return LLVMBuildFMul(ctx->builder, num, rcp, "");
661 }
662 
663 /* See fast_idiv_by_const.h. */
664 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)665 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
666                                 LLVMValueRef multiplier, LLVMValueRef pre_shift,
667                                 LLVMValueRef post_shift, LLVMValueRef increment)
668 {
669    LLVMBuilderRef builder = ctx->builder;
670 
671    num = LLVMBuildLShr(builder, num, pre_shift, "");
672    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
673                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
674    num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
675    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
676    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
677    return LLVMBuildLShr(builder, num, post_shift, "");
678 }
679 
680 /* See fast_idiv_by_const.h. */
681 /* If num != UINT_MAX, this more efficient version can be used. */
682 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)683 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
684                                     LLVMValueRef multiplier, LLVMValueRef pre_shift,
685                                     LLVMValueRef post_shift, LLVMValueRef increment)
686 {
687    LLVMBuilderRef builder = ctx->builder;
688 
689    num = LLVMBuildLShr(builder, num, pre_shift, "");
690    num = LLVMBuildNUWAdd(builder, num, increment, "");
691    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
692                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
693    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
694    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
695    return LLVMBuildLShr(builder, num, post_shift, "");
696 }
697 
698 /* See fast_idiv_by_const.h. */
699 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)700 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
701                                               LLVMValueRef multiplier, LLVMValueRef post_shift)
702 {
703    LLVMBuilderRef builder = ctx->builder;
704 
705    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
706                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
707    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
708    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
709    return LLVMBuildLShr(builder, num, post_shift, "");
710 }
711 
712 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
713  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
714  * already multiplied by two. id is the cube face number.
715  */
716 struct cube_selection_coords {
717    LLVMValueRef stc[2];
718    LLVMValueRef ma;
719    LLVMValueRef id;
720 };
721 
build_cube_intrinsic(struct ac_llvm_context * ctx,LLVMValueRef in[3],struct cube_selection_coords * out)722 static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
723                                  struct cube_selection_coords *out)
724 {
725    LLVMTypeRef f32 = ctx->f32;
726 
727    out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
728    out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
729    out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
730    out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
731 }
732 
733 /**
734  * Build a manual selection sequence for cube face sc/tc coordinates and
735  * major axis vector (multiplied by 2 for consistency) for the given
736  * vec3 \p coords, for the face implied by \p selcoords.
737  *
738  * For the major axis, we always adjust the sign to be in the direction of
739  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
740  * the selcoords major axis.
741  */
build_cube_select(struct ac_llvm_context * ctx,const struct cube_selection_coords * selcoords,const LLVMValueRef * coords,LLVMValueRef * out_st,LLVMValueRef * out_ma)742 static void build_cube_select(struct ac_llvm_context *ctx,
743                               const struct cube_selection_coords *selcoords,
744                               const LLVMValueRef *coords, LLVMValueRef *out_st,
745                               LLVMValueRef *out_ma)
746 {
747    LLVMBuilderRef builder = ctx->builder;
748    LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
749    LLVMValueRef is_ma_positive;
750    LLVMValueRef sgn_ma;
751    LLVMValueRef is_ma_z, is_not_ma_z;
752    LLVMValueRef is_ma_y;
753    LLVMValueRef is_ma_x;
754    LLVMValueRef sgn;
755    LLVMValueRef tmp;
756 
757    is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
758    sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
759                             LLVMConstReal(f32, -1.0), "");
760 
761    is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
762    is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
763    is_ma_y = LLVMBuildAnd(
764       builder, is_not_ma_z,
765       LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
766    is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
767 
768    /* Select sc */
769    tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
770    sgn = LLVMBuildSelect(
771       builder, is_ma_y, LLVMConstReal(f32, 1.0),
772       LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
773    out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
774 
775    /* Select tc */
776    tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
777    sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
778    out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
779 
780    /* Select ma */
781    tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
782                          LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
783    tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
784    *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
785 }
786 
ac_prepare_cube_coords(struct ac_llvm_context * ctx,bool is_deriv,bool is_array,bool is_lod,LLVMValueRef * coords_arg,LLVMValueRef * derivs_arg)787 void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
788                             LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
789 {
790 
791    LLVMBuilderRef builder = ctx->builder;
792    struct cube_selection_coords selcoords;
793    LLVMValueRef coords[3];
794    LLVMValueRef invma;
795 
796    if (is_array && !is_lod) {
797       LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
798 
799       /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
800        *
801        *    "For Array forms, the array layer used will be
802        *
803        *       max(0, min(d−1, floor(layer+0.5)))
804        *
805        *     where d is the depth of the texture array and layer
806        *     comes from the component indicated in the tables below.
807        *     Workaroudn for an issue where the layer is taken from a
808        *     helper invocation which happens to fall on a different
809        *     layer due to extrapolation."
810        *
811        * GFX8 and earlier attempt to implement this in hardware by
812        * clamping the value of coords[2] = (8 * layer) + face.
813        * Unfortunately, this means that the we end up with the wrong
814        * face when clamping occurs.
815        *
816        * Clamp the layer earlier to work around the issue.
817        */
818       if (ctx->chip_class <= GFX8) {
819          LLVMValueRef ge0;
820          ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
821          tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
822       }
823 
824       coords_arg[3] = tmp;
825    }
826 
827    build_cube_intrinsic(ctx, coords_arg, &selcoords);
828 
829    invma =
830       ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
831    invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
832 
833    for (int i = 0; i < 2; ++i)
834       coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
835 
836    coords[2] = selcoords.id;
837 
838    if (is_deriv && derivs_arg) {
839       LLVMValueRef derivs[4];
840       int axis;
841 
842       /* Convert cube derivatives to 2D derivatives. */
843       for (axis = 0; axis < 2; axis++) {
844          LLVMValueRef deriv_st[2];
845          LLVMValueRef deriv_ma;
846 
847          /* Transform the derivative alongside the texture
848           * coordinate. Mathematically, the correct formula is
849           * as follows. Assume we're projecting onto the +Z face
850           * and denote by dx/dh the derivative of the (original)
851           * X texture coordinate with respect to horizontal
852           * window coordinates. The projection onto the +Z face
853           * plane is:
854           *
855           *   f(x,z) = x/z
856           *
857           * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
858           *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
859           *
860           * This motivatives the implementation below.
861           *
862           * Whether this actually gives the expected results for
863           * apps that might feed in derivatives obtained via
864           * finite differences is anyone's guess. The OpenGL spec
865           * seems awfully quiet about how textureGrad for cube
866           * maps should be handled.
867           */
868          build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
869 
870          deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
871 
872          for (int i = 0; i < 2; ++i)
873             derivs[axis * 2 + i] =
874                LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
875                              LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
876       }
877 
878       memcpy(derivs_arg, derivs, sizeof(derivs));
879    }
880 
881    /* Shift the texture coordinate. This must be applied after the
882     * derivative calculation.
883     */
884    for (int i = 0; i < 2; ++i)
885       coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
886 
887    if (is_array) {
888       /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
889       /* coords_arg.w component - array_index for cube arrays */
890       coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
891    }
892 
893    memcpy(coords_arg, coords, sizeof(coords));
894 }
895 
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)896 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
897                                 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
898                                 LLVMValueRef j)
899 {
900    LLVMValueRef args[5];
901    LLVMValueRef p1;
902 
903    args[0] = i;
904    args[1] = llvm_chan;
905    args[2] = attr_number;
906    args[3] = params;
907 
908    p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
909 
910    args[0] = p1;
911    args[1] = j;
912    args[2] = llvm_chan;
913    args[3] = attr_number;
914    args[4] = params;
915 
916    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5,
917                              AC_FUNC_ATTR_READNONE);
918 }
919 
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)920 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
921                                     LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
922                                     LLVMValueRef j)
923 {
924    LLVMValueRef args[6];
925    LLVMValueRef p1;
926 
927    args[0] = i;
928    args[1] = llvm_chan;
929    args[2] = attr_number;
930    args[3] = ctx->i1false;
931    args[4] = params;
932 
933    p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
934                            AC_FUNC_ATTR_READNONE);
935 
936    args[0] = p1;
937    args[1] = j;
938    args[2] = llvm_chan;
939    args[3] = attr_number;
940    args[4] = ctx->i1false;
941    args[5] = params;
942 
943    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
944                              AC_FUNC_ATTR_READNONE);
945 }
946 
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,LLVMValueRef parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)947 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
948                                     LLVMValueRef llvm_chan, LLVMValueRef attr_number,
949                                     LLVMValueRef params)
950 {
951    LLVMValueRef args[4];
952 
953    args[0] = parameter;
954    args[1] = llvm_chan;
955    args[2] = attr_number;
956    args[3] = params;
957 
958    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
959                              AC_FUNC_ATTR_READNONE);
960 }
961 
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)962 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
963                               LLVMValueRef index)
964 {
965    return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
966 }
967 
ac_build_gep0(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)968 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
969 {
970    LLVMValueRef indices[2] = {
971       ctx->i32_0,
972       index,
973    };
974    return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
975 }
976 
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMValueRef index)977 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
978 {
979    return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
980                                LLVMTypeOf(ptr), "");
981 }
982 
ac_build_indexed_store(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,LLVMValueRef value)983 void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
984                             LLVMValueRef value)
985 {
986    LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
987 }
988 
989 /**
990  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
991  * It's equivalent to doing a load from &base_ptr[index].
992  *
993  * \param base_ptr  Where the array starts.
994  * \param index     The element index into the array.
995  * \param uniform   Whether the base_ptr and index can be assumed to be
996  *                  dynamically uniform (i.e. load to an SGPR)
997  * \param invariant Whether the load is invariant (no other opcodes affect it)
998  * \param no_unsigned_wraparound
999  *    For all possible re-associations and re-distributions of an expression
1000  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1001  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1002  *    does not result in an unsigned integer wraparound. This is used for
1003  *    optimal code generation of 32-bit pointer arithmetic.
1004  *
1005  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1006  *    integer wraparound can't be an imm offset in s_load_dword, because
1007  *    the instruction performs "addr + offset" in 64 bits.
1008  *
1009  *    Expected usage for bindless textures by chaining GEPs:
1010  *      // possible unsigned wraparound, don't use InBounds:
1011  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1012  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1013  *
1014  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1015  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1016  */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)1017 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1018                                          LLVMValueRef index, bool uniform, bool invariant,
1019                                          bool no_unsigned_wraparound)
1020 {
1021    LLVMValueRef pointer, result;
1022 
1023    if (no_unsigned_wraparound &&
1024        LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1025       pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1026    else
1027       pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1028 
1029    if (uniform)
1030       LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1031    result = LLVMBuildLoad(ctx->builder, pointer, "");
1032    if (invariant)
1033       LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1034    return result;
1035 }
1036 
ac_build_load(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1037 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1038 {
1039    return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1040 }
1041 
ac_build_load_invariant(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1042 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1043                                      LLVMValueRef index)
1044 {
1045    return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1046 }
1047 
1048 /* This assumes that there is no unsigned integer wraparound during the address
1049  * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1050 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1051                                    LLVMValueRef index)
1052 {
1053    return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1054 }
1055 
1056 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1057 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1058                                                    LLVMValueRef base_ptr, LLVMValueRef index)
1059 {
1060    return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1061 }
1062 
get_load_cache_policy(struct ac_llvm_context * ctx,unsigned cache_policy)1063 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1064 {
1065    return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1066 }
1067 
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool use_format,bool structurized)1068 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1069                                          LLVMValueRef data, LLVMValueRef vindex,
1070                                          LLVMValueRef voffset, LLVMValueRef soffset,
1071                                          unsigned cache_policy, bool use_format, bool structurized)
1072 {
1073    LLVMValueRef args[6];
1074    int idx = 0;
1075    args[idx++] = data;
1076    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1077    if (structurized)
1078       args[idx++] = vindex ? vindex : ctx->i32_0;
1079    args[idx++] = voffset ? voffset : ctx->i32_0;
1080    args[idx++] = soffset ? soffset : ctx->i32_0;
1081    args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1082    const char *indexing_kind = structurized ? "struct" : "raw";
1083    char name[256], type_name[8];
1084 
1085    ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1086 
1087    if (use_format) {
1088       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1089                type_name);
1090    } else {
1091       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1092    }
1093 
1094    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1095 }
1096 
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,unsigned cache_policy)1097 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1098                                   LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1099 {
1100    ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true);
1101 }
1102 
1103 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1104  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1105  * or v4i32 (num_channels=3,4).
1106  */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,unsigned num_channels,LLVMValueRef voffset,LLVMValueRef soffset,unsigned inst_offset,unsigned cache_policy)1107 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1108                                  unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset,
1109                                  unsigned inst_offset, unsigned cache_policy)
1110 {
1111    /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1112     * intrinsics. */
1113    if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1114       LLVMValueRef v[3], v01;
1115 
1116       for (int i = 0; i < 3; i++) {
1117          v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1118       }
1119       v01 = ac_build_gather_values(ctx, v, 2);
1120 
1121       ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy);
1122       ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8,
1123                                   cache_policy);
1124       return;
1125    }
1126 
1127    /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1128     * (voffset is swizzled, but soffset isn't swizzled).
1129     * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1130     */
1131    if (!(cache_policy & ac_swizzled)) {
1132       LLVMValueRef offset = soffset;
1133 
1134       if (inst_offset)
1135          offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), "");
1136 
1137       ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset,
1138                                    cache_policy, false, false);
1139       return;
1140    }
1141 
1142    static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
1143                                     V_008F0C_BUF_DATA_FORMAT_32_32_32,
1144                                     V_008F0C_BUF_DATA_FORMAT_32_32_32_32};
1145    unsigned dfmt = dfmts[num_channels - 1];
1146    unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1147    LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1148 
1149    ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt,
1150                               nfmt, cache_policy);
1151 }
1152 
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,unsigned cache_policy,bool can_speculate,bool use_format,bool structurized)1153 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1154                                                 LLVMValueRef vindex, LLVMValueRef voffset,
1155                                                 LLVMValueRef soffset, unsigned num_channels,
1156                                                 LLVMTypeRef channel_type, unsigned cache_policy,
1157                                                 bool can_speculate, bool use_format,
1158                                                 bool structurized)
1159 {
1160    LLVMValueRef args[5];
1161    int idx = 0;
1162    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1163    if (structurized)
1164       args[idx++] = vindex ? vindex : ctx->i32_0;
1165    args[idx++] = voffset ? voffset : ctx->i32_0;
1166    args[idx++] = soffset ? soffset : ctx->i32_0;
1167    args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1168    unsigned func =
1169       !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1170    const char *indexing_kind = structurized ? "struct" : "raw";
1171    char name[256], type_name[8];
1172 
1173    /* D16 is only supported on gfx8+ */
1174    assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1175           ctx->chip_class >= GFX8);
1176 
1177    LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1178    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1179 
1180    if (use_format) {
1181       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1182                type_name);
1183    } else {
1184       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1185    }
1186 
1187    return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1188 }
1189 
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned inst_offset,unsigned cache_policy,bool can_speculate,bool allow_smem)1190 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1191                                   LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1192                                   unsigned inst_offset, unsigned cache_policy, bool can_speculate,
1193                                   bool allow_smem)
1194 {
1195    LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1196    if (voffset)
1197       offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1198    if (soffset)
1199       offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1200 
1201    if (allow_smem && !(cache_policy & ac_slc) &&
1202        (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
1203       assert(vindex == NULL);
1204 
1205       LLVMValueRef result[8];
1206 
1207       for (int i = 0; i < num_channels; i++) {
1208          if (i) {
1209             offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1210          }
1211          LLVMValueRef args[3] = {
1212             rsrc,
1213             offset,
1214             LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1215          };
1216          result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1217                                         AC_FUNC_ATTR_READNONE);
1218       }
1219       if (num_channels == 1)
1220          return result[0];
1221 
1222       if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1223          result[num_channels++] = LLVMGetUndef(ctx->f32);
1224       return ac_build_gather_values(ctx, result, num_channels);
1225    }
1226 
1227    return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels, ctx->f32,
1228                                       cache_policy, can_speculate, false, false);
1229 }
1230 
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,unsigned cache_policy,bool can_speculate,bool d16)1231 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1232                                          LLVMValueRef vindex, LLVMValueRef voffset,
1233                                          unsigned num_channels, unsigned cache_policy,
1234                                          bool can_speculate, bool d16)
1235 {
1236    return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1237                                       d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1238                                       true);
1239 }
1240 
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate,bool structurized)1241 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1242                                           LLVMValueRef vindex, LLVMValueRef voffset,
1243                                           LLVMValueRef soffset, LLVMValueRef immoffset,
1244                                           unsigned num_channels, unsigned dfmt, unsigned nfmt,
1245                                           unsigned cache_policy, bool can_speculate,
1246                                           bool structurized)
1247 {
1248    voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1249 
1250    LLVMValueRef args[6];
1251    int idx = 0;
1252    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1253    if (structurized)
1254       args[idx++] = vindex ? vindex : ctx->i32_0;
1255    args[idx++] = voffset ? voffset : ctx->i32_0;
1256    args[idx++] = soffset ? soffset : ctx->i32_0;
1257    args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1258    args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1259    unsigned func =
1260       !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1261    const char *indexing_kind = structurized ? "struct" : "raw";
1262    char name[256], type_name[8];
1263 
1264    LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1265    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1266 
1267    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1268 
1269    return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1270 }
1271 
ac_build_struct_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate)1272 LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1273                                           LLVMValueRef vindex, LLVMValueRef voffset,
1274                                           LLVMValueRef soffset, LLVMValueRef immoffset,
1275                                           unsigned num_channels, unsigned dfmt, unsigned nfmt,
1276                                           unsigned cache_policy, bool can_speculate)
1277 {
1278    return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1279                                 nfmt, cache_policy, can_speculate, true);
1280 }
1281 
ac_build_raw_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate)1282 LLVMValueRef ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1283                                        LLVMValueRef voffset, LLVMValueRef soffset,
1284                                        LLVMValueRef immoffset, unsigned num_channels, unsigned dfmt,
1285                                        unsigned nfmt, unsigned cache_policy, bool can_speculate)
1286 {
1287    return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1288                                 nfmt, cache_policy, can_speculate, false);
1289 }
1290 
ac_build_tbuffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned cache_policy)1291 LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1292                                          LLVMValueRef voffset, LLVMValueRef soffset,
1293                                          LLVMValueRef immoffset, unsigned cache_policy)
1294 {
1295    LLVMValueRef res;
1296 
1297    if (LLVM_VERSION_MAJOR >= 9) {
1298       voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1299 
1300       /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1301       res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1302                                         cache_policy, false, false, false);
1303    } else {
1304       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1305       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1306 
1307       res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1308                                       cache_policy, false);
1309 
1310       res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1311    }
1312 
1313    return res;
1314 }
1315 
ac_build_tbuffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned cache_policy)1316 LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1317                                         LLVMValueRef voffset, LLVMValueRef soffset,
1318                                         LLVMValueRef immoffset, unsigned cache_policy)
1319 {
1320    LLVMValueRef res;
1321 
1322    if (LLVM_VERSION_MAJOR >= 9) {
1323       voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1324 
1325       /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1326       res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1327                                         false, false, false);
1328    } else {
1329       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1330       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1331 
1332       res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1333                                       cache_policy, false);
1334 
1335       res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1336    }
1337 
1338    return res;
1339 }
1340 
1341 /**
1342  * Convert an 11- or 10-bit unsigned floating point number to an f32.
1343  *
1344  * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1345  * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1346  */
ac_ufN_to_float(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned exp_bits,unsigned mant_bits)1347 static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1348                                     unsigned exp_bits, unsigned mant_bits)
1349 {
1350    assert(LLVMTypeOf(src) == ctx->i32);
1351 
1352    LLVMValueRef tmp;
1353    LLVMValueRef mantissa;
1354    mantissa =
1355       LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1356 
1357    /* Converting normal numbers is just a shift + correcting the exponent bias */
1358    unsigned normal_shift = 23 - mant_bits;
1359    unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1360    LLVMValueRef shifted, normal;
1361 
1362    shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1363    normal =
1364       LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1365 
1366    /* Converting nan/inf numbers is the same, but with a different exponent update */
1367    LLVMValueRef naninf;
1368    naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1369 
1370    /* Converting denormals is the complex case: determine the leading zeros of the
1371     * mantissa to obtain the correct shift for the mantissa and exponent correction.
1372     */
1373    LLVMValueRef denormal;
1374    LLVMValueRef params[2] = {
1375       mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1376    };
1377    LLVMValueRef ctlz =
1378       ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1379 
1380    /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1381    tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1382    denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1383 
1384    unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1385    tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1386    tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1387    denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1388 
1389    /* Select the final result. */
1390    LLVMValueRef result;
1391 
1392    tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1393                        LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1394    result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1395 
1396    tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, LLVMConstInt(ctx->i32, 1 << mant_bits, false),
1397                        "");
1398    result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1399 
1400    tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1401    result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1402 
1403    return ac_to_float(ctx, result);
1404 }
1405 
1406 /**
1407  * Generate a fully general open coded buffer format fetch with all required
1408  * fixups suitable for vertex fetch, using non-format buffer loads.
1409  *
1410  * Some combinations of argument values have special interpretations:
1411  * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1412  * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1413  *
1414  * \param log_size log(size of channel in bytes)
1415  * \param num_channels number of channels (1 to 4)
1416  * \param format AC_FETCH_FORMAT_xxx value
1417  * \param reverse whether XYZ channels are reversed
1418  * \param known_aligned whether the source is known to be aligned to hardware's
1419  *                      effective element size for loading the given format
1420  *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1421  * \param rsrc buffer resource descriptor
1422  * \return the resulting vector of floats or integers bitcast to <4 x i32>
1423  */
ac_build_opencoded_load_format(struct ac_llvm_context * ctx,unsigned log_size,unsigned num_channels,unsigned format,bool reverse,bool known_aligned,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool can_speculate)1424 LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1425                                             unsigned num_channels, unsigned format, bool reverse,
1426                                             bool known_aligned, LLVMValueRef rsrc,
1427                                             LLVMValueRef vindex, LLVMValueRef voffset,
1428                                             LLVMValueRef soffset, unsigned cache_policy,
1429                                             bool can_speculate)
1430 {
1431    LLVMValueRef tmp;
1432    unsigned load_log_size = log_size;
1433    unsigned load_num_channels = num_channels;
1434    if (log_size == 3) {
1435       load_log_size = 2;
1436       if (format == AC_FETCH_FORMAT_FLOAT) {
1437          load_num_channels = 2 * num_channels;
1438       } else {
1439          load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1440       }
1441    }
1442 
1443    int log_recombine = 0;
1444    if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) {
1445       /* Avoid alignment restrictions by loading one byte at a time. */
1446       load_num_channels <<= load_log_size;
1447       log_recombine = load_log_size;
1448       load_log_size = 0;
1449    } else if (load_num_channels == 2 || load_num_channels == 4) {
1450       log_recombine = -util_logbase2(load_num_channels);
1451       load_num_channels = 1;
1452       load_log_size += -log_recombine;
1453    }
1454 
1455    assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9);
1456 
1457    LLVMValueRef loads[32]; /* up to 32 bytes */
1458    for (unsigned i = 0; i < load_num_channels; ++i) {
1459       tmp =
1460          LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1461       LLVMTypeRef channel_type =
1462          load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1463       unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1464       loads[i] =
1465          ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1466                                      cache_policy, can_speculate, false, true);
1467       if (load_log_size >= 2)
1468          loads[i] = ac_to_integer(ctx, loads[i]);
1469    }
1470 
1471    if (log_recombine > 0) {
1472       /* Recombine bytes if necessary (GFX6 only) */
1473       LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1474 
1475       for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1476          LLVMValueRef accum = NULL;
1477          for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1478             tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1479             if (i == 0) {
1480                accum = tmp;
1481             } else {
1482                tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1483                accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1484             }
1485          }
1486          loads[dst] = accum;
1487       }
1488    } else if (log_recombine < 0) {
1489       /* Split vectors of dwords */
1490       if (load_log_size > 2) {
1491          assert(load_num_channels == 1);
1492          LLVMValueRef loaded = loads[0];
1493          unsigned log_split = load_log_size - 2;
1494          log_recombine += log_split;
1495          load_num_channels = 1 << log_split;
1496          load_log_size = 2;
1497          for (unsigned i = 0; i < load_num_channels; ++i) {
1498             tmp = LLVMConstInt(ctx->i32, i, false);
1499             loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1500          }
1501       }
1502 
1503       /* Further split dwords and shorts if required */
1504       if (log_recombine < 0) {
1505          for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1506               --src) {
1507             unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1508             LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1509             LLVMValueRef loaded = loads[src - 1];
1510             LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1511             for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1512                tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1513                tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1514                loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1515             }
1516          }
1517       }
1518    }
1519 
1520    if (log_size == 3) {
1521       if (format == AC_FETCH_FORMAT_FLOAT) {
1522          for (unsigned i = 0; i < num_channels; ++i) {
1523             tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1524             loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1525          }
1526       } else if (format == AC_FETCH_FORMAT_FIXED) {
1527          /* 10_11_11_FLOAT */
1528          LLVMValueRef data = loads[0];
1529          LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1530          LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1531          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1532          LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1533          LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1534 
1535          loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1536          loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1537          loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1538 
1539          num_channels = 3;
1540          log_size = 2;
1541          format = AC_FETCH_FORMAT_FLOAT;
1542       } else {
1543          /* 2_10_10_10 data formats */
1544          LLVMValueRef data = loads[0];
1545          LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1546          LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1547          loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1548          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1549          loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1550          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1551          loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1552          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1553          loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1554 
1555          num_channels = 4;
1556       }
1557    }
1558 
1559    if (format == AC_FETCH_FORMAT_FLOAT) {
1560       if (log_size != 2) {
1561          for (unsigned chan = 0; chan < num_channels; ++chan) {
1562             tmp = ac_to_float(ctx, loads[chan]);
1563             if (log_size == 3)
1564                tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1565             else if (log_size == 1)
1566                tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1567             loads[chan] = ac_to_integer(ctx, tmp);
1568          }
1569       }
1570    } else if (format == AC_FETCH_FORMAT_UINT) {
1571       if (log_size != 2) {
1572          for (unsigned chan = 0; chan < num_channels; ++chan)
1573             loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1574       }
1575    } else if (format == AC_FETCH_FORMAT_SINT) {
1576       if (log_size != 2) {
1577          for (unsigned chan = 0; chan < num_channels; ++chan)
1578             loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1579       }
1580    } else {
1581       bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1582                     format == AC_FETCH_FORMAT_UINT;
1583 
1584       for (unsigned chan = 0; chan < num_channels; ++chan) {
1585          if (unsign) {
1586             tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1587          } else {
1588             tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1589          }
1590 
1591          LLVMValueRef scale = NULL;
1592          if (format == AC_FETCH_FORMAT_FIXED) {
1593             assert(log_size == 2);
1594             scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1595          } else if (format == AC_FETCH_FORMAT_UNORM) {
1596             unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1597             scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1598          } else if (format == AC_FETCH_FORMAT_SNORM) {
1599             unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1600             scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1601          }
1602          if (scale)
1603             tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1604 
1605          if (format == AC_FETCH_FORMAT_SNORM) {
1606             /* Clamp to [-1, 1] */
1607             LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1608             LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1609             tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1610          }
1611 
1612          loads[chan] = ac_to_integer(ctx, tmp);
1613       }
1614    }
1615 
1616    while (num_channels < 4) {
1617       if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1618          loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1619       } else {
1620          loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1621       }
1622       num_channels++;
1623    }
1624 
1625    if (reverse) {
1626       tmp = loads[0];
1627       loads[0] = loads[2];
1628       loads[2] = tmp;
1629    }
1630 
1631    return ac_build_gather_values(ctx, loads, 4);
1632 }
1633 
ac_build_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool structurized)1634 static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1635                                    LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1636                                    LLVMValueRef soffset, LLVMValueRef immoffset,
1637                                    unsigned num_channels, unsigned dfmt, unsigned nfmt,
1638                                    unsigned cache_policy, bool structurized)
1639 {
1640    voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, "");
1641 
1642    LLVMValueRef args[7];
1643    int idx = 0;
1644    args[idx++] = vdata;
1645    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1646    if (structurized)
1647       args[idx++] = vindex ? vindex : ctx->i32_0;
1648    args[idx++] = voffset ? voffset : ctx->i32_0;
1649    args[idx++] = soffset ? soffset : ctx->i32_0;
1650    args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1651    args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1652    unsigned func =
1653       !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1654    const char *indexing_kind = structurized ? "struct" : "raw";
1655    char name[256], type_name[8];
1656 
1657    LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1658    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1659 
1660    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name);
1661 
1662    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1663 }
1664 
ac_build_struct_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy)1665 void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1666                                    LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1667                                    LLVMValueRef soffset, LLVMValueRef immoffset,
1668                                    unsigned num_channels, unsigned dfmt, unsigned nfmt,
1669                                    unsigned cache_policy)
1670 {
1671    ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1672                           nfmt, cache_policy, true);
1673 }
1674 
ac_build_raw_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy)1675 void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1676                                 LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset,
1677                                 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1678                                 unsigned cache_policy)
1679 {
1680    ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1681                           nfmt, cache_policy, false);
1682 }
1683 
ac_build_tbuffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1684 void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1685                                   LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1686                                   unsigned cache_policy)
1687 {
1688    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1689 
1690    if (LLVM_VERSION_MAJOR >= 9) {
1691       /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1692       ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1693                                    false);
1694    } else {
1695       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1696       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1697 
1698       vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1699 
1700       ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1701                                  cache_policy);
1702    }
1703 }
1704 
ac_build_tbuffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1705 void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1706                                  LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1707 {
1708    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1709 
1710    if (LLVM_VERSION_MAJOR >= 9) {
1711       /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1712       ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1713                                    false);
1714    } else {
1715       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1716       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1717 
1718       vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1719 
1720       ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1721                                  cache_policy);
1722    }
1723 }
1724 /**
1725  * Set range metadata on an instruction.  This can only be used on load and
1726  * call instructions.  If you know an instruction can only produce the values
1727  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1728  * \p lo is the minimum value inclusive.
1729  * \p hi is the maximum value exclusive.
1730  */
set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1731 static void set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1732                                unsigned hi)
1733 {
1734    LLVMValueRef range_md, md_args[2];
1735    LLVMTypeRef type = LLVMTypeOf(value);
1736    LLVMContextRef context = LLVMGetTypeContext(type);
1737 
1738    md_args[0] = LLVMConstInt(type, lo, false);
1739    md_args[1] = LLVMConstInt(type, hi, false);
1740    range_md = LLVMMDNodeInContext(context, md_args, 2);
1741    LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1742 }
1743 
ac_get_thread_id(struct ac_llvm_context * ctx)1744 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1745 {
1746    LLVMValueRef tid;
1747 
1748    LLVMValueRef tid_args[2];
1749    tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1750    tid_args[1] = ctx->i32_0;
1751    tid_args[1] =
1752       ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, tid_args, 2, AC_FUNC_ATTR_READNONE);
1753 
1754    if (ctx->wave_size == 32) {
1755       tid = tid_args[1];
1756    } else {
1757       tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, tid_args, 2,
1758                                AC_FUNC_ATTR_READNONE);
1759    }
1760    set_range_metadata(ctx, tid, 0, ctx->wave_size);
1761    return tid;
1762 }
1763 
1764 /*
1765  * AMD GCN implements derivatives using the local data store (LDS)
1766  * All writes to the LDS happen in all executing threads at
1767  * the same time. TID is the Thread ID for the current
1768  * thread and is a value between 0 and 63, representing
1769  * the thread's position in the wavefront.
1770  *
1771  * For the pixel shader threads are grouped into quads of four pixels.
1772  * The TIDs of the pixels of a quad are:
1773  *
1774  *  +------+------+
1775  *  |4n + 0|4n + 1|
1776  *  +------+------+
1777  *  |4n + 2|4n + 3|
1778  *  +------+------+
1779  *
1780  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1781  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1782  * the current pixel's column, and masking with 0xfffffffe yields the TID
1783  * of the left pixel of the current pixel's row.
1784  *
1785  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1786  * adding 2 yields the TID of the pixel below the top pixel.
1787  */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1788 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1789 {
1790    unsigned tl_lanes[4], trbl_lanes[4];
1791    char name[32], type[8];
1792    LLVMValueRef tl, trbl;
1793    LLVMTypeRef result_type;
1794    LLVMValueRef result;
1795 
1796    result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1797 
1798    if (result_type == ctx->f16)
1799       val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1800    else if (result_type == ctx->v2f16)
1801       val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1802 
1803    for (unsigned i = 0; i < 4; ++i) {
1804       tl_lanes[i] = i & mask;
1805       trbl_lanes[i] = (i & mask) + idx;
1806    }
1807 
1808    tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1809    trbl =
1810       ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1811 
1812    if (result_type == ctx->f16) {
1813       tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1814       trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1815    }
1816 
1817    tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1818    trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1819    result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1820 
1821    ac_build_type_name_for_intr(result_type, type, sizeof(type));
1822    snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1823 
1824    return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1825 }
1826 
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t msg,LLVMValueRef wave_id)1827 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1828 {
1829    LLVMValueRef args[2];
1830    args[0] = LLVMConstInt(ctx->i32, msg, false);
1831    args[1] = wave_id;
1832    ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1833 }
1834 
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1835 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1836 {
1837    LLVMValueRef msb =
1838       ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1839 
1840    /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1841     * the index from LSB. Invert it by doing "31 - msb". */
1842    msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1843 
1844    LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1845    LLVMValueRef cond =
1846       LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1847                   LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1848 
1849    return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1850 }
1851 
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1852 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1853 {
1854    const char *intrin_name;
1855    LLVMTypeRef type;
1856    LLVMValueRef highest_bit;
1857    LLVMValueRef zero;
1858    unsigned bitsize;
1859 
1860    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1861    switch (bitsize) {
1862    case 64:
1863       intrin_name = "llvm.ctlz.i64";
1864       type = ctx->i64;
1865       highest_bit = LLVMConstInt(ctx->i64, 63, false);
1866       zero = ctx->i64_0;
1867       break;
1868    case 32:
1869       intrin_name = "llvm.ctlz.i32";
1870       type = ctx->i32;
1871       highest_bit = LLVMConstInt(ctx->i32, 31, false);
1872       zero = ctx->i32_0;
1873       break;
1874    case 16:
1875       intrin_name = "llvm.ctlz.i16";
1876       type = ctx->i16;
1877       highest_bit = LLVMConstInt(ctx->i16, 15, false);
1878       zero = ctx->i16_0;
1879       break;
1880    case 8:
1881       intrin_name = "llvm.ctlz.i8";
1882       type = ctx->i8;
1883       highest_bit = LLVMConstInt(ctx->i8, 7, false);
1884       zero = ctx->i8_0;
1885       break;
1886    default:
1887       unreachable(!"invalid bitsize");
1888       break;
1889    }
1890 
1891    LLVMValueRef params[2] = {
1892       arg,
1893       ctx->i1true,
1894    };
1895 
1896    LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1897 
1898    /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1899     * the index from LSB. Invert it by doing "31 - msb". */
1900    msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1901 
1902    if (bitsize == 64) {
1903       msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1904    } else if (bitsize < 32) {
1905       msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1906    }
1907 
1908    /* check for zero */
1909    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1910                           LLVMConstInt(ctx->i32, -1, true), msb, "");
1911 }
1912 
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1913 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1914 {
1915    char name[64], type[64];
1916 
1917    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1918    snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1919    LLVMValueRef args[2] = {a, b};
1920    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1921 }
1922 
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1923 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1924 {
1925    char name[64], type[64];
1926 
1927    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1928    snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1929    LLVMValueRef args[2] = {a, b};
1930    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1931 }
1932 
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1933 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1934 {
1935    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1936    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1937 }
1938 
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1939 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1940 {
1941    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1942    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1943 }
1944 
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1945 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1946 {
1947    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1948    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1949 }
1950 
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1951 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1952 {
1953    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1954    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1955 }
1956 
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1957 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1958 {
1959    LLVMTypeRef t = LLVMTypeOf(value);
1960    return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1961                         LLVMConstReal(t, 1.0));
1962 }
1963 
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1964 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1965 {
1966    LLVMValueRef args[9];
1967 
1968    args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1969    args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1970 
1971    if (a->compr) {
1972       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1973       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1974       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1975       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1976 
1977       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1978    } else {
1979       args[2] = a->out[0];
1980       args[3] = a->out[1];
1981       args[4] = a->out[2];
1982       args[5] = a->out[3];
1983       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1984       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1985 
1986       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1987    }
1988 }
1989 
ac_build_export_null(struct ac_llvm_context * ctx)1990 void ac_build_export_null(struct ac_llvm_context *ctx)
1991 {
1992    struct ac_export_args args;
1993 
1994    args.enabled_channels = 0x0; /* enabled channels */
1995    args.valid_mask = 1;         /* whether the EXEC mask is valid */
1996    args.done = 1;               /* DONE bit */
1997    args.target = V_008DFC_SQ_EXP_NULL;
1998    args.compr = 0;                       /* COMPR flag (0 = 32-bit export) */
1999    args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2000    args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2001    args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2002    args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2003 
2004    ac_build_export(ctx, &args);
2005 }
2006 
ac_num_coords(enum ac_image_dim dim)2007 static unsigned ac_num_coords(enum ac_image_dim dim)
2008 {
2009    switch (dim) {
2010    case ac_image_1d:
2011       return 1;
2012    case ac_image_2d:
2013    case ac_image_1darray:
2014       return 2;
2015    case ac_image_3d:
2016    case ac_image_cube:
2017    case ac_image_2darray:
2018    case ac_image_2dmsaa:
2019       return 3;
2020    case ac_image_2darraymsaa:
2021       return 4;
2022    default:
2023       unreachable("ac_num_coords: bad dim");
2024    }
2025 }
2026 
ac_num_derivs(enum ac_image_dim dim)2027 static unsigned ac_num_derivs(enum ac_image_dim dim)
2028 {
2029    switch (dim) {
2030    case ac_image_1d:
2031    case ac_image_1darray:
2032       return 2;
2033    case ac_image_2d:
2034    case ac_image_2darray:
2035    case ac_image_cube:
2036       return 4;
2037    case ac_image_3d:
2038       return 6;
2039    case ac_image_2dmsaa:
2040    case ac_image_2darraymsaa:
2041    default:
2042       unreachable("derivatives not supported");
2043    }
2044 }
2045 
get_atomic_name(enum ac_atomic_op op)2046 static const char *get_atomic_name(enum ac_atomic_op op)
2047 {
2048    switch (op) {
2049    case ac_atomic_swap:
2050       return "swap";
2051    case ac_atomic_add:
2052       return "add";
2053    case ac_atomic_sub:
2054       return "sub";
2055    case ac_atomic_smin:
2056       return "smin";
2057    case ac_atomic_umin:
2058       return "umin";
2059    case ac_atomic_smax:
2060       return "smax";
2061    case ac_atomic_umax:
2062       return "umax";
2063    case ac_atomic_and:
2064       return "and";
2065    case ac_atomic_or:
2066       return "or";
2067    case ac_atomic_xor:
2068       return "xor";
2069    case ac_atomic_inc_wrap:
2070       return "inc";
2071    case ac_atomic_dec_wrap:
2072       return "dec";
2073    }
2074    unreachable("bad atomic op");
2075 }
2076 
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)2077 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2078 {
2079    const char *overload[3] = {"", "", ""};
2080    unsigned num_overloads = 0;
2081    LLVMValueRef args[18];
2082    unsigned num_args = 0;
2083    enum ac_image_dim dim = a->dim;
2084 
2085    assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2086    assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2087            a->opcode != ac_image_store_mip) ||
2088           a->lod);
2089    assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2090           (!a->compare && !a->offset));
2091    assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2092            a->opcode == ac_image_get_lod) ||
2093           !a->bias);
2094    assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2095           1);
2096    assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2097    assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic &&
2098                       a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2099                       a->opcode != ac_image_get_resinfo));
2100 
2101    if (a->opcode == ac_image_get_lod) {
2102       switch (dim) {
2103       case ac_image_1darray:
2104          dim = ac_image_1d;
2105          break;
2106       case ac_image_2darray:
2107       case ac_image_cube:
2108          dim = ac_image_2d;
2109          break;
2110       default:
2111          break;
2112       }
2113    }
2114 
2115    bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2116                  a->opcode == ac_image_get_lod;
2117    bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2118    bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2119                a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2120    LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2121    uint8_t dmask = a->dmask;
2122    LLVMTypeRef data_type;
2123    char data_type_str[8];
2124 
2125    if (atomic) {
2126       data_type = LLVMTypeOf(a->data[0]);
2127    } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2128       /* Image stores might have been shrinked using the format. */
2129       data_type = LLVMTypeOf(a->data[0]);
2130       dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
2131    } else {
2132       data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
2133    }
2134 
2135    if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2136       args[num_args++] = a->data[0];
2137       if (a->opcode == ac_image_atomic_cmpswap)
2138          args[num_args++] = a->data[1];
2139    }
2140 
2141    if (!atomic)
2142       args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
2143 
2144    if (a->offset)
2145       args[num_args++] = ac_to_integer(ctx, a->offset);
2146    if (a->bias) {
2147       args[num_args++] = ac_to_float(ctx, a->bias);
2148       overload[num_overloads++] = ".f32";
2149    }
2150    if (a->compare)
2151       args[num_args++] = ac_to_float(ctx, a->compare);
2152    if (a->derivs[0]) {
2153       unsigned count = ac_num_derivs(dim);
2154       for (unsigned i = 0; i < count; ++i)
2155          args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2156       overload[num_overloads++] = ".f32";
2157    }
2158    unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2159    for (unsigned i = 0; i < num_coords; ++i)
2160       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2161    if (a->lod)
2162       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2163    if (a->min_lod)
2164       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2165 
2166    overload[num_overloads++] = sample ? ".f32" : ".i32";
2167 
2168    args[num_args++] = a->resource;
2169    if (sample) {
2170       args[num_args++] = a->sampler;
2171       args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2172    }
2173 
2174    args[num_args++] = ctx->i32_0; /* texfailctrl */
2175    args[num_args++] = LLVMConstInt(
2176       ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2177 
2178    const char *name;
2179    const char *atomic_subop = "";
2180    switch (a->opcode) {
2181    case ac_image_sample:
2182       name = "sample";
2183       break;
2184    case ac_image_gather4:
2185       name = "gather4";
2186       break;
2187    case ac_image_load:
2188       name = "load";
2189       break;
2190    case ac_image_load_mip:
2191       name = "load.mip";
2192       break;
2193    case ac_image_store:
2194       name = "store";
2195       break;
2196    case ac_image_store_mip:
2197       name = "store.mip";
2198       break;
2199    case ac_image_atomic:
2200       name = "atomic.";
2201       atomic_subop = get_atomic_name(a->atomic);
2202       break;
2203    case ac_image_atomic_cmpswap:
2204       name = "atomic.";
2205       atomic_subop = "cmpswap";
2206       break;
2207    case ac_image_get_lod:
2208       name = "getlod";
2209       break;
2210    case ac_image_get_resinfo:
2211       name = "getresinfo";
2212       break;
2213    default:
2214       unreachable("invalid image opcode");
2215    }
2216 
2217    const char *dimname;
2218    switch (dim) {
2219    case ac_image_1d:
2220       dimname = "1d";
2221       break;
2222    case ac_image_2d:
2223       dimname = "2d";
2224       break;
2225    case ac_image_3d:
2226       dimname = "3d";
2227       break;
2228    case ac_image_cube:
2229       dimname = "cube";
2230       break;
2231    case ac_image_1darray:
2232       dimname = "1darray";
2233       break;
2234    case ac_image_2darray:
2235       dimname = "2darray";
2236       break;
2237    case ac_image_2dmsaa:
2238       dimname = "2dmsaa";
2239       break;
2240    case ac_image_2darraymsaa:
2241       dimname = "2darraymsaa";
2242       break;
2243    default:
2244       unreachable("invalid dim");
2245    }
2246 
2247    ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
2248 
2249    bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2250    char intr_name[96];
2251    snprintf(intr_name, sizeof(intr_name),
2252             "llvm.amdgcn.image.%s%s" /* base name */
2253             "%s%s%s%s"               /* sample/gather modifiers */
2254             ".%s.%s%s%s%s",          /* dimension and type overloads */
2255             name, atomic_subop, a->compare ? ".c" : "",
2256             a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2257             a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2258             data_type_str, overload[0], overload[1], overload[2]);
2259 
2260    LLVMTypeRef retty;
2261    if (atomic)
2262       retty = data_type;
2263    else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2264       retty = ctx->voidt;
2265    else
2266       retty = a->d16 ? ctx->v4f16 : ctx->v4f32;
2267 
2268    LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2269    if (!sample && !atomic && retty != ctx->voidt)
2270       result = ac_to_integer(ctx, result);
2271 
2272    return result;
2273 }
2274 
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)2275 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2276 {
2277    LLVMValueRef samples;
2278 
2279    /* Read the samples from the descriptor directly.
2280     * Hardware doesn't have any instruction for this.
2281     */
2282    samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2283    samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2284    samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2285    samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2286    return samples;
2287 }
2288 
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2289 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2290 {
2291    return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2292                              AC_FUNC_ATTR_READNONE);
2293 }
2294 
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2295 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2296 {
2297    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2298                                          AC_FUNC_ATTR_READNONE);
2299    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2300 }
2301 
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2302 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2303 {
2304    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2305                                          AC_FUNC_ATTR_READNONE);
2306    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2307 }
2308 
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2309 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
2310                                          LLVMValueRef args[2])
2311 {
2312    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2313    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2314    LLVMValueRef code = LLVMConstInlineAsm(calltype,
2315                                           "v_cvt_pknorm_i16_f16 $0, $1, $2", "=v,v,v",
2316                                           false, false);
2317    return LLVMBuildCall(ctx->builder, code, args, 2, "");
2318 }
2319 
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2320 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2321                                          LLVMValueRef args[2])
2322 {
2323    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2324    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2325    LLVMValueRef code = LLVMConstInlineAsm(calltype,
2326                                           "v_cvt_pknorm_u16_f16 $0, $1, $2", "=v,v,v",
2327                                           false, false);
2328    return LLVMBuildCall(ctx->builder, code, args, 2, "");
2329 }
2330 
2331 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2332 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2333                                  bool hi)
2334 {
2335    assert(bits == 8 || bits == 10 || bits == 16);
2336 
2337    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2338    LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2339    LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2340    LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2341 
2342    /* Clamp. */
2343    if (bits != 16) {
2344       for (int i = 0; i < 2; i++) {
2345          bool alpha = hi && i == 1;
2346          args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2347          args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2348       }
2349    }
2350 
2351    LLVMValueRef res =
2352       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2353    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2354 }
2355 
2356 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2357 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2358                                  bool hi)
2359 {
2360    assert(bits == 8 || bits == 10 || bits == 16);
2361 
2362    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2363    LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2364 
2365    /* Clamp. */
2366    if (bits != 16) {
2367       for (int i = 0; i < 2; i++) {
2368          bool alpha = hi && i == 1;
2369          args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2370       }
2371    }
2372 
2373    LLVMValueRef res =
2374       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2375    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2376 }
2377 
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2378 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2379 {
2380    return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2381 }
2382 
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2383 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2384 {
2385    ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2386 }
2387 
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2388 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2389                           LLVMValueRef width, bool is_signed)
2390 {
2391    LLVMValueRef args[] = {
2392       input,
2393       offset,
2394       width,
2395    };
2396 
2397    return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2398                              ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2399 }
2400 
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2401 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2402                            LLVMValueRef s2)
2403 {
2404    return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2405 }
2406 
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2407 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2408                            LLVMValueRef s2)
2409 {
2410    /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2411    if (ctx->chip_class >= GFX10) {
2412       return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2413                                 AC_FUNC_ATTR_READNONE);
2414    }
2415 
2416    return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2417 }
2418 
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2419 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2420 {
2421    if (!wait_flags)
2422       return;
2423 
2424    unsigned lgkmcnt = 63;
2425    unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2426    unsigned vscnt = 63;
2427 
2428    if (wait_flags & AC_WAIT_LGKM)
2429       lgkmcnt = 0;
2430    if (wait_flags & AC_WAIT_VLOAD)
2431       vmcnt = 0;
2432 
2433    if (wait_flags & AC_WAIT_VSTORE) {
2434       if (ctx->chip_class >= GFX10)
2435          vscnt = 0;
2436       else
2437          vmcnt = 0;
2438    }
2439 
2440    /* There is no intrinsic for vscnt(0), so use a fence. */
2441    if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2442        vscnt == 0) {
2443       LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2444       return;
2445    }
2446 
2447    unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */
2448                      (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2449 
2450    LLVMValueRef args[1] = {
2451       LLVMConstInt(ctx->i32, simm16, false),
2452    };
2453    ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2454 }
2455 
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2456 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2457                            LLVMTypeRef type)
2458 {
2459    unsigned bitsize = ac_get_elem_bits(ctx, type);
2460    LLVMValueRef zero = LLVMConstReal(type, 0.0);
2461    LLVMValueRef one = LLVMConstReal(type, 1.0);
2462    LLVMValueRef result;
2463 
2464    if (bitsize == 64 || (bitsize == 16 && ctx->chip_class <= GFX8)) {
2465       /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2466        * doesn't expose an intrinsic.
2467        */
2468       result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2469    } else {
2470       LLVMTypeRef type;
2471       char *intr;
2472 
2473       if (bitsize == 16) {
2474          intr = "llvm.amdgcn.fmed3.f16";
2475          type = ctx->f16;
2476       } else {
2477          assert(bitsize == 32);
2478          intr = "llvm.amdgcn.fmed3.f32";
2479          type = ctx->f32;
2480       }
2481 
2482       LLVMValueRef params[] = {
2483          zero,
2484          one,
2485          src,
2486       };
2487 
2488       result = ac_build_intrinsic(ctx, intr, type, params, 3,
2489                                   AC_FUNC_ATTR_READNONE);
2490    }
2491 
2492    if (ctx->chip_class < GFX9 && bitsize == 32) {
2493       /* Only pre-GFX9 chips do not flush denorms. */
2494       result = ac_build_canonicalize(ctx, result, bitsize);
2495    }
2496 
2497    return result;
2498 }
2499 
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2500 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2501 {
2502    LLVMTypeRef type;
2503    char *intr;
2504 
2505    if (bitsize == 16) {
2506       intr = "llvm.amdgcn.fract.f16";
2507       type = ctx->f16;
2508    } else if (bitsize == 32) {
2509       intr = "llvm.amdgcn.fract.f32";
2510       type = ctx->f32;
2511    } else {
2512       intr = "llvm.amdgcn.fract.f64";
2513       type = ctx->f64;
2514    }
2515 
2516    LLVMValueRef params[] = {
2517       src0,
2518    };
2519    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2520 }
2521 
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2522 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2523 {
2524 
2525    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2526       LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2527       unsigned vec_size = LLVMGetVectorSize(type);
2528       LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2529 
2530       for (unsigned i = 0; i < vec_size; i++)
2531          scalars[i] = scalar;
2532       return LLVMConstVector(scalars, vec_size);
2533    }
2534    return LLVMConstInt(type, value, 0);
2535 }
2536 
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2537 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2538 {
2539    LLVMTypeRef type = LLVMTypeOf(src0);
2540    LLVMValueRef val;
2541 
2542    /* v_med3 is selected only when max is first. (LLVM bug?) */
2543    val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2544    return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2545 }
2546 
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2547 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2548 {
2549    ac_enable_signed_zeros(ctx);
2550    /* (val + 0) converts negative zero to positive zero. */
2551    val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2552    ac_disable_signed_zeros(ctx);
2553    return val;
2554 }
2555 
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2556 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2557 {
2558    LLVMTypeRef type = LLVMTypeOf(src);
2559    LLVMValueRef pos, neg, dw[2], val;
2560    unsigned bitsize = ac_get_elem_bits(ctx, type);
2561 
2562    /* The standard version leads to this:
2563     *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
2564     *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
2565     *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
2566     *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
2567     *
2568     * The isign version:
2569     *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
2570     *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
2571     *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
2572     *
2573     * (src0 + 0) converts negative zero to positive zero.
2574     * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2575     *
2576     * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2577     * reduction. (FP64 comparisons are as fast as int64 comparisons)
2578     */
2579    if (bitsize == 16 || bitsize == 32) {
2580       val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2581       val = ac_build_isign(ctx, val);
2582       return LLVMBuildSIToFP(ctx->builder, val, type, "");
2583    }
2584 
2585    assert(bitsize == 64);
2586    pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2587    neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2588    dw[0] = ctx->i32_0;
2589    dw[1] = LLVMBuildSelect(
2590       ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2591       LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2592       "");
2593    return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2594 }
2595 
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2596 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2597 {
2598    LLVMValueRef result;
2599    unsigned bitsize;
2600 
2601    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2602 
2603    switch (bitsize) {
2604    case 128:
2605       result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2606                                   AC_FUNC_ATTR_READNONE);
2607       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2608       break;
2609    case 64:
2610       result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2611                                   AC_FUNC_ATTR_READNONE);
2612 
2613       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2614       break;
2615    case 32:
2616       result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2617                                   AC_FUNC_ATTR_READNONE);
2618       break;
2619    case 16:
2620       result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2621                                   AC_FUNC_ATTR_READNONE);
2622 
2623       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2624       break;
2625    case 8:
2626       result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2627                                   AC_FUNC_ATTR_READNONE);
2628 
2629       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2630       break;
2631    default:
2632       unreachable(!"invalid bitsize");
2633       break;
2634    }
2635 
2636    return result;
2637 }
2638 
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2639 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2640 {
2641    LLVMValueRef result;
2642    unsigned bitsize;
2643 
2644    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2645 
2646    switch (bitsize) {
2647    case 64:
2648       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2649                                   AC_FUNC_ATTR_READNONE);
2650 
2651       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2652       break;
2653    case 32:
2654       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2655                                   AC_FUNC_ATTR_READNONE);
2656       break;
2657    case 16:
2658       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2659                                   AC_FUNC_ATTR_READNONE);
2660 
2661       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2662       break;
2663    case 8:
2664       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2665                                   AC_FUNC_ATTR_READNONE);
2666 
2667       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2668       break;
2669    default:
2670       unreachable(!"invalid bitsize");
2671       break;
2672    }
2673 
2674    return result;
2675 }
2676 
2677 #define AC_EXP_TARGET           0
2678 #define AC_EXP_ENABLED_CHANNELS 1
2679 #define AC_EXP_OUT0             2
2680 
2681 enum ac_ir_type
2682 {
2683    AC_IR_UNDEF,
2684    AC_IR_CONST,
2685    AC_IR_VALUE,
2686 };
2687 
2688 struct ac_vs_exp_chan {
2689    LLVMValueRef value;
2690    float const_float;
2691    enum ac_ir_type type;
2692 };
2693 
2694 struct ac_vs_exp_inst {
2695    unsigned offset;
2696    LLVMValueRef inst;
2697    struct ac_vs_exp_chan chan[4];
2698 };
2699 
2700 struct ac_vs_exports {
2701    unsigned num;
2702    struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2703 };
2704 
2705 /* Return true if the PARAM export has been eliminated. */
ac_eliminate_const_output(uint8_t * vs_output_param_offset,uint32_t num_outputs,struct ac_vs_exp_inst * exp)2706 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs,
2707                                       struct ac_vs_exp_inst *exp)
2708 {
2709    unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2710    bool is_zero[4] = {0}, is_one[4] = {0};
2711 
2712    for (i = 0; i < 4; i++) {
2713       /* It's a constant expression. Undef outputs are eliminated too. */
2714       if (exp->chan[i].type == AC_IR_UNDEF) {
2715          is_zero[i] = true;
2716          is_one[i] = true;
2717       } else if (exp->chan[i].type == AC_IR_CONST) {
2718          if (exp->chan[i].const_float == 0)
2719             is_zero[i] = true;
2720          else if (exp->chan[i].const_float == 1)
2721             is_one[i] = true;
2722          else
2723             return false; /* other constant */
2724       } else
2725          return false;
2726    }
2727 
2728    /* Only certain combinations of 0 and 1 can be eliminated. */
2729    if (is_zero[0] && is_zero[1] && is_zero[2])
2730       default_val = is_zero[3] ? 0 : 1;
2731    else if (is_one[0] && is_one[1] && is_one[2])
2732       default_val = is_zero[3] ? 2 : 3;
2733    else
2734       return false;
2735 
2736    /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2737    LLVMInstructionEraseFromParent(exp->inst);
2738 
2739    /* Change OFFSET to DEFAULT_VAL. */
2740    for (i = 0; i < num_outputs; i++) {
2741       if (vs_output_param_offset[i] == exp->offset) {
2742          vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2743          break;
2744       }
2745    }
2746    return true;
2747 }
2748 
ac_eliminate_duplicated_output(struct ac_llvm_context * ctx,uint8_t * vs_output_param_offset,uint32_t num_outputs,struct ac_vs_exports * processed,struct ac_vs_exp_inst * exp)2749 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2750                                            uint8_t *vs_output_param_offset, uint32_t num_outputs,
2751                                            struct ac_vs_exports *processed,
2752                                            struct ac_vs_exp_inst *exp)
2753 {
2754    unsigned p, copy_back_channels = 0;
2755 
2756    /* See if the output is already in the list of processed outputs.
2757     * The LLVMValueRef comparison relies on SSA.
2758     */
2759    for (p = 0; p < processed->num; p++) {
2760       bool different = false;
2761 
2762       for (unsigned j = 0; j < 4; j++) {
2763          struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2764          struct ac_vs_exp_chan *c2 = &exp->chan[j];
2765 
2766          /* Treat undef as a match. */
2767          if (c2->type == AC_IR_UNDEF)
2768             continue;
2769 
2770          /* If c1 is undef but c2 isn't, we can copy c2 to c1
2771           * and consider the instruction duplicated.
2772           */
2773          if (c1->type == AC_IR_UNDEF) {
2774             copy_back_channels |= 1 << j;
2775             continue;
2776          }
2777 
2778          /* Test whether the channels are not equal. */
2779          if (c1->type != c2->type ||
2780              (c1->type == AC_IR_CONST && c1->const_float != c2->const_float) ||
2781              (c1->type == AC_IR_VALUE && c1->value != c2->value)) {
2782             different = true;
2783             break;
2784          }
2785       }
2786       if (!different)
2787          break;
2788 
2789       copy_back_channels = 0;
2790    }
2791    if (p == processed->num)
2792       return false;
2793 
2794    /* If a match was found, but the matching export has undef where the new
2795     * one has a normal value, copy the normal value to the undef channel.
2796     */
2797    struct ac_vs_exp_inst *match = &processed->exp[p];
2798 
2799    /* Get current enabled channels mask. */
2800    LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2801    unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2802 
2803    while (copy_back_channels) {
2804       unsigned chan = u_bit_scan(&copy_back_channels);
2805 
2806       assert(match->chan[chan].type == AC_IR_UNDEF);
2807       LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value);
2808       match->chan[chan] = exp->chan[chan];
2809 
2810       /* Update number of enabled channels because the original mask
2811        * is not always 0xf.
2812        */
2813       enabled_channels |= (1 << chan);
2814       LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2815                      LLVMConstInt(ctx->i32, enabled_channels, 0));
2816    }
2817 
2818    /* The PARAM export is duplicated. Kill it. */
2819    LLVMInstructionEraseFromParent(exp->inst);
2820 
2821    /* Change OFFSET to the matching export. */
2822    for (unsigned i = 0; i < num_outputs; i++) {
2823       if (vs_output_param_offset[i] == exp->offset) {
2824          vs_output_param_offset[i] = match->offset;
2825          break;
2826       }
2827    }
2828    return true;
2829 }
2830 
ac_optimize_vs_outputs(struct ac_llvm_context * ctx,LLVMValueRef main_fn,uint8_t * vs_output_param_offset,uint32_t num_outputs,uint32_t skip_output_mask,uint8_t * num_param_exports)2831 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn,
2832                             uint8_t *vs_output_param_offset, uint32_t num_outputs,
2833                             uint32_t skip_output_mask, uint8_t *num_param_exports)
2834 {
2835    LLVMBasicBlockRef bb;
2836    bool removed_any = false;
2837    struct ac_vs_exports exports;
2838 
2839    exports.num = 0;
2840 
2841    /* Process all LLVM instructions. */
2842    bb = LLVMGetFirstBasicBlock(main_fn);
2843    while (bb) {
2844       LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2845 
2846       while (inst) {
2847          LLVMValueRef cur = inst;
2848          inst = LLVMGetNextInstruction(inst);
2849          struct ac_vs_exp_inst exp;
2850 
2851          if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2852             continue;
2853 
2854          LLVMValueRef callee = ac_llvm_get_called_value(cur);
2855 
2856          if (!ac_llvm_is_function(callee))
2857             continue;
2858 
2859          const char *name = LLVMGetValueName(callee);
2860          unsigned num_args = LLVMCountParams(callee);
2861 
2862          /* Check if this is an export instruction. */
2863          if ((num_args != 9 && num_args != 8) ||
2864              (strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32")))
2865             continue;
2866 
2867          LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2868          unsigned target = LLVMConstIntGetZExtValue(arg);
2869 
2870          if (target < V_008DFC_SQ_EXP_PARAM)
2871             continue;
2872 
2873          target -= V_008DFC_SQ_EXP_PARAM;
2874 
2875          /* Parse the instruction. */
2876          memset(&exp, 0, sizeof(exp));
2877          exp.offset = target;
2878          exp.inst = cur;
2879 
2880          for (unsigned i = 0; i < 4; i++) {
2881             LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2882 
2883             exp.chan[i].value = v;
2884 
2885             if (LLVMIsUndef(v)) {
2886                exp.chan[i].type = AC_IR_UNDEF;
2887             } else if (LLVMIsAConstantFP(v)) {
2888                LLVMBool loses_info;
2889                exp.chan[i].type = AC_IR_CONST;
2890                exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info);
2891             } else {
2892                exp.chan[i].type = AC_IR_VALUE;
2893             }
2894          }
2895 
2896          /* Eliminate constant and duplicated PARAM exports. */
2897          if (!((1u << target) & skip_output_mask) &&
2898              (ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) ||
2899               ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports,
2900                                              &exp))) {
2901             removed_any = true;
2902          } else {
2903             exports.exp[exports.num++] = exp;
2904          }
2905       }
2906       bb = LLVMGetNextBasicBlock(bb);
2907    }
2908 
2909    /* Remove holes in export memory due to removed PARAM exports.
2910     * This is done by renumbering all PARAM exports.
2911     */
2912    if (removed_any) {
2913       uint8_t old_offset[VARYING_SLOT_MAX];
2914       unsigned out, i;
2915 
2916       /* Make a copy of the offsets. We need the old version while
2917        * we are modifying some of them. */
2918       memcpy(old_offset, vs_output_param_offset, sizeof(old_offset));
2919 
2920       for (i = 0; i < exports.num; i++) {
2921          unsigned offset = exports.exp[i].offset;
2922 
2923          /* Update vs_output_param_offset. Multiple outputs can
2924           * have the same offset.
2925           */
2926          for (out = 0; out < num_outputs; out++) {
2927             if (old_offset[out] == offset)
2928                vs_output_param_offset[out] = i;
2929          }
2930 
2931          /* Change the PARAM offset in the instruction. */
2932          LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2933                         LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0));
2934       }
2935       *num_param_exports = exports.num;
2936    }
2937 }
2938 
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2939 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2940 {
2941    LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2942    ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2943                       AC_FUNC_ATTR_CONVERGENT);
2944 }
2945 
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2946 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2947 {
2948    unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
2949    ctx->lds = LLVMBuildIntToPtr(
2950       ctx->builder, ctx->i32_0,
2951       LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2952 }
2953 
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2954 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2955 {
2956    return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2957 }
2958 
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)2959 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2960 {
2961    value = ac_to_integer(ctx, value);
2962    ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2963 }
2964 
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2965 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2966 {
2967    unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2968    const char *intrin_name;
2969    LLVMTypeRef type;
2970    LLVMValueRef zero;
2971 
2972    switch (src0_bitsize) {
2973    case 64:
2974       intrin_name = "llvm.cttz.i64";
2975       type = ctx->i64;
2976       zero = ctx->i64_0;
2977       break;
2978    case 32:
2979       intrin_name = "llvm.cttz.i32";
2980       type = ctx->i32;
2981       zero = ctx->i32_0;
2982       break;
2983    case 16:
2984       intrin_name = "llvm.cttz.i16";
2985       type = ctx->i16;
2986       zero = ctx->i16_0;
2987       break;
2988    case 8:
2989       intrin_name = "llvm.cttz.i8";
2990       type = ctx->i8;
2991       zero = ctx->i8_0;
2992       break;
2993    default:
2994       unreachable(!"invalid bitsize");
2995    }
2996 
2997    LLVMValueRef params[2] = {
2998       src0,
2999 
3000       /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3001        * add special code to check for x=0. The reason is that
3002        * the LLVM behavior for x=0 is different from what we
3003        * need here. However, LLVM also assumes that ffs(x) is
3004        * in [0, 31], but GLSL expects that ffs(0) = -1, so
3005        * a conditional assignment to handle 0 is still required.
3006        *
3007        * The hardware already implements the correct behavior.
3008        */
3009       ctx->i1true,
3010    };
3011 
3012    LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
3013 
3014    if (src0_bitsize == 64) {
3015       lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3016    } else if (src0_bitsize < 32) {
3017       lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3018    }
3019 
3020    /* TODO: We need an intrinsic to skip this conditional. */
3021    /* Check for zero: */
3022    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
3023                           LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3024 }
3025 
ac_array_in_const_addr_space(LLVMTypeRef elem_type)3026 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3027 {
3028    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3029 }
3030 
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)3031 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3032 {
3033    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3034 }
3035 
get_current_flow(struct ac_llvm_context * ctx)3036 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
3037 {
3038    if (ctx->flow->depth > 0)
3039       return &ctx->flow->stack[ctx->flow->depth - 1];
3040    return NULL;
3041 }
3042 
get_innermost_loop(struct ac_llvm_context * ctx)3043 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
3044 {
3045    for (unsigned i = ctx->flow->depth; i > 0; --i) {
3046       if (ctx->flow->stack[i - 1].loop_entry_block)
3047          return &ctx->flow->stack[i - 1];
3048    }
3049    return NULL;
3050 }
3051 
push_flow(struct ac_llvm_context * ctx)3052 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
3053 {
3054    struct ac_llvm_flow *flow;
3055 
3056    if (ctx->flow->depth >= ctx->flow->depth_max) {
3057       unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
3058 
3059       ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
3060       ctx->flow->depth_max = new_max;
3061    }
3062 
3063    flow = &ctx->flow->stack[ctx->flow->depth];
3064    ctx->flow->depth++;
3065 
3066    flow->next_block = NULL;
3067    flow->loop_entry_block = NULL;
3068    return flow;
3069 }
3070 
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)3071 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
3072 {
3073    char buf[32];
3074    snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3075    LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3076 }
3077 
3078 /* Append a basic block at the level of the parent flow.
3079  */
append_basic_block(struct ac_llvm_context * ctx,const char * name)3080 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
3081 {
3082    assert(ctx->flow->depth >= 1);
3083 
3084    if (ctx->flow->depth >= 2) {
3085       struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
3086 
3087       return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
3088    }
3089 
3090    LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3091    return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3092 }
3093 
3094 /* Emit a branch to the given default target for the current block if
3095  * applicable -- that is, if the current block does not already contain a
3096  * branch from a break or continue.
3097  */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)3098 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
3099 {
3100    if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3101       LLVMBuildBr(builder, target);
3102 }
3103 
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)3104 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3105 {
3106    struct ac_llvm_flow *flow = push_flow(ctx);
3107    flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3108    flow->next_block = append_basic_block(ctx, "ENDLOOP");
3109    set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3110    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3111    LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3112 }
3113 
ac_build_break(struct ac_llvm_context * ctx)3114 void ac_build_break(struct ac_llvm_context *ctx)
3115 {
3116    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3117    LLVMBuildBr(ctx->builder, flow->next_block);
3118 }
3119 
ac_build_continue(struct ac_llvm_context * ctx)3120 void ac_build_continue(struct ac_llvm_context *ctx)
3121 {
3122    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3123    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3124 }
3125 
ac_build_else(struct ac_llvm_context * ctx,int label_id)3126 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3127 {
3128    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3129    LLVMBasicBlockRef endif_block;
3130 
3131    assert(!current_branch->loop_entry_block);
3132 
3133    endif_block = append_basic_block(ctx, "ENDIF");
3134    emit_default_branch(ctx->builder, endif_block);
3135 
3136    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3137    set_basicblock_name(current_branch->next_block, "else", label_id);
3138 
3139    current_branch->next_block = endif_block;
3140 }
3141 
ac_build_endif(struct ac_llvm_context * ctx,int label_id)3142 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3143 {
3144    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3145 
3146    assert(!current_branch->loop_entry_block);
3147 
3148    emit_default_branch(ctx->builder, current_branch->next_block);
3149    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3150    set_basicblock_name(current_branch->next_block, "endif", label_id);
3151 
3152    ctx->flow->depth--;
3153 }
3154 
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)3155 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3156 {
3157    struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3158 
3159    assert(current_loop->loop_entry_block);
3160 
3161    emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3162 
3163    LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3164    set_basicblock_name(current_loop->next_block, "endloop", label_id);
3165    ctx->flow->depth--;
3166 }
3167 
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)3168 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3169 {
3170    struct ac_llvm_flow *flow = push_flow(ctx);
3171    LLVMBasicBlockRef if_block;
3172 
3173    if_block = append_basic_block(ctx, "IF");
3174    flow->next_block = append_basic_block(ctx, "ELSE");
3175    set_basicblock_name(if_block, "if", label_id);
3176    LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3177    LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3178 }
3179 
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3180 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3181 {
3182    LLVMBuilderRef builder = ac->builder;
3183    LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3184    LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3185    LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3186    LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3187    LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3188    LLVMValueRef res;
3189 
3190    if (first_instr) {
3191       LLVMPositionBuilderBefore(first_builder, first_instr);
3192    } else {
3193       LLVMPositionBuilderAtEnd(first_builder, first_block);
3194    }
3195 
3196    res = LLVMBuildAlloca(first_builder, type, name);
3197    LLVMDisposeBuilder(first_builder);
3198    return res;
3199 }
3200 
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3201 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3202 {
3203    LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3204    LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3205    return ptr;
3206 }
3207 
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)3208 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3209 {
3210    int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3211    return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3212 }
3213 
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)3214 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3215 {
3216    unsigned num_components = ac_get_llvm_num_components(value);
3217    if (count == num_components)
3218       return value;
3219 
3220    LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
3221    masks[0] = ctx->i32_0;
3222    masks[1] = ctx->i32_1;
3223    for (unsigned i = 2; i < count; i++)
3224       masks[i] = LLVMConstInt(ctx->i32, i, false);
3225 
3226    if (count == 1)
3227       return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3228 
3229    LLVMValueRef swizzle = LLVMConstVector(masks, count);
3230    return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3231 }
3232 
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)3233 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3234                              unsigned bitwidth)
3235 {
3236    LLVMValueRef value = param;
3237    if (rshift)
3238       value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(ctx->i32, rshift, false), "");
3239 
3240    if (rshift + bitwidth < 32) {
3241       unsigned mask = (1 << bitwidth) - 1;
3242       value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(ctx->i32, mask, false), "");
3243    }
3244    return value;
3245 }
3246 
3247 /* Adjust the sample index according to FMASK.
3248  *
3249  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3250  * which is the identity mapping. Each nibble says which physical sample
3251  * should be fetched to get that sample.
3252  *
3253  * For example, 0x11111100 means there are only 2 samples stored and
3254  * the second sample covers 3/4 of the pixel. When reading samples 0
3255  * and 1, return physical sample 0 (determined by the first two 0s
3256  * in FMASK), otherwise return physical sample 1.
3257  *
3258  * The sample index should be adjusted as follows:
3259  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3260  */
ac_apply_fmask_to_sample(struct ac_llvm_context * ac,LLVMValueRef fmask,LLVMValueRef * addr,bool is_array_tex)3261 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3262                               bool is_array_tex)
3263 {
3264    struct ac_image_args fmask_load = {0};
3265    fmask_load.opcode = ac_image_load;
3266    fmask_load.resource = fmask;
3267    fmask_load.dmask = 0xf;
3268    fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3269    fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3270 
3271    fmask_load.coords[0] = addr[0];
3272    fmask_load.coords[1] = addr[1];
3273    if (is_array_tex)
3274       fmask_load.coords[2] = addr[2];
3275 
3276    LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3277    fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3278 
3279    /* Apply the formula. */
3280    unsigned sample_chan = is_array_tex ? 3 : 2;
3281    LLVMValueRef final_sample;
3282    final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], LLVMConstInt(ac->i32, 4, 0), "");
3283    final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3284    /* Mask the sample index by 0x7, because 0x8 means an unknown value
3285     * with EQAA, so those will map to 0. */
3286    final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3287 
3288    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3289     * resource descriptor is 0 (invalid).
3290     */
3291    LLVMValueRef tmp;
3292    tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3293    tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3294    tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3295 
3296    /* Replace the MSAA sample index. */
3297    addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], "");
3298 }
3299 
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3300 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3301                                        LLVMValueRef lane, bool with_opt_barrier)
3302 {
3303    LLVMTypeRef type = LLVMTypeOf(src);
3304    LLVMValueRef result;
3305 
3306    if (with_opt_barrier)
3307       ac_build_optimization_barrier(ctx, &src);
3308 
3309    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3310    if (lane)
3311       lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3312 
3313    result =
3314       ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3315                          ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3316                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3317 
3318    return LLVMBuildTrunc(ctx->builder, result, type, "");
3319 }
3320 
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3321 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3322                                              LLVMValueRef lane, bool with_opt_barrier)
3323 {
3324    LLVMTypeRef src_type = LLVMTypeOf(src);
3325    src = ac_to_integer(ctx, src);
3326    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3327    LLVMValueRef ret;
3328 
3329    if (bits > 32) {
3330       assert(bits % 32 == 0);
3331       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3332       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3333       ret = LLVMGetUndef(vec_type);
3334       for (unsigned i = 0; i < bits / 32; i++) {
3335          LLVMValueRef ret_comp;
3336 
3337          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3338 
3339          ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3340 
3341          ret =
3342             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3343       }
3344    } else {
3345       ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3346    }
3347 
3348    if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3349       return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3350    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3351 }
3352 
3353 /**
3354  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3355  *
3356  * The optimization barrier is not needed if the value is the same in all lanes
3357  * or if this is called in the outermost block.
3358  *
3359  * @param ctx
3360  * @param src
3361  * @param lane - id of the lane or NULL for the first active lane
3362  * @return value of the lane
3363  */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3364 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3365                                               LLVMValueRef lane)
3366 {
3367    return ac_build_readlane_common(ctx, src, lane, false);
3368 }
3369 
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3370 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3371 {
3372    return ac_build_readlane_common(ctx, src, lane, true);
3373 }
3374 
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)3375 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3376                                 LLVMValueRef lane)
3377 {
3378    return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3379                              (LLVMValueRef[]){value, lane, src}, 3,
3380                              AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3381 }
3382 
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)3383 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3384 {
3385    if (ctx->wave_size == 32) {
3386       return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3387                                 (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3388    }
3389    LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3390    LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3391    LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3392    LLVMValueRef val =
3393       ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3394                          (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3395    val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3396                             2, AC_FUNC_ATTR_READNONE);
3397    return val;
3398 }
3399 
3400 enum dpp_ctrl
3401 {
3402    _dpp_quad_perm = 0x000,
3403    _dpp_row_sl = 0x100,
3404    _dpp_row_sr = 0x110,
3405    _dpp_row_rr = 0x120,
3406    dpp_wf_sl1 = 0x130,
3407    dpp_wf_rl1 = 0x134,
3408    dpp_wf_sr1 = 0x138,
3409    dpp_wf_rr1 = 0x13C,
3410    dpp_row_mirror = 0x140,
3411    dpp_row_half_mirror = 0x141,
3412    dpp_row_bcast15 = 0x142,
3413    dpp_row_bcast31 = 0x143
3414 };
3415 
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3416 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3417                                           unsigned lane3)
3418 {
3419    assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3420    return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3421 }
3422 
dpp_row_sl(unsigned amount)3423 static inline enum dpp_ctrl dpp_row_sl(unsigned amount)
3424 {
3425    assert(amount > 0 && amount < 16);
3426    return _dpp_row_sl | amount;
3427 }
3428 
dpp_row_sr(unsigned amount)3429 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3430 {
3431    assert(amount > 0 && amount < 16);
3432    return _dpp_row_sr | amount;
3433 }
3434 
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3435 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3436                                   enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3437                                   bool bound_ctrl)
3438 {
3439    LLVMTypeRef type = LLVMTypeOf(src);
3440    LLVMValueRef res;
3441 
3442    old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3443    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3444 
3445    res = ac_build_intrinsic(
3446       ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3447       (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3448                        LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3449                        LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3450       6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3451 
3452    return LLVMBuildTrunc(ctx->builder, res, type, "");
3453 }
3454 
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3455 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3456                                  enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3457                                  bool bound_ctrl)
3458 {
3459    LLVMTypeRef src_type = LLVMTypeOf(src);
3460    src = ac_to_integer(ctx, src);
3461    old = ac_to_integer(ctx, old);
3462    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3463    LLVMValueRef ret;
3464    if (bits > 32) {
3465       assert(bits % 32 == 0);
3466       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3467       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3468       LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3469       ret = LLVMGetUndef(vec_type);
3470       for (unsigned i = 0; i < bits / 32; i++) {
3471          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3472          old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3473          LLVMValueRef ret_comp =
3474             _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3475          ret =
3476             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3477       }
3478    } else {
3479       ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3480    }
3481    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3482 }
3483 
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3484 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3485                                          uint64_t sel, bool exchange_rows, bool bound_ctrl)
3486 {
3487    LLVMTypeRef type = LLVMTypeOf(src);
3488    LLVMValueRef result;
3489 
3490    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3491 
3492    LLVMValueRef args[6] = {
3493       src,
3494       src,
3495       LLVMConstInt(ctx->i32, sel, false),
3496       LLVMConstInt(ctx->i32, sel >> 32, false),
3497       ctx->i1true, /* fi */
3498       bound_ctrl ? ctx->i1true : ctx->i1false,
3499    };
3500 
3501    result =
3502       ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3503                          ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3504 
3505    return LLVMBuildTrunc(ctx->builder, result, type, "");
3506 }
3507 
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3508 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3509                                         bool exchange_rows, bool bound_ctrl)
3510 {
3511    LLVMTypeRef src_type = LLVMTypeOf(src);
3512    src = ac_to_integer(ctx, src);
3513    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3514    LLVMValueRef ret;
3515    if (bits > 32) {
3516       assert(bits % 32 == 0);
3517       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3518       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3519       ret = LLVMGetUndef(vec_type);
3520       for (unsigned i = 0; i < bits / 32; i++) {
3521          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3522          LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3523          ret =
3524             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3525       }
3526    } else {
3527       ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3528    }
3529    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3530 }
3531 
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)3532 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3533 {
3534    assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3535    return and_mask | (or_mask << 5) | (xor_mask << 10);
3536 }
3537 
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3538 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3539                                          unsigned mask)
3540 {
3541    LLVMTypeRef src_type = LLVMTypeOf(src);
3542    LLVMValueRef ret;
3543 
3544    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3545 
3546    ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3547                             (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3548                             AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3549 
3550    return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3551 }
3552 
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3553 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3554 {
3555    LLVMTypeRef src_type = LLVMTypeOf(src);
3556    src = ac_to_integer(ctx, src);
3557    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3558    LLVMValueRef ret;
3559    if (bits > 32) {
3560       assert(bits % 32 == 0);
3561       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3562       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3563       ret = LLVMGetUndef(vec_type);
3564       for (unsigned i = 0; i < bits / 32; i++) {
3565          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3566          LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3567          ret =
3568             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3569       }
3570    } else {
3571       ret = _ac_build_ds_swizzle(ctx, src, mask);
3572    }
3573    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3574 }
3575 
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3576 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3577 {
3578    LLVMTypeRef src_type = LLVMTypeOf(src);
3579    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3580    char name[32], type[8];
3581    LLVMValueRef ret;
3582 
3583    src = ac_to_integer(ctx, src);
3584 
3585    if (bitsize < 32)
3586       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3587 
3588    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3589    snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3590    ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3591                             AC_FUNC_ATTR_READNONE);
3592 
3593    if (bitsize < 32)
3594       ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3595 
3596    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3597 }
3598 
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3599 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3600                                           LLVMValueRef inactive)
3601 {
3602    char name[33], type[8];
3603    LLVMTypeRef src_type = LLVMTypeOf(src);
3604    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3605    src = ac_to_integer(ctx, src);
3606    inactive = ac_to_integer(ctx, inactive);
3607 
3608    if (bitsize < 32) {
3609       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3610       inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3611    }
3612 
3613    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3614    snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3615    LLVMValueRef ret =
3616       ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3617                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3618    if (bitsize < 32)
3619       ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3620 
3621    return ret;
3622 }
3623 
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3624 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3625                                            unsigned type_size)
3626 {
3627 
3628    if (type_size == 0) {
3629       switch (op) {
3630       case nir_op_ior:
3631       case nir_op_ixor:
3632          return LLVMConstInt(ctx->i1, 0, 0);
3633       case nir_op_iand:
3634          return LLVMConstInt(ctx->i1, 1, 0);
3635       default:
3636          unreachable("bad reduction intrinsic");
3637       }
3638    } else if (type_size == 1) {
3639       switch (op) {
3640       case nir_op_iadd:
3641          return ctx->i8_0;
3642       case nir_op_imul:
3643          return ctx->i8_1;
3644       case nir_op_imin:
3645          return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3646       case nir_op_umin:
3647          return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3648       case nir_op_imax:
3649          return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3650       case nir_op_umax:
3651          return ctx->i8_0;
3652       case nir_op_iand:
3653          return LLVMConstInt(ctx->i8, -1, 0);
3654       case nir_op_ior:
3655          return ctx->i8_0;
3656       case nir_op_ixor:
3657          return ctx->i8_0;
3658       default:
3659          unreachable("bad reduction intrinsic");
3660       }
3661    } else if (type_size == 2) {
3662       switch (op) {
3663       case nir_op_iadd:
3664          return ctx->i16_0;
3665       case nir_op_fadd:
3666          return ctx->f16_0;
3667       case nir_op_imul:
3668          return ctx->i16_1;
3669       case nir_op_fmul:
3670          return ctx->f16_1;
3671       case nir_op_imin:
3672          return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3673       case nir_op_umin:
3674          return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3675       case nir_op_fmin:
3676          return LLVMConstReal(ctx->f16, INFINITY);
3677       case nir_op_imax:
3678          return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3679       case nir_op_umax:
3680          return ctx->i16_0;
3681       case nir_op_fmax:
3682          return LLVMConstReal(ctx->f16, -INFINITY);
3683       case nir_op_iand:
3684          return LLVMConstInt(ctx->i16, -1, 0);
3685       case nir_op_ior:
3686          return ctx->i16_0;
3687       case nir_op_ixor:
3688          return ctx->i16_0;
3689       default:
3690          unreachable("bad reduction intrinsic");
3691       }
3692    } else if (type_size == 4) {
3693       switch (op) {
3694       case nir_op_iadd:
3695          return ctx->i32_0;
3696       case nir_op_fadd:
3697          return ctx->f32_0;
3698       case nir_op_imul:
3699          return ctx->i32_1;
3700       case nir_op_fmul:
3701          return ctx->f32_1;
3702       case nir_op_imin:
3703          return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3704       case nir_op_umin:
3705          return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3706       case nir_op_fmin:
3707          return LLVMConstReal(ctx->f32, INFINITY);
3708       case nir_op_imax:
3709          return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3710       case nir_op_umax:
3711          return ctx->i32_0;
3712       case nir_op_fmax:
3713          return LLVMConstReal(ctx->f32, -INFINITY);
3714       case nir_op_iand:
3715          return LLVMConstInt(ctx->i32, -1, 0);
3716       case nir_op_ior:
3717          return ctx->i32_0;
3718       case nir_op_ixor:
3719          return ctx->i32_0;
3720       default:
3721          unreachable("bad reduction intrinsic");
3722       }
3723    } else { /* type_size == 64bit */
3724       switch (op) {
3725       case nir_op_iadd:
3726          return ctx->i64_0;
3727       case nir_op_fadd:
3728          return ctx->f64_0;
3729       case nir_op_imul:
3730          return ctx->i64_1;
3731       case nir_op_fmul:
3732          return ctx->f64_1;
3733       case nir_op_imin:
3734          return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3735       case nir_op_umin:
3736          return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3737       case nir_op_fmin:
3738          return LLVMConstReal(ctx->f64, INFINITY);
3739       case nir_op_imax:
3740          return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3741       case nir_op_umax:
3742          return ctx->i64_0;
3743       case nir_op_fmax:
3744          return LLVMConstReal(ctx->f64, -INFINITY);
3745       case nir_op_iand:
3746          return LLVMConstInt(ctx->i64, -1, 0);
3747       case nir_op_ior:
3748          return ctx->i64_0;
3749       case nir_op_ixor:
3750          return ctx->i64_0;
3751       default:
3752          unreachable("bad reduction intrinsic");
3753       }
3754    }
3755 }
3756 
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3757 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3758                                     nir_op op)
3759 {
3760    bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3761    bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3762    switch (op) {
3763    case nir_op_iadd:
3764       return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3765    case nir_op_fadd:
3766       return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3767    case nir_op_imul:
3768       return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3769    case nir_op_fmul:
3770       return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3771    case nir_op_imin:
3772       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3773                              lhs, rhs, "");
3774    case nir_op_umin:
3775       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3776                              lhs, rhs, "");
3777    case nir_op_fmin:
3778       return ac_build_intrinsic(
3779          ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3780          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3781          AC_FUNC_ATTR_READNONE);
3782    case nir_op_imax:
3783       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3784                              lhs, rhs, "");
3785    case nir_op_umax:
3786       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3787                              lhs, rhs, "");
3788    case nir_op_fmax:
3789       return ac_build_intrinsic(
3790          ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3791          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3792          AC_FUNC_ATTR_READNONE);
3793    case nir_op_iand:
3794       return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3795    case nir_op_ior:
3796       return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3797    case nir_op_ixor:
3798       return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3799    default:
3800       unreachable("bad reduction intrinsic");
3801    }
3802 }
3803 
3804 /**
3805  * \param src The value to shift.
3806  * \param identity The value to use the first lane.
3807  * \param maxprefix specifies that the result only needs to be correct for a
3808  *     prefix of this many threads
3809  * \return src, shifted 1 lane up, and identity shifted into lane 0.
3810  */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3811 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3812                                                LLVMValueRef identity, unsigned maxprefix)
3813 {
3814    if (ctx->chip_class >= GFX10) {
3815       /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3816       LLVMValueRef active, tmp1, tmp2;
3817       LLVMValueRef tid = ac_get_thread_id(ctx);
3818 
3819       tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3820 
3821       tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3822 
3823       if (maxprefix > 32) {
3824          active =
3825             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3826 
3827          tmp2 = LLVMBuildSelect(ctx->builder, active,
3828                                 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3829                                 tmp2, "");
3830 
3831          active = LLVMBuildOr(
3832             ctx->builder, active,
3833             LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3834                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3835                           LLVMConstInt(ctx->i32, 0x10, false), ""),
3836             "");
3837          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3838       } else if (maxprefix > 16) {
3839          active =
3840             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3841 
3842          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3843       }
3844    } else if (ctx->chip_class >= GFX8) {
3845       return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3846    }
3847 
3848    /* wavefront shift_right by 1 on SI/CI */
3849    LLVMValueRef active, tmp1, tmp2;
3850    LLVMValueRef tid = ac_get_thread_id(ctx);
3851    tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3852    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3853    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3854                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3855                           LLVMConstInt(ctx->i32, 0x4, 0), "");
3856    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3857    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3858    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3859                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3860                           LLVMConstInt(ctx->i32, 0x8, 0), "");
3861    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3862    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3863    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3864                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3865                           LLVMConstInt(ctx->i32, 0x10, 0), "");
3866    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3867    tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3868    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3869    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3870    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3871    return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3872 }
3873 
3874 /**
3875  * \param maxprefix specifies that the result only needs to be correct for a
3876  *     prefix of this many threads
3877  */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3878 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3879                                   LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3880 {
3881    LLVMValueRef result, tmp;
3882 
3883    if (!inclusive)
3884       src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3885 
3886    result = src;
3887 
3888    if (ctx->chip_class <= GFX7) {
3889       assert(maxprefix == 64);
3890       LLVMValueRef tid = ac_get_thread_id(ctx);
3891       LLVMValueRef active;
3892       tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3893       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3894                              LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3895       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3896       result = ac_build_alu_op(ctx, result, tmp, op);
3897       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3898       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3899                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3900                              ctx->i32_0, "");
3901       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3902       result = ac_build_alu_op(ctx, result, tmp, op);
3903       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3904       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3905                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3906                              ctx->i32_0, "");
3907       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3908       result = ac_build_alu_op(ctx, result, tmp, op);
3909       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3910       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3911                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3912                              ctx->i32_0, "");
3913       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3914       result = ac_build_alu_op(ctx, result, tmp, op);
3915       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3916       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3917                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3918                              ctx->i32_0, "");
3919       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3920       result = ac_build_alu_op(ctx, result, tmp, op);
3921       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3922       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3923                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3924                              ctx->i32_0, "");
3925       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3926       result = ac_build_alu_op(ctx, result, tmp, op);
3927       return result;
3928    }
3929 
3930    if (maxprefix <= 1)
3931       return result;
3932    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3933    result = ac_build_alu_op(ctx, result, tmp, op);
3934    if (maxprefix <= 2)
3935       return result;
3936    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3937    result = ac_build_alu_op(ctx, result, tmp, op);
3938    if (maxprefix <= 3)
3939       return result;
3940    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3941    result = ac_build_alu_op(ctx, result, tmp, op);
3942    if (maxprefix <= 4)
3943       return result;
3944    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3945    result = ac_build_alu_op(ctx, result, tmp, op);
3946    if (maxprefix <= 8)
3947       return result;
3948    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3949    result = ac_build_alu_op(ctx, result, tmp, op);
3950    if (maxprefix <= 16)
3951       return result;
3952 
3953    if (ctx->chip_class >= GFX10) {
3954       LLVMValueRef tid = ac_get_thread_id(ctx);
3955       LLVMValueRef active;
3956 
3957       tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3958 
3959       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3960                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3961                              ctx->i32_0, "");
3962 
3963       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3964 
3965       result = ac_build_alu_op(ctx, result, tmp, op);
3966 
3967       if (maxprefix <= 32)
3968          return result;
3969 
3970       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3971 
3972       active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3973 
3974       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3975 
3976       result = ac_build_alu_op(ctx, result, tmp, op);
3977       return result;
3978    }
3979 
3980    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3981    result = ac_build_alu_op(ctx, result, tmp, op);
3982    if (maxprefix <= 32)
3983       return result;
3984    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3985    result = ac_build_alu_op(ctx, result, tmp, op);
3986    return result;
3987 }
3988 
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3989 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3990 {
3991    LLVMValueRef result;
3992 
3993    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3994       LLVMBuilderRef builder = ctx->builder;
3995       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3996       result = ac_build_ballot(ctx, src);
3997       result = ac_build_mbcnt(ctx, result);
3998       result = LLVMBuildAdd(builder, result, src, "");
3999       return result;
4000    }
4001 
4002    ac_build_optimization_barrier(ctx, &src);
4003 
4004    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4005    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4006                              LLVMTypeOf(identity), "");
4007    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4008 
4009    return ac_build_wwm(ctx, result);
4010 }
4011 
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)4012 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4013 {
4014    LLVMValueRef result;
4015 
4016    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4017       LLVMBuilderRef builder = ctx->builder;
4018       src = LLVMBuildZExt(builder, src, ctx->i32, "");
4019       result = ac_build_ballot(ctx, src);
4020       result = ac_build_mbcnt(ctx, result);
4021       return result;
4022    }
4023 
4024    ac_build_optimization_barrier(ctx, &src);
4025 
4026    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4027    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4028                              LLVMTypeOf(identity), "");
4029    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4030 
4031    return ac_build_wwm(ctx, result);
4032 }
4033 
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)4034 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
4035                              unsigned cluster_size)
4036 {
4037    if (cluster_size == 1)
4038       return src;
4039    ac_build_optimization_barrier(ctx, &src);
4040    LLVMValueRef result, swap;
4041    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4042    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4043                              LLVMTypeOf(identity), "");
4044    swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4045    result = ac_build_alu_op(ctx, result, swap, op);
4046    if (cluster_size == 2)
4047       return ac_build_wwm(ctx, result);
4048 
4049    swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4050    result = ac_build_alu_op(ctx, result, swap, op);
4051    if (cluster_size == 4)
4052       return ac_build_wwm(ctx, result);
4053 
4054    if (ctx->chip_class >= GFX8)
4055       swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4056    else
4057       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4058    result = ac_build_alu_op(ctx, result, swap, op);
4059    if (cluster_size == 8)
4060       return ac_build_wwm(ctx, result);
4061 
4062    if (ctx->chip_class >= GFX8)
4063       swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4064    else
4065       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4066    result = ac_build_alu_op(ctx, result, swap, op);
4067    if (cluster_size == 16)
4068       return ac_build_wwm(ctx, result);
4069 
4070    if (ctx->chip_class >= GFX10)
4071       swap = ac_build_permlane16(ctx, result, 0, true, false);
4072    else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4073       swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4074    else
4075       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4076    result = ac_build_alu_op(ctx, result, swap, op);
4077    if (cluster_size == 32)
4078       return ac_build_wwm(ctx, result);
4079 
4080    if (ctx->chip_class >= GFX8) {
4081       if (ctx->wave_size == 64) {
4082          if (ctx->chip_class >= GFX10)
4083             swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4084          else
4085             swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4086          result = ac_build_alu_op(ctx, result, swap, op);
4087          result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4088       }
4089 
4090       return ac_build_wwm(ctx, result);
4091    } else {
4092       swap = ac_build_readlane(ctx, result, ctx->i32_0);
4093       result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4094       result = ac_build_alu_op(ctx, result, swap, op);
4095       return ac_build_wwm(ctx, result);
4096    }
4097 }
4098 
4099 /**
4100  * "Top half" of a scan that reduces per-wave values across an entire
4101  * workgroup.
4102  *
4103  * The source value must be present in the highest lane of the wave, and the
4104  * highest lane must be live.
4105  */
ac_build_wg_wavescan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4106 void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4107 {
4108    if (ws->maxwaves <= 1)
4109       return;
4110 
4111    const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4112    LLVMBuilderRef builder = ctx->builder;
4113    LLVMValueRef tid = ac_get_thread_id(ctx);
4114    LLVMValueRef tmp;
4115 
4116    tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4117    ac_build_ifcc(ctx, tmp, 1000);
4118    LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4119    ac_build_endif(ctx, 1000);
4120 }
4121 
4122 /**
4123  * "Bottom half" of a scan that reduces per-wave values across an entire
4124  * workgroup.
4125  *
4126  * The caller must place a barrier between the top and bottom halves.
4127  */
ac_build_wg_wavescan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4128 void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4129 {
4130    const LLVMTypeRef type = LLVMTypeOf(ws->src);
4131    const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4132 
4133    if (ws->maxwaves <= 1) {
4134       ws->result_reduce = ws->src;
4135       ws->result_inclusive = ws->src;
4136       ws->result_exclusive = identity;
4137       return;
4138    }
4139    assert(ws->maxwaves <= 32);
4140 
4141    LLVMBuilderRef builder = ctx->builder;
4142    LLVMValueRef tid = ac_get_thread_id(ctx);
4143    LLVMBasicBlockRef bbs[2];
4144    LLVMValueRef phivalues_scan[2];
4145    LLVMValueRef tmp, tmp2;
4146 
4147    bbs[0] = LLVMGetInsertBlock(builder);
4148    phivalues_scan[0] = LLVMGetUndef(type);
4149 
4150    if (ws->enable_reduce)
4151       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4152    else if (ws->enable_inclusive)
4153       tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4154    else
4155       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4156    ac_build_ifcc(ctx, tmp, 1001);
4157    {
4158       tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4159 
4160       ac_build_optimization_barrier(ctx, &tmp);
4161 
4162       bbs[1] = LLVMGetInsertBlock(builder);
4163       phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4164    }
4165    ac_build_endif(ctx, 1001);
4166 
4167    const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4168 
4169    if (ws->enable_reduce) {
4170       tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4171       ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4172    }
4173    if (ws->enable_inclusive)
4174       ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4175    if (ws->enable_exclusive) {
4176       tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4177       tmp = ac_build_readlane(ctx, scan, tmp);
4178       tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4179       ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4180    }
4181 }
4182 
4183 /**
4184  * Inclusive scan of a per-wave value across an entire workgroup.
4185  *
4186  * This implies an s_barrier instruction.
4187  *
4188  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4189  * of the workgroup are live. (This requirement cannot easily be relaxed in a
4190  * useful manner because of the barrier in the algorithm.)
4191  */
ac_build_wg_wavescan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4192 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4193 {
4194    ac_build_wg_wavescan_top(ctx, ws);
4195    ac_build_s_barrier(ctx);
4196    ac_build_wg_wavescan_bottom(ctx, ws);
4197 }
4198 
4199 /**
4200  * "Top half" of a scan that reduces per-thread values across an entire
4201  * workgroup.
4202  *
4203  * All lanes must be active when this code runs.
4204  */
ac_build_wg_scan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4205 void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4206 {
4207    if (ws->enable_exclusive) {
4208       ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4209       if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4210          ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4211       ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4212    } else {
4213       ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4214    }
4215 
4216    bool enable_inclusive = ws->enable_inclusive;
4217    bool enable_exclusive = ws->enable_exclusive;
4218    ws->enable_inclusive = false;
4219    ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4220    ac_build_wg_wavescan_top(ctx, ws);
4221    ws->enable_inclusive = enable_inclusive;
4222    ws->enable_exclusive = enable_exclusive;
4223 }
4224 
4225 /**
4226  * "Bottom half" of a scan that reduces per-thread values across an entire
4227  * workgroup.
4228  *
4229  * The caller must place a barrier between the top and bottom halves.
4230  */
ac_build_wg_scan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4231 void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4232 {
4233    bool enable_inclusive = ws->enable_inclusive;
4234    bool enable_exclusive = ws->enable_exclusive;
4235    ws->enable_inclusive = false;
4236    ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4237    ac_build_wg_wavescan_bottom(ctx, ws);
4238    ws->enable_inclusive = enable_inclusive;
4239    ws->enable_exclusive = enable_exclusive;
4240 
4241    /* ws->result_reduce is already the correct value */
4242    if (ws->enable_inclusive)
4243       ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4244    if (ws->enable_exclusive)
4245       ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4246 }
4247 
4248 /**
4249  * A scan that reduces per-thread values across an entire workgroup.
4250  *
4251  * The caller must ensure that all lanes are active when this code runs
4252  * (WWM is insufficient!), because there is an implied barrier.
4253  */
ac_build_wg_scan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4254 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4255 {
4256    ac_build_wg_scan_top(ctx, ws);
4257    ac_build_s_barrier(ctx);
4258    ac_build_wg_scan_bottom(ctx, ws);
4259 }
4260 
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)4261 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4262                                    unsigned lane1, unsigned lane2, unsigned lane3)
4263 {
4264    unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4265    if (ctx->chip_class >= GFX8) {
4266       return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4267    } else {
4268       return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4269    }
4270 }
4271 
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)4272 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4273 {
4274    LLVMTypeRef type = LLVMTypeOf(src);
4275    LLVMValueRef result;
4276 
4277    index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4278    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4279 
4280    result =
4281       ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4282                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4283    return LLVMBuildTrunc(ctx->builder, result, type, "");
4284 }
4285 
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4286 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4287 {
4288    LLVMTypeRef type;
4289    char *intr;
4290 
4291    if (bitsize == 16) {
4292       intr = "llvm.amdgcn.frexp.exp.i16.f16";
4293       type = ctx->i16;
4294    } else if (bitsize == 32) {
4295       intr = "llvm.amdgcn.frexp.exp.i32.f32";
4296       type = ctx->i32;
4297    } else {
4298       intr = "llvm.amdgcn.frexp.exp.i32.f64";
4299       type = ctx->i32;
4300    }
4301 
4302    LLVMValueRef params[] = {
4303       src0,
4304    };
4305    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4306 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4307 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4308 {
4309    LLVMTypeRef type;
4310    char *intr;
4311 
4312    if (bitsize == 16) {
4313       intr = "llvm.amdgcn.frexp.mant.f16";
4314       type = ctx->f16;
4315    } else if (bitsize == 32) {
4316       intr = "llvm.amdgcn.frexp.mant.f32";
4317       type = ctx->f32;
4318    } else {
4319       intr = "llvm.amdgcn.frexp.mant.f64";
4320       type = ctx->f64;
4321    }
4322 
4323    LLVMValueRef params[] = {
4324       src0,
4325    };
4326    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4327 }
4328 
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4329 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4330 {
4331    LLVMTypeRef type;
4332    char *intr;
4333 
4334    if (bitsize == 16) {
4335       intr = "llvm.canonicalize.f16";
4336       type = ctx->f16;
4337    } else if (bitsize == 32) {
4338       intr = "llvm.canonicalize.f32";
4339       type = ctx->f32;
4340    } else {
4341       intr = "llvm.canonicalize.f64";
4342       type = ctx->f64;
4343    }
4344 
4345    LLVMValueRef params[] = {
4346       src0,
4347    };
4348    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4349 }
4350 
4351 /*
4352  * this takes an I,J coordinate pair,
4353  * and works out the X and Y derivatives.
4354  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4355  */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)4356 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4357 {
4358    LLVMValueRef result[4], a;
4359    unsigned i;
4360 
4361    for (i = 0; i < 2; i++) {
4362       a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4363       result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4364       result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4365    }
4366    return ac_build_gather_values(ctx, result, 4);
4367 }
4368 
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)4369 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4370 {
4371    LLVMValueRef result =
4372       ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4373    return LLVMBuildNot(ctx->builder, result, "");
4374 }
4375 
ac_build_is_helper_invocation(struct ac_llvm_context * ctx)4376 LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4377 {
4378    if (!ctx->postponed_kill)
4379       return ac_build_load_helper_invocation(ctx);
4380 
4381    /* !(exact && postponed) */
4382    LLVMValueRef exact =
4383       ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4384 
4385    LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
4386    return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), "");
4387 }
4388 
ac_build_call(struct ac_llvm_context * ctx,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)4389 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4390                            unsigned num_args)
4391 {
4392    LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4393    LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4394    return ret;
4395 }
4396 
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,struct ac_export_args * args)4397 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4398                      LLVMValueRef samplemask, struct ac_export_args *args)
4399 {
4400    unsigned mask = 0;
4401    unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL);
4402 
4403    assert(depth || stencil || samplemask);
4404 
4405    memset(args, 0, sizeof(*args));
4406 
4407    args->valid_mask = 1; /* whether the EXEC mask is valid */
4408    args->done = 1;       /* DONE bit */
4409 
4410    /* Specify the target we are exporting */
4411    args->target = V_008DFC_SQ_EXP_MRTZ;
4412 
4413    args->compr = 0;                       /* COMP flag */
4414    args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4415    args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4416    args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4417    args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4418 
4419    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4420       assert(!depth);
4421       args->compr = 1; /* COMPR flag */
4422 
4423       if (stencil) {
4424          /* Stencil should be in X[23:16]. */
4425          stencil = ac_to_integer(ctx, stencil);
4426          stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4427          args->out[0] = ac_to_float(ctx, stencil);
4428          mask |= 0x3;
4429       }
4430       if (samplemask) {
4431          /* SampleMask should be in Y[15:0]. */
4432          args->out[1] = samplemask;
4433          mask |= 0xc;
4434       }
4435    } else {
4436       if (depth) {
4437          args->out[0] = depth;
4438          mask |= 0x1;
4439       }
4440       if (stencil) {
4441          args->out[1] = stencil;
4442          mask |= 0x2;
4443       }
4444       if (samplemask) {
4445          args->out[2] = samplemask;
4446          mask |= 0x4;
4447       }
4448    }
4449 
4450    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4451     * at the X writemask component. */
4452    if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4453       mask |= 0x1;
4454 
4455    /* Specify which components to enable */
4456    args->enabled_channels = mask;
4457 }
4458 
4459 /* Send GS Alloc Req message from the first wave of the group to SPI.
4460  * Message payload is:
4461  * - bits 0..10: vertices in group
4462  * - bits 12..22: primitives in group
4463  */
ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context * ctx,LLVMValueRef wave_id,LLVMValueRef vtx_cnt,LLVMValueRef prim_cnt)4464 void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4465                                    LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4466 {
4467    LLVMBuilderRef builder = ctx->builder;
4468    LLVMValueRef tmp;
4469    bool export_dummy_prim = false;
4470 
4471    /* HW workaround for a GPU hang with 100% culling.
4472     * We always have to export at least 1 primitive.
4473     * Export a degenerate triangle using vertex 0 for all 3 vertices.
4474     */
4475    if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {
4476       assert(vtx_cnt == ctx->i32_0);
4477       prim_cnt = ctx->i32_1;
4478       vtx_cnt = ctx->i32_1;
4479       export_dummy_prim = true;
4480    }
4481 
4482    ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4483 
4484    tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4485    tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4486    ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4487 
4488    if (export_dummy_prim) {
4489       struct ac_ngg_prim prim = {0};
4490       /* The vertex indices are 0,0,0. */
4491       prim.passthrough = ctx->i32_0;
4492 
4493       struct ac_export_args pos = {0};
4494       pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0;
4495       pos.target = V_008DFC_SQ_EXP_POS;
4496       pos.enabled_channels = 0xf;
4497       pos.done = true;
4498 
4499       ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4500                     5021);
4501       ac_build_export_prim(ctx, &prim);
4502       ac_build_export(ctx, &pos);
4503       ac_build_endif(ctx, 5021);
4504    }
4505 
4506    ac_build_endif(ctx, 5020);
4507 }
4508 
ac_pack_prim_export(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4509 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4510 {
4511    /* The prim export format is:
4512     *  - bits 0..8: index 0
4513     *  - bit 9: edge flag 0
4514     *  - bits 10..18: index 1
4515     *  - bit 19: edge flag 1
4516     *  - bits 20..28: index 2
4517     *  - bit 29: edge flag 2
4518     *  - bit 31: null primitive (skip)
4519     */
4520    LLVMBuilderRef builder = ctx->builder;
4521    LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4522    LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4523 
4524    for (unsigned i = 0; i < prim->num_vertices; ++i) {
4525       tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4526       result = LLVMBuildOr(builder, result, tmp, "");
4527       tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, "");
4528       tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), "");
4529       result = LLVMBuildOr(builder, result, tmp, "");
4530    }
4531    return result;
4532 }
4533 
ac_build_export_prim(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4534 void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4535 {
4536    struct ac_export_args args;
4537 
4538    if (prim->passthrough) {
4539       args.out[0] = prim->passthrough;
4540    } else {
4541       args.out[0] = ac_pack_prim_export(ctx, prim);
4542    }
4543 
4544    args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4545    args.out[1] = LLVMGetUndef(ctx->f32);
4546    args.out[2] = LLVMGetUndef(ctx->f32);
4547    args.out[3] = LLVMGetUndef(ctx->f32);
4548 
4549    args.target = V_008DFC_SQ_EXP_PRIM;
4550    args.enabled_channels = 1;
4551    args.done = true;
4552    args.valid_mask = false;
4553    args.compr = false;
4554 
4555    ac_build_export(ctx, &args);
4556 }
4557 
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)4558 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4559 {
4560    if (type == AC_ARG_FLOAT) {
4561       return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4562    } else if (type == AC_ARG_INT) {
4563       return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4564    } else {
4565       LLVMTypeRef ptr_type;
4566       switch (type) {
4567       case AC_ARG_CONST_PTR:
4568          ptr_type = ctx->i8;
4569          break;
4570       case AC_ARG_CONST_FLOAT_PTR:
4571          ptr_type = ctx->f32;
4572          break;
4573       case AC_ARG_CONST_PTR_PTR:
4574          ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4575          break;
4576       case AC_ARG_CONST_DESC_PTR:
4577          ptr_type = ctx->v4i32;
4578          break;
4579       case AC_ARG_CONST_IMAGE_PTR:
4580          ptr_type = ctx->v8i32;
4581          break;
4582       default:
4583          unreachable("unknown arg type");
4584       }
4585       if (size == 1) {
4586          return ac_array_in_const32_addr_space(ptr_type);
4587       } else {
4588          assert(size == 2);
4589          return ac_array_in_const_addr_space(ptr_type);
4590       }
4591    }
4592 }
4593 
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)4594 LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4595                            enum ac_llvm_calling_convention convention, const char *name,
4596                            LLVMTypeRef ret_type, LLVMModuleRef module)
4597 {
4598    LLVMTypeRef arg_types[AC_MAX_ARGS];
4599 
4600    for (unsigned i = 0; i < args->arg_count; i++) {
4601       arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4602    }
4603 
4604    LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4605 
4606    LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4607    LLVMBasicBlockRef main_function_body =
4608       LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4609    LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4610 
4611    LLVMSetFunctionCallConv(main_function, convention);
4612    for (unsigned i = 0; i < args->arg_count; ++i) {
4613       LLVMValueRef P = LLVMGetParam(main_function, i);
4614 
4615       if (args->args[i].file != AC_ARG_SGPR)
4616          continue;
4617 
4618       ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4619 
4620       if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4621          ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4622          ac_add_attr_dereferenceable(P, UINT64_MAX);
4623          ac_add_attr_alignment(P, 32);
4624       }
4625    }
4626 
4627    ctx->main_function = main_function;
4628 
4629    if (LLVM_VERSION_MAJOR >= 11) {
4630       /* Enable denormals for FP16 and FP64: */
4631       LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4632       /* Disable denormals for FP32: */
4633       LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4634                                          "preserve-sign,preserve-sign");
4635    }
4636    return main_function;
4637 }
4638 
ac_build_s_endpgm(struct ac_llvm_context * ctx)4639 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4640 {
4641    LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4642    LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4643    LLVMBuildCall(ctx->builder, code, NULL, 0, "");
4644 }
4645 
ac_prefix_bitcount(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef index)4646 LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef index)
4647 {
4648    LLVMBuilderRef builder = ctx->builder;
4649    LLVMTypeRef type = LLVMTypeOf(mask);
4650 
4651    LLVMValueRef bit =
4652       LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), LLVMBuildZExt(builder, index, type, ""), "");
4653    LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), "");
4654    LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, "");
4655    return ac_build_bit_count(ctx, prefix_mask);
4656 }
4657 
4658 /* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */
ac_prefix_bitcount_2x64(struct ac_llvm_context * ctx,LLVMValueRef mask[2],LLVMValueRef index)4659 LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, LLVMValueRef mask[2],
4660                                      LLVMValueRef index)
4661 {
4662    LLVMBuilderRef builder = ctx->builder;
4663 #if 0
4664 	/* Reference version using i128. */
4665 	LLVMValueRef input_mask =
4666 		LLVMBuildBitCast(builder, ac_build_gather_values(ctx, mask, 2), ctx->i128, "");
4667 
4668 	return ac_prefix_bitcount(ctx, input_mask, index);
4669 #else
4670    /* Optimized version using 2 64-bit masks. */
4671    LLVMValueRef is_hi, is_0, c64, c128, all_bits;
4672    LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2];
4673 
4674    /* Compute the 128-bit prefix mask. */
4675    c64 = LLVMConstInt(ctx->i32, 64, 0);
4676    c128 = LLVMConstInt(ctx->i32, 128, 0);
4677    all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0);
4678    /* The first index that can have non-zero high bits in the prefix mask is 65. */
4679    is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, "");
4680    is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, "");
4681    mask_bcnt0 = ac_build_bit_count(ctx, mask[0]);
4682 
4683    for (unsigned i = 0; i < 2; i++) {
4684       shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, "");
4685       /* For i==0, index==0, the right shift by 64 doesn't give the desired result,
4686        * so we handle it by the is_0 select.
4687        * For i==1, index==64, same story, so we handle it by the last is_hi select.
4688        * For i==0, index==64, we shift by 0, which is what we want.
4689        */
4690       prefix_mask[i] =
4691          LLVMBuildLShr(builder, all_bits, LLVMBuildZExt(builder, shift[i], ctx->i64, ""), "");
4692       prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], "");
4693       prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]);
4694    }
4695 
4696    prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], "");
4697    prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], "");
4698    prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, "");
4699 
4700    return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], "");
4701 #endif
4702 }
4703 
4704 /**
4705  * Convert triangle strip indices to triangle indices. This is used to decompose
4706  * triangle strips into triangles.
4707  */
ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context * ctx,LLVMValueRef is_odd,LLVMValueRef flatshade_first,LLVMValueRef index[3])4708 void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4709                                                  LLVMValueRef flatshade_first,
4710                                                  LLVMValueRef index[3])
4711 {
4712    LLVMBuilderRef builder = ctx->builder;
4713    LLVMValueRef out[3];
4714 
4715    /* We need to change the vertex order for odd triangles to get correct
4716     * front/back facing by swapping 2 vertex indices, but we also have to
4717     * keep the provoking vertex in the same place.
4718     *
4719     * If the first vertex is provoking, swap index 1 and 2.
4720     * If the last vertex is provoking, swap index 0 and 1.
4721     */
4722    out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4723                             LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4724    out[1] = LLVMBuildSelect(builder, flatshade_first,
4725                             LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4726                             LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4727    out[2] = LLVMBuildSelect(builder, flatshade_first,
4728                             LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4729    memcpy(index, out, sizeof(out));
4730 }
4731