1; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NONSA %s
2; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NSA %s
3
4; GCN-LABEL: {{^}}sample_2d:
5;
6; TODO: use NSA here
7; GCN: v_mov_b32_e32 v2, v0
8;
9; GCN: image_sample v[0:3], v[1:2],
10define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) {
11main_body:
12  %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
13  ret <4 x float> %v
14}
15
16; GCN-LABEL: {{^}}sample_3d:
17; NONSA: v_mov_b32_e32 v3, v0
18; NONSA: image_sample v[0:3], v[1:3],
19; NSA: image_sample v[0:3], [v1, v2, v0],
20define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) {
21main_body:
22  %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
23  ret <4 x float> %v
24}
25
26; GCN-LABEL: {{^}}sample_d_3d:
27; NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
28define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
29main_body:
30  %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
31  ret <4 x float> %v
32}
33
34; GCN-LABEL: {{^}}sample_contig_nsa:
35; GCN: image_sample_c_l v0, v[0:7],
36; NSA: image_sample v1, [v6, v7, v5],
37define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) {
38main_body:
39  %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
40  %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
41  %r.0 = insertelement <2 x float> undef, float %v1, i32 0
42  %r = insertelement <2 x float> %r.0, float %v2, i32 1
43  ret <2 x float> %r
44}
45
46; GCN-LABEL: {{^}}sample_nsa_nsa:
47; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
48; NSA: image_sample v1, [v6, v7, v5],
49define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) {
50main_body:
51  %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
52  %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
53  %r.0 = insertelement <2 x float> undef, float %v1, i32 0
54  %r = insertelement <2 x float> %r.0, float %v2, i32 1
55  ret <2 x float> %r
56}
57
58; GCN-LABEL: {{^}}sample_nsa_contig:
59; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
60; NSA: image_sample v1, v[5:7],
61define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) {
62main_body:
63  %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
64  %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
65  %r.0 = insertelement <2 x float> undef, float %v1, i32 0
66  %r = insertelement <2 x float> %r.0, float %v2, i32 1
67  ret <2 x float> %r
68}
69
70; GCN-LABEL: {{^}}sample_contig_contig:
71; GCN: image_sample_c_l v0, v[0:7],
72; NSA: image_sample v1, v[5:7],
73; NONSA: image_sample v1, v[5:7],
74define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) {
75main_body:
76  %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
77  %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
78  %r.0 = insertelement <2 x float> undef, float %v1, i32 0
79  %r = insertelement <2 x float> %r.0, float %v2, i32 1
80  ret <2 x float> %r
81}
82
83; Test that undef inputs with NSA are handled safely; these tests used to crash.
84
85; GCN-LABEL: {{^}}sample_undef_undef_undef_undef:
86; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
87define amdgpu_ps float @sample_undef_undef_undef_undef(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp) {
88  %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float undef, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
89  ret float %r
90}
91
92; GCN-LABEL: {{^}}sample_undef_undef_undef_def:
93; NONSA: v_mov_b32_e32 v3, v0
94; NONSA: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
95; NSA: image_sample_c_b v0, [v0, v0, v0, v0], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
96define amdgpu_ps float @sample_undef_undef_undef_def(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %layer) {
97  %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float %layer, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
98  ret float %r
99}
100
101; GCN-LABEL: {{^}}sample_undef_undef_undef_def_rnd:
102; GCN: v_rndne_f32_e32 v3, v0
103; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
104define amdgpu_ps float @sample_undef_undef_undef_def_rnd(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %layer) {
105  %layer_rnd = call float @llvm.rint.f32(float %layer)
106  %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float %layer_rnd, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
107  ret float %r
108}
109
110; GCN-LABEL: {{^}}sample_def_undef_undef_undef:
111; GCN: v_add_f32_e32 v0, 1.0, v0
112; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
113define amdgpu_ps float @sample_def_undef_undef_undef(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %z0) {
114  ; The NSA reassign pass is conservative (quite reasonably!) when one of the operands
115  ; comes directly from a function argument (via COPY). To test that NSA can be
116  ; eliminated in the presence of undef, just add an arbitrary intermediate
117  ; computation.
118  %c0 = fadd float %z0, 1.0
119  %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float %c0, float undef, float undef, float undef, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
120  ret float %r
121}
122
123declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
124declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
125declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
126
127declare float @llvm.amdgcn.image.sample.3d.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
128declare float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
129
130declare float @llvm.rint.f32(float) #2
131declare float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
132
133attributes #1 = { nounwind readonly }
134attributes #2 = { nounwind readnone speculatable willreturn }
135