1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
5
6; Tests for indirect addressing on SI, which is implemented using dynamic
7; indexing of vectors.
8
9; GCN-LABEL: {{^}}extract_w_offset:
10; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
11; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
12; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
13; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
14; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
15; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
16
17; MOVREL-DAG: s_mov_b32 m0, [[IN]]
18; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
19
20; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
21; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
22; IDXMODE-NEXT: s_set_gpr_idx_off
23define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
24entry:
25  %idx = add i32 %in, 1
26  %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
27  store float %elt, float addrspace(1)* %out
28  ret void
29}
30
31; XXX: Could do v_or_b32 directly
32; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
33; GCN-DAG: s_or_b32
34; GCN-DAG: s_or_b32
35; GCN-DAG: s_or_b32
36; GCN-DAG: s_or_b32
37; MOVREL: s_mov_b32 m0
38; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
39; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
40; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
41; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
42
43
44; MOVREL: v_movrels_b32_e32
45
46; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}}
47; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
48; IDXMODE-NEXT: s_set_gpr_idx_off
49define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <16 x i32> %or.val) {
50entry:
51  %idx = add i32 %in, 1
52  %vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
53  %elt = extractelement <16 x i32> %vec, i32 %idx
54  store i32 %elt, i32 addrspace(1)* %out
55  ret void
56}
57
58; GCN-LABEL: {{^}}extract_wo_offset:
59; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
60; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
61; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
62; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
63; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
64
65; MOVREL-DAG: s_mov_b32 m0, [[IN]]
66; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
67
68; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
69; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
70; IDXMODE-NEXT: s_set_gpr_idx_off
71define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
72entry:
73  %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in
74  store float %elt, float addrspace(1)* %out
75  ret void
76}
77
78; GCN-LABEL: {{^}}extract_neg_offset_sgpr:
79; The offset depends on the register that holds the first element of the vector.
80; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
81; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
82
83; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
84; IDXMODE: v_mov_b32_e32 v14, 15
85; IDXMODE: v_mov_b32_e32 v15, 16
86; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
87; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
88; IDXMODE-NEXT: s_set_gpr_idx_off
89define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
90entry:
91  %index = add i32 %offset, -512
92  %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
93  store i32 %value, i32 addrspace(1)* %out
94  ret void
95}
96
97; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
98; The offset depends on the register that holds the first element of the vector.
99; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
100; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
101
102; IDXMODE-DAG: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
103; IDXMODE-DAG: v_mov_b32_e32 v0,
104; IDXMODE: v_mov_b32_e32 v1,
105; IDXMODE: v_mov_b32_e32 v2,
106; IDXMODE: v_mov_b32_e32 v3,
107; IDXMODE: v_mov_b32_e32 v4,
108; IDXMODE: v_mov_b32_e32 v5,
109; IDXMODE: v_mov_b32_e32 v6,
110; IDXMODE: v_mov_b32_e32 v7,
111; IDXMODE: v_mov_b32_e32 v8,
112; IDXMODE: v_mov_b32_e32 v9,
113; IDXMODE: v_mov_b32_e32 v10,
114; IDXMODE: v_mov_b32_e32 v11,
115; IDXMODE: v_mov_b32_e32 v12,
116; IDXMODE: v_mov_b32_e32 v13,
117; IDXMODE: v_mov_b32_e32 v14,
118; IDXMODE: v_mov_b32_e32 v15,
119; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
120; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
121; IDXMODE-NEXT: s_set_gpr_idx_off
122define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
123entry:
124  %index = add i32 %offset, -512
125  %or = or <16 x i32> %vec0, %vec1
126  %value = extractelement <16 x i32> %or, i32 %index
127  store i32 %value, i32 addrspace(1)* %out
128  ret void
129}
130
131; GCN-LABEL: {{^}}extract_neg_offset_vgpr:
132; The offset depends on the register that holds the first element of the vector.
133
134; GCN: v_cmp_eq_u32_e32
135; GCN-COUNT-14: v_cndmask_b32
136; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16
137; GCN: buffer_store_dword [[RESULT]]
138define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
139entry:
140  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
141  %index = add i32 %id, -512
142  %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
143  store i32 %value, i32 addrspace(1)* %out
144  ret void
145}
146
147; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
148; undefined behavior, but shouldn't crash compiler
149define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
150entry:
151  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
152  %value = extractelement <4 x i32> %ld, i32 undef
153  store i32 %value, i32 addrspace(1)* %out
154  ret void
155}
156
157; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
158; undefined behavior, but shouldn't crash compiler
159define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
160entry:
161  %ld = load <4 x i32>, <4  x i32> addrspace(1)* %in
162  %value = insertelement <4 x i32> %ld, i32 5, i32 undef
163  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
164  ret void
165}
166
167; GCN-LABEL: {{^}}insert_w_offset:
168; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
169; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
170; MOVREL-DAG: s_mov_b32 m0, [[IN]]
171; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
172; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
173; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
174; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
175; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
176; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
177
178; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]]
179; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
180define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
181entry:
182  %add = add i32 %in, 1
183  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
184  store <16 x float> %ins, <16 x float> addrspace(1)* %out
185  ret void
186}
187
188; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset:
189; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
190; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
191; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
192; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff
193
194; MOVREL: s_mov_b32 m0, [[BASE]]
195; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}}
196
197; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST)
198; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}}
199; IDXMODE-NEXT: s_set_gpr_idx_off
200define amdgpu_kernel void @insert_unsigned_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) {
201entry:
202  %base = zext i16 %in to i32
203  %add = add i32 %base, 1
204  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
205  store <16 x float> %ins, <16 x float> addrspace(1)* %out
206  ret void
207}
208
209; GCN-LABEL: {{^}}insert_signed_base_plus_offset:
210; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
211; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
212; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
213
214; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]]
215; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1
216
217; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]]
218; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}}
219
220; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST)
221; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}}
222; IDXMODE-NEXT: s_set_gpr_idx_off
223define amdgpu_kernel void @insert_signed_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) {
224entry:
225  %base = sext i16 %in to i32
226  %add = add i32 %base, 1
227  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
228  store <16 x float> %ins, <16 x float> addrspace(1)* %out
229  ret void
230}
231
232
233; GCN-LABEL: {{^}}insert_wo_offset:
234; GCN: s_load_dword [[IN:s[0-9]+]]
235
236; MOVREL: s_mov_b32 m0, [[IN]]
237; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
238
239; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(DST)
240; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
241; IDXMODE-NEXT: s_set_gpr_idx_off
242
243; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
244define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
245entry:
246  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
247  store <16 x float> %ins, <16 x float> addrspace(1)* %out
248  ret void
249}
250
251; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
252; The offset depends on the register that holds the first element of the vector.
253; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
254; MOVREL: v_movreld_b32_e32 v0, 16
255
256; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
257; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
258; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
259; IDXMODE-NEXT: s_set_gpr_idx_off
260define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) {
261entry:
262  %index = add i32 %offset, -512
263  %value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
264  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
265  ret void
266}
267
268; The vector indexed into is originally loaded into an SGPR rather
269; than built with a reg_sequence
270
271; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
272; The offset depends on the register that holds the first element of the vector.
273; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
274; MOVREL: v_movreld_b32_e32 v0, 5
275
276; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
277; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
278; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
279; IDXMODE-NEXT: s_set_gpr_idx_off
280define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) {
281entry:
282  %index = add i32 %offset, -512
283  %value = insertelement <16 x i32> %vec, i32 5, i32 %index
284  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
285  ret void
286}
287
288; GCN-LABEL: {{^}}insert_neg_offset_vgpr:
289; The offset depends on the register that holds the first element of the vector.
290
291; GCN: v_cmp_eq_u32_e32
292; GCN-COUNT-16: v_cndmask_b32
293; GCN-COUNT-4:  buffer_store_dwordx4
294define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
295entry:
296  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
297  %index = add i32 %id, -512
298  %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
299  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
300  ret void
301}
302
303; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:
304
305; GCN: v_cmp_eq_u32_e32
306; GCN-COUNT-16: v_cndmask_b32
307; GCN-COUNT-4:  buffer_store_dwordx4
308define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
309entry:
310  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
311  %index = add i32 %id, -16
312  %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
313  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
314  ret void
315}
316
317; When the block is split to insert the loop, make sure any other
318; places that need to be expanded in the same block are also handled.
319
320; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
321
322; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
323; GCN: v_cmp_eq_u32
324; GCN: v_cndmask_b32_e64 [[RESULT0:v[0-9]+]], 16,
325; GCN: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 16,
326
327; GCN: buffer_store_dword [[RESULT0]]
328; GCN: buffer_store_dword [[RESULT1]]
329define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
330entry:
331  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
332  %id.ext = zext i32 %id to i64
333  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
334  %idx0 = load volatile i32, i32 addrspace(1)* %gep
335  %idx1 = add i32 %idx0, 1
336  %val0 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx0
337  %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
338  %val1 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx1
339  store volatile i32 %val0, i32 addrspace(1)* %out0
340  store volatile i32 %val1, i32 addrspace(1)* %out0
341  %cmp = icmp eq i32 %id, 0
342  br i1 %cmp, label %bb1, label %bb2
343
344bb1:
345  store volatile i32 %live.out.reg, i32 addrspace(1)* undef
346  br label %bb2
347
348bb2:
349  ret void
350}
351
352; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to
353; avoid very different schedule induced isses with gfx9.
354; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
355
356
357; GCN-LABEL: {{^}}insert_adjacent_blocks:
358define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
359bb:
360  %tmp = icmp eq i32 %arg, 0
361  br i1 %tmp, label %bb1, label %bb4
362
363bb1:                                              ; preds = %bb
364  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
365  %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
366  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
367  br label %bb7
368
369bb4:                                              ; preds = %bb
370  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
371  %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
372  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
373  br label %bb7
374
375bb7:                                              ; preds = %bb4, %bb1
376  %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
377  store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
378  ret void
379}
380
381; FIXME: Should be able to fold zero input to movreld to inline imm?
382
383; GCN-LABEL: {{^}}multi_same_block:
384
385; GCN: s_load_dword [[ARG:s[0-9]+]]
386
387; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
388; MOVREL: s_waitcnt
389; MOVREL: s_add_i32 m0, [[ARG]], -16
390; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
391; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
392; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
393; MOVREL: s_mov_b32 m0, -1
394
395
396; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
397; IDXMODE: s_waitcnt
398; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
399; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
400; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
401; IDXMODE: s_set_gpr_idx_off
402; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
403; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
404; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
405; IDXMODE: s_set_gpr_idx_off
406
407; GCN: ds_write_b32
408; GCN: ds_write_b32
409; GCN: s_endpgm
410define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
411bb:
412  %tmp1 = add i32 %arg, -16
413  %tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
414  %tmp3 = add i32 %arg, -16
415  %tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3
416  %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
417  %tmp6 = extractelement <9 x i32> %tmp5, i32 1
418  %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
419  %tmp8 = extractelement <9 x i32> %tmp7, i32 5
420  store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
421  store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
422  ret void
423}
424
425; offset puts outside of superegister bounaries, so clamp to 1st element.
426; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
427; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]
428; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]]
429; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15
430
431; MOVREL: s_mov_b32 m0, [[IDX]]
432; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
433
434; IDXMODE: s_set_gpr_idx_on [[IDX]], gpr_idx(SRC0)
435; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
436; IDXMODE: s_set_gpr_idx_off
437
438; GCN: buffer_store_dword [[EXTRACT]]
439define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) {
440entry:
441  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
442  %offset = add i32 %idx, 15
443  %value = extractelement <16 x i32> %ld, i32 %offset
444  store i32 %value, i32 addrspace(1)* %out
445  ret void
446}
447
448; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
449; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
450; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
451; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
452
453; MOVREL: s_mov_b32 m0, [[ADD_IDX]]
454; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
455
456; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0)
457; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
458; IDXMODE: s_set_gpr_idx_off
459
460; GCN: buffer_store_dword [[EXTRACT]]
461define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) {
462entry:
463  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
464  %offset = add i32 %idx, 16
465  %value = extractelement <16 x i32> %ld, i32 %offset
466  store i32 %value, i32 addrspace(1)* %out
467  ret void
468}
469
470; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
471; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
472; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
473
474; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
475; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
476
477; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0)
478; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
479; IDXMODE: s_set_gpr_idx_off
480define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) {
481entry:
482  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
483  %idx.shl = shl i32 %idx.in, 2
484  %idx = or i32 %idx.shl, 1
485  %value = extractelement <16 x i32> %ld, i32 %idx
486  store i32 %value, i32 addrspace(1)* %out
487  ret void
488}
489
490; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
491; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
492; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
493
494; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
495; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
496
497; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST)
498; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
499; IDXMODE: s_set_gpr_idx_off
500define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {
501  %idx.shl = shl i32 %idx.in, 2
502  %idx = or i32 %idx.shl, 1
503  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
504  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
505  ret void
506}
507
508; GCN-LABEL: {{^}}broken_phi_bb:
509; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
510
511; GCN: {{BB[0-9]+_[0-9]+}}:
512; GCN: [[BB2:BB[0-9]+_[0-9]+]]:
513; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
514; GCN: buffer_load_dword
515
516; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
517; MOVREL: v_movreld_b32_e32
518
519; IDXMODE: s_set_gpr_idx_on
520; IDXMODE: v_mov_b32_e32
521; IDXMODE: s_set_gpr_idx_off
522
523; GCN: s_cbranch_execnz [[REGLOOP]]
524
525; GCN: {{^; %bb.[0-9]}}:
526; GCN: s_mov_b64 exec,
527; GCN: s_cbranch_execnz [[BB2]]
528
529define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
530bb:
531  br label %bb2
532
533bb2:                                              ; preds = %bb4, %bb
534  %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
535  %tmp3 = icmp slt i32 %tmp, %arg
536  br i1 %tmp3, label %bb4, label %bb8
537
538bb4:                                              ; preds = %bb2
539  %vgpr = load volatile i32, i32 addrspace(1)* undef
540  %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
541  %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
542  %tmp7 = extractelement <16 x i32> %tmp6, i32 0
543  br label %bb2
544
545bb8:                                              ; preds = %bb2
546  ret void
547}
548
549declare i32 @llvm.amdgcn.workitem.id.x() #1
550declare void @llvm.amdgcn.s.barrier() #2
551
552attributes #0 = { nounwind }
553attributes #1 = { nounwind readnone }
554attributes #2 = { nounwind convergent }
555