1; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
2; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
3
4; FIXME: Broken on evergreen
5; FIXME: For some reason the 8 and 16 vectors are being stored as
6; individual elements instead of 128-bit stores.
7
8
9; FIXME: Why is the constant moved into the intermediate register and
10; not just directly into the vector component?
11
12; SI-LABEL: {{^}}insertelement_v4f32_0:
13; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
14; v_mov_b32_e32
15; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
16; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
17; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
18define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
19  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
20  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
21  ret void
22}
23
24; SI-LABEL: {{^}}insertelement_v4f32_1:
25define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
26  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
27  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
28  ret void
29}
30
31; SI-LABEL: {{^}}insertelement_v4f32_2:
32define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
33  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
34  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
35  ret void
36}
37
38; SI-LABEL: {{^}}insertelement_v4f32_3:
39define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
40  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
41  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
42  ret void
43}
44
45; SI-LABEL: {{^}}insertelement_v4i32_0:
46define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
47  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
48  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
49  ret void
50}
51
52; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
53; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
54; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
55; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
56define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
57  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
58  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
59  ret void
60}
61
62; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
63; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
64; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
65; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
66define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
67  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
68  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
69  ret void
70}
71
72; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
73; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
74; SI: buffer_store_dwordx4
75; SI: buffer_store_dwordx4
76define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
77  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
78  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
79  ret void
80}
81
82; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
83; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
84; SI: buffer_store_dwordx4
85; SI: buffer_store_dwordx4
86; SI: buffer_store_dwordx4
87; SI: buffer_store_dwordx4
88define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
89  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
90  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
91  ret void
92}
93
94; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
95; SI: buffer_store_dwordx2
96define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
97  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
98  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
99  ret void
100}
101
102; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
103; SI: buffer_store_dwordx4
104define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
105  %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
106  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
107  ret void
108}
109
110; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
111; FIXMESI: buffer_store_dwordx4
112; FIXMESI: buffer_store_dwordx4
113define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
114  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
115  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
116  ret void
117}
118
119; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
120; FIXMESI: buffer_store_dwordx4
121; FIXMESI: buffer_store_dwordx4
122; FIXMESI: buffer_store_dwordx4
123; FIXMESI: buffer_store_dwordx4
124define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
125  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
126  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
127  ret void
128}
129
130
131; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
132; FIXMESI: buffer_store_dwordx2
133define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
134  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
135  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
136  ret void
137}
138
139; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
140; FIXMESI: buffer_store_dwordx4
141define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
142  %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
143  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
144  ret void
145}
146
147
148; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
149; FIXMESI: BUFFER_STORE_USHORT
150define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
151  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
152  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
153  ret void
154}
155
156; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
157; FIXMESI: buffer_store_dword
158define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
159  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
160  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
161  ret void
162}
163
164; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
165; FIXMESI: buffer_store_dwordx2
166define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
167  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
168  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
169  ret void
170}
171
172; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
173; FIXMESI: buffer_store_dwordx4
174define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
175  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
176  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
177  ret void
178}
179
180; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
181; the compiler doesn't crash.
182; SI-LABEL: {{^}}insert_split_bb:
183define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
184entry:
185  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
186  %1 = icmp eq i32 %a, 0
187  br i1 %1, label %if, label %else
188
189if:
190  %2 = load i32, i32 addrspace(1)* %in
191  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
192  br label %endif
193
194else:
195  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
196  %5 = load i32, i32 addrspace(1)* %4
197  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
198  br label %endif
199
200endif:
201  %7 = phi <2 x i32> [%3, %if], [%6, %else]
202  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
203  ret void
204}
205
206; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
207; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
208; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
209; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
210
211; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
212; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
213; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
214; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
215
216; SI: s_mov_b32 m0, [[SCALEDIDX]]
217; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
218
219; Increment to next element.
220; FIXME: Should be able to manipulate m0 directly instead of add and
221; copy.
222
223; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1
224; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
225; SI-DAG: s_mov_b32 m0, [[IDX1]]
226; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
227
228; SI: buffer_store_dwordx4
229; SI: s_endpgm
230define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
231  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
232  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
233  ret void
234}
235
236; FIXME: Inline immediate should be folded into v_movreld_b32.
237; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
238
239; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
240; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
241
242; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
243; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
244
245; SI: buffer_store_dwordx4
246; SI: s_endpgm
247define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
248  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
249  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
250  ret void
251}
252
253; FIXME: Should be able to do without stack access. The used stack
254; space is also 2x what should be required.
255
256; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
257; SI: SCRATCH_RSRC_DWORD
258
259; Stack store
260; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
261; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
262
263; Write element
264; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
265
266; Stack reload
267; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
268; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
269
270; Store result
271; SI: buffer_store_dwordx4
272; SI: buffer_store_dwordx4
273; SI: s_endpgm
274; SI: ScratchSize: 64
275
276define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
277  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
278  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
279  ret void
280}
281
282; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
283; SI: SCRATCH_RSRC_DWORD
284
285; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
286; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
287; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
288; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
289
290; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
291
292; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
293; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
294; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
295; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
296
297; SI: buffer_store_dwordx4
298; SI: buffer_store_dwordx4
299; SI: buffer_store_dwordx4
300; SI: buffer_store_dwordx4
301; SI: s_endpgm
302; SI: ScratchSize: 128
303define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
304  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
305  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
306  ret void
307}
308