1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
2
3; GCN-LABEL: {{^}}float4_inselt:
4; GCN-NOT: v_movrel
5; GCN-NOT: buffer_
6; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
7; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
8; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
9; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
10; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
11; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
12; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
13; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
14; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
15define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
16entry:
17  %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
18  store <4 x float> %v, <4 x float> addrspace(1)* %out
19  ret void
20}
21
22; GCN-LABEL: {{^}}float4_inselt_undef:
23; GCN-NOT: v_movrel
24; GCN-NOT: buffer_
25; GCN-NOT: v_cmp_
26; GCN-NOT: v_cndmask_
27; GCN:     v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
28; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
29; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
30; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
31define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
32entry:
33  %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
34  store <4 x float> %v, <4 x float> addrspace(1)* %out
35  ret void
36}
37
38; GCN-LABEL: {{^}}int4_inselt:
39; GCN-NOT: v_movrel
40; GCN-NOT: buffer_
41; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3
42; GCN-DAG: s_cselect_b32 s[[ELT_3:[0-9]+]], s{{[0-9]+}}, 1
43; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
44; GCN-DAG: s_cselect_b32 s[[ELT_2:[0-9]+]], s{{[0-9]+}}, 1
45; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1
46; GCN-DAG: s_cselect_b32 s[[ELT_1:[0-9]+]], s{{[0-9]+}}, 1
47; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
48; GCN-DAG: s_cselect_b32 s[[ELT_0:[0-9]+]], s{{[0-9]+}}, 1
49; GCN-DAG: v_mov_b32_e32 v[[VELT_0:[0-9]+]], s[[ELT_0]]
50; GCN-DAG: v_mov_b32_e32 v[[VELT_1:[0-9]+]], s[[ELT_1]]
51; GCN-DAG: v_mov_b32_e32 v[[VELT_2:[0-9]+]], s[[ELT_2]]
52; GCN-DAG: v_mov_b32_e32 v[[VELT_3:[0-9]+]], s[[ELT_3]]
53; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[VELT_0]]:[[VELT_3]]]
54define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
55entry:
56  %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
57  store <4 x i32> %v, <4 x i32> addrspace(1)* %out
58  ret void
59}
60
61; GCN-LABEL: {{^}}float2_inselt:
62; GCN-NOT: v_movrel
63; GCN-NOT: buffer_
64; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
65; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
66; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
67; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
68; GCN:     flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
69define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
70entry:
71  %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
72  store <2 x float> %v, <2 x float> addrspace(1)* %out
73  ret void
74}
75
76; GCN-LABEL: {{^}}float8_inselt:
77; GCN-NOT: v_movrel
78; GCN-NOT: buffer_
79; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
80; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
81; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
82; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
83; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
84; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
85; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
86; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
87; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7
88; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
89; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6
90; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
91; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5
92; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
93; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4
94; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
95; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
96; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
97define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
98entry:
99  %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
100  store <8 x float> %v, <8 x float> addrspace(1)* %out
101  ret void
102}
103
104; GCN-LABEL: {{^}}float16_inselt:
105; GCN: v_movreld_b32
106define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
107entry:
108  %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
109  store <16 x float> %v, <16 x float> addrspace(1)* %out
110  ret void
111}
112
113; GCN-LABEL: {{^}}float32_inselt:
114; GCN: v_movreld_b32
115define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x float> %vec, i32 %sel) {
116entry:
117  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
118  store <32 x float> %v, <32 x float> addrspace(1)* %out
119  ret void
120}
121
122; GCN-LABEL: {{^}}half4_inselt:
123; GCN-NOT: v_cndmask_b32
124; GCN-NOT: v_movrel
125; GCN-NOT: buffer_
126; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
127; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
128; GCN:     s_andn2_b64
129; GCN:     s_mov_b32 s[[KLO:[0-9]+]], 0x3c003c00
130; GCN:     s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]]
131; GCN:     s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]]
132; GCN:     s_or_b64
133define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
134entry:
135  %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
136  store <4 x half> %v, <4 x half> addrspace(1)* %out
137  ret void
138}
139
140; GCN-LABEL: {{^}}half2_inselt:
141; GCN-NOT: v_cndmask_b32
142; GCN-NOT: v_movrel
143; GCN-NOT: buffer_
144; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
145; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
146; GCN:     s_andn2_b32
147; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c003c00
148; GCN:     s_or_b32
149define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
150entry:
151  %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
152  store <2 x half> %v, <2 x half> addrspace(1)* %out
153  ret void
154}
155
156; GCN-LABEL: {{^}}half8_inselt:
157; GCN-NOT: v_movrel
158; GCN-NOT: buffer_
159; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
160; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1
161; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2
162; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3
163; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4
164; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5
165; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6
166; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7
167; GCN-DAG: v_cndmask_b32_e32
168; GCN-DAG: v_cndmask_b32_e32
169; GCN-DAG: v_cndmask_b32_e32
170; GCN-DAG: v_cndmask_b32_e32
171; GCN-DAG: v_cndmask_b32_e32
172; GCN-DAG: v_cndmask_b32_e32
173; GCN-DAG: v_cndmask_b32_e32
174; GCN-DAG: v_cndmask_b32_e32
175; GCN-DAG: v_or_b32_sdwa
176; GCN-DAG: v_or_b32_sdwa
177; GCN-DAG: v_or_b32_sdwa
178; GCN-DAG: v_or_b32_sdwa
179define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
180entry:
181  %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
182  store <8 x half> %v, <8 x half> addrspace(1)* %out
183  ret void
184}
185
186; GCN-LABEL: {{^}}short2_inselt:
187; GCN-NOT: v_cndmask_b32
188; GCN-NOT: v_movrel
189; GCN-NOT: buffer_
190; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
191; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
192; GCN:     s_andn2_b32
193; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10001
194; GCN:     s_or_b32
195define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
196entry:
197  %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
198  store <2 x i16> %v, <2 x i16> addrspace(1)* %out
199  ret void
200}
201
202; GCN-LABEL: {{^}}short4_inselt:
203; GCN-NOT: v_cndmask_b32
204; GCN-NOT: v_movrel
205; GCN-NOT: buffer_
206; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
207; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
208; GCN:     s_andn2_b64
209; GCN:     s_mov_b32 s[[KLO:[0-9]+]], 0x10001
210; GCN:     s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]]
211; GCN:     s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]]
212; GCN:     s_or_b64
213define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
214entry:
215  %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
216  store <4 x i16> %v, <4 x i16> addrspace(1)* %out
217  ret void
218}
219
220; GCN-LABEL: {{^}}byte8_inselt:
221; GCN-NOT: v_movrel
222; GCN-NOT: buffer_
223; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
224; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
225; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x1010101
226; GCN:     s_and_b32 s3, s1, [[K]]
227; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
228; GCN:     s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
229; GCN:     s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
230define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
231entry:
232  %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
233  store <8 x i8> %v, <8 x i8> addrspace(1)* %out
234  ret void
235}
236
237; GCN-LABEL: {{^}}byte16_inselt:
238; GCN-NOT: v_movrel
239; GCN-NOT: buffer_
240; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
241; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15
242; GCN-DAG: v_cndmask_b32_e32
243; GCN-DAG: v_cndmask_b32_e32
244; GCN-DAG: v_cndmask_b32_e32
245; GCN-DAG: v_cndmask_b32_e32
246; GCN-DAG: v_cndmask_b32_e32
247; GCN-DAG: v_cndmask_b32_e32
248; GCN-DAG: v_cndmask_b32_e32
249; GCN-DAG: v_cndmask_b32_e32
250; GCN-DAG: v_cndmask_b32_e32
251; GCN-DAG: v_cndmask_b32_e32
252; GCN-DAG: v_cndmask_b32_e32
253; GCN-DAG: v_cndmask_b32_e32
254; GCN-DAG: v_cndmask_b32_e32
255; GCN-DAG: v_cndmask_b32_e32
256; GCN-DAG: v_cndmask_b32_e32
257; GCN-DAG: v_cndmask_b32_e32
258; GCN-DAG: v_or_b32_sdwa
259; GCN-DAG: v_or_b32_sdwa
260; GCN-DAG: v_or_b32_sdwa
261; GCN-DAG: v_or_b32_sdwa
262; GCN-DAG: v_or_b32_sdwa
263; GCN-DAG: v_or_b32_sdwa
264; GCN-DAG: v_or_b32_sdwa
265; GCN-DAG: v_or_b32_sdwa
266define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
267entry:
268  %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
269  store <16 x i8> %v, <16 x i8> addrspace(1)* %out
270  ret void
271}
272
273; GCN-LABEL: {{^}}double2_inselt:
274; GCN-NOT: v_movrel
275; GCN-NOT: buffer_
276; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
277; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
278; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
279; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
280; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
281; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
282define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
283entry:
284  %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
285  store <2 x double> %v, <2 x double> addrspace(1)* %out
286  ret void
287}
288
289; GCN-LABEL: {{^}}double5_inselt:
290; GCN-NOT: v_movrel
291; GCN-NOT: buffer_
292; GCN-COUNT-10: v_cndmask_b32
293define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
294entry:
295  %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
296  store <5 x double> %v, <5 x double> addrspace(1)* %out
297  ret void
298}
299
300; GCN-LABEL: {{^}}double8_inselt:
301; GCN-NOT: v_cndmask
302; GCN-NOT: buffer_
303; GCN-NOT: s_or_b32
304; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
305; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
306; GCN-NOT: s_mov_b32 m0
307; GCN:     v_movreld_b32_e32 v[[#BASE+1]],
308define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
309entry:
310  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
311  store <8 x double> %v, <8 x double> addrspace(1)* %out
312  ret void
313}
314
315; GCN-LABEL: {{^}}double7_inselt:
316; GCN-NOT: v_cndmask
317; GCN-NOT: buffer_
318; GCN-NOT: s_or_b32
319; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
320; GCN-DAG: v_movreld_b32_e32 v[[#BASE]], 0
321; GCN-NOT: s_mov_b32 m0
322; GCN:     v_movreld_b32_e32 v[[#BASE+1]],
323define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) {
324entry:
325  %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
326  store <7 x double> %v, <7 x double> addrspace(1)* %out
327  ret void
328}
329
330; GCN-LABEL: {{^}}double16_inselt:
331; GCN-NOT: v_cndmask
332; GCN-NOT: buffer_
333; GCN-NOT: s_or_b32
334; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
335; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
336; GCN-NOT: s_mov_b32 m0
337; GCN:     v_movreld_b32_e32 v[[#BASE+1]],
338define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16 x double> %vec, i32 %sel) {
339entry:
340  %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel
341  store <16 x double> %v, <16 x double> addrspace(1)* %out
342  ret void
343}
344
345; GCN-LABEL: {{^}}double15_inselt:
346; GCN-NOT: v_cndmask
347; GCN-NOT: buffer_
348; GCN-NOT: s_or_b32
349; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
350; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
351; GCN-NOT: s_mov_b32 m0
352; GCN:     v_movreld_b32_e32 v[[#BASE+1]],
353define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) {
354entry:
355  %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel
356  store <15 x double> %v, <15 x double> addrspace(1)* %out
357  ret void
358}
359
360; GCN-LABEL: {{^}}bit4_inselt:
361; GCN: buffer_store_byte
362; GCN: buffer_load_ubyte
363; GCN: buffer_load_ubyte
364; GCN: buffer_load_ubyte
365; GCN: buffer_load_ubyte
366define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
367entry:
368  %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
369  store <4 x i1> %v, <4 x i1> addrspace(1)* %out
370  ret void
371}
372
373; GCN-LABEL: {{^}}bit128_inselt:
374; GCN-NOT: buffer_
375; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0
376; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
377; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
378; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
379; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
380define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
381entry:
382  %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
383  store <128 x i1> %v, <128 x i1> addrspace(1)* %out
384  ret void
385}
386
387; GCN-LABEL: {{^}}float32_inselt_vec:
388; GCN-NOT: buffer_
389; GCN-COUNT-32: v_cmp_ne_u32
390; GCN-COUNT-32: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0,
391define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) {
392entry:
393  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
394  ret <32 x float> %v
395}
396
397; GCN-LABEL: {{^}}double8_inselt_vec:
398; GCN-NOT: buffer_
399; GCN:         v_cmp_eq_u32
400; GCN-COUNT-2: v_cndmask_b32
401; GCN:         v_cmp_eq_u32
402; GCN-COUNT-2: v_cndmask_b32
403; GCN:         v_cmp_eq_u32
404; GCN-COUNT-2: v_cndmask_b32
405; GCN:         v_cmp_eq_u32
406; GCN-COUNT-2: v_cndmask_b32
407; GCN:         v_cmp_eq_u32
408; GCN-COUNT-2: v_cndmask_b32
409; GCN:         v_cmp_eq_u32
410; GCN-COUNT-2: v_cndmask_b32
411; GCN:         v_cmp_eq_u32
412; GCN-COUNT-2: v_cndmask_b32
413; GCN:         v_cmp_eq_u32
414; GCN-COUNT-2: v_cndmask_b32
415define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
416entry:
417  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
418  ret <8 x double> %v
419}
420