1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s 2 3; GCN-LABEL: {{^}}float4_inselt: 4; GCN-NOT: v_movrel 5; GCN-NOT: buffer_ 6; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 7; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] 8; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 9; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] 10; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 11; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] 12; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 13; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] 14; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] 15define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) { 16entry: 17 %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel 18 store <4 x float> %v, <4 x float> addrspace(1)* %out 19 ret void 20} 21 22; GCN-LABEL: {{^}}float4_inselt_undef: 23; GCN-NOT: v_movrel 24; GCN-NOT: buffer_ 25; GCN-NOT: v_cmp_ 26; GCN-NOT: v_cndmask_ 27; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 28; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] 29; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] 30; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] 31define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) { 32entry: 33 %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel 34 store <4 x float> %v, <4 x float> addrspace(1)* %out 35 ret void 36} 37 38; GCN-LABEL: {{^}}int4_inselt: 39; GCN-NOT: v_movrel 40; GCN-NOT: buffer_ 41; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3 42; GCN-DAG: s_cselect_b32 s[[ELT_3:[0-9]+]], s{{[0-9]+}}, 1 43; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2 44; GCN-DAG: s_cselect_b32 s[[ELT_2:[0-9]+]], s{{[0-9]+}}, 1 45; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1 46; GCN-DAG: s_cselect_b32 s[[ELT_1:[0-9]+]], s{{[0-9]+}}, 1 47; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0 48; GCN-DAG: s_cselect_b32 s[[ELT_0:[0-9]+]], s{{[0-9]+}}, 1 49; GCN-DAG: v_mov_b32_e32 v[[VELT_0:[0-9]+]], s[[ELT_0]] 50; GCN-DAG: v_mov_b32_e32 v[[VELT_1:[0-9]+]], s[[ELT_1]] 51; GCN-DAG: v_mov_b32_e32 v[[VELT_2:[0-9]+]], s[[ELT_2]] 52; GCN-DAG: v_mov_b32_e32 v[[VELT_3:[0-9]+]], s[[ELT_3]] 53; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[VELT_0]]:[[VELT_3]]] 54define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) { 55entry: 56 %v = insertelement <4 x i32> %vec, i32 1, i32 %sel 57 store <4 x i32> %v, <4 x i32> addrspace(1)* %out 58 ret void 59} 60 61; GCN-LABEL: {{^}}float2_inselt: 62; GCN-NOT: v_movrel 63; GCN-NOT: buffer_ 64; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 65; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] 66; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 67; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]] 68; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] 69define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) { 70entry: 71 %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel 72 store <2 x float> %v, <2 x float> addrspace(1)* %out 73 ret void 74} 75 76; GCN-LABEL: {{^}}float8_inselt: 77; GCN-NOT: v_movrel 78; GCN-NOT: buffer_ 79; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 80; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] 81; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 82; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] 83; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 84; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] 85; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 86; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] 87; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7 88; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]] 89; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6 90; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]] 91; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5 92; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]] 93; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4 94; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]] 95; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]] 96; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]] 97define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) { 98entry: 99 %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel 100 store <8 x float> %v, <8 x float> addrspace(1)* %out 101 ret void 102} 103 104; GCN-LABEL: {{^}}float16_inselt: 105; GCN: v_movreld_b32 106define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) { 107entry: 108 %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel 109 store <16 x float> %v, <16 x float> addrspace(1)* %out 110 ret void 111} 112 113; GCN-LABEL: {{^}}float32_inselt: 114; GCN: v_movreld_b32 115define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x float> %vec, i32 %sel) { 116entry: 117 %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel 118 store <32 x float> %v, <32 x float> addrspace(1)* %out 119 ret void 120} 121 122; GCN-LABEL: {{^}}half4_inselt: 123; GCN-NOT: v_cndmask_b32 124; GCN-NOT: v_movrel 125; GCN-NOT: buffer_ 126; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 127; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] 128; GCN: s_andn2_b64 129; GCN: s_mov_b32 s[[KLO:[0-9]+]], 0x3c003c00 130; GCN: s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]] 131; GCN: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]] 132; GCN: s_or_b64 133define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) { 134entry: 135 %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel 136 store <4 x half> %v, <4 x half> addrspace(1)* %out 137 ret void 138} 139 140; GCN-LABEL: {{^}}half2_inselt: 141; GCN-NOT: v_cndmask_b32 142; GCN-NOT: v_movrel 143; GCN-NOT: buffer_ 144; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 145; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] 146; GCN: s_andn2_b32 147; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c003c00 148; GCN: s_or_b32 149define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) { 150entry: 151 %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel 152 store <2 x half> %v, <2 x half> addrspace(1)* %out 153 ret void 154} 155 156; GCN-LABEL: {{^}}half8_inselt: 157; GCN-NOT: v_movrel 158; GCN-NOT: buffer_ 159; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 160; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1 161; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2 162; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3 163; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4 164; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5 165; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6 166; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7 167; GCN-DAG: v_cndmask_b32_e32 168; GCN-DAG: v_cndmask_b32_e32 169; GCN-DAG: v_cndmask_b32_e32 170; GCN-DAG: v_cndmask_b32_e32 171; GCN-DAG: v_cndmask_b32_e32 172; GCN-DAG: v_cndmask_b32_e32 173; GCN-DAG: v_cndmask_b32_e32 174; GCN-DAG: v_cndmask_b32_e32 175; GCN-DAG: v_or_b32_sdwa 176; GCN-DAG: v_or_b32_sdwa 177; GCN-DAG: v_or_b32_sdwa 178; GCN-DAG: v_or_b32_sdwa 179define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) { 180entry: 181 %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel 182 store <8 x half> %v, <8 x half> addrspace(1)* %out 183 ret void 184} 185 186; GCN-LABEL: {{^}}short2_inselt: 187; GCN-NOT: v_cndmask_b32 188; GCN-NOT: v_movrel 189; GCN-NOT: buffer_ 190; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 191; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] 192; GCN: s_andn2_b32 193; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10001 194; GCN: s_or_b32 195define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) { 196entry: 197 %v = insertelement <2 x i16> %vec, i16 1, i32 %sel 198 store <2 x i16> %v, <2 x i16> addrspace(1)* %out 199 ret void 200} 201 202; GCN-LABEL: {{^}}short4_inselt: 203; GCN-NOT: v_cndmask_b32 204; GCN-NOT: v_movrel 205; GCN-NOT: buffer_ 206; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 207; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] 208; GCN: s_andn2_b64 209; GCN: s_mov_b32 s[[KLO:[0-9]+]], 0x10001 210; GCN: s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]] 211; GCN: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]] 212; GCN: s_or_b64 213define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) { 214entry: 215 %v = insertelement <4 x i16> %vec, i16 1, i32 %sel 216 store <4 x i16> %v, <4 x i16> addrspace(1)* %out 217 ret void 218} 219 220; GCN-LABEL: {{^}}byte8_inselt: 221; GCN-NOT: v_movrel 222; GCN-NOT: buffer_ 223; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 224; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] 225; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101 226; GCN: s_and_b32 s3, s1, [[K]] 227; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] 228; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] 229; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] 230define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) { 231entry: 232 %v = insertelement <8 x i8> %vec, i8 1, i32 %sel 233 store <8 x i8> %v, <8 x i8> addrspace(1)* %out 234 ret void 235} 236 237; GCN-LABEL: {{^}}byte16_inselt: 238; GCN-NOT: v_movrel 239; GCN-NOT: buffer_ 240; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 241; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15 242; GCN-DAG: v_cndmask_b32_e32 243; GCN-DAG: v_cndmask_b32_e32 244; GCN-DAG: v_cndmask_b32_e32 245; GCN-DAG: v_cndmask_b32_e32 246; GCN-DAG: v_cndmask_b32_e32 247; GCN-DAG: v_cndmask_b32_e32 248; GCN-DAG: v_cndmask_b32_e32 249; GCN-DAG: v_cndmask_b32_e32 250; GCN-DAG: v_cndmask_b32_e32 251; GCN-DAG: v_cndmask_b32_e32 252; GCN-DAG: v_cndmask_b32_e32 253; GCN-DAG: v_cndmask_b32_e32 254; GCN-DAG: v_cndmask_b32_e32 255; GCN-DAG: v_cndmask_b32_e32 256; GCN-DAG: v_cndmask_b32_e32 257; GCN-DAG: v_cndmask_b32_e32 258; GCN-DAG: v_or_b32_sdwa 259; GCN-DAG: v_or_b32_sdwa 260; GCN-DAG: v_or_b32_sdwa 261; GCN-DAG: v_or_b32_sdwa 262; GCN-DAG: v_or_b32_sdwa 263; GCN-DAG: v_or_b32_sdwa 264; GCN-DAG: v_or_b32_sdwa 265; GCN-DAG: v_or_b32_sdwa 266define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) { 267entry: 268 %v = insertelement <16 x i8> %vec, i8 1, i32 %sel 269 store <16 x i8> %v, <16 x i8> addrspace(1)* %out 270 ret void 271} 272 273; GCN-LABEL: {{^}}double2_inselt: 274; GCN-NOT: v_movrel 275; GCN-NOT: buffer_ 276; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 277; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]] 278; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] 279; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 280; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]] 281; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] 282define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) { 283entry: 284 %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel 285 store <2 x double> %v, <2 x double> addrspace(1)* %out 286 ret void 287} 288 289; GCN-LABEL: {{^}}double5_inselt: 290; GCN-NOT: v_movrel 291; GCN-NOT: buffer_ 292; GCN-COUNT-10: v_cndmask_b32 293define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) { 294entry: 295 %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel 296 store <5 x double> %v, <5 x double> addrspace(1)* %out 297 ret void 298} 299 300; GCN-LABEL: {{^}}double8_inselt: 301; GCN-NOT: v_cndmask 302; GCN-NOT: buffer_ 303; GCN-NOT: s_or_b32 304; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] 305; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0 306; GCN-NOT: s_mov_b32 m0 307; GCN: v_movreld_b32_e32 v[[#BASE+1]], 308define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) { 309entry: 310 %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel 311 store <8 x double> %v, <8 x double> addrspace(1)* %out 312 ret void 313} 314 315; GCN-LABEL: {{^}}double7_inselt: 316; GCN-NOT: v_cndmask 317; GCN-NOT: buffer_ 318; GCN-NOT: s_or_b32 319; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] 320; GCN-DAG: v_movreld_b32_e32 v[[#BASE]], 0 321; GCN-NOT: s_mov_b32 m0 322; GCN: v_movreld_b32_e32 v[[#BASE+1]], 323define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) { 324entry: 325 %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel 326 store <7 x double> %v, <7 x double> addrspace(1)* %out 327 ret void 328} 329 330; GCN-LABEL: {{^}}double16_inselt: 331; GCN-NOT: v_cndmask 332; GCN-NOT: buffer_ 333; GCN-NOT: s_or_b32 334; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] 335; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0 336; GCN-NOT: s_mov_b32 m0 337; GCN: v_movreld_b32_e32 v[[#BASE+1]], 338define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16 x double> %vec, i32 %sel) { 339entry: 340 %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel 341 store <16 x double> %v, <16 x double> addrspace(1)* %out 342 ret void 343} 344 345; GCN-LABEL: {{^}}double15_inselt: 346; GCN-NOT: v_cndmask 347; GCN-NOT: buffer_ 348; GCN-NOT: s_or_b32 349; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] 350; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0 351; GCN-NOT: s_mov_b32 m0 352; GCN: v_movreld_b32_e32 v[[#BASE+1]], 353define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) { 354entry: 355 %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel 356 store <15 x double> %v, <15 x double> addrspace(1)* %out 357 ret void 358} 359 360; GCN-LABEL: {{^}}bit4_inselt: 361; GCN: buffer_store_byte 362; GCN: buffer_load_ubyte 363; GCN: buffer_load_ubyte 364; GCN: buffer_load_ubyte 365; GCN: buffer_load_ubyte 366define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) { 367entry: 368 %v = insertelement <4 x i1> %vec, i1 1, i32 %sel 369 store <4 x i1> %v, <4 x i1> addrspace(1)* %out 370 ret void 371} 372 373; GCN-LABEL: {{^}}bit128_inselt: 374; GCN-NOT: buffer_ 375; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0 376; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]] 377; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f 378; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]] 379; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]] 380define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) { 381entry: 382 %v = insertelement <128 x i1> %vec, i1 1, i32 %sel 383 store <128 x i1> %v, <128 x i1> addrspace(1)* %out 384 ret void 385} 386 387; GCN-LABEL: {{^}}float32_inselt_vec: 388; GCN-NOT: buffer_ 389; GCN-COUNT-32: v_cmp_ne_u32 390; GCN-COUNT-32: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, 391define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) { 392entry: 393 %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel 394 ret <32 x float> %v 395} 396 397; GCN-LABEL: {{^}}double8_inselt_vec: 398; GCN-NOT: buffer_ 399; GCN: v_cmp_eq_u32 400; GCN-COUNT-2: v_cndmask_b32 401; GCN: v_cmp_eq_u32 402; GCN-COUNT-2: v_cndmask_b32 403; GCN: v_cmp_eq_u32 404; GCN-COUNT-2: v_cndmask_b32 405; GCN: v_cmp_eq_u32 406; GCN-COUNT-2: v_cndmask_b32 407; GCN: v_cmp_eq_u32 408; GCN-COUNT-2: v_cndmask_b32 409; GCN: v_cmp_eq_u32 410; GCN-COUNT-2: v_cndmask_b32 411; GCN: v_cmp_eq_u32 412; GCN-COUNT-2: v_cndmask_b32 413; GCN: v_cmp_eq_u32 414; GCN-COUNT-2: v_cndmask_b32 415define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { 416entry: 417 %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel 418 ret <8 x double> %v 419} 420