1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s 5 6; Tests for indirect addressing on SI, which is implemented using dynamic 7; indexing of vectors. 8 9; GCN-LABEL: {{^}}extract_w_offset: 10; GCN-DAG: s_load_dword [[IN0:s[0-9]+]] 11; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 12; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 13; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 14; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 15; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1 16 17; MOVREL-DAG: s_mov_b32 m0, [[IN]] 18; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] 19 20; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}} 21; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] 22; IDXMODE-NEXT: s_set_gpr_idx_off 23define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) { 24entry: 25 %idx = add i32 %in, 1 26 %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx 27 store float %elt, float addrspace(1)* %out 28 ret void 29} 30 31; XXX: Could do v_or_b32 directly 32; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector: 33; GCN-DAG: s_or_b32 34; GCN-DAG: s_or_b32 35; GCN-DAG: s_or_b32 36; GCN-DAG: s_or_b32 37; MOVREL: s_mov_b32 m0 38; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 39; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 40; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 41; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 42 43 44; MOVREL: v_movrels_b32_e32 45 46; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}} 47; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 48; IDXMODE-NEXT: s_set_gpr_idx_off 49define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <16 x i32> %or.val) { 50entry: 51 %idx = add i32 %in, 1 52 %vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 53 %elt = extractelement <16 x i32> %vec, i32 %idx 54 store i32 %elt, i32 addrspace(1)* %out 55 ret void 56} 57 58; GCN-LABEL: {{^}}extract_wo_offset: 59; GCN-DAG: s_load_dword [[IN:s[0-9]+]] 60; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 61; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 62; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 63; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 64 65; MOVREL-DAG: s_mov_b32 m0, [[IN]] 66; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] 67 68; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}} 69; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] 70; IDXMODE-NEXT: s_set_gpr_idx_off 71define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { 72entry: 73 %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in 74 store float %elt, float addrspace(1)* %out 75 ret void 76} 77 78; GCN-LABEL: {{^}}extract_neg_offset_sgpr: 79; The offset depends on the register that holds the first element of the vector. 80; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} 81; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 82 83; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} 84; IDXMODE: v_mov_b32_e32 v14, 15 85; IDXMODE: v_mov_b32_e32 v15, 16 86; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}} 87; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 88; IDXMODE-NEXT: s_set_gpr_idx_off 89define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { 90entry: 91 %index = add i32 %offset, -512 92 %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index 93 store i32 %value, i32 addrspace(1)* %out 94 ret void 95} 96 97; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded: 98; The offset depends on the register that holds the first element of the vector. 99; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} 100; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 101 102; IDXMODE-DAG: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} 103; IDXMODE-DAG: v_mov_b32_e32 v0, 104; IDXMODE: v_mov_b32_e32 v1, 105; IDXMODE: v_mov_b32_e32 v2, 106; IDXMODE: v_mov_b32_e32 v3, 107; IDXMODE: v_mov_b32_e32 v4, 108; IDXMODE: v_mov_b32_e32 v5, 109; IDXMODE: v_mov_b32_e32 v6, 110; IDXMODE: v_mov_b32_e32 v7, 111; IDXMODE: v_mov_b32_e32 v8, 112; IDXMODE: v_mov_b32_e32 v9, 113; IDXMODE: v_mov_b32_e32 v10, 114; IDXMODE: v_mov_b32_e32 v11, 115; IDXMODE: v_mov_b32_e32 v12, 116; IDXMODE: v_mov_b32_e32 v13, 117; IDXMODE: v_mov_b32_e32 v14, 118; IDXMODE: v_mov_b32_e32 v15, 119; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}} 120; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 121; IDXMODE-NEXT: s_set_gpr_idx_off 122define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) { 123entry: 124 %index = add i32 %offset, -512 125 %or = or <16 x i32> %vec0, %vec1 126 %value = extractelement <16 x i32> %or, i32 %index 127 store i32 %value, i32 addrspace(1)* %out 128 ret void 129} 130 131; GCN-LABEL: {{^}}extract_neg_offset_vgpr: 132; The offset depends on the register that holds the first element of the vector. 133 134; GCN: v_cmp_eq_u32_e32 135; GCN-COUNT-14: v_cndmask_b32 136; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16 137; GCN: buffer_store_dword [[RESULT]] 138define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { 139entry: 140 %id = call i32 @llvm.amdgcn.workitem.id.x() #1 141 %index = add i32 %id, -512 142 %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index 143 store i32 %value, i32 addrspace(1)* %out 144 ret void 145} 146 147; GCN-LABEL: {{^}}extract_undef_offset_sgpr: 148; undefined behavior, but shouldn't crash compiler 149define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 150entry: 151 %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in 152 %value = extractelement <4 x i32> %ld, i32 undef 153 store i32 %value, i32 addrspace(1)* %out 154 ret void 155} 156 157; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src: 158; undefined behavior, but shouldn't crash compiler 159define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 160entry: 161 %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in 162 %value = insertelement <4 x i32> %ld, i32 5, i32 undef 163 store <4 x i32> %value, <4 x i32> addrspace(1)* %out 164 ret void 165} 166 167; GCN-LABEL: {{^}}insert_w_offset: 168; GCN-DAG: s_load_dword [[IN0:s[0-9]+]] 169; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1 170; MOVREL-DAG: s_mov_b32 m0, [[IN]] 171; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0 172; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0 173; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000 174; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0 175; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000 176; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000 177 178; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]] 179; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}} 180define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) { 181entry: 182 %add = add i32 %in, 1 183 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add 184 store <16 x float> %ins, <16 x float> addrspace(1)* %out 185 ret void 186} 187 188; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset: 189; GCN-DAG: s_load_dword [[IN:s[0-9]+]] 190; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0 191; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0 192; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff 193 194; MOVREL: s_mov_b32 m0, [[BASE]] 195; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}} 196 197; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST) 198; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}} 199; IDXMODE-NEXT: s_set_gpr_idx_off 200define amdgpu_kernel void @insert_unsigned_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) { 201entry: 202 %base = zext i16 %in to i32 203 %add = add i32 %base, 1 204 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add 205 store <16 x float> %ins, <16 x float> addrspace(1)* %out 206 ret void 207} 208 209; GCN-LABEL: {{^}}insert_signed_base_plus_offset: 210; GCN-DAG: s_load_dword [[IN:s[0-9]+]] 211; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0 212; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0 213 214; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]] 215; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1 216 217; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]] 218; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}} 219 220; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST) 221; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}} 222; IDXMODE-NEXT: s_set_gpr_idx_off 223define amdgpu_kernel void @insert_signed_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) { 224entry: 225 %base = sext i16 %in to i32 226 %add = add i32 %base, 1 227 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add 228 store <16 x float> %ins, <16 x float> addrspace(1)* %out 229 ret void 230} 231 232 233; GCN-LABEL: {{^}}insert_wo_offset: 234; GCN: s_load_dword [[IN:s[0-9]+]] 235 236; MOVREL: s_mov_b32 m0, [[IN]] 237; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]] 238 239; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(DST) 240; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}} 241; IDXMODE-NEXT: s_set_gpr_idx_off 242 243; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]: 244define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) { 245entry: 246 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in 247 store <16 x float> %ins, <16 x float> addrspace(1)* %out 248 ret void 249} 250 251; GCN-LABEL: {{^}}insert_neg_offset_sgpr: 252; The offset depends on the register that holds the first element of the vector. 253; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} 254; MOVREL: v_movreld_b32_e32 v0, 16 255 256; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} 257; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) 258; IDXMODE-NEXT: v_mov_b32_e32 v0, 16 259; IDXMODE-NEXT: s_set_gpr_idx_off 260define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) { 261entry: 262 %index = add i32 %offset, -512 263 %value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index 264 store <16 x i32> %value, <16 x i32> addrspace(1)* %out 265 ret void 266} 267 268; The vector indexed into is originally loaded into an SGPR rather 269; than built with a reg_sequence 270 271; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg: 272; The offset depends on the register that holds the first element of the vector. 273; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} 274; MOVREL: v_movreld_b32_e32 v0, 5 275 276; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} 277; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) 278; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 279; IDXMODE-NEXT: s_set_gpr_idx_off 280define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) { 281entry: 282 %index = add i32 %offset, -512 283 %value = insertelement <16 x i32> %vec, i32 5, i32 %index 284 store <16 x i32> %value, <16 x i32> addrspace(1)* %out 285 ret void 286} 287 288; GCN-LABEL: {{^}}insert_neg_offset_vgpr: 289; The offset depends on the register that holds the first element of the vector. 290 291; GCN: v_cmp_eq_u32_e32 292; GCN-COUNT-16: v_cndmask_b32 293; GCN-COUNT-4: buffer_store_dwordx4 294define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) { 295entry: 296 %id = call i32 @llvm.amdgcn.workitem.id.x() #1 297 %index = add i32 %id, -512 298 %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index 299 store <16 x i32> %value, <16 x i32> addrspace(1)* %out 300 ret void 301} 302 303; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr: 304 305; GCN: v_cmp_eq_u32_e32 306; GCN-COUNT-16: v_cndmask_b32 307; GCN-COUNT-4: buffer_store_dwordx4 308define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) { 309entry: 310 %id = call i32 @llvm.amdgcn.workitem.id.x() #1 311 %index = add i32 %id, -16 312 %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index 313 store <16 x i32> %value, <16 x i32> addrspace(1)* %out 314 ret void 315} 316 317; When the block is split to insert the loop, make sure any other 318; places that need to be expanded in the same block are also handled. 319 320; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block: 321 322; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] 323; GCN: v_cmp_eq_u32 324; GCN: v_cndmask_b32_e64 [[RESULT0:v[0-9]+]], 16, 325; GCN: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 16, 326 327; GCN: buffer_store_dword [[RESULT0]] 328; GCN: buffer_store_dword [[RESULT1]] 329define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { 330entry: 331 %id = call i32 @llvm.amdgcn.workitem.id.x() #1 332 %id.ext = zext i32 %id to i64 333 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext 334 %idx0 = load volatile i32, i32 addrspace(1)* %gep 335 %idx1 = add i32 %idx0, 1 336 %val0 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx0 337 %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" () 338 %val1 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx1 339 store volatile i32 %val0, i32 addrspace(1)* %out0 340 store volatile i32 %val1, i32 addrspace(1)* %out0 341 %cmp = icmp eq i32 %id, 0 342 br i1 %cmp, label %bb1, label %bb2 343 344bb1: 345 store volatile i32 %live.out.reg, i32 addrspace(1)* undef 346 br label %bb2 347 348bb2: 349 ret void 350} 351 352; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to 353; avoid very different schedule induced isses with gfx9. 354; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll 355 356 357; GCN-LABEL: {{^}}insert_adjacent_blocks: 358define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { 359bb: 360 %tmp = icmp eq i32 %arg, 0 361 br i1 %tmp, label %bb1, label %bb4 362 363bb1: ; preds = %bb 364 %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef 365 %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef 366 call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out 367 br label %bb7 368 369bb4: ; preds = %bb 370 %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef 371 %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef 372 call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out 373 br label %bb7 374 375bb7: ; preds = %bb4, %bb1 376 %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] 377 store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef 378 ret void 379} 380 381; FIXME: Should be able to fold zero input to movreld to inline imm? 382 383; GCN-LABEL: {{^}}multi_same_block: 384 385; GCN: s_load_dword [[ARG:s[0-9]+]] 386 387; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 388; MOVREL: s_waitcnt 389; MOVREL: s_add_i32 m0, [[ARG]], -16 390; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0 391; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd 392; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0 393; MOVREL: s_mov_b32 m0, -1 394 395 396; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 397; IDXMODE: s_waitcnt 398; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16 399; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST) 400; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0 401; IDXMODE: s_set_gpr_idx_off 402; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd 403; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST) 404; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0 405; IDXMODE: s_set_gpr_idx_off 406 407; GCN: ds_write_b32 408; GCN: ds_write_b32 409; GCN: s_endpgm 410define amdgpu_kernel void @multi_same_block(i32 %arg) #0 { 411bb: 412 %tmp1 = add i32 %arg, -16 413 %tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1 414 %tmp3 = add i32 %arg, -16 415 %tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3 416 %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32> 417 %tmp6 = extractelement <9 x i32> %tmp5, i32 1 418 %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32> 419 %tmp8 = extractelement <9 x i32> %tmp7, i32 5 420 store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4 421 store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4 422 ret void 423} 424 425; offset puts outside of superegister bounaries, so clamp to 1st element. 426; GCN-LABEL: {{^}}extract_largest_inbounds_offset: 427; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]] 428; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]] 429; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15 430 431; MOVREL: s_mov_b32 m0, [[IDX]] 432; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] 433 434; IDXMODE: s_set_gpr_idx_on [[IDX]], gpr_idx(SRC0) 435; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] 436; IDXMODE: s_set_gpr_idx_off 437 438; GCN: buffer_store_dword [[EXTRACT]] 439define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) { 440entry: 441 %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in 442 %offset = add i32 %idx, 15 443 %value = extractelement <16 x i32> %ld, i32 %offset 444 store i32 %value, i32 addrspace(1)* %out 445 ret void 446} 447 448; GCN-LABEL: {{^}}extract_out_of_bounds_offset: 449; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} 450; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] 451; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16 452 453; MOVREL: s_mov_b32 m0, [[ADD_IDX]] 454; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] 455 456; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0) 457; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] 458; IDXMODE: s_set_gpr_idx_off 459 460; GCN: buffer_store_dword [[EXTRACT]] 461define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) { 462entry: 463 %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in 464 %offset = add i32 %idx, 16 465 %value = extractelement <16 x i32> %ld, i32 %offset 466 store i32 %value, i32 addrspace(1)* %out 467 ret void 468} 469 470; GCN-LABEL: {{^}}extractelement_v16i32_or_index: 471; GCN: s_load_dword [[IDX_IN:s[0-9]+]] 472; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] 473 474; MOVREL: s_mov_b32 m0, [[IDX_SHL]] 475; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 476 477; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0) 478; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 479; IDXMODE: s_set_gpr_idx_off 480define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) { 481entry: 482 %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in 483 %idx.shl = shl i32 %idx.in, 2 484 %idx = or i32 %idx.shl, 1 485 %value = extractelement <16 x i32> %ld, i32 %idx 486 store i32 %value, i32 addrspace(1)* %out 487 ret void 488} 489 490; GCN-LABEL: {{^}}insertelement_v16f32_or_index: 491; GCN: s_load_dword [[IDX_IN:s[0-9]+]] 492; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] 493 494; MOVREL: s_mov_b32 m0, [[IDX_SHL]] 495; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 496 497; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST) 498; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 499; IDXMODE: s_set_gpr_idx_off 500define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind { 501 %idx.shl = shl i32 %idx.in, 2 502 %idx = or i32 %idx.shl, 1 503 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx 504 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 505 ret void 506} 507 508; GCN-LABEL: {{^}}broken_phi_bb: 509; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8 510 511; GCN: {{BB[0-9]+_[0-9]+}}: 512; GCN: [[BB2:BB[0-9]+_[0-9]+]]: 513; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]] 514; GCN: buffer_load_dword 515 516; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]: 517; MOVREL: v_movreld_b32_e32 518 519; IDXMODE: s_set_gpr_idx_on 520; IDXMODE: v_mov_b32_e32 521; IDXMODE: s_set_gpr_idx_off 522 523; GCN: s_cbranch_execnz [[REGLOOP]] 524 525; GCN: {{^; %bb.[0-9]}}: 526; GCN: s_mov_b64 exec, 527; GCN: s_cbranch_execnz [[BB2]] 528 529define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 { 530bb: 531 br label %bb2 532 533bb2: ; preds = %bb4, %bb 534 %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ] 535 %tmp3 = icmp slt i32 %tmp, %arg 536 br i1 %tmp3, label %bb4, label %bb8 537 538bb4: ; preds = %bb2 539 %vgpr = load volatile i32, i32 addrspace(1)* undef 540 %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr 541 %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr 542 %tmp7 = extractelement <16 x i32> %tmp6, i32 0 543 br label %bb2 544 545bb8: ; preds = %bb2 546 ret void 547} 548 549declare i32 @llvm.amdgcn.workitem.id.x() #1 550declare void @llvm.amdgcn.s.barrier() #2 551 552attributes #0 = { nounwind } 553attributes #1 = { nounwind readnone } 554attributes #2 = { nounwind convergent } 555