1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,CIVI,VI %s 4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s 5 6define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 7; GFX9-LABEL: s_insertelement_v2i16_0: 8; GFX9: ; %bb.0: 9; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 10; GFX9-NEXT: v_mov_b32_e32 v0, 0 11; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2 15; GFX9-NEXT: v_mov_b32_e32 v1, s2 16; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 17; GFX9-NEXT: s_endpgm 18; 19; CIVI-LABEL: s_insertelement_v2i16_0: 20; CIVI: ; %bb.0: 21; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 22; CIVI-NEXT: s_waitcnt lgkmcnt(0) 23; CIVI-NEXT: v_mov_b32_e32 v0, s0 24; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 25; CIVI-NEXT: v_mov_b32_e32 v1, s1 26; CIVI-NEXT: s_waitcnt lgkmcnt(0) 27; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000 28; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7 29; CIVI-NEXT: v_mov_b32_e32 v2, s0 30; CIVI-NEXT: flat_store_dword v[0:1], v2 31; CIVI-NEXT: s_endpgm 32 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 33 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 34 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 35 ret void 36} 37 38 39define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 40; GFX9-LABEL: s_insertelement_v2i16_0_reg: 41; GFX9: ; %bb.0: 42; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 43; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 44; GFX9-NEXT: v_mov_b32_e32 v0, 0 45; GFX9-NEXT: s_waitcnt lgkmcnt(0) 46; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 47; GFX9-NEXT: s_waitcnt lgkmcnt(0) 48; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2 49; GFX9-NEXT: v_mov_b32_e32 v1, s2 50; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 51; GFX9-NEXT: s_endpgm 52; 53; VI-LABEL: s_insertelement_v2i16_0_reg: 54; VI: ; %bb.0: 55; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 56; VI-NEXT: s_load_dword s4, s[4:5], 0x30 57; VI-NEXT: s_waitcnt lgkmcnt(0) 58; VI-NEXT: v_mov_b32_e32 v0, s0 59; VI-NEXT: s_load_dword s0, s[2:3], 0x0 60; VI-NEXT: v_mov_b32_e32 v1, s1 61; VI-NEXT: s_and_b32 s1, s4, 0xffff 62; VI-NEXT: s_waitcnt lgkmcnt(0) 63; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 64; VI-NEXT: s_or_b32 s0, s1, s0 65; VI-NEXT: v_mov_b32_e32 v2, s0 66; VI-NEXT: flat_store_dword v[0:1], v2 67; VI-NEXT: s_endpgm 68; 69; CI-LABEL: s_insertelement_v2i16_0_reg: 70; CI: ; %bb.0: 71; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 72; CI-NEXT: s_load_dword s4, s[4:5], 0xc 73; CI-NEXT: s_waitcnt lgkmcnt(0) 74; CI-NEXT: s_load_dword s2, s[2:3], 0x0 75; CI-NEXT: v_mov_b32_e32 v0, s0 76; CI-NEXT: v_mov_b32_e32 v1, s1 77; CI-NEXT: s_and_b32 s1, s4, 0xffff 78; CI-NEXT: s_waitcnt lgkmcnt(0) 79; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 80; CI-NEXT: s_or_b32 s0, s1, s0 81; CI-NEXT: v_mov_b32_e32 v2, s0 82; CI-NEXT: flat_store_dword v[0:1], v2 83; CI-NEXT: s_endpgm 84 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 85 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 86 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 87 ret void 88} 89 90define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 91; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 92; GFX9: ; %bb.0: 93; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 94; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 95; GFX9-NEXT: v_mov_b32_e32 v0, 0 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 98; GFX9-NEXT: s_waitcnt lgkmcnt(0) 99; GFX9-NEXT: s_lshr_b32 s2, s2, 16 100; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2 101; GFX9-NEXT: v_mov_b32_e32 v1, s3 102; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 103; GFX9-NEXT: ;;#ASMSTART 104; GFX9-NEXT: ; use s2 105; GFX9-NEXT: ;;#ASMEND 106; GFX9-NEXT: s_endpgm 107; 108; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 109; VI: ; %bb.0: 110; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 111; VI-NEXT: s_load_dword s4, s[4:5], 0x30 112; VI-NEXT: s_waitcnt lgkmcnt(0) 113; VI-NEXT: v_mov_b32_e32 v0, s0 114; VI-NEXT: s_load_dword s0, s[2:3], 0x0 115; VI-NEXT: v_mov_b32_e32 v1, s1 116; VI-NEXT: s_and_b32 s1, s4, 0xffff 117; VI-NEXT: s_waitcnt lgkmcnt(0) 118; VI-NEXT: s_lshr_b32 s2, s0, 16 119; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 120; VI-NEXT: s_or_b32 s0, s1, s0 121; VI-NEXT: v_mov_b32_e32 v2, s0 122; VI-NEXT: flat_store_dword v[0:1], v2 123; VI-NEXT: ;;#ASMSTART 124; VI-NEXT: ; use s2 125; VI-NEXT: ;;#ASMEND 126; VI-NEXT: s_endpgm 127; 128; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 129; CI: ; %bb.0: 130; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 131; CI-NEXT: s_load_dword s4, s[4:5], 0xc 132; CI-NEXT: s_waitcnt lgkmcnt(0) 133; CI-NEXT: s_load_dword s2, s[2:3], 0x0 134; CI-NEXT: v_mov_b32_e32 v1, s1 135; CI-NEXT: v_mov_b32_e32 v0, s0 136; CI-NEXT: s_and_b32 s0, s4, 0xffff 137; CI-NEXT: s_waitcnt lgkmcnt(0) 138; CI-NEXT: s_lshr_b32 s1, s2, 16 139; CI-NEXT: s_lshl_b32 s2, s1, 16 140; CI-NEXT: s_or_b32 s0, s0, s2 141; CI-NEXT: v_mov_b32_e32 v2, s0 142; CI-NEXT: flat_store_dword v[0:1], v2 143; CI-NEXT: ;;#ASMSTART 144; CI-NEXT: ; use s1 145; CI-NEXT: ;;#ASMEND 146; CI-NEXT: s_endpgm 147 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 148 %elt1 = extractelement <2 x i16> %vec, i32 1 149 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 150 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 151 %use1 = zext i16 %elt1 to i32 152 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 153 ret void 154} 155 156define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 { 157; GFX9-LABEL: s_insertelement_v2i16_0_reghi: 158; GFX9: ; %bb.0: 159; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 160; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 161; GFX9-NEXT: v_mov_b32_e32 v0, 0 162; GFX9-NEXT: s_waitcnt lgkmcnt(0) 163; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 165; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2 166; GFX9-NEXT: v_mov_b32_e32 v1, s2 167; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 168; GFX9-NEXT: s_endpgm 169; 170; VI-LABEL: s_insertelement_v2i16_0_reghi: 171; VI: ; %bb.0: 172; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 173; VI-NEXT: s_load_dword s4, s[4:5], 0x30 174; VI-NEXT: s_waitcnt lgkmcnt(0) 175; VI-NEXT: v_mov_b32_e32 v0, s0 176; VI-NEXT: s_load_dword s0, s[2:3], 0x0 177; VI-NEXT: v_mov_b32_e32 v1, s1 178; VI-NEXT: s_lshr_b32 s1, s4, 16 179; VI-NEXT: s_waitcnt lgkmcnt(0) 180; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 181; VI-NEXT: s_or_b32 s0, s1, s0 182; VI-NEXT: v_mov_b32_e32 v2, s0 183; VI-NEXT: flat_store_dword v[0:1], v2 184; VI-NEXT: s_endpgm 185; 186; CI-LABEL: s_insertelement_v2i16_0_reghi: 187; CI: ; %bb.0: 188; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 189; CI-NEXT: s_load_dword s4, s[4:5], 0xc 190; CI-NEXT: s_waitcnt lgkmcnt(0) 191; CI-NEXT: s_load_dword s2, s[2:3], 0x0 192; CI-NEXT: v_mov_b32_e32 v0, s0 193; CI-NEXT: v_mov_b32_e32 v1, s1 194; CI-NEXT: s_lshr_b32 s1, s4, 16 195; CI-NEXT: s_waitcnt lgkmcnt(0) 196; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 197; CI-NEXT: s_or_b32 s0, s1, s0 198; CI-NEXT: v_mov_b32_e32 v2, s0 199; CI-NEXT: flat_store_dword v[0:1], v2 200; CI-NEXT: s_endpgm 201 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 202 %elt.hi = lshr i32 %elt.arg, 16 203 %elt = trunc i32 %elt.hi to i16 204 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 205 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 206 ret void 207} 208 209define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { 210; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 211; GFX9: ; %bb.0: 212; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 213; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 214; GFX9-NEXT: v_mov_b32_e32 v0, 0 215; GFX9-NEXT: s_waitcnt lgkmcnt(0) 216; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 217; GFX9-NEXT: s_lshr_b32 s3, s4, 16 218; GFX9-NEXT: s_waitcnt lgkmcnt(0) 219; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 220; GFX9-NEXT: v_mov_b32_e32 v1, s2 221; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 222; GFX9-NEXT: ;;#ASMSTART 223; GFX9-NEXT: ; use s3 224; GFX9-NEXT: ;;#ASMEND 225; GFX9-NEXT: s_endpgm 226; 227; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 228; VI: ; %bb.0: 229; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 230; VI-NEXT: s_load_dword s4, s[4:5], 0x10 231; VI-NEXT: s_waitcnt lgkmcnt(0) 232; VI-NEXT: s_load_dword s2, s[2:3], 0x0 233; VI-NEXT: v_mov_b32_e32 v0, s0 234; VI-NEXT: v_mov_b32_e32 v1, s1 235; VI-NEXT: s_lshr_b32 s0, s4, 16 236; VI-NEXT: s_waitcnt lgkmcnt(0) 237; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 238; VI-NEXT: s_or_b32 s1, s0, s1 239; VI-NEXT: v_mov_b32_e32 v2, s1 240; VI-NEXT: flat_store_dword v[0:1], v2 241; VI-NEXT: ;;#ASMSTART 242; VI-NEXT: ; use s0 243; VI-NEXT: ;;#ASMEND 244; VI-NEXT: s_endpgm 245; 246; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 247; CI: ; %bb.0: 248; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 249; CI-NEXT: s_load_dword s4, s[4:5], 0x4 250; CI-NEXT: s_waitcnt lgkmcnt(0) 251; CI-NEXT: s_load_dword s2, s[2:3], 0x0 252; CI-NEXT: v_mov_b32_e32 v0, s0 253; CI-NEXT: v_mov_b32_e32 v1, s1 254; CI-NEXT: s_lshr_b32 s0, s4, 16 255; CI-NEXT: s_waitcnt lgkmcnt(0) 256; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 257; CI-NEXT: s_or_b32 s1, s0, s1 258; CI-NEXT: v_mov_b32_e32 v2, s1 259; CI-NEXT: flat_store_dword v[0:1], v2 260; CI-NEXT: ;;#ASMSTART 261; CI-NEXT: ; use s0 262; CI-NEXT: ;;#ASMEND 263; CI-NEXT: s_endpgm 264 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 265 %elt.hi = lshr i32 %elt.arg, 16 266 %elt = trunc i32 %elt.hi to i16 267 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 268 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 269 %use1 = zext i16 %elt to i32 270 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 271 ret void 272} 273 274define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { 275; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 276; GFX9: ; %bb.0: 277; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 278; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 279; GFX9-NEXT: v_mov_b32_e32 v0, 0 280; GFX9-NEXT: s_waitcnt lgkmcnt(0) 281; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 282; GFX9-NEXT: s_lshr_b32 s3, s4, 16 283; GFX9-NEXT: s_waitcnt lgkmcnt(0) 284; GFX9-NEXT: s_lshr_b32 s2, s2, 16 285; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 286; GFX9-NEXT: v_mov_b32_e32 v1, s4 287; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 288; GFX9-NEXT: ;;#ASMSTART 289; GFX9-NEXT: ; use s3 290; GFX9-NEXT: ;;#ASMEND 291; GFX9-NEXT: ;;#ASMSTART 292; GFX9-NEXT: ; use s2 293; GFX9-NEXT: ;;#ASMEND 294; GFX9-NEXT: s_endpgm 295; 296; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 297; VI: ; %bb.0: 298; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 299; VI-NEXT: s_load_dword s4, s[4:5], 0x10 300; VI-NEXT: s_waitcnt lgkmcnt(0) 301; VI-NEXT: v_mov_b32_e32 v0, s0 302; VI-NEXT: s_load_dword s0, s[2:3], 0x0 303; VI-NEXT: v_mov_b32_e32 v1, s1 304; VI-NEXT: s_lshr_b32 s1, s4, 16 305; VI-NEXT: s_waitcnt lgkmcnt(0) 306; VI-NEXT: s_lshr_b32 s2, s0, 16 307; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 308; VI-NEXT: s_or_b32 s0, s1, s0 309; VI-NEXT: v_mov_b32_e32 v2, s0 310; VI-NEXT: flat_store_dword v[0:1], v2 311; VI-NEXT: ;;#ASMSTART 312; VI-NEXT: ; use s1 313; VI-NEXT: ;;#ASMEND 314; VI-NEXT: ;;#ASMSTART 315; VI-NEXT: ; use s2 316; VI-NEXT: ;;#ASMEND 317; VI-NEXT: s_endpgm 318; 319; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 320; CI: ; %bb.0: 321; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 322; CI-NEXT: s_load_dword s4, s[4:5], 0x4 323; CI-NEXT: s_waitcnt lgkmcnt(0) 324; CI-NEXT: v_mov_b32_e32 v0, s0 325; CI-NEXT: s_load_dword s0, s[2:3], 0x0 326; CI-NEXT: v_mov_b32_e32 v2, s4 327; CI-NEXT: v_mov_b32_e32 v1, s1 328; CI-NEXT: s_lshr_b32 s1, s4, 16 329; CI-NEXT: s_waitcnt lgkmcnt(0) 330; CI-NEXT: s_lshr_b32 s0, s0, 16 331; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 332; CI-NEXT: flat_store_dword v[0:1], v2 333; CI-NEXT: ;;#ASMSTART 334; CI-NEXT: ; use s1 335; CI-NEXT: ;;#ASMEND 336; CI-NEXT: ;;#ASMSTART 337; CI-NEXT: ; use s0 338; CI-NEXT: ;;#ASMEND 339; CI-NEXT: s_endpgm 340 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 341 %elt.hi = lshr i32 %elt.arg, 16 342 %elt = trunc i32 %elt.hi to i16 343 %vec.hi = extractelement <2 x i16> %vec, i32 1 344 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 345 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 346 %use1 = zext i16 %elt to i32 347 %vec.hi.use1 = zext i16 %vec.hi to i32 348 349 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 350 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0 351 ret void 352} 353 354define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 355; GFX9-LABEL: s_insertelement_v2i16_1: 356; GFX9: ; %bb.0: 357; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 358; GFX9-NEXT: v_mov_b32_e32 v0, 0 359; GFX9-NEXT: s_waitcnt lgkmcnt(0) 360; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 361; GFX9-NEXT: s_waitcnt lgkmcnt(0) 362; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7 363; GFX9-NEXT: v_mov_b32_e32 v1, s2 364; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 365; GFX9-NEXT: s_endpgm 366; 367; CIVI-LABEL: s_insertelement_v2i16_1: 368; CIVI: ; %bb.0: 369; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 370; CIVI-NEXT: s_waitcnt lgkmcnt(0) 371; CIVI-NEXT: v_mov_b32_e32 v0, s0 372; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 373; CIVI-NEXT: v_mov_b32_e32 v1, s1 374; CIVI-NEXT: s_waitcnt lgkmcnt(0) 375; CIVI-NEXT: s_and_b32 s0, s0, 0xffff 376; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000 377; CIVI-NEXT: v_mov_b32_e32 v2, s0 378; CIVI-NEXT: flat_store_dword v[0:1], v2 379; CIVI-NEXT: s_endpgm 380 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 381 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 382 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 383 ret void 384} 385 386define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 387; GFX9-LABEL: s_insertelement_v2i16_1_reg: 388; GFX9: ; %bb.0: 389; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 390; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 391; GFX9-NEXT: v_mov_b32_e32 v0, 0 392; GFX9-NEXT: s_waitcnt lgkmcnt(0) 393; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 394; GFX9-NEXT: s_waitcnt lgkmcnt(0) 395; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 396; GFX9-NEXT: v_mov_b32_e32 v1, s2 397; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 398; GFX9-NEXT: s_endpgm 399; 400; VI-LABEL: s_insertelement_v2i16_1_reg: 401; VI: ; %bb.0: 402; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 403; VI-NEXT: s_load_dword s4, s[4:5], 0x30 404; VI-NEXT: s_waitcnt lgkmcnt(0) 405; VI-NEXT: v_mov_b32_e32 v0, s0 406; VI-NEXT: s_load_dword s0, s[2:3], 0x0 407; VI-NEXT: v_mov_b32_e32 v1, s1 408; VI-NEXT: s_lshl_b32 s1, s4, 16 409; VI-NEXT: s_waitcnt lgkmcnt(0) 410; VI-NEXT: s_and_b32 s0, s0, 0xffff 411; VI-NEXT: s_or_b32 s0, s0, s1 412; VI-NEXT: v_mov_b32_e32 v2, s0 413; VI-NEXT: flat_store_dword v[0:1], v2 414; VI-NEXT: s_endpgm 415; 416; CI-LABEL: s_insertelement_v2i16_1_reg: 417; CI: ; %bb.0: 418; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 419; CI-NEXT: s_load_dword s4, s[4:5], 0xc 420; CI-NEXT: s_waitcnt lgkmcnt(0) 421; CI-NEXT: s_load_dword s2, s[2:3], 0x0 422; CI-NEXT: v_mov_b32_e32 v0, s0 423; CI-NEXT: v_mov_b32_e32 v1, s1 424; CI-NEXT: s_lshl_b32 s1, s4, 16 425; CI-NEXT: s_waitcnt lgkmcnt(0) 426; CI-NEXT: s_and_b32 s0, s2, 0xffff 427; CI-NEXT: s_or_b32 s0, s0, s1 428; CI-NEXT: v_mov_b32_e32 v2, s0 429; CI-NEXT: flat_store_dword v[0:1], v2 430; CI-NEXT: s_endpgm 431 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 432 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 433 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 434 ret void 435} 436 437define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { 438; GFX9-LABEL: s_insertelement_v2f16_0: 439; GFX9: ; %bb.0: 440; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 441; GFX9-NEXT: v_mov_b32_e32 v0, 0 442; GFX9-NEXT: s_waitcnt lgkmcnt(0) 443; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 445; GFX9-NEXT: s_lshr_b32 s2, s2, 16 446; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 447; GFX9-NEXT: v_mov_b32_e32 v1, s2 448; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 449; GFX9-NEXT: s_endpgm 450; 451; CIVI-LABEL: s_insertelement_v2f16_0: 452; CIVI: ; %bb.0: 453; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 454; CIVI-NEXT: s_waitcnt lgkmcnt(0) 455; CIVI-NEXT: v_mov_b32_e32 v0, s0 456; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 457; CIVI-NEXT: v_mov_b32_e32 v1, s1 458; CIVI-NEXT: s_waitcnt lgkmcnt(0) 459; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000 460; CIVI-NEXT: s_or_b32 s0, s0, 0x4500 461; CIVI-NEXT: v_mov_b32_e32 v2, s0 462; CIVI-NEXT: flat_store_dword v[0:1], v2 463; CIVI-NEXT: s_endpgm 464 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr 465 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 466 store <2 x half> %vecins, <2 x half> addrspace(1)* %out 467 ret void 468} 469 470define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { 471; GFX9-LABEL: s_insertelement_v2f16_1: 472; GFX9: ; %bb.0: 473; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 474; GFX9-NEXT: v_mov_b32_e32 v0, 0 475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 476; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 477; GFX9-NEXT: s_waitcnt lgkmcnt(0) 478; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500 479; GFX9-NEXT: v_mov_b32_e32 v1, s2 480; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 481; GFX9-NEXT: s_endpgm 482; 483; CIVI-LABEL: s_insertelement_v2f16_1: 484; CIVI: ; %bb.0: 485; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 486; CIVI-NEXT: s_waitcnt lgkmcnt(0) 487; CIVI-NEXT: v_mov_b32_e32 v0, s0 488; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 489; CIVI-NEXT: v_mov_b32_e32 v1, s1 490; CIVI-NEXT: s_waitcnt lgkmcnt(0) 491; CIVI-NEXT: s_and_b32 s0, s0, 0xffff 492; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000 493; CIVI-NEXT: v_mov_b32_e32 v2, s0 494; CIVI-NEXT: flat_store_dword v[0:1], v2 495; CIVI-NEXT: s_endpgm 496 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr 497 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 498 store <2 x half> %vecins, <2 x half> addrspace(1)* %out 499 ret void 500} 501 502define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 503; GFX9-LABEL: v_insertelement_v2i16_0: 504; GFX9: ; %bb.0: 505; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 506; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 507; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 508; GFX9-NEXT: s_waitcnt lgkmcnt(0) 509; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 510; GFX9-NEXT: s_movk_i32 s2, 0x3e7 511; GFX9-NEXT: s_waitcnt vmcnt(0) 512; GFX9-NEXT: v_bfi_b32 v1, v2, s2, v1 513; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 514; GFX9-NEXT: s_endpgm 515; 516; VI-LABEL: v_insertelement_v2i16_0: 517; VI: ; %bb.0: 518; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 519; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 520; VI-NEXT: s_waitcnt lgkmcnt(0) 521; VI-NEXT: v_mov_b32_e32 v1, s3 522; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 523; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 524; VI-NEXT: flat_load_dword v0, v[0:1] 525; VI-NEXT: v_mov_b32_e32 v3, s1 526; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 527; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 528; VI-NEXT: s_waitcnt vmcnt(0) 529; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 530; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 531; VI-NEXT: flat_store_dword v[2:3], v0 532; VI-NEXT: s_endpgm 533; 534; CI-LABEL: v_insertelement_v2i16_0: 535; CI: ; %bb.0: 536; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 537; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 538; CI-NEXT: s_waitcnt lgkmcnt(0) 539; CI-NEXT: v_mov_b32_e32 v1, s3 540; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 541; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 542; CI-NEXT: flat_load_dword v0, v[0:1] 543; CI-NEXT: v_mov_b32_e32 v3, s1 544; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 545; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 546; CI-NEXT: s_waitcnt vmcnt(0) 547; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 548; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 549; CI-NEXT: flat_store_dword v[2:3], v0 550; CI-NEXT: s_endpgm 551 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 552 %tid.ext = sext i32 %tid to i64 553 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 554 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 555 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 556 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 557 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 558 ret void 559} 560 561define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { 562; GFX9-LABEL: v_insertelement_v2i16_0_reghi: 563; GFX9: ; %bb.0: 564; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 565; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 566; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 567; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 568; GFX9-NEXT: s_waitcnt lgkmcnt(0) 569; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 570; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 571; GFX9-NEXT: s_waitcnt vmcnt(0) 572; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 573; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 574; GFX9-NEXT: s_endpgm 575; 576; VI-LABEL: v_insertelement_v2i16_0_reghi: 577; VI: ; %bb.0: 578; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 579; VI-NEXT: s_load_dword s4, s[4:5], 0x10 580; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 581; VI-NEXT: s_waitcnt lgkmcnt(0) 582; VI-NEXT: v_mov_b32_e32 v1, s3 583; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 584; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 585; VI-NEXT: flat_load_dword v0, v[0:1] 586; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 587; VI-NEXT: v_mov_b32_e32 v3, s1 588; VI-NEXT: s_lshr_b32 s0, s4, 16 589; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 590; VI-NEXT: s_waitcnt vmcnt(0) 591; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 592; VI-NEXT: v_or_b32_e32 v0, s0, v0 593; VI-NEXT: flat_store_dword v[2:3], v0 594; VI-NEXT: s_endpgm 595; 596; CI-LABEL: v_insertelement_v2i16_0_reghi: 597; CI: ; %bb.0: 598; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 599; CI-NEXT: s_load_dword s4, s[4:5], 0x4 600; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 601; CI-NEXT: s_waitcnt lgkmcnt(0) 602; CI-NEXT: v_mov_b32_e32 v1, s3 603; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 604; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 605; CI-NEXT: flat_load_dword v3, v[0:1] 606; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 607; CI-NEXT: v_mov_b32_e32 v1, s1 608; CI-NEXT: s_lshr_b32 s0, s4, 16 609; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 610; CI-NEXT: s_waitcnt vmcnt(0) 611; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 612; CI-NEXT: v_or_b32_e32 v2, s0, v2 613; CI-NEXT: flat_store_dword v[0:1], v2 614; CI-NEXT: s_endpgm 615 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 616 %tid.ext = sext i32 %tid to i64 617 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 618 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 619 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 620 %elt.hi = lshr i32 %elt.arg, 16 621 %elt = trunc i32 %elt.hi to i16 622 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 623 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 624 ret void 625} 626 627define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 628; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: 629; GFX9: ; %bb.0: 630; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 631; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 632; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 633; GFX9-NEXT: s_waitcnt lgkmcnt(0) 634; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 635; GFX9-NEXT: s_waitcnt vmcnt(0) 636; GFX9-NEXT: v_bfi_b32 v1, v2, 53, v1 637; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 638; GFX9-NEXT: s_endpgm 639; 640; VI-LABEL: v_insertelement_v2i16_0_inlineimm: 641; VI: ; %bb.0: 642; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 643; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 644; VI-NEXT: s_waitcnt lgkmcnt(0) 645; VI-NEXT: v_mov_b32_e32 v1, s3 646; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 647; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 648; VI-NEXT: flat_load_dword v0, v[0:1] 649; VI-NEXT: v_mov_b32_e32 v3, s1 650; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 651; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 652; VI-NEXT: s_waitcnt vmcnt(0) 653; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 654; VI-NEXT: v_or_b32_e32 v0, 53, v0 655; VI-NEXT: flat_store_dword v[2:3], v0 656; VI-NEXT: s_endpgm 657; 658; CI-LABEL: v_insertelement_v2i16_0_inlineimm: 659; CI: ; %bb.0: 660; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 661; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 662; CI-NEXT: s_waitcnt lgkmcnt(0) 663; CI-NEXT: v_mov_b32_e32 v1, s3 664; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 665; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 666; CI-NEXT: flat_load_dword v0, v[0:1] 667; CI-NEXT: v_mov_b32_e32 v3, s1 668; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 669; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 670; CI-NEXT: s_waitcnt vmcnt(0) 671; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 672; CI-NEXT: v_or_b32_e32 v0, 53, v0 673; CI-NEXT: flat_store_dword v[2:3], v0 674; CI-NEXT: s_endpgm 675 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 676 %tid.ext = sext i32 %tid to i64 677 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 678 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 679 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 680 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0 681 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 682 ret void 683} 684 685; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0 686define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 687; GFX9-LABEL: v_insertelement_v2i16_1: 688; GFX9: ; %bb.0: 689; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 690; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 691; GFX9-NEXT: s_waitcnt lgkmcnt(0) 692; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 693; GFX9-NEXT: s_movk_i32 s2, 0x3e7 694; GFX9-NEXT: s_waitcnt vmcnt(0) 695; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 696; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 697; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 698; GFX9-NEXT: s_endpgm 699; 700; VI-LABEL: v_insertelement_v2i16_1: 701; VI: ; %bb.0: 702; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 703; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 704; VI-NEXT: s_waitcnt lgkmcnt(0) 705; VI-NEXT: v_mov_b32_e32 v1, s3 706; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 707; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 708; VI-NEXT: flat_load_dword v0, v[0:1] 709; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 710; VI-NEXT: v_mov_b32_e32 v3, s1 711; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 712; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 713; VI-NEXT: s_waitcnt vmcnt(0) 714; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 715; VI-NEXT: flat_store_dword v[2:3], v0 716; VI-NEXT: s_endpgm 717; 718; CI-LABEL: v_insertelement_v2i16_1: 719; CI: ; %bb.0: 720; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 721; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 722; CI-NEXT: s_waitcnt lgkmcnt(0) 723; CI-NEXT: v_mov_b32_e32 v1, s3 724; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 725; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 726; CI-NEXT: flat_load_dword v0, v[0:1] 727; CI-NEXT: v_mov_b32_e32 v3, s1 728; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 729; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 730; CI-NEXT: s_waitcnt vmcnt(0) 731; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 732; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0 733; CI-NEXT: flat_store_dword v[2:3], v0 734; CI-NEXT: s_endpgm 735 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 736 %tid.ext = sext i32 %tid to i64 737 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 738 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 739 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 740 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 741 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 742 ret void 743} 744 745define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 746; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: 747; GFX9: ; %bb.0: 748; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 749; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 750; GFX9-NEXT: s_waitcnt lgkmcnt(0) 751; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 752; GFX9-NEXT: s_waitcnt vmcnt(0) 753; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 754; GFX9-NEXT: v_lshl_or_b32 v1, -15, 16, v1 755; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 756; GFX9-NEXT: s_endpgm 757; 758; VI-LABEL: v_insertelement_v2i16_1_inlineimm: 759; VI: ; %bb.0: 760; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 761; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 762; VI-NEXT: s_waitcnt lgkmcnt(0) 763; VI-NEXT: v_mov_b32_e32 v1, s3 764; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 765; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 766; VI-NEXT: flat_load_dword v0, v[0:1] 767; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 768; VI-NEXT: v_mov_b32_e32 v3, s1 769; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 770; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 771; VI-NEXT: s_waitcnt vmcnt(0) 772; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 773; VI-NEXT: flat_store_dword v[2:3], v0 774; VI-NEXT: s_endpgm 775; 776; CI-LABEL: v_insertelement_v2i16_1_inlineimm: 777; CI: ; %bb.0: 778; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 779; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 780; CI-NEXT: s_waitcnt lgkmcnt(0) 781; CI-NEXT: v_mov_b32_e32 v1, s3 782; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 783; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 784; CI-NEXT: flat_load_dword v0, v[0:1] 785; CI-NEXT: v_mov_b32_e32 v3, s1 786; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 787; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 788; CI-NEXT: s_waitcnt vmcnt(0) 789; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 790; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0 791; CI-NEXT: flat_store_dword v[2:3], v0 792; CI-NEXT: s_endpgm 793 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 794 %tid.ext = sext i32 %tid to i64 795 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 796 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 797 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 798 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1 799 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 800 ret void 801} 802 803define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 804; GFX9-LABEL: v_insertelement_v2f16_0: 805; GFX9: ; %bb.0: 806; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 807; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 808; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 809; GFX9-NEXT: s_waitcnt lgkmcnt(0) 810; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 811; GFX9-NEXT: s_waitcnt vmcnt(0) 812; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 813; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 814; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 815; GFX9-NEXT: s_endpgm 816; 817; VI-LABEL: v_insertelement_v2f16_0: 818; VI: ; %bb.0: 819; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 820; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 821; VI-NEXT: s_waitcnt lgkmcnt(0) 822; VI-NEXT: v_mov_b32_e32 v1, s3 823; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 824; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 825; VI-NEXT: flat_load_dword v0, v[0:1] 826; VI-NEXT: v_mov_b32_e32 v3, s1 827; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 828; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 829; VI-NEXT: s_waitcnt vmcnt(0) 830; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 831; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0 832; VI-NEXT: flat_store_dword v[2:3], v0 833; VI-NEXT: s_endpgm 834; 835; CI-LABEL: v_insertelement_v2f16_0: 836; CI: ; %bb.0: 837; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 838; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 839; CI-NEXT: s_waitcnt lgkmcnt(0) 840; CI-NEXT: v_mov_b32_e32 v1, s3 841; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 842; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 843; CI-NEXT: flat_load_dword v0, v[0:1] 844; CI-NEXT: v_mov_b32_e32 v3, s1 845; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 846; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 847; CI-NEXT: s_waitcnt vmcnt(0) 848; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 849; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0 850; CI-NEXT: flat_store_dword v[2:3], v0 851; CI-NEXT: s_endpgm 852 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 853 %tid.ext = sext i32 %tid to i64 854 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 855 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 856 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 857 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 858 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 859 ret void 860} 861 862define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 863; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: 864; GFX9: ; %bb.0: 865; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 866; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 867; GFX9-NEXT: s_waitcnt lgkmcnt(0) 868; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 869; GFX9-NEXT: s_waitcnt vmcnt(0) 870; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 871; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, 53 872; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 873; GFX9-NEXT: s_endpgm 874; 875; VI-LABEL: v_insertelement_v2f16_0_inlineimm: 876; VI: ; %bb.0: 877; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 878; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 879; VI-NEXT: s_waitcnt lgkmcnt(0) 880; VI-NEXT: v_mov_b32_e32 v1, s3 881; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 882; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 883; VI-NEXT: flat_load_dword v0, v[0:1] 884; VI-NEXT: v_mov_b32_e32 v3, s1 885; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 886; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 887; VI-NEXT: s_waitcnt vmcnt(0) 888; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 889; VI-NEXT: v_or_b32_e32 v0, 53, v0 890; VI-NEXT: flat_store_dword v[2:3], v0 891; VI-NEXT: s_endpgm 892; 893; CI-LABEL: v_insertelement_v2f16_0_inlineimm: 894; CI: ; %bb.0: 895; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 896; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 897; CI-NEXT: s_waitcnt lgkmcnt(0) 898; CI-NEXT: v_mov_b32_e32 v1, s3 899; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 900; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 901; CI-NEXT: flat_load_dword v0, v[0:1] 902; CI-NEXT: v_mov_b32_e32 v3, s1 903; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 904; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 905; CI-NEXT: s_waitcnt vmcnt(0) 906; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 907; CI-NEXT: v_or_b32_e32 v0, 53, v0 908; CI-NEXT: flat_store_dword v[2:3], v0 909; CI-NEXT: s_endpgm 910 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 911 %tid.ext = sext i32 %tid to i64 912 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 913 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 914 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 915 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0 916 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 917 ret void 918} 919 920define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 921; GFX9-LABEL: v_insertelement_v2f16_1: 922; GFX9: ; %bb.0: 923; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 924; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 925; GFX9-NEXT: s_waitcnt lgkmcnt(0) 926; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 927; GFX9-NEXT: s_movk_i32 s2, 0x4500 928; GFX9-NEXT: s_waitcnt vmcnt(0) 929; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 930; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 931; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 932; GFX9-NEXT: s_endpgm 933; 934; VI-LABEL: v_insertelement_v2f16_1: 935; VI: ; %bb.0: 936; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 937; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 938; VI-NEXT: s_waitcnt lgkmcnt(0) 939; VI-NEXT: v_mov_b32_e32 v1, s3 940; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 941; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 942; VI-NEXT: flat_load_dword v0, v[0:1] 943; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 944; VI-NEXT: v_mov_b32_e32 v3, s1 945; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 946; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 947; VI-NEXT: s_waitcnt vmcnt(0) 948; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 949; VI-NEXT: flat_store_dword v[2:3], v0 950; VI-NEXT: s_endpgm 951; 952; CI-LABEL: v_insertelement_v2f16_1: 953; CI: ; %bb.0: 954; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 955; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 956; CI-NEXT: s_waitcnt lgkmcnt(0) 957; CI-NEXT: v_mov_b32_e32 v1, s3 958; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 959; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 960; CI-NEXT: flat_load_dword v0, v[0:1] 961; CI-NEXT: v_mov_b32_e32 v3, s1 962; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 963; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 964; CI-NEXT: s_waitcnt vmcnt(0) 965; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 966; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0 967; CI-NEXT: flat_store_dword v[2:3], v0 968; CI-NEXT: s_endpgm 969 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 970 %tid.ext = sext i32 %tid to i64 971 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 972 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 973 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 974 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 975 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 976 ret void 977} 978 979define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 980; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: 981; GFX9: ; %bb.0: 982; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 983; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 984; GFX9-NEXT: s_waitcnt lgkmcnt(0) 985; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 986; GFX9-NEXT: s_waitcnt vmcnt(0) 987; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 988; GFX9-NEXT: v_lshl_or_b32 v1, 35, 16, v1 989; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 990; GFX9-NEXT: s_endpgm 991; 992; VI-LABEL: v_insertelement_v2f16_1_inlineimm: 993; VI: ; %bb.0: 994; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 995; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 996; VI-NEXT: s_waitcnt lgkmcnt(0) 997; VI-NEXT: v_mov_b32_e32 v1, s3 998; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 999; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1000; VI-NEXT: flat_load_dword v0, v[0:1] 1001; VI-NEXT: v_mov_b32_e32 v1, 0x230000 1002; VI-NEXT: v_mov_b32_e32 v3, s1 1003; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1004; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1005; VI-NEXT: s_waitcnt vmcnt(0) 1006; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1007; VI-NEXT: flat_store_dword v[2:3], v0 1008; VI-NEXT: s_endpgm 1009; 1010; CI-LABEL: v_insertelement_v2f16_1_inlineimm: 1011; CI: ; %bb.0: 1012; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1013; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1014; CI-NEXT: s_waitcnt lgkmcnt(0) 1015; CI-NEXT: v_mov_b32_e32 v1, s3 1016; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1017; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1018; CI-NEXT: flat_load_dword v0, v[0:1] 1019; CI-NEXT: v_mov_b32_e32 v3, s1 1020; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1021; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1022; CI-NEXT: s_waitcnt vmcnt(0) 1023; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1024; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0 1025; CI-NEXT: flat_store_dword v[2:3], v0 1026; CI-NEXT: s_endpgm 1027 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1028 %tid.ext = sext i32 %tid to i64 1029 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1030 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1031 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1032 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1 1033 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1034 ret void 1035} 1036 1037; FIXME: Enable for others when argument load not split 1038define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 { 1039; GFX9-LABEL: s_insertelement_v2i16_dynamic: 1040; GFX9: ; %bb.0: 1041; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1042; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 1043; GFX9-NEXT: v_mov_b32_e32 v0, 0 1044; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 1046; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 1047; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1048; GFX9-NEXT: s_lshl_b32 s3, s4, 4 1049; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s3 1050; GFX9-NEXT: s_andn2_b32 s2, s2, s3 1051; GFX9-NEXT: s_and_b32 s3, s3, 0x3e703e7 1052; GFX9-NEXT: s_or_b32 s2, s3, s2 1053; GFX9-NEXT: v_mov_b32_e32 v1, s2 1054; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1055; GFX9-NEXT: s_endpgm 1056; 1057; VI-LABEL: s_insertelement_v2i16_dynamic: 1058; VI: ; %bb.0: 1059; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1060; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 1061; VI-NEXT: s_waitcnt lgkmcnt(0) 1062; VI-NEXT: v_mov_b32_e32 v0, s0 1063; VI-NEXT: s_load_dword s0, s[4:5], 0x0 1064; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1065; VI-NEXT: v_mov_b32_e32 v1, s1 1066; VI-NEXT: s_waitcnt lgkmcnt(0) 1067; VI-NEXT: s_lshl_b32 s0, s0, 4 1068; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1069; VI-NEXT: s_andn2_b32 s1, s2, s0 1070; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7 1071; VI-NEXT: s_or_b32 s0, s0, s1 1072; VI-NEXT: v_mov_b32_e32 v2, s0 1073; VI-NEXT: flat_store_dword v[0:1], v2 1074; VI-NEXT: s_endpgm 1075; 1076; CI-LABEL: s_insertelement_v2i16_dynamic: 1077; CI: ; %bb.0: 1078; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1079; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 1080; CI-NEXT: s_waitcnt lgkmcnt(0) 1081; CI-NEXT: v_mov_b32_e32 v0, s0 1082; CI-NEXT: s_load_dword s0, s[4:5], 0x0 1083; CI-NEXT: s_load_dword s2, s[2:3], 0x0 1084; CI-NEXT: v_mov_b32_e32 v1, s1 1085; CI-NEXT: s_waitcnt lgkmcnt(0) 1086; CI-NEXT: s_lshl_b32 s0, s0, 4 1087; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1088; CI-NEXT: s_andn2_b32 s1, s2, s0 1089; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7 1090; CI-NEXT: s_or_b32 s0, s0, s1 1091; CI-NEXT: v_mov_b32_e32 v2, s0 1092; CI-NEXT: flat_store_dword v[0:1], v2 1093; CI-NEXT: s_endpgm 1094 %idx = load volatile i32, i32 addrspace(4)* %idx.ptr 1095 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 1096 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1097 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 1098 ret void 1099} 1100 1101define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 { 1102; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1103; GFX9: ; %bb.0: 1104; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1105; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1106; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1107; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1108; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1109; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1110; GFX9-NEXT: s_lshl_b32 s2, s4, 4 1111; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 1112; GFX9-NEXT: s_waitcnt vmcnt(0) 1113; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 1114; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1115; GFX9-NEXT: s_endpgm 1116; 1117; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1118; VI: ; %bb.0: 1119; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1120; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1121; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1122; VI-NEXT: s_waitcnt lgkmcnt(0) 1123; VI-NEXT: v_mov_b32_e32 v1, s3 1124; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1125; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1126; VI-NEXT: flat_load_dword v0, v[0:1] 1127; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1128; VI-NEXT: s_lshl_b32 s0, s4, 4 1129; VI-NEXT: v_mov_b32_e32 v3, s1 1130; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1131; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 1132; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1133; VI-NEXT: s_waitcnt vmcnt(0) 1134; VI-NEXT: v_bfi_b32 v0, s0, v1, v0 1135; VI-NEXT: flat_store_dword v[2:3], v0 1136; VI-NEXT: s_endpgm 1137; 1138; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1139; CI: ; %bb.0: 1140; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1141; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1142; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1143; CI-NEXT: s_waitcnt lgkmcnt(0) 1144; CI-NEXT: v_mov_b32_e32 v1, s3 1145; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1146; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1147; CI-NEXT: flat_load_dword v0, v[0:1] 1148; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1149; CI-NEXT: s_lshl_b32 s0, s4, 4 1150; CI-NEXT: v_mov_b32_e32 v3, s1 1151; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1152; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 1153; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1154; CI-NEXT: s_waitcnt vmcnt(0) 1155; CI-NEXT: v_bfi_b32 v0, s0, v1, v0 1156; CI-NEXT: flat_store_dword v[2:3], v0 1157; CI-NEXT: s_endpgm 1158 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1159 %tid.ext = sext i32 %tid to i64 1160 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1161 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1162 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 1163 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1164 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 1165 ret void 1166} 1167 1168define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { 1169; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1170; GFX9: ; %bb.0: 1171; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1172; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 1173; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1174; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1175; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 1176; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1177; GFX9-NEXT: s_mov_b32 s2, 0xffff 1178; GFX9-NEXT: s_waitcnt vmcnt(1) 1179; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1180; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 1181; GFX9-NEXT: s_mov_b32 s2, 0x12341234 1182; GFX9-NEXT: s_waitcnt vmcnt(0) 1183; GFX9-NEXT: v_bfi_b32 v1, v1, s2, v2 1184; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1185; GFX9-NEXT: s_endpgm 1186; 1187; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1188; VI: ; %bb.0: 1189; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1190; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 1191; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1192; VI-NEXT: s_waitcnt lgkmcnt(0) 1193; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1194; VI-NEXT: v_mov_b32_e32 v1, s3 1195; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1196; VI-NEXT: v_mov_b32_e32 v3, s5 1197; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1198; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1199; VI-NEXT: flat_load_dword v2, v[2:3] 1200; VI-NEXT: flat_load_dword v3, v[0:1] 1201; VI-NEXT: s_mov_b32 s2, 0xffff 1202; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1203; VI-NEXT: v_mov_b32_e32 v1, s1 1204; VI-NEXT: s_mov_b32 s0, 0x12341234 1205; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1206; VI-NEXT: s_waitcnt vmcnt(1) 1207; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1208; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 1209; VI-NEXT: s_waitcnt vmcnt(0) 1210; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 1211; VI-NEXT: flat_store_dword v[0:1], v2 1212; VI-NEXT: s_endpgm 1213; 1214; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1215; CI: ; %bb.0: 1216; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1217; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 1218; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1219; CI-NEXT: s_waitcnt lgkmcnt(0) 1220; CI-NEXT: v_mov_b32_e32 v1, s3 1221; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1222; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1223; CI-NEXT: v_mov_b32_e32 v3, s5 1224; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 1225; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1226; CI-NEXT: flat_load_dword v2, v[2:3] 1227; CI-NEXT: flat_load_dword v3, v[0:1] 1228; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 1229; CI-NEXT: v_mov_b32_e32 v1, s1 1230; CI-NEXT: s_mov_b32 s0, 0x12341234 1231; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1232; CI-NEXT: s_waitcnt vmcnt(1) 1233; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1234; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 1235; CI-NEXT: s_waitcnt vmcnt(0) 1236; CI-NEXT: v_bfi_b32 v2, v2, s0, v3 1237; CI-NEXT: flat_store_dword v[0:1], v2 1238; CI-NEXT: s_endpgm 1239 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1240 %tid.ext = sext i32 %tid to i64 1241 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1242 %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext 1243 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1244 %idx = load i32, i32 addrspace(1)* %idx.gep 1245 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1246 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx 1247 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1248 ret void 1249} 1250 1251define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { 1252; GFX9-LABEL: v_insertelement_v4f16_0: 1253; GFX9: ; %bb.0: 1254; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1255; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 1256; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1257; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1259; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1260; GFX9-NEXT: s_waitcnt vmcnt(0) 1261; GFX9-NEXT: v_bfi_b32 v0, v3, s4, v0 1262; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1263; GFX9-NEXT: s_endpgm 1264; 1265; VI-LABEL: v_insertelement_v4f16_0: 1266; VI: ; %bb.0: 1267; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1268; VI-NEXT: s_load_dword s4, s[4:5], 0x30 1269; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1270; VI-NEXT: s_waitcnt lgkmcnt(0) 1271; VI-NEXT: v_mov_b32_e32 v1, s3 1272; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1273; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1274; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1275; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1276; VI-NEXT: v_mov_b32_e32 v3, s1 1277; VI-NEXT: s_mov_b32 s0, 0xffff 1278; VI-NEXT: v_mov_b32_e32 v4, s4 1279; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1280; VI-NEXT: s_waitcnt vmcnt(0) 1281; VI-NEXT: v_bfi_b32 v0, s0, v4, v0 1282; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1283; VI-NEXT: s_endpgm 1284; 1285; CI-LABEL: v_insertelement_v4f16_0: 1286; CI: ; %bb.0: 1287; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1288; CI-NEXT: s_load_dword s4, s[4:5], 0xc 1289; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1290; CI-NEXT: s_waitcnt lgkmcnt(0) 1291; CI-NEXT: v_mov_b32_e32 v1, s3 1292; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1293; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1294; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1295; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1296; CI-NEXT: v_mov_b32_e32 v3, s1 1297; CI-NEXT: s_mov_b32 s0, 0xffff 1298; CI-NEXT: v_mov_b32_e32 v4, s4 1299; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1300; CI-NEXT: s_waitcnt vmcnt(0) 1301; CI-NEXT: v_bfi_b32 v0, s0, v4, v0 1302; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1303; CI-NEXT: s_endpgm 1304 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1305 %tid.ext = sext i32 %tid to i64 1306 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1307 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1308 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1309 %val.trunc = trunc i32 %val to i16 1310 %val.cvt = bitcast i16 %val.trunc to half 1311 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0 1312 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1313 ret void 1314} 1315 1316define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { 1317; GFX9-LABEL: v_insertelement_v4f16_1: 1318; GFX9: ; %bb.0: 1319; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1320; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1321; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1323; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1324; GFX9-NEXT: s_waitcnt vmcnt(0) 1325; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 1326; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0 1327; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1328; GFX9-NEXT: s_endpgm 1329; 1330; VI-LABEL: v_insertelement_v4f16_1: 1331; VI: ; %bb.0: 1332; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1333; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1334; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1335; VI-NEXT: s_waitcnt lgkmcnt(0) 1336; VI-NEXT: v_mov_b32_e32 v1, s3 1337; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1338; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1339; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1340; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1341; VI-NEXT: s_lshl_b32 s0, s4, 16 1342; VI-NEXT: v_mov_b32_e32 v3, s1 1343; VI-NEXT: v_mov_b32_e32 v4, s0 1344; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1345; VI-NEXT: s_waitcnt vmcnt(0) 1346; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1347; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1348; VI-NEXT: s_endpgm 1349; 1350; CI-LABEL: v_insertelement_v4f16_1: 1351; CI: ; %bb.0: 1352; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1353; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1354; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1355; CI-NEXT: s_waitcnt lgkmcnt(0) 1356; CI-NEXT: v_mov_b32_e32 v1, s3 1357; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1358; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1359; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1360; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1361; CI-NEXT: v_mov_b32_e32 v3, s1 1362; CI-NEXT: s_lshl_b32 s0, s4, 16 1363; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1364; CI-NEXT: s_waitcnt vmcnt(0) 1365; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1366; CI-NEXT: v_or_b32_e32 v0, s0, v0 1367; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1368; CI-NEXT: s_endpgm 1369 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1370 %tid.ext = sext i32 %tid to i64 1371 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1372 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1373 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1374 %val.trunc = trunc i32 %val to i16 1375 %val.cvt = bitcast i16 %val.trunc to half 1376 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1 1377 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1378 ret void 1379} 1380 1381define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { 1382; GFX9-LABEL: v_insertelement_v4f16_2: 1383; GFX9: ; %bb.0: 1384; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1385; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 1386; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1387; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1388; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1390; GFX9-NEXT: s_waitcnt vmcnt(0) 1391; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 1392; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1393; GFX9-NEXT: s_endpgm 1394; 1395; VI-LABEL: v_insertelement_v4f16_2: 1396; VI: ; %bb.0: 1397; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1398; VI-NEXT: s_load_dword s4, s[4:5], 0x30 1399; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1400; VI-NEXT: s_waitcnt lgkmcnt(0) 1401; VI-NEXT: v_mov_b32_e32 v1, s3 1402; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1403; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1404; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1405; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1406; VI-NEXT: v_mov_b32_e32 v3, s1 1407; VI-NEXT: s_mov_b32 s0, 0xffff 1408; VI-NEXT: v_mov_b32_e32 v4, s4 1409; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1410; VI-NEXT: s_waitcnt vmcnt(0) 1411; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 1412; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1413; VI-NEXT: s_endpgm 1414; 1415; CI-LABEL: v_insertelement_v4f16_2: 1416; CI: ; %bb.0: 1417; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1418; CI-NEXT: s_load_dword s4, s[4:5], 0xc 1419; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1420; CI-NEXT: s_waitcnt lgkmcnt(0) 1421; CI-NEXT: v_mov_b32_e32 v1, s3 1422; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1423; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1424; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1425; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1426; CI-NEXT: v_mov_b32_e32 v3, s1 1427; CI-NEXT: s_mov_b32 s0, 0xffff 1428; CI-NEXT: v_mov_b32_e32 v4, s4 1429; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1430; CI-NEXT: s_waitcnt vmcnt(0) 1431; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 1432; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1433; CI-NEXT: s_endpgm 1434 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1435 %tid.ext = sext i32 %tid to i64 1436 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1437 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1438 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1439 %val.trunc = trunc i32 %val to i16 1440 %val.cvt = bitcast i16 %val.trunc to half 1441 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2 1442 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1443 ret void 1444} 1445 1446define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { 1447; GFX9-LABEL: v_insertelement_v4f16_3: 1448; GFX9: ; %bb.0: 1449; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1450; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1451; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1454; GFX9-NEXT: s_waitcnt vmcnt(0) 1455; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1456; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 1457; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1458; GFX9-NEXT: s_endpgm 1459; 1460; VI-LABEL: v_insertelement_v4f16_3: 1461; VI: ; %bb.0: 1462; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1463; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1464; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1465; VI-NEXT: s_waitcnt lgkmcnt(0) 1466; VI-NEXT: v_mov_b32_e32 v1, s3 1467; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1468; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1469; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1470; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1471; VI-NEXT: s_lshl_b32 s0, s4, 16 1472; VI-NEXT: v_mov_b32_e32 v3, s1 1473; VI-NEXT: v_mov_b32_e32 v4, s0 1474; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1475; VI-NEXT: s_waitcnt vmcnt(0) 1476; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1477; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1478; VI-NEXT: s_endpgm 1479; 1480; CI-LABEL: v_insertelement_v4f16_3: 1481; CI: ; %bb.0: 1482; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1483; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1484; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1485; CI-NEXT: s_waitcnt lgkmcnt(0) 1486; CI-NEXT: v_mov_b32_e32 v1, s3 1487; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1488; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1489; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1490; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1491; CI-NEXT: v_mov_b32_e32 v3, s1 1492; CI-NEXT: s_lshl_b32 s0, s4, 16 1493; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1494; CI-NEXT: s_waitcnt vmcnt(0) 1495; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1496; CI-NEXT: v_or_b32_e32 v1, s0, v1 1497; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1498; CI-NEXT: s_endpgm 1499 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1500 %tid.ext = sext i32 %tid to i64 1501 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1502 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1503 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1504 %val.trunc = trunc i32 %val to i16 1505 %val.cvt = bitcast i16 %val.trunc to half 1506 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3 1507 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1508 ret void 1509} 1510 1511define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { 1512; GFX9-LABEL: v_insertelement_v4i16_2: 1513; GFX9: ; %bb.0: 1514; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1515; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1516; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1517; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1518; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1519; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1520; GFX9-NEXT: s_waitcnt vmcnt(0) 1521; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 1522; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1523; GFX9-NEXT: s_endpgm 1524; 1525; VI-LABEL: v_insertelement_v4i16_2: 1526; VI: ; %bb.0: 1527; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1528; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1529; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1530; VI-NEXT: s_waitcnt lgkmcnt(0) 1531; VI-NEXT: v_mov_b32_e32 v1, s3 1532; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1533; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1534; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1535; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1536; VI-NEXT: v_mov_b32_e32 v3, s1 1537; VI-NEXT: s_mov_b32 s0, 0xffff 1538; VI-NEXT: v_mov_b32_e32 v4, s4 1539; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1540; VI-NEXT: s_waitcnt vmcnt(0) 1541; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 1542; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1543; VI-NEXT: s_endpgm 1544; 1545; CI-LABEL: v_insertelement_v4i16_2: 1546; CI: ; %bb.0: 1547; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1548; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1549; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1550; CI-NEXT: s_waitcnt lgkmcnt(0) 1551; CI-NEXT: v_mov_b32_e32 v1, s3 1552; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1553; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1554; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1555; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1556; CI-NEXT: v_mov_b32_e32 v3, s1 1557; CI-NEXT: s_mov_b32 s0, 0xffff 1558; CI-NEXT: v_mov_b32_e32 v4, s4 1559; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1560; CI-NEXT: s_waitcnt vmcnt(0) 1561; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 1562; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1563; CI-NEXT: s_endpgm 1564 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1565 %tid.ext = sext i32 %tid to i64 1566 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 1567 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 1568 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 1569 %val.trunc = trunc i32 %val to i16 1570 %val.cvt = bitcast i16 %val.trunc to i16 1571 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2 1572 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep 1573 ret void 1574} 1575 1576; FIXME: Better code on CI? 1577define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { 1578; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1579; GFX9: ; %bb.0: 1580; GFX9-NEXT: global_load_dword v2, v[0:1], off 1581; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1582; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1583; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1584; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 1586; GFX9-NEXT: s_mov_b32 s3, 0 1587; GFX9-NEXT: s_mov_b32 s2, 0xffff 1588; GFX9-NEXT: s_waitcnt vmcnt(1) 1589; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1590; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] 1591; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 1592; GFX9-NEXT: s_waitcnt vmcnt(0) 1593; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 1594; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 1595; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1596; GFX9-NEXT: s_endpgm 1597; 1598; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1599; VI: ; %bb.0: 1600; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1601; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1602; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1603; VI-NEXT: s_waitcnt lgkmcnt(0) 1604; VI-NEXT: v_mov_b32_e32 v1, s3 1605; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1606; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1607; VI-NEXT: flat_load_dword v4, v[0:1] 1608; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1609; VI-NEXT: s_mov_b32 s2, 0xffff 1610; VI-NEXT: v_mov_b32_e32 v3, s1 1611; VI-NEXT: s_mov_b32 s3, 0 1612; VI-NEXT: s_and_b32 s1, s4, s2 1613; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1614; VI-NEXT: s_lshl_b32 s0, s1, 16 1615; VI-NEXT: s_or_b32 s0, s1, s0 1616; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1617; VI-NEXT: s_waitcnt vmcnt(1) 1618; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1619; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] 1620; VI-NEXT: s_waitcnt vmcnt(0) 1621; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 1622; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 1623; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1624; VI-NEXT: s_endpgm 1625; 1626; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1627; CI: ; %bb.0: 1628; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1629; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1630; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1631; CI-NEXT: s_waitcnt lgkmcnt(0) 1632; CI-NEXT: v_mov_b32_e32 v1, s3 1633; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1634; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1635; CI-NEXT: flat_load_dword v4, v[0:1] 1636; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1637; CI-NEXT: s_mov_b32 s3, 0 1638; CI-NEXT: s_mov_b32 s2, 0xffff 1639; CI-NEXT: v_mov_b32_e32 v3, s1 1640; CI-NEXT: s_lshl_b32 s1, s4, 16 1641; CI-NEXT: s_and_b32 s4, s4, s2 1642; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1643; CI-NEXT: s_or_b32 s0, s4, s1 1644; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1645; CI-NEXT: s_waitcnt vmcnt(1) 1646; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1647; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4 1648; CI-NEXT: s_waitcnt vmcnt(0) 1649; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 1650; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 1651; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1652; CI-NEXT: s_endpgm 1653 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1654 %tid.ext = sext i32 %tid to i64 1655 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 1656 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 1657 %idx.val = load volatile i32, i32 addrspace(1)* undef 1658 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 1659 %val.trunc = trunc i32 %val to i16 1660 %val.cvt = bitcast i16 %val.trunc to i16 1661 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val 1662 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep 1663 ret void 1664} 1665 1666define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 { 1667; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1668; GFX9: ; %bb.0: 1669; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1670; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 1671; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1672; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1673; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1674; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 1675; GFX9-NEXT: s_mov_b32 s3, 0 1676; GFX9-NEXT: s_mov_b32 s2, 0xffff 1677; GFX9-NEXT: s_lshl_b32 s5, s5, 4 1678; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s5 1679; GFX9-NEXT: v_mov_b32_e32 v3, s4 1680; GFX9-NEXT: v_mov_b32_e32 v4, s4 1681; GFX9-NEXT: s_waitcnt vmcnt(0) 1682; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 1683; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 1684; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1685; GFX9-NEXT: s_endpgm 1686; 1687; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1688; VI: ; %bb.0: 1689; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1690; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 1691; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1692; VI-NEXT: s_waitcnt lgkmcnt(0) 1693; VI-NEXT: v_mov_b32_e32 v1, s3 1694; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1695; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1696; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1697; VI-NEXT: s_mov_b32 s2, 0xffff 1698; VI-NEXT: v_mov_b32_e32 v3, s1 1699; VI-NEXT: s_mov_b32 s3, 0 1700; VI-NEXT: s_lshl_b32 s1, s5, 4 1701; VI-NEXT: s_and_b32 s4, s4, s2 1702; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1703; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 1704; VI-NEXT: s_lshl_b32 s2, s4, 16 1705; VI-NEXT: s_or_b32 s2, s4, s2 1706; VI-NEXT: v_mov_b32_e32 v4, s2 1707; VI-NEXT: v_mov_b32_e32 v5, s2 1708; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1709; VI-NEXT: s_waitcnt vmcnt(0) 1710; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 1711; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 1712; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1713; VI-NEXT: s_endpgm 1714; 1715; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1716; CI: ; %bb.0: 1717; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1718; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 1719; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1720; CI-NEXT: s_waitcnt lgkmcnt(0) 1721; CI-NEXT: v_mov_b32_e32 v1, s3 1722; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1723; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1724; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1725; CI-NEXT: s_mov_b32 s2, 0xffff 1726; CI-NEXT: v_mov_b32_e32 v3, s1 1727; CI-NEXT: s_and_b32 s6, s4, s2 1728; CI-NEXT: s_mov_b32 s3, 0 1729; CI-NEXT: s_lshl_b32 s1, s5, 4 1730; CI-NEXT: s_lshl_b32 s4, s4, 16 1731; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1732; CI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 1733; CI-NEXT: s_or_b32 s2, s6, s4 1734; CI-NEXT: v_mov_b32_e32 v4, s2 1735; CI-NEXT: v_mov_b32_e32 v5, s2 1736; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1737; CI-NEXT: s_waitcnt vmcnt(0) 1738; CI-NEXT: v_bfi_b32 v1, s1, v4, v1 1739; CI-NEXT: v_bfi_b32 v0, s0, v5, v0 1740; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1741; CI-NEXT: s_endpgm 1742 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1743 %tid.ext = sext i32 %tid to i64 1744 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1745 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1746 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1747 %val.trunc = trunc i32 %val to i16 1748 %val.cvt = bitcast i16 %val.trunc to half 1749 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval 1750 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1751 ret void 1752} 1753 1754declare i32 @llvm.amdgcn.workitem.id.x() #1 1755 1756attributes #0 = { nounwind } 1757attributes #1 = { nounwind readnone } 1758