1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI 4 5; XXX - Why the packing? 6define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 7; SI-LABEL: scalar_to_vector_v2i32: 8; SI: ; %bb.0: 9; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 10; SI-NEXT: s_mov_b32 s7, 0xf000 11; SI-NEXT: s_mov_b32 s6, -1 12; SI-NEXT: s_mov_b32 s10, s6 13; SI-NEXT: s_mov_b32 s11, s7 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_mov_b32 s8, s2 16; SI-NEXT: s_mov_b32 s9, s3 17; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 20; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 21; SI-NEXT: s_mov_b32 s4, s0 22; SI-NEXT: s_mov_b32 s5, s1 23; SI-NEXT: v_mov_b32_e32 v1, v0 24; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: scalar_to_vector_v2i32: 28; VI: ; %bb.0: 29; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 30; VI-NEXT: s_mov_b32 s3, 0xf000 31; VI-NEXT: s_mov_b32 s2, -1 32; VI-NEXT: s_waitcnt lgkmcnt(0) 33; VI-NEXT: s_mov_b32 s0, s4 34; VI-NEXT: s_mov_b32 s1, s5 35; VI-NEXT: s_mov_b32 s4, s6 36; VI-NEXT: s_mov_b32 s5, s7 37; VI-NEXT: s_mov_b32 s6, s2 38; VI-NEXT: s_mov_b32 s7, s3 39; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 40; VI-NEXT: s_waitcnt vmcnt(0) 41; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 42; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 43; VI-NEXT: v_mov_b32_e32 v1, v0 44; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 45; VI-NEXT: s_endpgm 46 %tmp1 = load i32, i32 addrspace(1)* %in, align 4 47 %bc = bitcast i32 %tmp1 to <2 x i16> 48 %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 49 store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 50 ret void 51} 52 53define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { 54; SI-LABEL: scalar_to_vector_v2f32: 55; SI: ; %bb.0: 56; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 57; SI-NEXT: s_mov_b32 s7, 0xf000 58; SI-NEXT: s_mov_b32 s6, -1 59; SI-NEXT: s_mov_b32 s10, s6 60; SI-NEXT: s_mov_b32 s11, s7 61; SI-NEXT: s_waitcnt lgkmcnt(0) 62; SI-NEXT: s_mov_b32 s8, s2 63; SI-NEXT: s_mov_b32 s9, s3 64; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 65; SI-NEXT: s_waitcnt vmcnt(0) 66; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 67; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 68; SI-NEXT: s_mov_b32 s4, s0 69; SI-NEXT: s_mov_b32 s5, s1 70; SI-NEXT: v_mov_b32_e32 v1, v0 71; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 72; SI-NEXT: s_endpgm 73; 74; VI-LABEL: scalar_to_vector_v2f32: 75; VI: ; %bb.0: 76; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 77; VI-NEXT: s_mov_b32 s3, 0xf000 78; VI-NEXT: s_mov_b32 s2, -1 79; VI-NEXT: s_waitcnt lgkmcnt(0) 80; VI-NEXT: s_mov_b32 s0, s4 81; VI-NEXT: s_mov_b32 s1, s5 82; VI-NEXT: s_mov_b32 s4, s6 83; VI-NEXT: s_mov_b32 s5, s7 84; VI-NEXT: s_mov_b32 s6, s2 85; VI-NEXT: s_mov_b32 s7, s3 86; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 87; VI-NEXT: s_waitcnt vmcnt(0) 88; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 89; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 90; VI-NEXT: v_mov_b32_e32 v1, v0 91; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 92; VI-NEXT: s_endpgm 93 %tmp1 = load float, float addrspace(1)* %in, align 4 94 %bc = bitcast float %tmp1 to <2 x i16> 95 %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 96 store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 97 ret void 98} 99 100define amdgpu_kernel void @scalar_to_vector_v4i16() { 101; SI-LABEL: scalar_to_vector_v4i16: 102; SI: ; %bb.0: ; %bb 103; SI-NEXT: s_mov_b32 s3, 0xf000 104; SI-NEXT: s_mov_b32 s2, -1 105; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 106; SI-NEXT: s_waitcnt vmcnt(0) 107; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 108; SI-NEXT: v_or_b32_e32 v0, v1, v0 109; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 110; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 111; SI-NEXT: v_or_b32_e32 v1, v1, v2 112; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 113; SI-NEXT: v_or_b32_e32 v1, v1, v2 114; SI-NEXT: v_or_b32_e32 v0, v0, v2 115; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 116; SI-NEXT: s_endpgm 117; 118; VI-LABEL: scalar_to_vector_v4i16: 119; VI: ; %bb.0: ; %bb 120; VI-NEXT: s_mov_b32 s3, 0xf000 121; VI-NEXT: s_mov_b32 s2, -1 122; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 123; VI-NEXT: s_waitcnt vmcnt(0) 124; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 125; VI-NEXT: v_or_b32_e32 v0, v1, v0 126; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 127; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1 128; VI-NEXT: v_or_b32_e32 v1, v1, v2 129; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 130; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 131; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 132; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 133; VI-NEXT: s_endpgm 134bb: 135 %tmp = load <2 x i8>, <2 x i8> addrspace(1)* undef, align 1 136 %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 137 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 138 store <8 x i8> %tmp2, <8 x i8> addrspace(1)* undef, align 8 139 ret void 140} 141 142define amdgpu_kernel void @scalar_to_vector_v4f16() { 143; SI-LABEL: scalar_to_vector_v4f16: 144; SI: ; %bb.0: ; %bb 145; SI-NEXT: s_mov_b32 s3, 0xf000 146; SI-NEXT: s_mov_b32 s2, -1 147; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 148; SI-NEXT: s_waitcnt vmcnt(0) 149; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 150; SI-NEXT: v_or_b32_e32 v0, v1, v0 151; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 152; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 153; SI-NEXT: v_or_b32_e32 v1, v1, v2 154; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 155; SI-NEXT: v_or_b32_e32 v1, v1, v2 156; SI-NEXT: v_or_b32_e32 v0, v0, v2 157; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 158; SI-NEXT: s_endpgm 159; 160; VI-LABEL: scalar_to_vector_v4f16: 161; VI: ; %bb.0: ; %bb 162; VI-NEXT: s_mov_b32 s3, 0xf000 163; VI-NEXT: s_mov_b32 s2, -1 164; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 165; VI-NEXT: s_waitcnt vmcnt(0) 166; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 167; VI-NEXT: v_or_b32_e32 v0, v1, v0 168; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 169; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1 170; VI-NEXT: v_or_b32_e32 v1, v1, v2 171; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 172; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 173; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 174; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 175; VI-NEXT: s_endpgm 176bb: 177 %load = load half, half addrspace(1)* undef, align 1 178 %tmp = bitcast half %load to <2 x i8> 179 %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 180 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 181 store <8 x i8> %tmp2, <8 x i8> addrspace(1)* undef, align 8 182 ret void 183} 184 185; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed 186; to produce one, but for some reason never made it to selection. 187 188 189; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 190; %tmp1 = load i32, i32 addrspace(1)* %in, align 4 191; %bc = bitcast i32 %tmp1 to <4 x i8> 192 193; %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 194; store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4 195; ret void 196; } 197 198; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { 199; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 200; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 201; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> 202; %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4> 203; store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16 204; ret void 205; } 206 207; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { 208; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 209; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> 210; %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4> 211; store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16 212; ret void 213; } 214 215; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { 216; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 217; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> 218; %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4> 219; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 220; ret void 221; } 222 223define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { 224; SI-LABEL: scalar_to_vector_test6: 225; SI: ; %bb.0: 226; SI-NEXT: s_load_dword s2, s[0:1], 0xb 227; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 228; SI-NEXT: s_mov_b32 s7, 0xf000 229; SI-NEXT: s_mov_b32 s6, -1 230; SI-NEXT: s_waitcnt lgkmcnt(0) 231; SI-NEXT: v_mov_b32_e32 v0, s2 232; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 233; SI-NEXT: s_endpgm 234; 235; VI-LABEL: scalar_to_vector_test6: 236; VI: ; %bb.0: 237; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 238; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 239; VI-NEXT: s_mov_b32 s7, 0xf000 240; VI-NEXT: s_mov_b32 s6, -1 241; VI-NEXT: s_waitcnt lgkmcnt(0) 242; VI-NEXT: v_mov_b32_e32 v0, s0 243; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 244; VI-NEXT: s_endpgm 245 %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 246 %bc = bitcast <4 x i8> %newvec0 to <2 x half> 247 store <2 x half> %bc, <2 x half> addrspace(1)* %out 248 ret void 249} 250