1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s 3; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 7 8define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 9; SI-LABEL: v_uitofp_i32_to_f32_mask255: 10; SI: ; %bb.0: 11; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 13; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 14; SI-NEXT: s_setpc_b64 s[30:31] 15; 16; VI-LABEL: v_uitofp_i32_to_f32_mask255: 17; VI: ; %bb.0: 18; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 20; VI-NEXT: s_setpc_b64 s[30:31] 21 %masked = and i32 %arg0, 255 22 %cvt = uitofp i32 %masked to float 23 ret float %cvt 24} 25 26define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 27; SI-LABEL: v_sitofp_i32_to_f32_mask255: 28; SI: ; %bb.0: 29; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 31; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 32; SI-NEXT: s_setpc_b64 s[30:31] 33; 34; VI-LABEL: v_sitofp_i32_to_f32_mask255: 35; VI: ; %bb.0: 36; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 38; VI-NEXT: s_setpc_b64 s[30:31] 39 %masked = and i32 %arg0, 255 40 %cvt = sitofp i32 %masked to float 41 ret float %cvt 42} 43 44define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind { 45; SI-LABEL: v_uitofp_to_f32_lshr7_mask255: 46; SI: ; %bb.0: 47; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 48; SI-NEXT: v_lshrrev_b32_e32 v0, 7, v0 49; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 50; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 51; SI-NEXT: s_setpc_b64 s[30:31] 52; 53; VI-LABEL: v_uitofp_to_f32_lshr7_mask255: 54; VI: ; %bb.0: 55; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; VI-NEXT: v_lshrrev_b32_e32 v0, 7, v0 57; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 58; VI-NEXT: s_setpc_b64 s[30:31] 59 %lshr.7 = lshr i32 %arg0, 7 60 %masked = and i32 %lshr.7, 255 61 %cvt = uitofp i32 %masked to float 62 ret float %cvt 63} 64 65define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind { 66; SI-LABEL: v_uitofp_to_f32_lshr8_mask255: 67; SI: ; %bb.0: 68; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 69; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 70; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 71; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 72; SI-NEXT: s_setpc_b64 s[30:31] 73; 74; VI-LABEL: v_uitofp_to_f32_lshr8_mask255: 75; VI: ; %bb.0: 76; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 78; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 79; VI-NEXT: s_setpc_b64 s[30:31] 80 %lshr.8 = lshr i32 %arg0, 8 81 %masked = and i32 %lshr.8, 255 82 %cvt = uitofp i32 %masked to float 83 ret float %cvt 84} 85 86define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind { 87; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 88; SI: ; %bb.0: 89; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 90; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 91; SI-NEXT: s_mov_b32 s6, -1 92; SI-NEXT: s_mov_b32 s7, 0xf000 93; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 94; SI-NEXT: s_waitcnt expcnt(0) 95; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 96; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 97; SI-NEXT: s_waitcnt vmcnt(0) 98; SI-NEXT: s_setpc_b64 s[30:31] 99; 100; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 101; VI: ; %bb.0: 102; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 103; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 104; VI-NEXT: flat_store_dword v[0:1], v0 105; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 106; VI-NEXT: s_waitcnt vmcnt(0) 107; VI-NEXT: s_setpc_b64 s[30:31] 108 %lshr.8 = lshr i32 %arg0, 8 109 store i32 %lshr.8, i32 addrspace(1)* undef 110 %masked = and i32 %lshr.8, 255 111 %cvt = uitofp i32 %masked to float 112 ret float %cvt 113} 114 115define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind { 116; SI-LABEL: v_uitofp_to_f32_lshr16_mask255: 117; SI: ; %bb.0: 118; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 120; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 121; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 122; SI-NEXT: s_setpc_b64 s[30:31] 123; 124; VI-LABEL: v_uitofp_to_f32_lshr16_mask255: 125; VI: ; %bb.0: 126; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 127; VI-NEXT: v_mov_b32_e32 v1, 0xff 128; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 129; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 130; VI-NEXT: s_setpc_b64 s[30:31] 131 %lshr.16 = lshr i32 %arg0, 16 132 %masked = and i32 %lshr.16, 255 133 %cvt = uitofp i32 %masked to float 134 ret float %cvt 135} 136 137define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind { 138; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255: 139; GCN: ; %bb.0: 140; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 141; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 142; GCN-NEXT: s_setpc_b64 s[30:31] 143 %lshr.16 = lshr i32 %arg0, 24 144 %masked = and i32 %lshr.16, 255 145 %cvt = uitofp i32 %masked to float 146 ret float %cvt 147} 148 149define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind { 150; SI-LABEL: v_uitofp_i8_to_f32: 151; SI: ; %bb.0: 152; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 154; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 155; SI-NEXT: s_setpc_b64 s[30:31] 156; 157; VI-LABEL: v_uitofp_i8_to_f32: 158; VI: ; %bb.0: 159; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 160; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 161; VI-NEXT: s_setpc_b64 s[30:31] 162 %cvt = uitofp i8 %arg0 to float 163 ret float %cvt 164} 165 166define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind { 167; SI-LABEL: v_uitofp_v2i8_to_v2f32: 168; SI: ; %bb.0: 169; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 170; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 171; SI-NEXT: s_movk_i32 s4, 0xff 172; SI-NEXT: v_and_b32_e32 v0, s4, v0 173; SI-NEXT: v_and_b32_e32 v1, s4, v1 174; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 175; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 176; SI-NEXT: s_setpc_b64 s[30:31] 177; 178; VI-LABEL: v_uitofp_v2i8_to_v2f32: 179; VI: ; %bb.0: 180; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 181; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 182; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 183; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 184; VI-NEXT: s_setpc_b64 s[30:31] 185 %val = bitcast i16 %arg0 to <2 x i8> 186 %cvt = uitofp <2 x i8> %val to <2 x float> 187 ret <2 x float> %cvt 188} 189 190define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind { 191; SI-LABEL: v_uitofp_v3i8_to_v3f32: 192; SI: ; %bb.0: 193; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 195; SI-NEXT: s_movk_i32 s4, 0xff 196; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 197; SI-NEXT: v_and_b32_e32 v0, s4, v0 198; SI-NEXT: v_and_b32_e32 v1, s4, v1 199; SI-NEXT: v_and_b32_e32 v2, s4, v2 200; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 201; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 202; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 203; SI-NEXT: s_setpc_b64 s[30:31] 204; 205; VI-LABEL: v_uitofp_v3i8_to_v3f32: 206; VI: ; %bb.0: 207; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 208; VI-NEXT: s_movk_i32 s4, 0xff 209; VI-NEXT: v_mov_b32_e32 v2, s4 210; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 211; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 212; VI-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 213; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 214; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 215; VI-NEXT: v_mov_b32_e32 v0, v3 216; VI-NEXT: s_setpc_b64 s[30:31] 217 %trunc = trunc i32 %arg0 to i24 218 %val = bitcast i24 %trunc to <3 x i8> 219 %cvt = uitofp <3 x i8> %val to <3 x float> 220 ret <3 x float> %cvt 221} 222 223define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind { 224; SI-LABEL: v_uitofp_v4i8_to_v4f32: 225; SI: ; %bb.0: 226; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; SI-NEXT: s_movk_i32 s4, 0xff 228; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 229; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 230; SI-NEXT: v_and_b32_e32 v3, s4, v0 231; SI-NEXT: v_and_b32_e32 v1, s4, v1 232; SI-NEXT: v_and_b32_e32 v2, s4, v2 233; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 234; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 235; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 236; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 237; SI-NEXT: v_mov_b32_e32 v0, v4 238; SI-NEXT: s_setpc_b64 s[30:31] 239; 240; VI-LABEL: v_uitofp_v4i8_to_v4f32: 241; VI: ; %bb.0: 242; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 243; VI-NEXT: s_movk_i32 s4, 0xff 244; VI-NEXT: v_mov_b32_e32 v2, s4 245; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 246; VI-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 247; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 248; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 249; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 250; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 251; VI-NEXT: v_mov_b32_e32 v0, v4 252; VI-NEXT: s_setpc_b64 s[30:31] 253 %val = bitcast i32 %arg0 to <4 x i8> 254 %cvt = uitofp <4 x i8> %val to <4 x float> 255 ret <4 x float> %cvt 256} 257 258define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind { 259; SI-LABEL: v_uitofp_unpack_i32_to_v4f32: 260; SI: ; %bb.0: 261; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 262; SI-NEXT: s_movk_i32 s4, 0xff 263; SI-NEXT: v_and_b32_e32 v1, s4, v0 264; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 265; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1 266; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 267; SI-NEXT: v_and_b32_e32 v1, s4, v1 268; SI-NEXT: v_and_b32_e32 v2, s4, v2 269; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 270; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 271; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 272; SI-NEXT: v_mov_b32_e32 v0, v4 273; SI-NEXT: s_setpc_b64 s[30:31] 274; 275; VI-LABEL: v_uitofp_unpack_i32_to_v4f32: 276; VI: ; %bb.0: 277; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 278; VI-NEXT: s_movk_i32 s4, 0xff 279; VI-NEXT: v_mov_b32_e32 v2, s4 280; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 281; VI-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 282; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 283; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 284; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 285; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 286; VI-NEXT: v_mov_b32_e32 v0, v4 287; VI-NEXT: s_setpc_b64 s[30:31] 288 %mask.arg0 = and i32 %arg0, 255 289 %cvt0 = uitofp i32 %mask.arg0 to float 290 291 %lshr.8 = lshr i32 %arg0, 8 292 %mask.lshr.8 = and i32 %lshr.8, 255 293 %cvt1 = uitofp i32 %mask.lshr.8 to float 294 295 %lshr.16 = lshr i32 %arg0, 16 296 %mask.lshr.16 = and i32 %lshr.16, 255 297 %cvt2 = uitofp i32 %mask.lshr.16 to float 298 299 %lshr.24 = lshr i32 %arg0, 24 300 %mask.lshr.24 = and i32 %lshr.24, 255 301 %cvt3 = uitofp i32 %mask.lshr.24 to float 302 303 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0 304 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1 305 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2 306 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3 307 ret <4 x float> %ins.3 308} 309 310define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 311; SI-LABEL: v_uitofp_i32_to_f16_mask255: 312; SI: ; %bb.0: 313; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 314; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 315; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 316; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 317; SI-NEXT: s_setpc_b64 s[30:31] 318; 319; VI-LABEL: v_uitofp_i32_to_f16_mask255: 320; VI: ; %bb.0: 321; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 322; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 323; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 324; VI-NEXT: s_setpc_b64 s[30:31] 325 %masked = and i32 %arg0, 255 326 %cvt = uitofp i32 %masked to half 327 ret half %cvt 328} 329 330define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 331; SI-LABEL: v_sitofp_i32_to_f16_mask255: 332; SI: ; %bb.0: 333; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 334; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 335; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 336; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 337; SI-NEXT: s_setpc_b64 s[30:31] 338; 339; VI-LABEL: v_sitofp_i32_to_f16_mask255: 340; VI: ; %bb.0: 341; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 342; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 343; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 344; VI-NEXT: s_setpc_b64 s[30:31] 345 %masked = and i32 %arg0, 255 346 %cvt = sitofp i32 %masked to half 347 ret half %cvt 348} 349 350define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { 351; SI-LABEL: v_uitofp_to_f16_lshr8_mask255: 352; SI: ; %bb.0: 353; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 354; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 355; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 356; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 357; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 358; SI-NEXT: s_setpc_b64 s[30:31] 359; 360; VI-LABEL: v_uitofp_to_f16_lshr8_mask255: 361; VI: ; %bb.0: 362; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 363; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 364; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 365; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 366; VI-NEXT: s_setpc_b64 s[30:31] 367 %lshr.8 = lshr i32 %arg0, 8 368 %masked = and i32 %lshr.8, 255 369 %cvt = uitofp i32 %masked to half 370 ret half %cvt 371} 372 373define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { 374; SI-LABEL: v_uitofp_to_f16_lshr16_mask255: 375; SI: ; %bb.0: 376; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 378; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 379; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 380; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 381; SI-NEXT: s_setpc_b64 s[30:31] 382; 383; VI-LABEL: v_uitofp_to_f16_lshr16_mask255: 384; VI: ; %bb.0: 385; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 386; VI-NEXT: v_mov_b32_e32 v1, 0xff 387; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 388; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 389; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 390; VI-NEXT: s_setpc_b64 s[30:31] 391 %lshr.16 = lshr i32 %arg0, 16 392 %masked = and i32 %lshr.16, 255 393 %cvt = uitofp i32 %masked to half 394 ret half %cvt 395} 396 397define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind { 398; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255: 399; GCN: ; %bb.0: 400; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 401; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 402; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 403; GCN-NEXT: s_setpc_b64 s[30:31] 404 %lshr.16 = lshr i32 %arg0, 24 405 %masked = and i32 %lshr.16, 255 406 %cvt = uitofp i32 %masked to half 407 ret half %cvt 408} 409 410define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind { 411; SI-LABEL: v_uitofp_i8_to_f16: 412; SI: ; %bb.0: 413; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 414; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 415; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 416; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 417; SI-NEXT: s_setpc_b64 s[30:31] 418; 419; VI-LABEL: v_uitofp_i8_to_f16: 420; VI: ; %bb.0: 421; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 422; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 423; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 424; VI-NEXT: s_setpc_b64 s[30:31] 425 %cvt = uitofp i8 %arg0 to half 426 ret half %cvt 427} 428 429define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind { 430; GCN-LABEL: v_uitofp_i32_to_f64_mask255: 431; GCN: ; %bb.0: 432; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 433; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 434; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 435; GCN-NEXT: s_setpc_b64 s[30:31] 436 %masked = and i32 %arg0, 255 437 %cvt = uitofp i32 %masked to double 438 ret double %cvt 439} 440 441define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind { 442; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255: 443; GCN: ; %bb.0: 444; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 445; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v0 446; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 447; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 448; GCN-NEXT: s_setpc_b64 s[30:31] 449 %lshr.8 = lshr i32 %arg0, 8 450 %masked = and i32 %lshr.8, 255 451 %cvt = uitofp i32 %masked to double 452 ret double %cvt 453} 454 455define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind { 456; SI-LABEL: v_uitofp_to_f64_lshr16_mask255: 457; SI: ; %bb.0: 458; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 459; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 460; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 461; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 462; SI-NEXT: s_setpc_b64 s[30:31] 463; 464; VI-LABEL: v_uitofp_to_f64_lshr16_mask255: 465; VI: ; %bb.0: 466; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 467; VI-NEXT: v_mov_b32_e32 v1, 0xff 468; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 469; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 470; VI-NEXT: s_setpc_b64 s[30:31] 471 %lshr.16 = lshr i32 %arg0, 16 472 %masked = and i32 %lshr.16, 255 473 %cvt = uitofp i32 %masked to double 474 ret double %cvt 475} 476 477define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind { 478; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255: 479; GCN: ; %bb.0: 480; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 481; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0 482; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 483; GCN-NEXT: s_setpc_b64 s[30:31] 484 %lshr.16 = lshr i32 %arg0, 24 485 %masked = and i32 %lshr.16, 255 486 %cvt = uitofp i32 %masked to double 487 ret double %cvt 488} 489 490define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { 491; GCN-LABEL: v_uitofp_i8_to_f64: 492; GCN: ; %bb.0: 493; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 494; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 495; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 496; GCN-NEXT: s_setpc_b64 s[30:31] 497 %cvt = uitofp i8 %arg0 to double 498 ret double %cvt 499} 500 501define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { 502; SI-LABEL: load_i8_to_f32: 503; SI: ; %bb.0: 504; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 505; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 506; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 507; SI-NEXT: s_mov_b32 s2, 0 508; SI-NEXT: s_mov_b32 s3, 0xf000 509; SI-NEXT: s_waitcnt lgkmcnt(0) 510; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 511; SI-NEXT: s_mov_b32 s2, -1 512; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 513; SI-NEXT: s_waitcnt vmcnt(0) 514; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 515; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 516; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 517; SI-NEXT: s_endpgm 518; 519; VI-LABEL: load_i8_to_f32: 520; VI: ; %bb.0: 521; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 522; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 523; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 524; VI-NEXT: s_waitcnt lgkmcnt(0) 525; VI-NEXT: v_mov_b32_e32 v2, s1 526; VI-NEXT: v_mov_b32_e32 v1, s0 527; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 528; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc 529; VI-NEXT: flat_load_ubyte v0, v[0:1] 530; VI-NEXT: s_waitcnt vmcnt(0) 531; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 532; VI-NEXT: v_mov_b32_e32 v0, s2 533; VI-NEXT: v_mov_b32_e32 v1, s3 534; VI-NEXT: flat_store_dword v[0:1], v2 535; VI-NEXT: s_endpgm 536 %tid = call i32 @llvm.amdgcn.workitem.id.x() 537 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 538 %load = load i8, i8 addrspace(1)* %gep, align 1 539 %cvt = uitofp i8 %load to float 540 store float %cvt, float addrspace(1)* %out, align 4 541 ret void 542} 543 544; FIXME: 545; define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { 546; %tid = call i32 @llvm.amdgcn.workitem.id.x() 547; %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid 548; %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 549; %cvt = uitofp <2 x i8> %load to <2 x float> 550; store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 551; ret void 552; } 553 554; FIXME: 555; define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { 556; %tid = call i32 @llvm.amdgcn.workitem.id.x() 557; %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid 558; %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 559; %cvt = uitofp <3 x i8> %load to <3 x float> 560; store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 561; ret void 562; } 563 564; define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 565; %tid = call i32 @llvm.amdgcn.workitem.id.x() 566; %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 567; %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 568; %cvt = uitofp <4 x i8> %load to <4 x float> 569; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 570; ret void 571; } 572 573; This should not be adding instructions to shift into the correct 574; position in the word for the component. 575 576; FIXME: Packing bytes 577define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 578; SI-LABEL: load_v4i8_to_v4f32_unaligned: 579; SI: ; %bb.0: 580; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 581; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 582; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 583; SI-NEXT: v_mov_b32_e32 v1, 0 584; SI-NEXT: s_mov_b32 s2, 0 585; SI-NEXT: s_mov_b32 s3, 0xf000 586; SI-NEXT: s_waitcnt lgkmcnt(0) 587; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 588; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 589; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 590; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 591; SI-NEXT: s_movk_i32 s0, 0xff 592; SI-NEXT: s_mov_b32 s2, -1 593; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 594; SI-NEXT: s_waitcnt vmcnt(3) 595; SI-NEXT: v_and_b32_e32 v1, s0, v2 596; SI-NEXT: s_waitcnt vmcnt(2) 597; SI-NEXT: v_and_b32_e32 v2, s0, v3 598; SI-NEXT: s_waitcnt vmcnt(1) 599; SI-NEXT: v_and_b32_e32 v3, s0, v4 600; SI-NEXT: s_waitcnt vmcnt(0) 601; SI-NEXT: v_and_b32_e32 v4, s0, v0 602; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 603; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 604; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 605; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 606; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 607; SI-NEXT: s_endpgm 608; 609; VI-LABEL: load_v4i8_to_v4f32_unaligned: 610; VI: ; %bb.0: 611; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 612; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 613; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 614; VI-NEXT: s_waitcnt lgkmcnt(0) 615; VI-NEXT: v_mov_b32_e32 v0, s0 616; VI-NEXT: v_mov_b32_e32 v1, s1 617; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 618; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 619; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 620; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 621; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 622; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 623; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 624; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 625; VI-NEXT: flat_load_ubyte v0, v[0:1] 626; VI-NEXT: flat_load_ubyte v1, v[2:3] 627; VI-NEXT: flat_load_ubyte v2, v[4:5] 628; VI-NEXT: flat_load_ubyte v3, v[6:7] 629; VI-NEXT: v_mov_b32_e32 v5, s3 630; VI-NEXT: v_mov_b32_e32 v4, s2 631; VI-NEXT: s_waitcnt vmcnt(3) 632; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 633; VI-NEXT: s_waitcnt vmcnt(2) 634; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 635; VI-NEXT: s_waitcnt vmcnt(1) 636; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 637; VI-NEXT: s_waitcnt vmcnt(0) 638; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 639; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 640; VI-NEXT: s_endpgm 641 %tid = call i32 @llvm.amdgcn.workitem.id.x() 642 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 643 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 644 %cvt = uitofp <4 x i8> %load to <4 x float> 645 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 646 ret void 647} 648 649; FIXME: Need to handle non-uniform case for function below (load without gep). 650; Instructions still emitted to repack bytes for add use. 651; define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { 652; %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 653; %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 654; %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 655; %cvt = uitofp <4 x i8> %load to <4 x float> 656; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 657; %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load 658; store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 659; ret void 660; } 661 662; Make sure this doesn't crash. 663; FIXME: 664; define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { 665; %tid = call i32 @llvm.amdgcn.workitem.id.x() 666; %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid 667; %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 668; %cvt = uitofp <7 x i8> %load to <7 x float> 669; store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 670; ret void 671; } 672 673; FIXME 674; define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { 675; %tid = call i32 @llvm.amdgcn.workitem.id.x() 676; %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid 677; %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 678; %cvt = uitofp <8 x i8> %load to <8 x float> 679; store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 680; ret void 681; } 682 683define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 684; SI-LABEL: i8_zext_inreg_i32_to_f32: 685; SI: ; %bb.0: 686; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 687; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 688; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 689; SI-NEXT: v_mov_b32_e32 v1, 0 690; SI-NEXT: s_mov_b32 s2, 0 691; SI-NEXT: s_mov_b32 s3, 0xf000 692; SI-NEXT: s_waitcnt lgkmcnt(0) 693; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 694; SI-NEXT: s_mov_b32 s2, -1 695; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 696; SI-NEXT: s_waitcnt vmcnt(0) 697; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 698; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 699; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 700; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 701; SI-NEXT: s_endpgm 702; 703; VI-LABEL: i8_zext_inreg_i32_to_f32: 704; VI: ; %bb.0: 705; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 706; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 707; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 708; VI-NEXT: s_waitcnt lgkmcnt(0) 709; VI-NEXT: v_mov_b32_e32 v0, s0 710; VI-NEXT: v_mov_b32_e32 v1, s1 711; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 712; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 713; VI-NEXT: flat_load_dword v0, v[0:1] 714; VI-NEXT: s_waitcnt vmcnt(0) 715; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 716; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 717; VI-NEXT: v_mov_b32_e32 v0, s2 718; VI-NEXT: v_mov_b32_e32 v1, s3 719; VI-NEXT: flat_store_dword v[0:1], v2 720; VI-NEXT: s_endpgm 721 %tid = call i32 @llvm.amdgcn.workitem.id.x() 722 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 723 %load = load i32, i32 addrspace(1)* %gep, align 4 724 %add = add i32 %load, 2 725 %inreg = and i32 %add, 255 726 %cvt = uitofp i32 %inreg to float 727 store float %cvt, float addrspace(1)* %out, align 4 728 ret void 729} 730 731define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 732; SI-LABEL: i8_zext_inreg_hi1_to_f32: 733; SI: ; %bb.0: 734; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 735; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 736; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 737; SI-NEXT: v_mov_b32_e32 v1, 0 738; SI-NEXT: s_mov_b32 s2, 0 739; SI-NEXT: s_mov_b32 s3, 0xf000 740; SI-NEXT: s_waitcnt lgkmcnt(0) 741; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 742; SI-NEXT: s_mov_b32 s2, -1 743; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 744; SI-NEXT: s_waitcnt vmcnt(0) 745; SI-NEXT: v_and_b32_e32 v0, 0xff00, v0 746; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 747; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 748; SI-NEXT: s_endpgm 749; 750; VI-LABEL: i8_zext_inreg_hi1_to_f32: 751; VI: ; %bb.0: 752; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 753; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 754; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 755; VI-NEXT: s_waitcnt lgkmcnt(0) 756; VI-NEXT: v_mov_b32_e32 v0, s0 757; VI-NEXT: v_mov_b32_e32 v1, s1 758; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 759; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 760; VI-NEXT: flat_load_dword v0, v[0:1] 761; VI-NEXT: s_waitcnt vmcnt(0) 762; VI-NEXT: v_and_b32_e32 v0, 0xff00, v0 763; VI-NEXT: v_cvt_f32_ubyte1_e32 v2, v0 764; VI-NEXT: v_mov_b32_e32 v0, s2 765; VI-NEXT: v_mov_b32_e32 v1, s3 766; VI-NEXT: flat_store_dword v[0:1], v2 767; VI-NEXT: s_endpgm 768 %tid = call i32 @llvm.amdgcn.workitem.id.x() 769 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 770 %load = load i32, i32 addrspace(1)* %gep, align 4 771 %inreg = and i32 %load, 65280 772 %shr = lshr i32 %inreg, 8 773 %cvt = uitofp i32 %shr to float 774 store float %cvt, float addrspace(1)* %out, align 4 775 ret void 776} 777 778; We don't get these ones because of the zext, but instcombine removes 779; them so it shouldn't really matter. 780define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { 781; SI-LABEL: i8_zext_i32_to_f32: 782; SI: ; %bb.0: 783; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 784; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 785; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 786; SI-NEXT: s_mov_b32 s2, 0 787; SI-NEXT: s_mov_b32 s3, 0xf000 788; SI-NEXT: s_waitcnt lgkmcnt(0) 789; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 790; SI-NEXT: s_mov_b32 s2, -1 791; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 792; SI-NEXT: s_waitcnt vmcnt(0) 793; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 794; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 795; SI-NEXT: s_endpgm 796; 797; VI-LABEL: i8_zext_i32_to_f32: 798; VI: ; %bb.0: 799; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 800; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 801; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 802; VI-NEXT: s_waitcnt lgkmcnt(0) 803; VI-NEXT: v_mov_b32_e32 v2, s1 804; VI-NEXT: v_mov_b32_e32 v1, s0 805; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 806; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc 807; VI-NEXT: flat_load_ubyte v0, v[0:1] 808; VI-NEXT: s_waitcnt vmcnt(0) 809; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 810; VI-NEXT: v_mov_b32_e32 v0, s2 811; VI-NEXT: v_mov_b32_e32 v1, s3 812; VI-NEXT: flat_store_dword v[0:1], v2 813; VI-NEXT: s_endpgm 814 %tid = call i32 @llvm.amdgcn.workitem.id.x() 815 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 816 %load = load i8, i8 addrspace(1)* %gep, align 1 817 %ext = zext i8 %load to i32 818 %cvt = uitofp i32 %ext to float 819 store float %cvt, float addrspace(1)* %out, align 4 820 ret void 821} 822 823define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 824; SI-LABEL: v4i8_zext_v4i32_to_v4f32: 825; SI: ; %bb.0: 826; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 827; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 828; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 829; SI-NEXT: v_mov_b32_e32 v1, 0 830; SI-NEXT: s_mov_b32 s2, 0 831; SI-NEXT: s_mov_b32 s3, 0xf000 832; SI-NEXT: s_waitcnt lgkmcnt(0) 833; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 834; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 835; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 836; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 837; SI-NEXT: s_movk_i32 s0, 0xff 838; SI-NEXT: s_mov_b32 s2, -1 839; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 840; SI-NEXT: s_waitcnt vmcnt(3) 841; SI-NEXT: v_and_b32_e32 v1, s0, v2 842; SI-NEXT: s_waitcnt vmcnt(2) 843; SI-NEXT: v_and_b32_e32 v2, s0, v3 844; SI-NEXT: s_waitcnt vmcnt(1) 845; SI-NEXT: v_and_b32_e32 v3, s0, v4 846; SI-NEXT: s_waitcnt vmcnt(0) 847; SI-NEXT: v_and_b32_e32 v4, s0, v0 848; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 849; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 850; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 851; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 852; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 853; SI-NEXT: s_endpgm 854; 855; VI-LABEL: v4i8_zext_v4i32_to_v4f32: 856; VI: ; %bb.0: 857; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 858; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 859; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 860; VI-NEXT: s_waitcnt lgkmcnt(0) 861; VI-NEXT: v_mov_b32_e32 v0, s0 862; VI-NEXT: v_mov_b32_e32 v1, s1 863; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 864; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 865; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 866; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 867; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 868; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 869; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 870; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 871; VI-NEXT: flat_load_ubyte v0, v[0:1] 872; VI-NEXT: flat_load_ubyte v1, v[2:3] 873; VI-NEXT: flat_load_ubyte v2, v[4:5] 874; VI-NEXT: flat_load_ubyte v3, v[6:7] 875; VI-NEXT: v_mov_b32_e32 v5, s3 876; VI-NEXT: v_mov_b32_e32 v4, s2 877; VI-NEXT: s_waitcnt vmcnt(3) 878; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 879; VI-NEXT: s_waitcnt vmcnt(2) 880; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 881; VI-NEXT: s_waitcnt vmcnt(1) 882; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 883; VI-NEXT: s_waitcnt vmcnt(0) 884; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 885; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 886; VI-NEXT: s_endpgm 887 %tid = call i32 @llvm.amdgcn.workitem.id.x() 888 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 889 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 890 %ext = zext <4 x i8> %load to <4 x i32> 891 %cvt = uitofp <4 x i32> %ext to <4 x float> 892 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 893 ret void 894} 895 896define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 897; SI-LABEL: extract_byte0_to_f32: 898; SI: ; %bb.0: 899; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 900; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 901; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 902; SI-NEXT: v_mov_b32_e32 v1, 0 903; SI-NEXT: s_mov_b32 s2, 0 904; SI-NEXT: s_mov_b32 s3, 0xf000 905; SI-NEXT: s_waitcnt lgkmcnt(0) 906; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 907; SI-NEXT: s_mov_b32 s2, -1 908; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 909; SI-NEXT: s_waitcnt vmcnt(0) 910; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 911; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 912; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 913; SI-NEXT: s_endpgm 914; 915; VI-LABEL: extract_byte0_to_f32: 916; VI: ; %bb.0: 917; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 918; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 919; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 920; VI-NEXT: s_waitcnt lgkmcnt(0) 921; VI-NEXT: v_mov_b32_e32 v0, s0 922; VI-NEXT: v_mov_b32_e32 v1, s1 923; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 924; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 925; VI-NEXT: flat_load_dword v0, v[0:1] 926; VI-NEXT: s_waitcnt vmcnt(0) 927; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 928; VI-NEXT: v_mov_b32_e32 v0, s2 929; VI-NEXT: v_mov_b32_e32 v1, s3 930; VI-NEXT: flat_store_dword v[0:1], v2 931; VI-NEXT: s_endpgm 932 %tid = call i32 @llvm.amdgcn.workitem.id.x() 933 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 934 %val = load i32, i32 addrspace(1)* %gep 935 %and = and i32 %val, 255 936 %cvt = uitofp i32 %and to float 937 store float %cvt, float addrspace(1)* %out 938 ret void 939} 940 941define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 942; SI-LABEL: extract_byte1_to_f32: 943; SI: ; %bb.0: 944; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 945; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 946; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 947; SI-NEXT: v_mov_b32_e32 v1, 0 948; SI-NEXT: s_mov_b32 s2, 0 949; SI-NEXT: s_mov_b32 s3, 0xf000 950; SI-NEXT: s_waitcnt lgkmcnt(0) 951; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 952; SI-NEXT: s_mov_b32 s2, -1 953; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 954; SI-NEXT: s_waitcnt vmcnt(0) 955; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 956; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 957; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 958; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 959; SI-NEXT: s_endpgm 960; 961; VI-LABEL: extract_byte1_to_f32: 962; VI: ; %bb.0: 963; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 964; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 965; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 966; VI-NEXT: s_waitcnt lgkmcnt(0) 967; VI-NEXT: v_mov_b32_e32 v0, s0 968; VI-NEXT: v_mov_b32_e32 v1, s1 969; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 970; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 971; VI-NEXT: flat_load_dword v0, v[0:1] 972; VI-NEXT: s_waitcnt vmcnt(0) 973; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 974; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 975; VI-NEXT: v_mov_b32_e32 v0, s2 976; VI-NEXT: v_mov_b32_e32 v1, s3 977; VI-NEXT: flat_store_dword v[0:1], v2 978; VI-NEXT: s_endpgm 979 %tid = call i32 @llvm.amdgcn.workitem.id.x() 980 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 981 %val = load i32, i32 addrspace(1)* %gep 982 %srl = lshr i32 %val, 8 983 %and = and i32 %srl, 255 984 %cvt = uitofp i32 %and to float 985 store float %cvt, float addrspace(1)* %out 986 ret void 987} 988 989define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 990; SI-LABEL: extract_byte2_to_f32: 991; SI: ; %bb.0: 992; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 993; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 994; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 995; SI-NEXT: v_mov_b32_e32 v1, 0 996; SI-NEXT: s_mov_b32 s2, 0 997; SI-NEXT: s_mov_b32 s3, 0xf000 998; SI-NEXT: s_waitcnt lgkmcnt(0) 999; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1000; SI-NEXT: s_mov_b32 s2, -1 1001; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 1002; SI-NEXT: s_waitcnt vmcnt(0) 1003; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1004; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1005; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1006; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1007; SI-NEXT: s_endpgm 1008; 1009; VI-LABEL: extract_byte2_to_f32: 1010; VI: ; %bb.0: 1011; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1012; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1013; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1014; VI-NEXT: s_waitcnt lgkmcnt(0) 1015; VI-NEXT: v_mov_b32_e32 v0, s0 1016; VI-NEXT: v_mov_b32_e32 v1, s1 1017; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1018; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1019; VI-NEXT: flat_load_dword v0, v[0:1] 1020; VI-NEXT: v_mov_b32_e32 v1, 0xff 1021; VI-NEXT: s_waitcnt vmcnt(0) 1022; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1023; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 1024; VI-NEXT: v_mov_b32_e32 v0, s2 1025; VI-NEXT: v_mov_b32_e32 v1, s3 1026; VI-NEXT: flat_store_dword v[0:1], v2 1027; VI-NEXT: s_endpgm 1028 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1029 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1030 %val = load i32, i32 addrspace(1)* %gep 1031 %srl = lshr i32 %val, 16 1032 %and = and i32 %srl, 255 1033 %cvt = uitofp i32 %and to float 1034 store float %cvt, float addrspace(1)* %out 1035 ret void 1036} 1037 1038define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1039; SI-LABEL: extract_byte3_to_f32: 1040; SI: ; %bb.0: 1041; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1042; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1043; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1044; SI-NEXT: v_mov_b32_e32 v1, 0 1045; SI-NEXT: s_mov_b32 s2, 0 1046; SI-NEXT: s_mov_b32 s3, 0xf000 1047; SI-NEXT: s_waitcnt lgkmcnt(0) 1048; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1049; SI-NEXT: s_mov_b32 s2, -1 1050; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 1051; SI-NEXT: s_waitcnt vmcnt(0) 1052; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 1053; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1054; SI-NEXT: s_endpgm 1055; 1056; VI-LABEL: extract_byte3_to_f32: 1057; VI: ; %bb.0: 1058; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1059; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1060; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1061; VI-NEXT: s_waitcnt lgkmcnt(0) 1062; VI-NEXT: v_mov_b32_e32 v0, s0 1063; VI-NEXT: v_mov_b32_e32 v1, s1 1064; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1065; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1066; VI-NEXT: flat_load_dword v0, v[0:1] 1067; VI-NEXT: s_waitcnt vmcnt(0) 1068; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 1069; VI-NEXT: v_mov_b32_e32 v0, s2 1070; VI-NEXT: v_mov_b32_e32 v1, s3 1071; VI-NEXT: flat_store_dword v[0:1], v2 1072; VI-NEXT: s_endpgm 1073 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1074 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1075 %val = load i32, i32 addrspace(1)* %gep 1076 %srl = lshr i32 %val, 24 1077 %and = and i32 %srl, 255 1078 %cvt = uitofp i32 %and to float 1079 store float %cvt, float addrspace(1)* %out 1080 ret void 1081} 1082 1083define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { 1084; SI-LABEL: cvt_ubyte0_or_multiuse: 1085; SI: ; %bb.0: ; %bb 1086; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1087; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1088; SI-NEXT: v_mov_b32_e32 v1, 0 1089; SI-NEXT: s_mov_b32 s6, 0 1090; SI-NEXT: s_mov_b32 s7, 0xf000 1091; SI-NEXT: s_waitcnt lgkmcnt(0) 1092; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 1093; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1094; SI-NEXT: s_mov_b32 s6, -1 1095; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1096; SI-NEXT: s_waitcnt vmcnt(0) 1097; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1098; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 1099; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1100; SI-NEXT: v_add_f32_e32 v0, v0, v1 1101; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1102; SI-NEXT: s_endpgm 1103; 1104; VI-LABEL: cvt_ubyte0_or_multiuse: 1105; VI: ; %bb.0: ; %bb 1106; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1107; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1108; VI-NEXT: s_waitcnt lgkmcnt(0) 1109; VI-NEXT: v_mov_b32_e32 v0, s0 1110; VI-NEXT: v_mov_b32_e32 v1, s1 1111; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1112; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1113; VI-NEXT: flat_load_dword v0, v[0:1] 1114; VI-NEXT: s_waitcnt vmcnt(0) 1115; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1116; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1117; VI-NEXT: v_add_f32_e32 v2, v0, v1 1118; VI-NEXT: v_mov_b32_e32 v0, s2 1119; VI-NEXT: v_mov_b32_e32 v1, s3 1120; VI-NEXT: flat_store_dword v[0:1], v2 1121; VI-NEXT: s_endpgm 1122bb: 1123 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 1124 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid 1125 %load = load i32, i32 addrspace(1)* %gep 1126 %or = or i32 %load, -2147483647 1127 %and = and i32 %or, 255 1128 %uitofp = uitofp i32 %and to float 1129 %cast = bitcast i32 %or to float 1130 %add = fadd float %cast, %uitofp 1131 store float %add, float addrspace(1)* %out 1132 ret void 1133} 1134 1135define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) { 1136; SI-LABEL: v_test_sitofp_i64_byte_to_f32: 1137; SI: ; %bb.0: 1138; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1139; SI-NEXT: s_movk_i32 s6, 0xff 1140; SI-NEXT: v_and_b32_e32 v2, s6, v0 1141; SI-NEXT: v_add_i32_e32 v2, vcc, 0, v2 1142; SI-NEXT: v_ffbh_u32_e32 v4, v2 1143; SI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc 1144; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v4 1145; SI-NEXT: v_ffbh_u32_e32 v5, v3 1146; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1147; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 1148; SI-NEXT: v_mov_b32_e32 v5, 0xbe 1149; SI-NEXT: v_sub_i32_e32 v6, vcc, v5, v4 1150; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], v4 1151; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 1152; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v5 1153; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 1154; SI-NEXT: v_and_b32_e32 v5, s6, v3 1155; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 1156; SI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 1157; SI-NEXT: s_mov_b32 s4, 0 1158; SI-NEXT: s_movk_i32 s5, 0x80 1159; SI-NEXT: v_or_b32_e32 v2, v2, v3 1160; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] 1161; SI-NEXT: v_and_b32_e32 v3, 1, v2 1162; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 1163; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5] 1164; SI-NEXT: v_mov_b32_e32 v0, 0 1165; SI-NEXT: v_cndmask_b32_e64 v3, v3, 1, vcc 1166; SI-NEXT: v_mov_b32_e32 v1, v0 1167; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1168; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1169; SI-NEXT: v_cndmask_b32_e64 v0, v2, -v2, vcc 1170; SI-NEXT: s_setpc_b64 s[30:31] 1171; 1172; VI-LABEL: v_test_sitofp_i64_byte_to_f32: 1173; VI: ; %bb.0: 1174; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1175; VI-NEXT: s_movk_i32 s6, 0xff 1176; VI-NEXT: v_and_b32_e32 v2, s6, v0 1177; VI-NEXT: v_add_u32_e32 v2, vcc, 0, v2 1178; VI-NEXT: v_ffbh_u32_e32 v4, v2 1179; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc 1180; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 1181; VI-NEXT: v_ffbh_u32_e32 v5, v3 1182; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1183; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 1184; VI-NEXT: v_mov_b32_e32 v5, 0xbe 1185; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4 1186; VI-NEXT: v_lshlrev_b64 v[4:5], v4, v[2:3] 1187; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 1188; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v5 1189; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 1190; VI-NEXT: v_and_b32_e32 v5, s6, v3 1191; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 1192; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 1193; VI-NEXT: s_mov_b32 s4, 0 1194; VI-NEXT: s_movk_i32 s5, 0x80 1195; VI-NEXT: v_or_b32_e32 v2, v2, v3 1196; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] 1197; VI-NEXT: v_and_b32_e32 v3, 1, v2 1198; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 1199; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5] 1200; VI-NEXT: v_mov_b32_e32 v0, 0 1201; VI-NEXT: v_cndmask_b32_e64 v3, v3, 1, vcc 1202; VI-NEXT: v_mov_b32_e32 v1, v0 1203; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3 1204; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1205; VI-NEXT: v_cndmask_b32_e64 v0, v2, -v2, vcc 1206; VI-NEXT: s_setpc_b64 s[30:31] 1207 %masked = and i64 %arg0, 255 1208 %itofp = sitofp i64 %masked to float 1209 ret float %itofp 1210} 1211 1212define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) { 1213; SI-LABEL: v_test_uitofp_i64_byte_to_f32: 1214; SI: ; %bb.0: 1215; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1216; SI-NEXT: s_movk_i32 s4, 0xff 1217; SI-NEXT: v_and_b32_e32 v0, s4, v0 1218; SI-NEXT: v_ffbh_u32_e32 v2, v0 1219; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 1220; SI-NEXT: v_ffbh_u32_e32 v3, 0 1221; SI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 1222; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1223; SI-NEXT: v_mov_b32_e32 v3, 0xbe 1224; SI-NEXT: v_mov_b32_e32 v1, 0 1225; SI-NEXT: v_sub_i32_e32 v4, vcc, v3, v2 1226; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 1227; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1228; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 1229; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc 1230; SI-NEXT: v_and_b32_e32 v3, s4, v1 1231; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 1232; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1233; SI-NEXT: s_mov_b32 s4, 0 1234; SI-NEXT: s_movk_i32 s5, 0x80 1235; SI-NEXT: v_or_b32_e32 v0, v0, v1 1236; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] 1237; SI-NEXT: v_and_b32_e32 v1, 1, v0 1238; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1239; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 1240; SI-NEXT: v_cndmask_b32_e64 v1, v1, 1, vcc 1241; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1242; SI-NEXT: s_setpc_b64 s[30:31] 1243; 1244; VI-LABEL: v_test_uitofp_i64_byte_to_f32: 1245; VI: ; %bb.0: 1246; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1247; VI-NEXT: s_movk_i32 s4, 0xff 1248; VI-NEXT: v_and_b32_e32 v0, s4, v0 1249; VI-NEXT: v_ffbh_u32_e32 v2, v0 1250; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 1251; VI-NEXT: v_ffbh_u32_e32 v3, 0 1252; VI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 1253; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1254; VI-NEXT: v_mov_b32_e32 v3, 0xbe 1255; VI-NEXT: v_mov_b32_e32 v1, 0 1256; VI-NEXT: v_sub_u32_e32 v4, vcc, v3, v2 1257; VI-NEXT: v_lshlrev_b64 v[2:3], v2, v[0:1] 1258; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1259; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 1260; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc 1261; VI-NEXT: v_and_b32_e32 v3, s4, v1 1262; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 1263; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1264; VI-NEXT: s_mov_b32 s4, 0 1265; VI-NEXT: s_movk_i32 s5, 0x80 1266; VI-NEXT: v_or_b32_e32 v0, v0, v1 1267; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] 1268; VI-NEXT: v_and_b32_e32 v1, 1, v0 1269; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1270; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 1271; VI-NEXT: v_cndmask_b32_e64 v1, v1, 1, vcc 1272; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1273; VI-NEXT: s_setpc_b64 s[30:31] 1274 %masked = and i64 %arg0, 255 1275 %itofp = uitofp i64 %masked to float 1276 ret float %itofp 1277} 1278 1279define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) { 1280; SI-LABEL: v_test_sitofp_i16_byte_to_f32: 1281; SI: ; %bb.0: 1282; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1283; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1284; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 1285; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1286; SI-NEXT: s_setpc_b64 s[30:31] 1287; 1288; VI-LABEL: v_test_sitofp_i16_byte_to_f32: 1289; VI: ; %bb.0: 1290; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1291; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1292; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1293; VI-NEXT: s_setpc_b64 s[30:31] 1294 %masked = and i16 %arg0, 255 1295 %itofp = sitofp i16 %masked to float 1296 ret float %itofp 1297} 1298 1299define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) { 1300; SI-LABEL: v_test_uitofp_i16_byte_to_f32: 1301; SI: ; %bb.0: 1302; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1303; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1304; SI-NEXT: v_bfe_u32 v0, v0, 0, 16 1305; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1306; SI-NEXT: s_setpc_b64 s[30:31] 1307; 1308; VI-LABEL: v_test_uitofp_i16_byte_to_f32: 1309; VI: ; %bb.0: 1310; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1311; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1312; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1313; VI-NEXT: s_setpc_b64 s[30:31] 1314 %masked = and i16 %arg0, 255 1315 %itofp = uitofp i16 %masked to float 1316 ret float %itofp 1317} 1318