1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 7 8define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 9; GCN-LABEL: v_uitofp_i32_to_f32_mask255: 10; GCN: ; %bb.0: 11; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 13; GCN-NEXT: s_setpc_b64 s[30:31] 14 %masked = and i32 %arg0, 255 15 %cvt = uitofp i32 %masked to float 16 ret float %cvt 17} 18 19define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 20; GCN-LABEL: v_sitofp_i32_to_f32_mask255: 21; GCN: ; %bb.0: 22; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 24; GCN-NEXT: s_setpc_b64 s[30:31] 25 %masked = and i32 %arg0, 255 26 %cvt = sitofp i32 %masked to float 27 ret float %cvt 28} 29 30define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind { 31; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255: 32; GCN: ; %bb.0: 33; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GCN-NEXT: v_lshrrev_b32_e32 v0, 7, v0 35; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 36; GCN-NEXT: s_setpc_b64 s[30:31] 37 %lshr.7 = lshr i32 %arg0, 7 38 %masked = and i32 %lshr.7, 255 39 %cvt = uitofp i32 %masked to float 40 ret float %cvt 41} 42 43define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind { 44; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255: 45; GCN: ; %bb.0: 46; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 47; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 48; GCN-NEXT: s_setpc_b64 s[30:31] 49 %lshr.8 = lshr i32 %arg0, 8 50 %masked = and i32 %lshr.8, 255 51 %cvt = uitofp i32 %masked to float 52 ret float %cvt 53} 54 55define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind { 56; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 57; SI: ; %bb.0: 58; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 59; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 60; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 61; SI-NEXT: s_mov_b32 s7, 0xf000 62; SI-NEXT: s_mov_b32 s6, -1 63; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 64; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 65; SI-NEXT: s_setpc_b64 s[30:31] 66; 67; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 68; VI: ; %bb.0: 69; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 70; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 71; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 72; VI-NEXT: s_mov_b32 s7, 0xf000 73; VI-NEXT: s_mov_b32 s6, -1 74; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 75; VI-NEXT: s_waitcnt vmcnt(0) 76; VI-NEXT: s_setpc_b64 s[30:31] 77 %lshr.8 = lshr i32 %arg0, 8 78 store i32 %lshr.8, i32 addrspace(1)* undef 79 %masked = and i32 %lshr.8, 255 80 %cvt = uitofp i32 %masked to float 81 ret float %cvt 82} 83 84define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind { 85; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255: 86; GCN: ; %bb.0: 87; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 88; GCN-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 89; GCN-NEXT: s_setpc_b64 s[30:31] 90 %lshr.16 = lshr i32 %arg0, 16 91 %masked = and i32 %lshr.16, 255 92 %cvt = uitofp i32 %masked to float 93 ret float %cvt 94} 95 96define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind { 97; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255: 98; GCN: ; %bb.0: 99; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 100; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 101; GCN-NEXT: s_setpc_b64 s[30:31] 102 %lshr.16 = lshr i32 %arg0, 24 103 %masked = and i32 %lshr.16, 255 104 %cvt = uitofp i32 %masked to float 105 ret float %cvt 106} 107 108define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind { 109; GCN-LABEL: v_uitofp_i8_to_f32: 110; GCN: ; %bb.0: 111; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 112; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 113; GCN-NEXT: s_setpc_b64 s[30:31] 114 %cvt = uitofp i8 %arg0 to float 115 ret float %cvt 116} 117 118define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind { 119; GCN-LABEL: v_uitofp_v2i8_to_v2f32: 120; GCN: ; %bb.0: 121; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 123; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 124; GCN-NEXT: v_mov_b32_e32 v0, v2 125; GCN-NEXT: s_setpc_b64 s[30:31] 126 %val = bitcast i16 %arg0 to <2 x i8> 127 %cvt = uitofp <2 x i8> %val to <2 x float> 128 ret <2 x float> %cvt 129} 130 131define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind { 132; GCN-LABEL: v_uitofp_v3i8_to_v3f32: 133; GCN: ; %bb.0: 134; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 135; GCN-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 136; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 137; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 138; GCN-NEXT: v_mov_b32_e32 v0, v3 139; GCN-NEXT: s_setpc_b64 s[30:31] 140 %trunc = trunc i32 %arg0 to i24 141 %val = bitcast i24 %trunc to <3 x i8> 142 %cvt = uitofp <3 x i8> %val to <3 x float> 143 ret <3 x float> %cvt 144} 145 146define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind { 147; GCN-LABEL: v_uitofp_v4i8_to_v4f32: 148; GCN: ; %bb.0: 149; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 150; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 151; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 152; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 153; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 154; GCN-NEXT: v_mov_b32_e32 v0, v4 155; GCN-NEXT: s_setpc_b64 s[30:31] 156 %val = bitcast i32 %arg0 to <4 x i8> 157 %cvt = uitofp <4 x i8> %val to <4 x float> 158 ret <4 x float> %cvt 159} 160 161define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind { 162; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32: 163; GCN: ; %bb.0: 164; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 166; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 167; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 168; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 169; GCN-NEXT: v_mov_b32_e32 v0, v4 170; GCN-NEXT: s_setpc_b64 s[30:31] 171 %mask.arg0 = and i32 %arg0, 255 172 %cvt0 = uitofp i32 %mask.arg0 to float 173 174 %lshr.8 = lshr i32 %arg0, 8 175 %mask.lshr.8 = and i32 %lshr.8, 255 176 %cvt1 = uitofp i32 %mask.lshr.8 to float 177 178 %lshr.16 = lshr i32 %arg0, 16 179 %mask.lshr.16 = and i32 %lshr.16, 255 180 %cvt2 = uitofp i32 %mask.lshr.16 to float 181 182 %lshr.24 = lshr i32 %arg0, 24 183 %mask.lshr.24 = and i32 %lshr.24, 255 184 %cvt3 = uitofp i32 %mask.lshr.24 to float 185 186 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0 187 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1 188 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2 189 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3 190 ret <4 x float> %ins.3 191} 192 193define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 194; SI-LABEL: v_uitofp_i32_to_f16_mask255: 195; SI: ; %bb.0: 196; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 197; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 198; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 199; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 200; SI-NEXT: s_setpc_b64 s[30:31] 201; 202; VI-LABEL: v_uitofp_i32_to_f16_mask255: 203; VI: ; %bb.0: 204; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 205; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 206; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 207; VI-NEXT: s_setpc_b64 s[30:31] 208 %masked = and i32 %arg0, 255 209 %cvt = uitofp i32 %masked to half 210 ret half %cvt 211} 212 213define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 214; SI-LABEL: v_sitofp_i32_to_f16_mask255: 215; SI: ; %bb.0: 216; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 217; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 218; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 219; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 220; SI-NEXT: s_setpc_b64 s[30:31] 221; 222; VI-LABEL: v_sitofp_i32_to_f16_mask255: 223; VI: ; %bb.0: 224; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 225; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 226; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 227; VI-NEXT: s_setpc_b64 s[30:31] 228 %masked = and i32 %arg0, 255 229 %cvt = sitofp i32 %masked to half 230 ret half %cvt 231} 232 233define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { 234; SI-LABEL: v_uitofp_to_f16_lshr8_mask255: 235; SI: ; %bb.0: 236; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 237; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 238; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 239; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 240; SI-NEXT: s_setpc_b64 s[30:31] 241; 242; VI-LABEL: v_uitofp_to_f16_lshr8_mask255: 243; VI: ; %bb.0: 244; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 245; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 246; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 247; VI-NEXT: s_setpc_b64 s[30:31] 248 %lshr.8 = lshr i32 %arg0, 8 249 %masked = and i32 %lshr.8, 255 250 %cvt = uitofp i32 %masked to half 251 ret half %cvt 252} 253 254define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { 255; SI-LABEL: v_uitofp_to_f16_lshr16_mask255: 256; SI: ; %bb.0: 257; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 258; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 259; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 260; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 261; SI-NEXT: s_setpc_b64 s[30:31] 262; 263; VI-LABEL: v_uitofp_to_f16_lshr16_mask255: 264; VI: ; %bb.0: 265; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 266; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 267; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 268; VI-NEXT: s_setpc_b64 s[30:31] 269 %lshr.16 = lshr i32 %arg0, 16 270 %masked = and i32 %lshr.16, 255 271 %cvt = uitofp i32 %masked to half 272 ret half %cvt 273} 274 275define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind { 276; SI-LABEL: v_uitofp_to_f16_lshr24_mask255: 277; SI: ; %bb.0: 278; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 279; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 280; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 281; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 282; SI-NEXT: s_setpc_b64 s[30:31] 283; 284; VI-LABEL: v_uitofp_to_f16_lshr24_mask255: 285; VI: ; %bb.0: 286; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 287; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 288; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 289; VI-NEXT: s_setpc_b64 s[30:31] 290 %lshr.16 = lshr i32 %arg0, 24 291 %masked = and i32 %lshr.16, 255 292 %cvt = uitofp i32 %masked to half 293 ret half %cvt 294} 295 296define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind { 297; SI-LABEL: v_uitofp_i8_to_f16: 298; SI: ; %bb.0: 299; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 300; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 301; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 302; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 303; SI-NEXT: s_setpc_b64 s[30:31] 304; 305; VI-LABEL: v_uitofp_i8_to_f16: 306; VI: ; %bb.0: 307; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 308; VI-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 309; VI-NEXT: s_setpc_b64 s[30:31] 310 %cvt = uitofp i8 %arg0 to half 311 ret half %cvt 312} 313 314define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind { 315; GCN-LABEL: v_uitofp_i32_to_f64_mask255: 316; GCN: ; %bb.0: 317; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 318; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 319; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 320; GCN-NEXT: s_setpc_b64 s[30:31] 321 %masked = and i32 %arg0, 255 322 %cvt = uitofp i32 %masked to double 323 ret double %cvt 324} 325 326define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind { 327; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255: 328; GCN: ; %bb.0: 329; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 330; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8 331; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 332; GCN-NEXT: s_setpc_b64 s[30:31] 333 %lshr.8 = lshr i32 %arg0, 8 334 %masked = and i32 %lshr.8, 255 335 %cvt = uitofp i32 %masked to double 336 ret double %cvt 337} 338 339define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind { 340; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255: 341; GCN: ; %bb.0: 342; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 343; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8 344; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 345; GCN-NEXT: s_setpc_b64 s[30:31] 346 %lshr.16 = lshr i32 %arg0, 16 347 %masked = and i32 %lshr.16, 255 348 %cvt = uitofp i32 %masked to double 349 ret double %cvt 350} 351 352define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind { 353; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255: 354; GCN: ; %bb.0: 355; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 356; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0 357; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 358; GCN-NEXT: s_setpc_b64 s[30:31] 359 %lshr.16 = lshr i32 %arg0, 24 360 %masked = and i32 %lshr.16, 255 361 %cvt = uitofp i32 %masked to double 362 ret double %cvt 363} 364 365define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { 366; SI-LABEL: v_uitofp_i8_to_f64: 367; SI: ; %bb.0: 368; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 369; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 370; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 371; SI-NEXT: s_setpc_b64 s[30:31] 372; 373; VI-LABEL: v_uitofp_i8_to_f64: 374; VI: ; %bb.0: 375; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 376; VI-NEXT: v_mov_b32_e32 v1, 0xffff 377; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 378; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 379; VI-NEXT: s_setpc_b64 s[30:31] 380 %cvt = uitofp i8 %arg0 to double 381 ret double %cvt 382} 383 384define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { 385; SI-LABEL: load_i8_to_f32: 386; SI: ; %bb.0: 387; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 388; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 389; SI-NEXT: s_mov_b32 s7, 0xf000 390; SI-NEXT: v_mov_b32_e32 v1, 0 391; SI-NEXT: s_mov_b32 s2, 0 392; SI-NEXT: s_mov_b32 s3, s7 393; SI-NEXT: s_waitcnt lgkmcnt(0) 394; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 395; SI-NEXT: s_mov_b32 s6, -1 396; SI-NEXT: s_waitcnt vmcnt(0) 397; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 398; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 399; SI-NEXT: s_endpgm 400; 401; VI-LABEL: load_i8_to_f32: 402; VI: ; %bb.0: 403; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 404; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 405; VI-NEXT: s_mov_b32 s7, 0xf000 406; VI-NEXT: s_mov_b32 s6, -1 407; VI-NEXT: s_waitcnt lgkmcnt(0) 408; VI-NEXT: v_mov_b32_e32 v1, s1 409; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 410; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 411; VI-NEXT: flat_load_ubyte v0, v[0:1] 412; VI-NEXT: s_waitcnt vmcnt(0) 413; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 414; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 415; VI-NEXT: s_endpgm 416 %tid = call i32 @llvm.amdgcn.workitem.id.x() 417 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 418 %load = load i8, i8 addrspace(1)* %gep, align 1 419 %cvt = uitofp i8 %load to float 420 store float %cvt, float addrspace(1)* %out, align 4 421 ret void 422} 423 424define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { 425; SI-LABEL: load_v2i8_to_v2f32: 426; SI: ; %bb.0: 427; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 428; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 429; SI-NEXT: s_mov_b32 s7, 0xf000 430; SI-NEXT: s_mov_b32 s2, 0 431; SI-NEXT: s_mov_b32 s3, s7 432; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 433; SI-NEXT: v_mov_b32_e32 v1, 0 434; SI-NEXT: s_waitcnt lgkmcnt(0) 435; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 436; SI-NEXT: s_mov_b32 s6, -1 437; SI-NEXT: s_waitcnt vmcnt(0) 438; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 439; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 440; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 441; SI-NEXT: s_endpgm 442; 443; VI-LABEL: load_v2i8_to_v2f32: 444; VI: ; %bb.0: 445; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 446; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 447; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 448; VI-NEXT: s_mov_b32 s7, 0xf000 449; VI-NEXT: s_mov_b32 s6, -1 450; VI-NEXT: s_waitcnt lgkmcnt(0) 451; VI-NEXT: v_mov_b32_e32 v1, s1 452; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 453; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 454; VI-NEXT: flat_load_ushort v0, v[0:1] 455; VI-NEXT: s_waitcnt vmcnt(0) 456; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 457; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 458; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 459; VI-NEXT: s_endpgm 460 %tid = call i32 @llvm.amdgcn.workitem.id.x() 461 %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid 462 %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 463 %cvt = uitofp <2 x i8> %load to <2 x float> 464 store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 465 ret void 466} 467 468define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { 469; SI-LABEL: load_v3i8_to_v3f32: 470; SI: ; %bb.0: 471; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 472; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 473; SI-NEXT: s_mov_b32 s7, 0xf000 474; SI-NEXT: s_mov_b32 s2, 0 475; SI-NEXT: s_mov_b32 s3, s7 476; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 477; SI-NEXT: v_mov_b32_e32 v1, 0 478; SI-NEXT: s_waitcnt lgkmcnt(0) 479; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 480; SI-NEXT: s_mov_b32 s6, -1 481; SI-NEXT: s_waitcnt vmcnt(0) 482; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2 483; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 484; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 485; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 486; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 487; SI-NEXT: s_endpgm 488; 489; VI-LABEL: load_v3i8_to_v3f32: 490; VI: ; %bb.0: 491; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 492; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 493; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 494; VI-NEXT: s_mov_b32 s7, 0xf000 495; VI-NEXT: s_mov_b32 s6, -1 496; VI-NEXT: s_waitcnt lgkmcnt(0) 497; VI-NEXT: v_mov_b32_e32 v1, s1 498; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 499; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 500; VI-NEXT: flat_load_dword v0, v[0:1] 501; VI-NEXT: s_waitcnt vmcnt(0) 502; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 503; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 504; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 505; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 506; VI-NEXT: s_endpgm 507 %tid = call i32 @llvm.amdgcn.workitem.id.x() 508 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid 509 %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 510 %cvt = uitofp <3 x i8> %load to <3 x float> 511 store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 512 ret void 513} 514 515define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 516; SI-LABEL: load_v4i8_to_v4f32: 517; SI: ; %bb.0: 518; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 519; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 520; SI-NEXT: s_mov_b32 s7, 0xf000 521; SI-NEXT: s_mov_b32 s2, 0 522; SI-NEXT: s_mov_b32 s3, s7 523; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 524; SI-NEXT: v_mov_b32_e32 v1, 0 525; SI-NEXT: s_waitcnt lgkmcnt(0) 526; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 527; SI-NEXT: s_mov_b32 s6, -1 528; SI-NEXT: s_waitcnt vmcnt(0) 529; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 530; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 531; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 532; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 533; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 534; SI-NEXT: s_endpgm 535; 536; VI-LABEL: load_v4i8_to_v4f32: 537; VI: ; %bb.0: 538; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 539; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 540; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 541; VI-NEXT: s_mov_b32 s7, 0xf000 542; VI-NEXT: s_mov_b32 s6, -1 543; VI-NEXT: s_waitcnt lgkmcnt(0) 544; VI-NEXT: v_mov_b32_e32 v1, s1 545; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 546; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 547; VI-NEXT: flat_load_dword v0, v[0:1] 548; VI-NEXT: s_waitcnt vmcnt(0) 549; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 550; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 551; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 552; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 553; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 554; VI-NEXT: s_endpgm 555 %tid = call i32 @llvm.amdgcn.workitem.id.x() 556 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 557 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 558 %cvt = uitofp <4 x i8> %load to <4 x float> 559 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 560 ret void 561} 562 563; This should not be adding instructions to shift into the correct 564; position in the word for the component. 565 566; FIXME: Packing bytes 567define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 568; SI-LABEL: load_v4i8_to_v4f32_unaligned: 569; SI: ; %bb.0: 570; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 571; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 572; SI-NEXT: s_mov_b32 s7, 0xf000 573; SI-NEXT: s_mov_b32 s2, 0 574; SI-NEXT: s_mov_b32 s3, s7 575; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 576; SI-NEXT: v_mov_b32_e32 v1, 0 577; SI-NEXT: s_waitcnt lgkmcnt(0) 578; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 579; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 580; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 581; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 582; SI-NEXT: s_mov_b32 s6, -1 583; SI-NEXT: s_waitcnt vmcnt(2) 584; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 585; SI-NEXT: s_waitcnt vmcnt(0) 586; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 587; SI-NEXT: v_or_b32_e32 v0, v0, v3 588; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 589; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 590; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 591; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 592; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 593; SI-NEXT: s_endpgm 594; 595; VI-LABEL: load_v4i8_to_v4f32_unaligned: 596; VI: ; %bb.0: 597; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 598; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 599; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 600; VI-NEXT: s_mov_b32 s7, 0xf000 601; VI-NEXT: s_mov_b32 s6, -1 602; VI-NEXT: s_waitcnt lgkmcnt(0) 603; VI-NEXT: v_mov_b32_e32 v1, s1 604; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 605; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 606; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 607; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 608; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 609; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 610; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 611; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 612; VI-NEXT: flat_load_ubyte v4, v[4:5] 613; VI-NEXT: flat_load_ubyte v5, v[6:7] 614; VI-NEXT: flat_load_ubyte v6, v[2:3] 615; VI-NEXT: flat_load_ubyte v0, v[0:1] 616; VI-NEXT: s_waitcnt vmcnt(3) 617; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 618; VI-NEXT: s_waitcnt vmcnt(2) 619; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 620; VI-NEXT: s_waitcnt vmcnt(1) 621; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 622; VI-NEXT: s_waitcnt vmcnt(0) 623; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 624; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 625; VI-NEXT: s_endpgm 626 %tid = call i32 @llvm.amdgcn.workitem.id.x() 627 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 628 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 629 %cvt = uitofp <4 x i8> %load to <4 x float> 630 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 631 ret void 632} 633 634; FIXME: Need to handle non-uniform case for function below (load without gep). 635; Instructions still emitted to repack bytes for add use. 636define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { 637; SI-LABEL: load_v4i8_to_v4f32_2_uses: 638; SI: ; %bb.0: 639; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 640; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 641; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 642; SI-NEXT: s_mov_b32 s11, 0xf000 643; SI-NEXT: s_mov_b32 s2, 0 644; SI-NEXT: s_mov_b32 s3, s11 645; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 646; SI-NEXT: v_mov_b32_e32 v1, 0 647; SI-NEXT: s_waitcnt lgkmcnt(0) 648; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 649; SI-NEXT: s_mov_b32 s10, -1 650; SI-NEXT: s_movk_i32 s0, 0xff 651; SI-NEXT: s_mov_b32 s6, s10 652; SI-NEXT: s_mov_b32 s7, s11 653; SI-NEXT: s_waitcnt vmcnt(0) 654; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 655; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 656; SI-NEXT: v_and_b32_e32 v7, 0xff00, v4 657; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 658; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 659; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 660; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 661; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 662; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 663; SI-NEXT: s_waitcnt expcnt(0) 664; SI-NEXT: v_and_b32_e32 v0, s0, v4 665; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 666; SI-NEXT: v_or_b32_e32 v0, v7, v0 667; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 668; SI-NEXT: v_and_b32_e32 v2, s0, v2 669; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 670; SI-NEXT: v_or_b32_e32 v1, v1, v2 671; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 672; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 673; SI-NEXT: v_or_b32_e32 v0, v1, v0 674; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 675; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 676; SI-NEXT: s_endpgm 677; 678; VI-LABEL: load_v4i8_to_v4f32_2_uses: 679; VI: ; %bb.0: 680; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 681; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c 682; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 683; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 684; VI-NEXT: s_mov_b32 s11, 0xf000 685; VI-NEXT: s_mov_b32 s10, -1 686; VI-NEXT: v_mov_b32_e32 v5, 9 687; VI-NEXT: s_waitcnt lgkmcnt(0) 688; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 689; VI-NEXT: v_mov_b32_e32 v1, s1 690; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 691; VI-NEXT: flat_load_dword v4, v[0:1] 692; VI-NEXT: s_mov_b32 s6, s10 693; VI-NEXT: s_mov_b32 s7, s11 694; VI-NEXT: s_movk_i32 s0, 0x900 695; VI-NEXT: s_waitcnt vmcnt(0) 696; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 697; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 698; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 699; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 700; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 701; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 702; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 703; VI-NEXT: v_add_u16_e32 v8, 9, v4 704; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 705; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 706; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 707; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 708; VI-NEXT: v_mov_b32_e32 v2, s0 709; VI-NEXT: v_add_u16_e32 v0, s0, v0 710; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 711; VI-NEXT: v_or_b32_e32 v0, v0, v1 712; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 713; VI-NEXT: s_endpgm 714 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 715 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 716 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 717 %cvt = uitofp <4 x i8> %load to <4 x float> 718 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 719 %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load 720 store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 721 ret void 722} 723 724; Make sure this doesn't crash. 725define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { 726; SI-LABEL: load_v7i8_to_v7f32: 727; SI: ; %bb.0: 728; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 729; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 730; SI-NEXT: s_mov_b32 s7, 0xf000 731; SI-NEXT: s_mov_b32 s2, 0 732; SI-NEXT: s_mov_b32 s3, s7 733; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 734; SI-NEXT: v_mov_b32_e32 v1, 0 735; SI-NEXT: s_waitcnt lgkmcnt(0) 736; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 737; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 738; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 739; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3 740; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4 741; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5 742; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6 743; SI-NEXT: s_mov_b32 s6, -1 744; SI-NEXT: s_waitcnt vmcnt(6) 745; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 746; SI-NEXT: s_waitcnt vmcnt(5) 747; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 748; SI-NEXT: s_waitcnt vmcnt(3) 749; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 750; SI-NEXT: v_or_b32_e32 v3, v9, v6 751; SI-NEXT: s_waitcnt vmcnt(1) 752; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 753; SI-NEXT: s_waitcnt vmcnt(0) 754; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 755; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 756; SI-NEXT: s_waitcnt expcnt(0) 757; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 758; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 759; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 760; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 761; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 762; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 763; SI-NEXT: s_endpgm 764; 765; VI-LABEL: load_v7i8_to_v7f32: 766; VI: ; %bb.0: 767; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 768; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 769; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 770; VI-NEXT: s_mov_b32 s7, 0xf000 771; VI-NEXT: s_mov_b32 s6, -1 772; VI-NEXT: s_waitcnt lgkmcnt(0) 773; VI-NEXT: v_mov_b32_e32 v1, s1 774; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 775; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 776; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 777; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 778; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 779; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 780; VI-NEXT: flat_load_ubyte v12, v[4:5] 781; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 782; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 783; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 784; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 785; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 786; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 787; VI-NEXT: v_add_u32_e32 v10, vcc, 1, v0 788; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc 789; VI-NEXT: flat_load_ubyte v8, v[8:9] 790; VI-NEXT: flat_load_ubyte v9, v[10:11] 791; VI-NEXT: flat_load_ubyte v6, v[6:7] 792; VI-NEXT: flat_load_ubyte v7, v[4:5] 793; VI-NEXT: flat_load_ubyte v2, v[2:3] 794; VI-NEXT: flat_load_ubyte v0, v[0:1] 795; VI-NEXT: s_waitcnt vmcnt(5) 796; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 797; VI-NEXT: s_waitcnt vmcnt(4) 798; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 799; VI-NEXT: s_waitcnt vmcnt(3) 800; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 801; VI-NEXT: s_waitcnt vmcnt(2) 802; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v7 803; VI-NEXT: s_waitcnt vmcnt(1) 804; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 805; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 806; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 807; VI-NEXT: s_waitcnt vmcnt(0) 808; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 809; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 810; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 811; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 812; VI-NEXT: s_endpgm 813 %tid = call i32 @llvm.amdgcn.workitem.id.x() 814 %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid 815 %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 816 %cvt = uitofp <7 x i8> %load to <7 x float> 817 store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 818 ret void 819} 820 821define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { 822; SI-LABEL: load_v8i8_to_v8f32: 823; SI: ; %bb.0: 824; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 825; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 826; SI-NEXT: s_mov_b32 s7, 0xf000 827; SI-NEXT: s_mov_b32 s2, 0 828; SI-NEXT: s_mov_b32 s3, s7 829; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 830; SI-NEXT: v_mov_b32_e32 v1, 0 831; SI-NEXT: s_waitcnt lgkmcnt(0) 832; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64 833; SI-NEXT: s_mov_b32 s6, -1 834; SI-NEXT: s_waitcnt vmcnt(0) 835; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 836; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 837; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 838; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 839; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 840; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 841; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 842; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 843; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 844; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 845; SI-NEXT: s_endpgm 846; 847; VI-LABEL: load_v8i8_to_v8f32: 848; VI: ; %bb.0: 849; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 850; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 851; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 852; VI-NEXT: s_mov_b32 s7, 0xf000 853; VI-NEXT: s_mov_b32 s6, -1 854; VI-NEXT: s_waitcnt lgkmcnt(0) 855; VI-NEXT: v_mov_b32_e32 v1, s1 856; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 857; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 858; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] 859; VI-NEXT: s_waitcnt vmcnt(0) 860; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 861; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 862; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 863; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 864; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 865; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 866; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 867; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 868; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 869; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 870; VI-NEXT: s_endpgm 871 %tid = call i32 @llvm.amdgcn.workitem.id.x() 872 %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid 873 %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 874 %cvt = uitofp <8 x i8> %load to <8 x float> 875 store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 876 ret void 877} 878 879define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 880; SI-LABEL: i8_zext_inreg_i32_to_f32: 881; SI: ; %bb.0: 882; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 883; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 884; SI-NEXT: s_mov_b32 s7, 0xf000 885; SI-NEXT: s_mov_b32 s2, 0 886; SI-NEXT: s_mov_b32 s3, s7 887; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 888; SI-NEXT: v_mov_b32_e32 v1, 0 889; SI-NEXT: s_waitcnt lgkmcnt(0) 890; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 891; SI-NEXT: s_mov_b32 s6, -1 892; SI-NEXT: s_waitcnt vmcnt(0) 893; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 894; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 895; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 896; SI-NEXT: s_endpgm 897; 898; VI-LABEL: i8_zext_inreg_i32_to_f32: 899; VI: ; %bb.0: 900; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 901; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 902; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 903; VI-NEXT: s_mov_b32 s7, 0xf000 904; VI-NEXT: s_mov_b32 s6, -1 905; VI-NEXT: s_waitcnt lgkmcnt(0) 906; VI-NEXT: v_mov_b32_e32 v1, s1 907; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 908; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 909; VI-NEXT: flat_load_dword v0, v[0:1] 910; VI-NEXT: s_waitcnt vmcnt(0) 911; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 912; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 913; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 914; VI-NEXT: s_endpgm 915 %tid = call i32 @llvm.amdgcn.workitem.id.x() 916 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 917 %load = load i32, i32 addrspace(1)* %gep, align 4 918 %add = add i32 %load, 2 919 %inreg = and i32 %add, 255 920 %cvt = uitofp i32 %inreg to float 921 store float %cvt, float addrspace(1)* %out, align 4 922 ret void 923} 924 925define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 926; SI-LABEL: i8_zext_inreg_hi1_to_f32: 927; SI: ; %bb.0: 928; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 929; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 930; SI-NEXT: s_mov_b32 s7, 0xf000 931; SI-NEXT: s_mov_b32 s2, 0 932; SI-NEXT: s_mov_b32 s3, s7 933; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 934; SI-NEXT: v_mov_b32_e32 v1, 0 935; SI-NEXT: s_waitcnt lgkmcnt(0) 936; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 937; SI-NEXT: s_mov_b32 s6, -1 938; SI-NEXT: s_waitcnt vmcnt(0) 939; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 940; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 941; SI-NEXT: s_endpgm 942; 943; VI-LABEL: i8_zext_inreg_hi1_to_f32: 944; VI: ; %bb.0: 945; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 946; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 947; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 948; VI-NEXT: s_mov_b32 s7, 0xf000 949; VI-NEXT: s_mov_b32 s6, -1 950; VI-NEXT: s_waitcnt lgkmcnt(0) 951; VI-NEXT: v_mov_b32_e32 v1, s1 952; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 953; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 954; VI-NEXT: flat_load_dword v0, v[0:1] 955; VI-NEXT: s_waitcnt vmcnt(0) 956; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 957; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 958; VI-NEXT: s_endpgm 959 %tid = call i32 @llvm.amdgcn.workitem.id.x() 960 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 961 %load = load i32, i32 addrspace(1)* %gep, align 4 962 %inreg = and i32 %load, 65280 963 %shr = lshr i32 %inreg, 8 964 %cvt = uitofp i32 %shr to float 965 store float %cvt, float addrspace(1)* %out, align 4 966 ret void 967} 968 969; We don't get these ones because of the zext, but instcombine removes 970; them so it shouldn't really matter. 971define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { 972; SI-LABEL: i8_zext_i32_to_f32: 973; SI: ; %bb.0: 974; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 975; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 976; SI-NEXT: s_mov_b32 s7, 0xf000 977; SI-NEXT: v_mov_b32_e32 v1, 0 978; SI-NEXT: s_mov_b32 s2, 0 979; SI-NEXT: s_mov_b32 s3, s7 980; SI-NEXT: s_waitcnt lgkmcnt(0) 981; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 982; SI-NEXT: s_mov_b32 s6, -1 983; SI-NEXT: s_waitcnt vmcnt(0) 984; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 985; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 986; SI-NEXT: s_endpgm 987; 988; VI-LABEL: i8_zext_i32_to_f32: 989; VI: ; %bb.0: 990; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 991; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 992; VI-NEXT: s_mov_b32 s7, 0xf000 993; VI-NEXT: s_mov_b32 s6, -1 994; VI-NEXT: s_waitcnt lgkmcnt(0) 995; VI-NEXT: v_mov_b32_e32 v1, s1 996; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 997; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 998; VI-NEXT: flat_load_ubyte v0, v[0:1] 999; VI-NEXT: s_waitcnt vmcnt(0) 1000; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1001; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1002; VI-NEXT: s_endpgm 1003 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1004 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 1005 %load = load i8, i8 addrspace(1)* %gep, align 1 1006 %ext = zext i8 %load to i32 1007 %cvt = uitofp i32 %ext to float 1008 store float %cvt, float addrspace(1)* %out, align 4 1009 ret void 1010} 1011 1012define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 1013; SI-LABEL: v4i8_zext_v4i32_to_v4f32: 1014; SI: ; %bb.0: 1015; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1016; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1017; SI-NEXT: s_mov_b32 s7, 0xf000 1018; SI-NEXT: s_mov_b32 s2, 0 1019; SI-NEXT: s_mov_b32 s3, s7 1020; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1021; SI-NEXT: v_mov_b32_e32 v1, 0 1022; SI-NEXT: s_waitcnt lgkmcnt(0) 1023; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 1024; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 1025; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 1026; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 1027; SI-NEXT: s_mov_b32 s6, -1 1028; SI-NEXT: s_waitcnt vmcnt(2) 1029; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 1030; SI-NEXT: s_waitcnt vmcnt(0) 1031; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1032; SI-NEXT: v_or_b32_e32 v0, v0, v3 1033; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1034; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1035; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1036; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 1037; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1038; SI-NEXT: s_endpgm 1039; 1040; VI-LABEL: v4i8_zext_v4i32_to_v4f32: 1041; VI: ; %bb.0: 1042; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1043; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1044; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1045; VI-NEXT: s_mov_b32 s7, 0xf000 1046; VI-NEXT: s_mov_b32 s6, -1 1047; VI-NEXT: s_waitcnt lgkmcnt(0) 1048; VI-NEXT: v_mov_b32_e32 v1, s1 1049; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1050; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1051; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 1052; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1053; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 1054; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1055; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 1056; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 1057; VI-NEXT: flat_load_ubyte v4, v[4:5] 1058; VI-NEXT: flat_load_ubyte v5, v[6:7] 1059; VI-NEXT: flat_load_ubyte v2, v[2:3] 1060; VI-NEXT: flat_load_ubyte v0, v[0:1] 1061; VI-NEXT: s_waitcnt vmcnt(1) 1062; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1063; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1064; VI-NEXT: s_waitcnt vmcnt(0) 1065; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1066; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 1067; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 1068; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 1069; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1070; VI-NEXT: s_endpgm 1071 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1072 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 1073 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 1074 %ext = zext <4 x i8> %load to <4 x i32> 1075 %cvt = uitofp <4 x i32> %ext to <4 x float> 1076 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 1077 ret void 1078} 1079 1080define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1081; SI-LABEL: extract_byte0_to_f32: 1082; SI: ; %bb.0: 1083; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1084; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1085; SI-NEXT: s_mov_b32 s7, 0xf000 1086; SI-NEXT: s_mov_b32 s2, 0 1087; SI-NEXT: s_mov_b32 s3, s7 1088; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1089; SI-NEXT: v_mov_b32_e32 v1, 0 1090; SI-NEXT: s_waitcnt lgkmcnt(0) 1091; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1092; SI-NEXT: s_mov_b32 s6, -1 1093; SI-NEXT: s_waitcnt vmcnt(0) 1094; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1095; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1096; SI-NEXT: s_endpgm 1097; 1098; VI-LABEL: extract_byte0_to_f32: 1099; VI: ; %bb.0: 1100; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1101; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1102; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1103; VI-NEXT: s_mov_b32 s7, 0xf000 1104; VI-NEXT: s_mov_b32 s6, -1 1105; VI-NEXT: s_waitcnt lgkmcnt(0) 1106; VI-NEXT: v_mov_b32_e32 v1, s1 1107; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1108; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1109; VI-NEXT: flat_load_dword v0, v[0:1] 1110; VI-NEXT: s_waitcnt vmcnt(0) 1111; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1112; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1113; VI-NEXT: s_endpgm 1114 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1115 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1116 %val = load i32, i32 addrspace(1)* %gep 1117 %and = and i32 %val, 255 1118 %cvt = uitofp i32 %and to float 1119 store float %cvt, float addrspace(1)* %out 1120 ret void 1121} 1122 1123define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1124; SI-LABEL: extract_byte1_to_f32: 1125; SI: ; %bb.0: 1126; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1127; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1128; SI-NEXT: s_mov_b32 s7, 0xf000 1129; SI-NEXT: s_mov_b32 s2, 0 1130; SI-NEXT: s_mov_b32 s3, s7 1131; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1132; SI-NEXT: v_mov_b32_e32 v1, 0 1133; SI-NEXT: s_waitcnt lgkmcnt(0) 1134; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1135; SI-NEXT: s_mov_b32 s6, -1 1136; SI-NEXT: s_waitcnt vmcnt(0) 1137; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 1138; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1139; SI-NEXT: s_endpgm 1140; 1141; VI-LABEL: extract_byte1_to_f32: 1142; VI: ; %bb.0: 1143; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1144; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1145; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1146; VI-NEXT: s_mov_b32 s7, 0xf000 1147; VI-NEXT: s_mov_b32 s6, -1 1148; VI-NEXT: s_waitcnt lgkmcnt(0) 1149; VI-NEXT: v_mov_b32_e32 v1, s1 1150; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1151; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1152; VI-NEXT: flat_load_dword v0, v[0:1] 1153; VI-NEXT: s_waitcnt vmcnt(0) 1154; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 1155; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1156; VI-NEXT: s_endpgm 1157 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1158 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1159 %val = load i32, i32 addrspace(1)* %gep 1160 %srl = lshr i32 %val, 8 1161 %and = and i32 %srl, 255 1162 %cvt = uitofp i32 %and to float 1163 store float %cvt, float addrspace(1)* %out 1164 ret void 1165} 1166 1167define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1168; SI-LABEL: extract_byte2_to_f32: 1169; SI: ; %bb.0: 1170; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1171; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1172; SI-NEXT: s_mov_b32 s7, 0xf000 1173; SI-NEXT: s_mov_b32 s2, 0 1174; SI-NEXT: s_mov_b32 s3, s7 1175; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1176; SI-NEXT: v_mov_b32_e32 v1, 0 1177; SI-NEXT: s_waitcnt lgkmcnt(0) 1178; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1179; SI-NEXT: s_mov_b32 s6, -1 1180; SI-NEXT: s_waitcnt vmcnt(0) 1181; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 1182; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1183; SI-NEXT: s_endpgm 1184; 1185; VI-LABEL: extract_byte2_to_f32: 1186; VI: ; %bb.0: 1187; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1188; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1189; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1190; VI-NEXT: s_mov_b32 s7, 0xf000 1191; VI-NEXT: s_mov_b32 s6, -1 1192; VI-NEXT: s_waitcnt lgkmcnt(0) 1193; VI-NEXT: v_mov_b32_e32 v1, s1 1194; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1195; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1196; VI-NEXT: flat_load_dword v0, v[0:1] 1197; VI-NEXT: s_waitcnt vmcnt(0) 1198; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 1199; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1200; VI-NEXT: s_endpgm 1201 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1202 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1203 %val = load i32, i32 addrspace(1)* %gep 1204 %srl = lshr i32 %val, 16 1205 %and = and i32 %srl, 255 1206 %cvt = uitofp i32 %and to float 1207 store float %cvt, float addrspace(1)* %out 1208 ret void 1209} 1210 1211define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1212; SI-LABEL: extract_byte3_to_f32: 1213; SI: ; %bb.0: 1214; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1215; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1216; SI-NEXT: s_mov_b32 s7, 0xf000 1217; SI-NEXT: s_mov_b32 s2, 0 1218; SI-NEXT: s_mov_b32 s3, s7 1219; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1220; SI-NEXT: v_mov_b32_e32 v1, 0 1221; SI-NEXT: s_waitcnt lgkmcnt(0) 1222; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1223; SI-NEXT: s_mov_b32 s6, -1 1224; SI-NEXT: s_waitcnt vmcnt(0) 1225; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 1226; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1227; SI-NEXT: s_endpgm 1228; 1229; VI-LABEL: extract_byte3_to_f32: 1230; VI: ; %bb.0: 1231; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1232; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1233; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1234; VI-NEXT: s_mov_b32 s7, 0xf000 1235; VI-NEXT: s_mov_b32 s6, -1 1236; VI-NEXT: s_waitcnt lgkmcnt(0) 1237; VI-NEXT: v_mov_b32_e32 v1, s1 1238; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1239; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1240; VI-NEXT: flat_load_dword v0, v[0:1] 1241; VI-NEXT: s_waitcnt vmcnt(0) 1242; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 1243; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1244; VI-NEXT: s_endpgm 1245 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1246 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1247 %val = load i32, i32 addrspace(1)* %gep 1248 %srl = lshr i32 %val, 24 1249 %and = and i32 %srl, 255 1250 %cvt = uitofp i32 %and to float 1251 store float %cvt, float addrspace(1)* %out 1252 ret void 1253} 1254 1255define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { 1256; SI-LABEL: cvt_ubyte0_or_multiuse: 1257; SI: ; %bb.0: ; %bb 1258; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1259; SI-NEXT: s_mov_b32 s3, 0xf000 1260; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1261; SI-NEXT: v_mov_b32_e32 v1, 0 1262; SI-NEXT: s_mov_b32 s2, -1 1263; SI-NEXT: s_waitcnt lgkmcnt(0) 1264; SI-NEXT: s_mov_b32 s0, s6 1265; SI-NEXT: s_mov_b32 s1, s7 1266; SI-NEXT: s_mov_b32 s6, 0 1267; SI-NEXT: s_mov_b32 s7, s3 1268; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1269; SI-NEXT: s_waitcnt vmcnt(0) 1270; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1271; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 1272; SI-NEXT: v_add_f32_e32 v0, v0, v1 1273; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1274; SI-NEXT: s_endpgm 1275; 1276; VI-LABEL: cvt_ubyte0_or_multiuse: 1277; VI: ; %bb.0: ; %bb 1278; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1279; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1280; VI-NEXT: s_mov_b32 s3, 0xf000 1281; VI-NEXT: s_mov_b32 s2, -1 1282; VI-NEXT: s_waitcnt lgkmcnt(0) 1283; VI-NEXT: v_mov_b32_e32 v1, s5 1284; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 1285; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1286; VI-NEXT: flat_load_dword v0, v[0:1] 1287; VI-NEXT: s_mov_b32 s0, s6 1288; VI-NEXT: s_mov_b32 s1, s7 1289; VI-NEXT: s_waitcnt vmcnt(0) 1290; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1291; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 1292; VI-NEXT: v_add_f32_e32 v0, v0, v1 1293; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1294; VI-NEXT: s_endpgm 1295bb: 1296 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 1297 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid 1298 %load = load i32, i32 addrspace(1)* %gep 1299 %or = or i32 %load, -2147483647 1300 %and = and i32 %or, 255 1301 %uitofp = uitofp i32 %and to float 1302 %cast = bitcast i32 %or to float 1303 %add = fadd float %cast, %uitofp 1304 store float %add, float addrspace(1)* %out 1305 ret void 1306} 1307