1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI %s 4 5define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { 6; SI-LABEL: bfe_u32_arg_arg_arg: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 9; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10; SI-NEXT: s_mov_b32 s7, 0xf000 11; SI-NEXT: s_mov_b32 s6, -1 12; SI-NEXT: s_waitcnt lgkmcnt(0) 13; SI-NEXT: v_mov_b32_e32 v0, s2 14; SI-NEXT: v_bfe_u32 v0, v0, s3, s3 15; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 16; SI-NEXT: s_endpgm 17; 18; VI-LABEL: bfe_u32_arg_arg_arg: 19; VI: ; %bb.0: 20; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 21; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 22; VI-NEXT: s_mov_b32 s7, 0xf000 23; VI-NEXT: s_mov_b32 s6, -1 24; VI-NEXT: s_waitcnt lgkmcnt(0) 25; VI-NEXT: v_mov_b32_e32 v0, s0 26; VI-NEXT: v_bfe_u32 v0, v0, s1, s1 27; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 28; VI-NEXT: s_endpgm 29 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) 30 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 31 ret void 32} 33 34define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { 35; SI-LABEL: bfe_u32_arg_arg_imm: 36; SI: ; %bb.0: 37; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 38; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 39; SI-NEXT: s_mov_b32 s7, 0xf000 40; SI-NEXT: s_mov_b32 s6, -1 41; SI-NEXT: v_mov_b32_e32 v0, 0x7b 42; SI-NEXT: s_waitcnt lgkmcnt(0) 43; SI-NEXT: v_mov_b32_e32 v1, s3 44; SI-NEXT: v_bfe_u32 v0, s2, v1, v0 45; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 46; SI-NEXT: s_endpgm 47; 48; VI-LABEL: bfe_u32_arg_arg_imm: 49; VI: ; %bb.0: 50; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 51; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 52; VI-NEXT: v_mov_b32_e32 v1, 0x7b 53; VI-NEXT: s_mov_b32 s7, 0xf000 54; VI-NEXT: s_mov_b32 s6, -1 55; VI-NEXT: s_waitcnt lgkmcnt(0) 56; VI-NEXT: v_mov_b32_e32 v0, s1 57; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 58; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 59; VI-NEXT: s_endpgm 60 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) 61 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 62 ret void 63} 64 65define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { 66; SI-LABEL: bfe_u32_arg_imm_arg: 67; SI: ; %bb.0: 68; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 69; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 70; SI-NEXT: s_mov_b32 s7, 0xf000 71; SI-NEXT: s_mov_b32 s6, -1 72; SI-NEXT: v_mov_b32_e32 v0, 0x7b 73; SI-NEXT: s_waitcnt lgkmcnt(0) 74; SI-NEXT: v_mov_b32_e32 v1, s3 75; SI-NEXT: v_bfe_u32 v0, s2, v0, v1 76; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 77; SI-NEXT: s_endpgm 78; 79; VI-LABEL: bfe_u32_arg_imm_arg: 80; VI: ; %bb.0: 81; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 82; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 83; VI-NEXT: v_mov_b32_e32 v0, 0x7b 84; VI-NEXT: s_mov_b32 s7, 0xf000 85; VI-NEXT: s_mov_b32 s6, -1 86; VI-NEXT: s_waitcnt lgkmcnt(0) 87; VI-NEXT: v_mov_b32_e32 v1, s1 88; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 89; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 90; VI-NEXT: s_endpgm 91 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) 92 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 93 ret void 94} 95 96define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { 97; SI-LABEL: bfe_u32_imm_arg_arg: 98; SI: ; %bb.0: 99; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 100; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 101; SI-NEXT: s_mov_b32 s7, 0xf000 102; SI-NEXT: s_mov_b32 s6, -1 103; SI-NEXT: s_movk_i32 s0, 0x7b 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s2 106; SI-NEXT: v_mov_b32_e32 v1, s3 107; SI-NEXT: v_bfe_u32 v0, s0, v0, v1 108; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 109; SI-NEXT: s_endpgm 110; 111; VI-LABEL: bfe_u32_imm_arg_arg: 112; VI: ; %bb.0: 113; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 114; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 115; VI-NEXT: s_movk_i32 s2, 0x7b 116; VI-NEXT: s_mov_b32 s7, 0xf000 117; VI-NEXT: s_mov_b32 s6, -1 118; VI-NEXT: s_waitcnt lgkmcnt(0) 119; VI-NEXT: v_mov_b32_e32 v0, s0 120; VI-NEXT: v_mov_b32_e32 v1, s1 121; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 122; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 123; VI-NEXT: s_endpgm 124 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) 125 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 126 ret void 127} 128 129define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { 130; SI-LABEL: bfe_u32_arg_0_width_reg_offset: 131; SI: ; %bb.0: 132; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 133; SI-NEXT: s_mov_b32 s3, 0xf000 134; SI-NEXT: s_mov_b32 s2, -1 135; SI-NEXT: v_mov_b32_e32 v0, 0 136; SI-NEXT: s_waitcnt lgkmcnt(0) 137; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 138; SI-NEXT: s_endpgm 139; 140; VI-LABEL: bfe_u32_arg_0_width_reg_offset: 141; VI: ; %bb.0: 142; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 143; VI-NEXT: s_mov_b32 s3, 0xf000 144; VI-NEXT: s_mov_b32 s2, -1 145; VI-NEXT: v_mov_b32_e32 v0, 0 146; VI-NEXT: s_waitcnt lgkmcnt(0) 147; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 148; VI-NEXT: s_endpgm 149 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0) 150 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 151 ret void 152} 153 154define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { 155; SI-LABEL: bfe_u32_arg_0_width_imm_offset: 156; SI: ; %bb.0: 157; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 158; SI-NEXT: s_mov_b32 s3, 0xf000 159; SI-NEXT: s_mov_b32 s2, -1 160; SI-NEXT: v_mov_b32_e32 v0, 0 161; SI-NEXT: s_waitcnt lgkmcnt(0) 162; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 163; SI-NEXT: s_endpgm 164; 165; VI-LABEL: bfe_u32_arg_0_width_imm_offset: 166; VI: ; %bb.0: 167; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 168; VI-NEXT: s_mov_b32 s3, 0xf000 169; VI-NEXT: s_mov_b32 s2, -1 170; VI-NEXT: v_mov_b32_e32 v0, 0 171; VI-NEXT: s_waitcnt lgkmcnt(0) 172; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 173; VI-NEXT: s_endpgm 174 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0) 175 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 176 ret void 177} 178 179define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 180; SI-LABEL: bfe_u32_zextload_i8: 181; SI: ; %bb.0: 182; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 183; SI-NEXT: s_mov_b32 s7, 0xf000 184; SI-NEXT: s_mov_b32 s6, -1 185; SI-NEXT: s_mov_b32 s10, s6 186; SI-NEXT: s_mov_b32 s11, s7 187; SI-NEXT: s_waitcnt lgkmcnt(0) 188; SI-NEXT: s_mov_b32 s8, s2 189; SI-NEXT: s_mov_b32 s9, s3 190; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 191; SI-NEXT: s_mov_b32 s4, s0 192; SI-NEXT: s_mov_b32 s5, s1 193; SI-NEXT: s_waitcnt vmcnt(0) 194; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 195; SI-NEXT: s_endpgm 196; 197; VI-LABEL: bfe_u32_zextload_i8: 198; VI: ; %bb.0: 199; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 200; VI-NEXT: s_mov_b32 s3, 0xf000 201; VI-NEXT: s_mov_b32 s2, -1 202; VI-NEXT: s_waitcnt lgkmcnt(0) 203; VI-NEXT: s_mov_b32 s0, s4 204; VI-NEXT: s_mov_b32 s1, s5 205; VI-NEXT: s_mov_b32 s4, s6 206; VI-NEXT: s_mov_b32 s5, s7 207; VI-NEXT: s_mov_b32 s6, s2 208; VI-NEXT: s_mov_b32 s7, s3 209; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 210; VI-NEXT: s_waitcnt vmcnt(0) 211; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 212; VI-NEXT: s_endpgm 213 %load = load i8, i8 addrspace(1)* %in 214 %ext = zext i8 %load to i32 215 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8) 216 store i32 %bfe, i32 addrspace(1)* %out, align 4 217 ret void 218} 219 220; FIXME: Should be using s_add_i32 221define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 222; SI-LABEL: bfe_u32_zext_in_reg_i8: 223; SI: ; %bb.0: 224; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 225; SI-NEXT: s_mov_b32 s7, 0xf000 226; SI-NEXT: s_mov_b32 s6, -1 227; SI-NEXT: s_mov_b32 s10, s6 228; SI-NEXT: s_mov_b32 s11, s7 229; SI-NEXT: s_waitcnt lgkmcnt(0) 230; SI-NEXT: s_mov_b32 s8, s2 231; SI-NEXT: s_mov_b32 s9, s3 232; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 233; SI-NEXT: s_mov_b32 s4, s0 234; SI-NEXT: s_mov_b32 s5, s1 235; SI-NEXT: s_waitcnt vmcnt(0) 236; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 237; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 238; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 239; SI-NEXT: s_endpgm 240; 241; VI-LABEL: bfe_u32_zext_in_reg_i8: 242; VI: ; %bb.0: 243; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 244; VI-NEXT: s_mov_b32 s3, 0xf000 245; VI-NEXT: s_mov_b32 s2, -1 246; VI-NEXT: s_waitcnt lgkmcnt(0) 247; VI-NEXT: s_mov_b32 s0, s4 248; VI-NEXT: s_mov_b32 s1, s5 249; VI-NEXT: s_mov_b32 s4, s6 250; VI-NEXT: s_mov_b32 s5, s7 251; VI-NEXT: s_mov_b32 s6, s2 252; VI-NEXT: s_mov_b32 s7, s3 253; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 254; VI-NEXT: s_waitcnt vmcnt(0) 255; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 256; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 257; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 258; VI-NEXT: s_endpgm 259 %load = load i32, i32 addrspace(1)* %in, align 4 260 %add = add i32 %load, 1 261 %ext = and i32 %add, 255 262 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8) 263 store i32 %bfe, i32 addrspace(1)* %out, align 4 264 ret void 265} 266 267define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 268; SI-LABEL: bfe_u32_zext_in_reg_i16: 269; SI: ; %bb.0: 270; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 271; SI-NEXT: s_mov_b32 s7, 0xf000 272; SI-NEXT: s_mov_b32 s6, -1 273; SI-NEXT: s_mov_b32 s10, s6 274; SI-NEXT: s_mov_b32 s11, s7 275; SI-NEXT: s_waitcnt lgkmcnt(0) 276; SI-NEXT: s_mov_b32 s8, s2 277; SI-NEXT: s_mov_b32 s9, s3 278; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 279; SI-NEXT: s_mov_b32 s4, s0 280; SI-NEXT: s_mov_b32 s5, s1 281; SI-NEXT: s_waitcnt vmcnt(0) 282; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 283; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 284; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 285; SI-NEXT: s_endpgm 286; 287; VI-LABEL: bfe_u32_zext_in_reg_i16: 288; VI: ; %bb.0: 289; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 290; VI-NEXT: s_mov_b32 s3, 0xf000 291; VI-NEXT: s_mov_b32 s2, -1 292; VI-NEXT: s_waitcnt lgkmcnt(0) 293; VI-NEXT: s_mov_b32 s0, s4 294; VI-NEXT: s_mov_b32 s1, s5 295; VI-NEXT: s_mov_b32 s4, s6 296; VI-NEXT: s_mov_b32 s5, s7 297; VI-NEXT: s_mov_b32 s6, s2 298; VI-NEXT: s_mov_b32 s7, s3 299; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 300; VI-NEXT: s_waitcnt vmcnt(0) 301; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 302; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 303; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 304; VI-NEXT: s_endpgm 305 %load = load i32, i32 addrspace(1)* %in, align 4 306 %add = add i32 %load, 1 307 %ext = and i32 %add, 65535 308 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 16) 309 store i32 %bfe, i32 addrspace(1)* %out, align 4 310 ret void 311} 312 313define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 314; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: 315; SI: ; %bb.0: 316; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 317; SI-NEXT: s_mov_b32 s7, 0xf000 318; SI-NEXT: s_mov_b32 s6, -1 319; SI-NEXT: s_mov_b32 s10, s6 320; SI-NEXT: s_mov_b32 s11, s7 321; SI-NEXT: s_waitcnt lgkmcnt(0) 322; SI-NEXT: s_mov_b32 s8, s2 323; SI-NEXT: s_mov_b32 s9, s3 324; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 325; SI-NEXT: s_mov_b32 s4, s0 326; SI-NEXT: s_mov_b32 s5, s1 327; SI-NEXT: s_waitcnt vmcnt(0) 328; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 329; SI-NEXT: v_and_b32_e32 v0, 0xfe, v0 330; SI-NEXT: v_bfe_u32 v0, v0, 1, 8 331; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 332; SI-NEXT: s_endpgm 333; 334; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: 335; VI: ; %bb.0: 336; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 337; VI-NEXT: s_mov_b32 s3, 0xf000 338; VI-NEXT: s_mov_b32 s2, -1 339; VI-NEXT: s_waitcnt lgkmcnt(0) 340; VI-NEXT: s_mov_b32 s0, s4 341; VI-NEXT: s_mov_b32 s1, s5 342; VI-NEXT: s_mov_b32 s4, s6 343; VI-NEXT: s_mov_b32 s5, s7 344; VI-NEXT: s_mov_b32 s6, s2 345; VI-NEXT: s_mov_b32 s7, s3 346; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 347; VI-NEXT: s_waitcnt vmcnt(0) 348; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 349; VI-NEXT: v_and_b32_e32 v0, 0xfe, v0 350; VI-NEXT: v_bfe_u32 v0, v0, 1, 8 351; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 352; VI-NEXT: s_endpgm 353 %load = load i32, i32 addrspace(1)* %in, align 4 354 %add = add i32 %load, 1 355 %ext = and i32 %add, 255 356 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 1, i32 8) 357 store i32 %bfe, i32 addrspace(1)* %out, align 4 358 ret void 359} 360 361define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 362; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: 363; SI: ; %bb.0: 364; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 365; SI-NEXT: s_mov_b32 s7, 0xf000 366; SI-NEXT: s_mov_b32 s6, -1 367; SI-NEXT: s_mov_b32 s10, s6 368; SI-NEXT: s_mov_b32 s11, s7 369; SI-NEXT: s_waitcnt lgkmcnt(0) 370; SI-NEXT: s_mov_b32 s8, s2 371; SI-NEXT: s_mov_b32 s9, s3 372; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 373; SI-NEXT: s_mov_b32 s4, s0 374; SI-NEXT: s_mov_b32 s5, s1 375; SI-NEXT: s_waitcnt vmcnt(0) 376; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 377; SI-NEXT: v_and_b32_e32 v0, 0xf8, v0 378; SI-NEXT: v_bfe_u32 v0, v0, 3, 8 379; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 380; SI-NEXT: s_endpgm 381; 382; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: 383; VI: ; %bb.0: 384; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 385; VI-NEXT: s_mov_b32 s3, 0xf000 386; VI-NEXT: s_mov_b32 s2, -1 387; VI-NEXT: s_waitcnt lgkmcnt(0) 388; VI-NEXT: s_mov_b32 s0, s4 389; VI-NEXT: s_mov_b32 s1, s5 390; VI-NEXT: s_mov_b32 s4, s6 391; VI-NEXT: s_mov_b32 s5, s7 392; VI-NEXT: s_mov_b32 s6, s2 393; VI-NEXT: s_mov_b32 s7, s3 394; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 395; VI-NEXT: s_waitcnt vmcnt(0) 396; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 397; VI-NEXT: v_and_b32_e32 v0, 0xf8, v0 398; VI-NEXT: v_bfe_u32 v0, v0, 3, 8 399; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 400; VI-NEXT: s_endpgm 401 %load = load i32, i32 addrspace(1)* %in, align 4 402 %add = add i32 %load, 1 403 %ext = and i32 %add, 255 404 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 3, i32 8) 405 store i32 %bfe, i32 addrspace(1)* %out, align 4 406 ret void 407} 408 409define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 410; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: 411; SI: ; %bb.0: 412; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 413; SI-NEXT: s_mov_b32 s7, 0xf000 414; SI-NEXT: s_mov_b32 s6, -1 415; SI-NEXT: s_mov_b32 s10, s6 416; SI-NEXT: s_mov_b32 s11, s7 417; SI-NEXT: s_waitcnt lgkmcnt(0) 418; SI-NEXT: s_mov_b32 s8, s2 419; SI-NEXT: s_mov_b32 s9, s3 420; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 421; SI-NEXT: s_mov_b32 s4, s0 422; SI-NEXT: s_mov_b32 s5, s1 423; SI-NEXT: s_waitcnt vmcnt(0) 424; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 425; SI-NEXT: v_and_b32_e32 v0, 0x80, v0 426; SI-NEXT: v_bfe_u32 v0, v0, 7, 8 427; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 428; SI-NEXT: s_endpgm 429; 430; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: 431; VI: ; %bb.0: 432; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 433; VI-NEXT: s_mov_b32 s3, 0xf000 434; VI-NEXT: s_mov_b32 s2, -1 435; VI-NEXT: s_waitcnt lgkmcnt(0) 436; VI-NEXT: s_mov_b32 s0, s4 437; VI-NEXT: s_mov_b32 s1, s5 438; VI-NEXT: s_mov_b32 s4, s6 439; VI-NEXT: s_mov_b32 s5, s7 440; VI-NEXT: s_mov_b32 s6, s2 441; VI-NEXT: s_mov_b32 s7, s3 442; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 443; VI-NEXT: s_waitcnt vmcnt(0) 444; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 445; VI-NEXT: v_and_b32_e32 v0, 0x80, v0 446; VI-NEXT: v_bfe_u32 v0, v0, 7, 8 447; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 448; VI-NEXT: s_endpgm 449 %load = load i32, i32 addrspace(1)* %in, align 4 450 %add = add i32 %load, 1 451 %ext = and i32 %add, 255 452 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 7, i32 8) 453 store i32 %bfe, i32 addrspace(1)* %out, align 4 454 ret void 455} 456 457define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 458; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: 459; SI: ; %bb.0: 460; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 461; SI-NEXT: s_mov_b32 s7, 0xf000 462; SI-NEXT: s_mov_b32 s6, -1 463; SI-NEXT: s_mov_b32 s10, s6 464; SI-NEXT: s_mov_b32 s11, s7 465; SI-NEXT: s_waitcnt lgkmcnt(0) 466; SI-NEXT: s_mov_b32 s8, s2 467; SI-NEXT: s_mov_b32 s9, s3 468; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 469; SI-NEXT: s_mov_b32 s4, s0 470; SI-NEXT: s_mov_b32 s5, s1 471; SI-NEXT: s_waitcnt vmcnt(0) 472; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 473; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 474; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 475; SI-NEXT: s_endpgm 476; 477; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: 478; VI: ; %bb.0: 479; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 480; VI-NEXT: s_mov_b32 s3, 0xf000 481; VI-NEXT: s_mov_b32 s2, -1 482; VI-NEXT: s_waitcnt lgkmcnt(0) 483; VI-NEXT: s_mov_b32 s0, s4 484; VI-NEXT: s_mov_b32 s1, s5 485; VI-NEXT: s_mov_b32 s4, s6 486; VI-NEXT: s_mov_b32 s5, s7 487; VI-NEXT: s_mov_b32 s6, s2 488; VI-NEXT: s_mov_b32 s7, s3 489; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 490; VI-NEXT: s_waitcnt vmcnt(0) 491; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 492; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 493; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 494; VI-NEXT: s_endpgm 495 %load = load i32, i32 addrspace(1)* %in, align 4 496 %add = add i32 %load, 1 497 %ext = and i32 %add, 65535 498 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 8, i32 8) 499 store i32 %bfe, i32 addrspace(1)* %out, align 4 500 ret void 501} 502 503define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 504; SI-LABEL: bfe_u32_test_1: 505; SI: ; %bb.0: 506; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 507; SI-NEXT: s_mov_b32 s7, 0xf000 508; SI-NEXT: s_mov_b32 s6, -1 509; SI-NEXT: s_mov_b32 s10, s6 510; SI-NEXT: s_mov_b32 s11, s7 511; SI-NEXT: s_waitcnt lgkmcnt(0) 512; SI-NEXT: s_mov_b32 s8, s2 513; SI-NEXT: s_mov_b32 s9, s3 514; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 515; SI-NEXT: s_mov_b32 s4, s0 516; SI-NEXT: s_mov_b32 s5, s1 517; SI-NEXT: s_waitcnt vmcnt(0) 518; SI-NEXT: v_and_b32_e32 v0, 1, v0 519; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 520; SI-NEXT: s_endpgm 521; 522; VI-LABEL: bfe_u32_test_1: 523; VI: ; %bb.0: 524; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 525; VI-NEXT: s_mov_b32 s3, 0xf000 526; VI-NEXT: s_mov_b32 s2, -1 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: s_mov_b32 s0, s4 529; VI-NEXT: s_mov_b32 s1, s5 530; VI-NEXT: s_mov_b32 s4, s6 531; VI-NEXT: s_mov_b32 s5, s7 532; VI-NEXT: s_mov_b32 s6, s2 533; VI-NEXT: s_mov_b32 s7, s3 534; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 535; VI-NEXT: s_waitcnt vmcnt(0) 536; VI-NEXT: v_and_b32_e32 v0, 1, v0 537; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 538; VI-NEXT: s_endpgm 539 %x = load i32, i32 addrspace(1)* %in, align 4 540 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1) 541 store i32 %bfe, i32 addrspace(1)* %out, align 4 542 ret void 543} 544 545define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 546; SI-LABEL: bfe_u32_test_2: 547; SI: ; %bb.0: 548; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 549; SI-NEXT: s_waitcnt lgkmcnt(0) 550; SI-NEXT: s_mov_b32 s3, 0xf000 551; SI-NEXT: s_mov_b32 s2, -1 552; SI-NEXT: v_mov_b32_e32 v0, 0 553; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 554; SI-NEXT: s_endpgm 555; 556; VI-LABEL: bfe_u32_test_2: 557; VI: ; %bb.0: 558; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 559; VI-NEXT: s_waitcnt lgkmcnt(0) 560; VI-NEXT: s_mov_b32 s3, 0xf000 561; VI-NEXT: s_mov_b32 s2, -1 562; VI-NEXT: v_mov_b32_e32 v0, 0 563; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 564; VI-NEXT: s_endpgm 565 %x = load i32, i32 addrspace(1)* %in, align 4 566 %shl = shl i32 %x, 31 567 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8) 568 store i32 %bfe, i32 addrspace(1)* %out, align 4 569 ret void 570} 571 572define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 573; SI-LABEL: bfe_u32_test_3: 574; SI: ; %bb.0: 575; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 576; SI-NEXT: s_waitcnt lgkmcnt(0) 577; SI-NEXT: s_mov_b32 s3, 0xf000 578; SI-NEXT: s_mov_b32 s2, -1 579; SI-NEXT: v_mov_b32_e32 v0, 0 580; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 581; SI-NEXT: s_endpgm 582; 583; VI-LABEL: bfe_u32_test_3: 584; VI: ; %bb.0: 585; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 586; VI-NEXT: s_waitcnt lgkmcnt(0) 587; VI-NEXT: s_mov_b32 s3, 0xf000 588; VI-NEXT: s_mov_b32 s2, -1 589; VI-NEXT: v_mov_b32_e32 v0, 0 590; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 591; VI-NEXT: s_endpgm 592 %x = load i32, i32 addrspace(1)* %in, align 4 593 %shl = shl i32 %x, 31 594 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1) 595 store i32 %bfe, i32 addrspace(1)* %out, align 4 596 ret void 597} 598 599define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 600; SI-LABEL: bfe_u32_test_4: 601; SI: ; %bb.0: 602; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 603; SI-NEXT: s_waitcnt lgkmcnt(0) 604; SI-NEXT: s_mov_b32 s3, 0xf000 605; SI-NEXT: s_mov_b32 s2, -1 606; SI-NEXT: v_mov_b32_e32 v0, 0 607; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 608; SI-NEXT: s_endpgm 609; 610; VI-LABEL: bfe_u32_test_4: 611; VI: ; %bb.0: 612; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 613; VI-NEXT: s_waitcnt lgkmcnt(0) 614; VI-NEXT: s_mov_b32 s3, 0xf000 615; VI-NEXT: s_mov_b32 s2, -1 616; VI-NEXT: v_mov_b32_e32 v0, 0 617; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 618; VI-NEXT: s_endpgm 619 %x = load i32, i32 addrspace(1)* %in, align 4 620 %shl = shl i32 %x, 31 621 %shr = lshr i32 %shl, 31 622 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 31, i32 1) 623 store i32 %bfe, i32 addrspace(1)* %out, align 4 624 ret void 625} 626 627define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 628; SI-LABEL: bfe_u32_test_5: 629; SI: ; %bb.0: 630; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 631; SI-NEXT: s_mov_b32 s7, 0xf000 632; SI-NEXT: s_mov_b32 s6, -1 633; SI-NEXT: s_mov_b32 s10, s6 634; SI-NEXT: s_mov_b32 s11, s7 635; SI-NEXT: s_waitcnt lgkmcnt(0) 636; SI-NEXT: s_mov_b32 s8, s2 637; SI-NEXT: s_mov_b32 s9, s3 638; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 639; SI-NEXT: s_mov_b32 s4, s0 640; SI-NEXT: s_mov_b32 s5, s1 641; SI-NEXT: s_waitcnt vmcnt(0) 642; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 643; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 644; SI-NEXT: s_endpgm 645; 646; VI-LABEL: bfe_u32_test_5: 647; VI: ; %bb.0: 648; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 649; VI-NEXT: s_mov_b32 s3, 0xf000 650; VI-NEXT: s_mov_b32 s2, -1 651; VI-NEXT: s_waitcnt lgkmcnt(0) 652; VI-NEXT: s_mov_b32 s0, s4 653; VI-NEXT: s_mov_b32 s1, s5 654; VI-NEXT: s_mov_b32 s4, s6 655; VI-NEXT: s_mov_b32 s5, s7 656; VI-NEXT: s_mov_b32 s6, s2 657; VI-NEXT: s_mov_b32 s7, s3 658; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 659; VI-NEXT: s_waitcnt vmcnt(0) 660; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 661; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 662; VI-NEXT: s_endpgm 663 %x = load i32, i32 addrspace(1)* %in, align 4 664 %shl = shl i32 %x, 31 665 %shr = ashr i32 %shl, 31 666 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 0, i32 1) 667 store i32 %bfe, i32 addrspace(1)* %out, align 4 668 ret void 669} 670 671define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 672; SI-LABEL: bfe_u32_test_6: 673; SI: ; %bb.0: 674; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 675; SI-NEXT: s_mov_b32 s7, 0xf000 676; SI-NEXT: s_mov_b32 s6, -1 677; SI-NEXT: s_mov_b32 s10, s6 678; SI-NEXT: s_mov_b32 s11, s7 679; SI-NEXT: s_waitcnt lgkmcnt(0) 680; SI-NEXT: s_mov_b32 s8, s2 681; SI-NEXT: s_mov_b32 s9, s3 682; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 683; SI-NEXT: s_mov_b32 s4, s0 684; SI-NEXT: s_mov_b32 s5, s1 685; SI-NEXT: s_waitcnt vmcnt(0) 686; SI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 687; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 688; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 689; SI-NEXT: s_endpgm 690; 691; VI-LABEL: bfe_u32_test_6: 692; VI: ; %bb.0: 693; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 694; VI-NEXT: s_mov_b32 s3, 0xf000 695; VI-NEXT: s_mov_b32 s2, -1 696; VI-NEXT: s_waitcnt lgkmcnt(0) 697; VI-NEXT: s_mov_b32 s0, s4 698; VI-NEXT: s_mov_b32 s1, s5 699; VI-NEXT: s_mov_b32 s4, s6 700; VI-NEXT: s_mov_b32 s5, s7 701; VI-NEXT: s_mov_b32 s6, s2 702; VI-NEXT: s_mov_b32 s7, s3 703; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 704; VI-NEXT: s_waitcnt vmcnt(0) 705; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 706; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 707; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 708; VI-NEXT: s_endpgm 709 %x = load i32, i32 addrspace(1)* %in, align 4 710 %shl = shl i32 %x, 31 711 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31) 712 store i32 %bfe, i32 addrspace(1)* %out, align 4 713 ret void 714} 715 716define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 717; SI-LABEL: bfe_u32_test_7: 718; SI: ; %bb.0: 719; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 720; SI-NEXT: s_mov_b32 s7, 0xf000 721; SI-NEXT: s_mov_b32 s6, -1 722; SI-NEXT: s_mov_b32 s10, s6 723; SI-NEXT: s_mov_b32 s11, s7 724; SI-NEXT: s_waitcnt lgkmcnt(0) 725; SI-NEXT: s_mov_b32 s8, s2 726; SI-NEXT: s_mov_b32 s9, s3 727; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 728; SI-NEXT: s_mov_b32 s4, s0 729; SI-NEXT: s_mov_b32 s5, s1 730; SI-NEXT: s_waitcnt vmcnt(0) 731; SI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 732; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 733; SI-NEXT: s_endpgm 734; 735; VI-LABEL: bfe_u32_test_7: 736; VI: ; %bb.0: 737; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 738; VI-NEXT: s_mov_b32 s3, 0xf000 739; VI-NEXT: s_mov_b32 s2, -1 740; VI-NEXT: s_waitcnt lgkmcnt(0) 741; VI-NEXT: s_mov_b32 s0, s4 742; VI-NEXT: s_mov_b32 s1, s5 743; VI-NEXT: s_mov_b32 s4, s6 744; VI-NEXT: s_mov_b32 s5, s7 745; VI-NEXT: s_mov_b32 s6, s2 746; VI-NEXT: s_mov_b32 s7, s3 747; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 748; VI-NEXT: s_waitcnt vmcnt(0) 749; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 750; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 751; VI-NEXT: s_endpgm 752 %x = load i32, i32 addrspace(1)* %in, align 4 753 %shl = shl i32 %x, 31 754 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31) 755 store i32 %bfe, i32 addrspace(1)* %out, align 4 756 ret void 757} 758 759define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 760; SI-LABEL: bfe_u32_test_8: 761; SI: ; %bb.0: 762; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 763; SI-NEXT: s_mov_b32 s7, 0xf000 764; SI-NEXT: s_mov_b32 s6, -1 765; SI-NEXT: s_mov_b32 s10, s6 766; SI-NEXT: s_mov_b32 s11, s7 767; SI-NEXT: s_waitcnt lgkmcnt(0) 768; SI-NEXT: s_mov_b32 s8, s2 769; SI-NEXT: s_mov_b32 s9, s3 770; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 771; SI-NEXT: s_mov_b32 s4, s0 772; SI-NEXT: s_mov_b32 s5, s1 773; SI-NEXT: s_waitcnt vmcnt(0) 774; SI-NEXT: v_and_b32_e32 v0, 1, v0 775; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 776; SI-NEXT: s_endpgm 777; 778; VI-LABEL: bfe_u32_test_8: 779; VI: ; %bb.0: 780; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 781; VI-NEXT: s_mov_b32 s3, 0xf000 782; VI-NEXT: s_mov_b32 s2, -1 783; VI-NEXT: s_waitcnt lgkmcnt(0) 784; VI-NEXT: s_mov_b32 s0, s4 785; VI-NEXT: s_mov_b32 s1, s5 786; VI-NEXT: s_mov_b32 s4, s6 787; VI-NEXT: s_mov_b32 s5, s7 788; VI-NEXT: s_mov_b32 s6, s2 789; VI-NEXT: s_mov_b32 s7, s3 790; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 791; VI-NEXT: s_waitcnt vmcnt(0) 792; VI-NEXT: v_and_b32_e32 v0, 1, v0 793; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 794; VI-NEXT: s_endpgm 795 %x = load i32, i32 addrspace(1)* %in, align 4 796 %shl = shl i32 %x, 31 797 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) 798 store i32 %bfe, i32 addrspace(1)* %out, align 4 799 ret void 800} 801 802define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 803; SI-LABEL: bfe_u32_test_9: 804; SI: ; %bb.0: 805; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 806; SI-NEXT: s_mov_b32 s7, 0xf000 807; SI-NEXT: s_mov_b32 s6, -1 808; SI-NEXT: s_mov_b32 s10, s6 809; SI-NEXT: s_mov_b32 s11, s7 810; SI-NEXT: s_waitcnt lgkmcnt(0) 811; SI-NEXT: s_mov_b32 s8, s2 812; SI-NEXT: s_mov_b32 s9, s3 813; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 814; SI-NEXT: s_mov_b32 s4, s0 815; SI-NEXT: s_mov_b32 s5, s1 816; SI-NEXT: s_waitcnt vmcnt(0) 817; SI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 818; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 819; SI-NEXT: s_endpgm 820; 821; VI-LABEL: bfe_u32_test_9: 822; VI: ; %bb.0: 823; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 824; VI-NEXT: s_mov_b32 s3, 0xf000 825; VI-NEXT: s_mov_b32 s2, -1 826; VI-NEXT: s_waitcnt lgkmcnt(0) 827; VI-NEXT: s_mov_b32 s0, s4 828; VI-NEXT: s_mov_b32 s1, s5 829; VI-NEXT: s_mov_b32 s4, s6 830; VI-NEXT: s_mov_b32 s5, s7 831; VI-NEXT: s_mov_b32 s6, s2 832; VI-NEXT: s_mov_b32 s7, s3 833; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 834; VI-NEXT: s_waitcnt vmcnt(0) 835; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 836; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 837; VI-NEXT: s_endpgm 838 %x = load i32, i32 addrspace(1)* %in, align 4 839 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1) 840 store i32 %bfe, i32 addrspace(1)* %out, align 4 841 ret void 842} 843 844define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 845; SI-LABEL: bfe_u32_test_10: 846; SI: ; %bb.0: 847; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 848; SI-NEXT: s_mov_b32 s7, 0xf000 849; SI-NEXT: s_mov_b32 s6, -1 850; SI-NEXT: s_mov_b32 s10, s6 851; SI-NEXT: s_mov_b32 s11, s7 852; SI-NEXT: s_waitcnt lgkmcnt(0) 853; SI-NEXT: s_mov_b32 s8, s2 854; SI-NEXT: s_mov_b32 s9, s3 855; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 856; SI-NEXT: s_mov_b32 s4, s0 857; SI-NEXT: s_mov_b32 s5, s1 858; SI-NEXT: s_waitcnt vmcnt(0) 859; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 860; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 861; SI-NEXT: s_endpgm 862; 863; VI-LABEL: bfe_u32_test_10: 864; VI: ; %bb.0: 865; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 866; VI-NEXT: s_mov_b32 s3, 0xf000 867; VI-NEXT: s_mov_b32 s2, -1 868; VI-NEXT: s_waitcnt lgkmcnt(0) 869; VI-NEXT: s_mov_b32 s0, s4 870; VI-NEXT: s_mov_b32 s1, s5 871; VI-NEXT: s_mov_b32 s4, s6 872; VI-NEXT: s_mov_b32 s5, s7 873; VI-NEXT: s_mov_b32 s6, s2 874; VI-NEXT: s_mov_b32 s7, s3 875; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 876; VI-NEXT: s_waitcnt vmcnt(0) 877; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 878; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 879; VI-NEXT: s_endpgm 880 %x = load i32, i32 addrspace(1)* %in, align 4 881 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31) 882 store i32 %bfe, i32 addrspace(1)* %out, align 4 883 ret void 884} 885 886define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 887; SI-LABEL: bfe_u32_test_11: 888; SI: ; %bb.0: 889; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 890; SI-NEXT: s_mov_b32 s7, 0xf000 891; SI-NEXT: s_mov_b32 s6, -1 892; SI-NEXT: s_mov_b32 s10, s6 893; SI-NEXT: s_mov_b32 s11, s7 894; SI-NEXT: s_waitcnt lgkmcnt(0) 895; SI-NEXT: s_mov_b32 s8, s2 896; SI-NEXT: s_mov_b32 s9, s3 897; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 898; SI-NEXT: s_mov_b32 s4, s0 899; SI-NEXT: s_mov_b32 s5, s1 900; SI-NEXT: s_waitcnt vmcnt(0) 901; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 902; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 903; SI-NEXT: s_endpgm 904; 905; VI-LABEL: bfe_u32_test_11: 906; VI: ; %bb.0: 907; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 908; VI-NEXT: s_mov_b32 s3, 0xf000 909; VI-NEXT: s_mov_b32 s2, -1 910; VI-NEXT: s_waitcnt lgkmcnt(0) 911; VI-NEXT: s_mov_b32 s0, s4 912; VI-NEXT: s_mov_b32 s1, s5 913; VI-NEXT: s_mov_b32 s4, s6 914; VI-NEXT: s_mov_b32 s5, s7 915; VI-NEXT: s_mov_b32 s6, s2 916; VI-NEXT: s_mov_b32 s7, s3 917; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 918; VI-NEXT: s_waitcnt vmcnt(0) 919; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 920; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 921; VI-NEXT: s_endpgm 922 %x = load i32, i32 addrspace(1)* %in, align 4 923 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24) 924 store i32 %bfe, i32 addrspace(1)* %out, align 4 925 ret void 926} 927 928define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 929; SI-LABEL: bfe_u32_test_12: 930; SI: ; %bb.0: 931; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 932; SI-NEXT: s_mov_b32 s7, 0xf000 933; SI-NEXT: s_mov_b32 s6, -1 934; SI-NEXT: s_mov_b32 s10, s6 935; SI-NEXT: s_mov_b32 s11, s7 936; SI-NEXT: s_waitcnt lgkmcnt(0) 937; SI-NEXT: s_mov_b32 s8, s2 938; SI-NEXT: s_mov_b32 s9, s3 939; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 940; SI-NEXT: s_mov_b32 s4, s0 941; SI-NEXT: s_mov_b32 s5, s1 942; SI-NEXT: s_waitcnt vmcnt(0) 943; SI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 944; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 945; SI-NEXT: s_endpgm 946; 947; VI-LABEL: bfe_u32_test_12: 948; VI: ; %bb.0: 949; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 950; VI-NEXT: s_mov_b32 s3, 0xf000 951; VI-NEXT: s_mov_b32 s2, -1 952; VI-NEXT: s_waitcnt lgkmcnt(0) 953; VI-NEXT: s_mov_b32 s0, s4 954; VI-NEXT: s_mov_b32 s1, s5 955; VI-NEXT: s_mov_b32 s4, s6 956; VI-NEXT: s_mov_b32 s5, s7 957; VI-NEXT: s_mov_b32 s6, s2 958; VI-NEXT: s_mov_b32 s7, s3 959; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 960; VI-NEXT: s_waitcnt vmcnt(0) 961; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 962; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 963; VI-NEXT: s_endpgm 964 %x = load i32, i32 addrspace(1)* %in, align 4 965 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8) 966 store i32 %bfe, i32 addrspace(1)* %out, align 4 967 ret void 968} 969 970; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} 971define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 972; SI-LABEL: bfe_u32_test_13: 973; SI: ; %bb.0: 974; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 975; SI-NEXT: s_mov_b32 s7, 0xf000 976; SI-NEXT: s_mov_b32 s6, -1 977; SI-NEXT: s_mov_b32 s10, s6 978; SI-NEXT: s_mov_b32 s11, s7 979; SI-NEXT: s_waitcnt lgkmcnt(0) 980; SI-NEXT: s_mov_b32 s8, s2 981; SI-NEXT: s_mov_b32 s9, s3 982; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 983; SI-NEXT: s_mov_b32 s4, s0 984; SI-NEXT: s_mov_b32 s5, s1 985; SI-NEXT: s_waitcnt vmcnt(0) 986; SI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 987; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 988; SI-NEXT: s_endpgm 989; 990; VI-LABEL: bfe_u32_test_13: 991; VI: ; %bb.0: 992; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 993; VI-NEXT: s_mov_b32 s3, 0xf000 994; VI-NEXT: s_mov_b32 s2, -1 995; VI-NEXT: s_waitcnt lgkmcnt(0) 996; VI-NEXT: s_mov_b32 s0, s4 997; VI-NEXT: s_mov_b32 s1, s5 998; VI-NEXT: s_mov_b32 s4, s6 999; VI-NEXT: s_mov_b32 s5, s7 1000; VI-NEXT: s_mov_b32 s6, s2 1001; VI-NEXT: s_mov_b32 s7, s3 1002; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1003; VI-NEXT: s_waitcnt vmcnt(0) 1004; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 1005; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1006; VI-NEXT: s_endpgm 1007 %x = load i32, i32 addrspace(1)* %in, align 4 1008 %shl = ashr i32 %x, 31 1009 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) 1010 store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void 1011} 1012 1013define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 1014; SI-LABEL: bfe_u32_test_14: 1015; SI: ; %bb.0: 1016; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1017; SI-NEXT: s_waitcnt lgkmcnt(0) 1018; SI-NEXT: s_mov_b32 s3, 0xf000 1019; SI-NEXT: s_mov_b32 s2, -1 1020; SI-NEXT: v_mov_b32_e32 v0, 0 1021; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1022; SI-NEXT: s_endpgm 1023; 1024; VI-LABEL: bfe_u32_test_14: 1025; VI: ; %bb.0: 1026; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1027; VI-NEXT: s_waitcnt lgkmcnt(0) 1028; VI-NEXT: s_mov_b32 s3, 0xf000 1029; VI-NEXT: s_mov_b32 s2, -1 1030; VI-NEXT: v_mov_b32_e32 v0, 0 1031; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1032; VI-NEXT: s_endpgm 1033 %x = load i32, i32 addrspace(1)* %in, align 4 1034 %shl = lshr i32 %x, 31 1035 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) 1036 store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void 1037} 1038 1039; EG-NOT: BFE 1040define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { 1041; SI-LABEL: bfe_u32_constant_fold_test_0: 1042; SI: ; %bb.0: 1043; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1044; SI-NEXT: s_mov_b32 s3, 0xf000 1045; SI-NEXT: s_mov_b32 s2, -1 1046; SI-NEXT: v_mov_b32_e32 v0, 0 1047; SI-NEXT: s_waitcnt lgkmcnt(0) 1048; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1049; SI-NEXT: s_endpgm 1050; 1051; VI-LABEL: bfe_u32_constant_fold_test_0: 1052; VI: ; %bb.0: 1053; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1054; VI-NEXT: s_mov_b32 s3, 0xf000 1055; VI-NEXT: s_mov_b32 s2, -1 1056; VI-NEXT: v_mov_b32_e32 v0, 0 1057; VI-NEXT: s_waitcnt lgkmcnt(0) 1058; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1059; VI-NEXT: s_endpgm 1060 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0) 1061 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1062 ret void 1063} 1064 1065; EG-NOT: BFE 1066define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { 1067; SI-LABEL: bfe_u32_constant_fold_test_1: 1068; SI: ; %bb.0: 1069; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1070; SI-NEXT: s_mov_b32 s3, 0xf000 1071; SI-NEXT: s_mov_b32 s2, -1 1072; SI-NEXT: v_mov_b32_e32 v0, 0 1073; SI-NEXT: s_waitcnt lgkmcnt(0) 1074; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1075; SI-NEXT: s_endpgm 1076; 1077; VI-LABEL: bfe_u32_constant_fold_test_1: 1078; VI: ; %bb.0: 1079; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1080; VI-NEXT: s_mov_b32 s3, 0xf000 1081; VI-NEXT: s_mov_b32 s2, -1 1082; VI-NEXT: v_mov_b32_e32 v0, 0 1083; VI-NEXT: s_waitcnt lgkmcnt(0) 1084; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1085; VI-NEXT: s_endpgm 1086 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0) 1087 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1088 ret void 1089} 1090 1091; EG-NOT: BFE 1092define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { 1093; SI-LABEL: bfe_u32_constant_fold_test_2: 1094; SI: ; %bb.0: 1095; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1096; SI-NEXT: s_mov_b32 s3, 0xf000 1097; SI-NEXT: s_mov_b32 s2, -1 1098; SI-NEXT: v_mov_b32_e32 v0, 0 1099; SI-NEXT: s_waitcnt lgkmcnt(0) 1100; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1101; SI-NEXT: s_endpgm 1102; 1103; VI-LABEL: bfe_u32_constant_fold_test_2: 1104; VI: ; %bb.0: 1105; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1106; VI-NEXT: s_mov_b32 s3, 0xf000 1107; VI-NEXT: s_mov_b32 s2, -1 1108; VI-NEXT: v_mov_b32_e32 v0, 0 1109; VI-NEXT: s_waitcnt lgkmcnt(0) 1110; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1111; VI-NEXT: s_endpgm 1112 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1) 1113 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1114 ret void 1115} 1116 1117; EG-NOT: BFE 1118define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { 1119; SI-LABEL: bfe_u32_constant_fold_test_3: 1120; SI: ; %bb.0: 1121; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1122; SI-NEXT: s_mov_b32 s3, 0xf000 1123; SI-NEXT: s_mov_b32 s2, -1 1124; SI-NEXT: v_mov_b32_e32 v0, 1 1125; SI-NEXT: s_waitcnt lgkmcnt(0) 1126; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1127; SI-NEXT: s_endpgm 1128; 1129; VI-LABEL: bfe_u32_constant_fold_test_3: 1130; VI: ; %bb.0: 1131; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1132; VI-NEXT: s_mov_b32 s3, 0xf000 1133; VI-NEXT: s_mov_b32 s2, -1 1134; VI-NEXT: v_mov_b32_e32 v0, 1 1135; VI-NEXT: s_waitcnt lgkmcnt(0) 1136; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1137; VI-NEXT: s_endpgm 1138 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1) 1139 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1140 ret void 1141} 1142 1143; EG-NOT: BFE 1144define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { 1145; SI-LABEL: bfe_u32_constant_fold_test_4: 1146; SI: ; %bb.0: 1147; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1148; SI-NEXT: s_mov_b32 s3, 0xf000 1149; SI-NEXT: s_mov_b32 s2, -1 1150; SI-NEXT: v_mov_b32_e32 v0, -1 1151; SI-NEXT: s_waitcnt lgkmcnt(0) 1152; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1153; SI-NEXT: s_endpgm 1154; 1155; VI-LABEL: bfe_u32_constant_fold_test_4: 1156; VI: ; %bb.0: 1157; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1158; VI-NEXT: s_mov_b32 s3, 0xf000 1159; VI-NEXT: s_mov_b32 s2, -1 1160; VI-NEXT: v_mov_b32_e32 v0, -1 1161; VI-NEXT: s_waitcnt lgkmcnt(0) 1162; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1163; VI-NEXT: s_endpgm 1164 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1) 1165 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1166 ret void 1167} 1168 1169; EG-NOT: BFE 1170define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { 1171; SI-LABEL: bfe_u32_constant_fold_test_5: 1172; SI: ; %bb.0: 1173; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1174; SI-NEXT: s_mov_b32 s3, 0xf000 1175; SI-NEXT: s_mov_b32 s2, -1 1176; SI-NEXT: v_mov_b32_e32 v0, 1 1177; SI-NEXT: s_waitcnt lgkmcnt(0) 1178; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1179; SI-NEXT: s_endpgm 1180; 1181; VI-LABEL: bfe_u32_constant_fold_test_5: 1182; VI: ; %bb.0: 1183; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1184; VI-NEXT: s_mov_b32 s3, 0xf000 1185; VI-NEXT: s_mov_b32 s2, -1 1186; VI-NEXT: v_mov_b32_e32 v0, 1 1187; VI-NEXT: s_waitcnt lgkmcnt(0) 1188; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1189; VI-NEXT: s_endpgm 1190 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1) 1191 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1192 ret void 1193} 1194 1195; EG-NOT: BFE 1196define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { 1197; SI-LABEL: bfe_u32_constant_fold_test_6: 1198; SI: ; %bb.0: 1199; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1200; SI-NEXT: s_mov_b32 s3, 0xf000 1201; SI-NEXT: s_mov_b32 s2, -1 1202; SI-NEXT: v_mov_b32_e32 v0, 0x80 1203; SI-NEXT: s_waitcnt lgkmcnt(0) 1204; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1205; SI-NEXT: s_endpgm 1206; 1207; VI-LABEL: bfe_u32_constant_fold_test_6: 1208; VI: ; %bb.0: 1209; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1210; VI-NEXT: s_mov_b32 s3, 0xf000 1211; VI-NEXT: s_mov_b32 s2, -1 1212; VI-NEXT: v_mov_b32_e32 v0, 0x80 1213; VI-NEXT: s_waitcnt lgkmcnt(0) 1214; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1215; VI-NEXT: s_endpgm 1216 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8) 1217 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1218 ret void 1219} 1220 1221; EG-NOT: BFE 1222define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { 1223; SI-LABEL: bfe_u32_constant_fold_test_7: 1224; SI: ; %bb.0: 1225; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1226; SI-NEXT: s_mov_b32 s3, 0xf000 1227; SI-NEXT: s_mov_b32 s2, -1 1228; SI-NEXT: v_mov_b32_e32 v0, 0x7f 1229; SI-NEXT: s_waitcnt lgkmcnt(0) 1230; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1231; SI-NEXT: s_endpgm 1232; 1233; VI-LABEL: bfe_u32_constant_fold_test_7: 1234; VI: ; %bb.0: 1235; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1236; VI-NEXT: s_mov_b32 s3, 0xf000 1237; VI-NEXT: s_mov_b32 s2, -1 1238; VI-NEXT: v_mov_b32_e32 v0, 0x7f 1239; VI-NEXT: s_waitcnt lgkmcnt(0) 1240; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1241; VI-NEXT: s_endpgm 1242 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8) 1243 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1244 ret void 1245} 1246 1247; EG-NOT: BFE 1248define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { 1249; SI-LABEL: bfe_u32_constant_fold_test_8: 1250; SI: ; %bb.0: 1251; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1252; SI-NEXT: s_mov_b32 s3, 0xf000 1253; SI-NEXT: s_mov_b32 s2, -1 1254; SI-NEXT: v_mov_b32_e32 v0, 1 1255; SI-NEXT: s_waitcnt lgkmcnt(0) 1256; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1257; SI-NEXT: s_endpgm 1258; 1259; VI-LABEL: bfe_u32_constant_fold_test_8: 1260; VI: ; %bb.0: 1261; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1262; VI-NEXT: s_mov_b32 s3, 0xf000 1263; VI-NEXT: s_mov_b32 s2, -1 1264; VI-NEXT: v_mov_b32_e32 v0, 1 1265; VI-NEXT: s_waitcnt lgkmcnt(0) 1266; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1267; VI-NEXT: s_endpgm 1268 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8) 1269 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1270 ret void 1271} 1272 1273; EG-NOT: BFE 1274define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { 1275; SI-LABEL: bfe_u32_constant_fold_test_9: 1276; SI: ; %bb.0: 1277; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1278; SI-NEXT: s_mov_b32 s3, 0xf000 1279; SI-NEXT: s_mov_b32 s2, -1 1280; SI-NEXT: v_mov_b32_e32 v0, 1 1281; SI-NEXT: s_waitcnt lgkmcnt(0) 1282; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1283; SI-NEXT: s_endpgm 1284; 1285; VI-LABEL: bfe_u32_constant_fold_test_9: 1286; VI: ; %bb.0: 1287; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1288; VI-NEXT: s_mov_b32 s3, 0xf000 1289; VI-NEXT: s_mov_b32 s2, -1 1290; VI-NEXT: v_mov_b32_e32 v0, 1 1291; VI-NEXT: s_waitcnt lgkmcnt(0) 1292; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1293; VI-NEXT: s_endpgm 1294 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8) 1295 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1296 ret void 1297} 1298 1299; EG-NOT: BFE 1300define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { 1301; SI-LABEL: bfe_u32_constant_fold_test_10: 1302; SI: ; %bb.0: 1303; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1304; SI-NEXT: s_mov_b32 s3, 0xf000 1305; SI-NEXT: s_mov_b32 s2, -1 1306; SI-NEXT: v_mov_b32_e32 v0, 0 1307; SI-NEXT: s_waitcnt lgkmcnt(0) 1308; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1309; SI-NEXT: s_endpgm 1310; 1311; VI-LABEL: bfe_u32_constant_fold_test_10: 1312; VI: ; %bb.0: 1313; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1314; VI-NEXT: s_mov_b32 s3, 0xf000 1315; VI-NEXT: s_mov_b32 s2, -1 1316; VI-NEXT: v_mov_b32_e32 v0, 0 1317; VI-NEXT: s_waitcnt lgkmcnt(0) 1318; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1319; VI-NEXT: s_endpgm 1320 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16) 1321 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1322 ret void 1323} 1324 1325; EG-NOT: BFE 1326define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { 1327; SI-LABEL: bfe_u32_constant_fold_test_11: 1328; SI: ; %bb.0: 1329; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1330; SI-NEXT: s_mov_b32 s3, 0xf000 1331; SI-NEXT: s_mov_b32 s2, -1 1332; SI-NEXT: v_mov_b32_e32 v0, 10 1333; SI-NEXT: s_waitcnt lgkmcnt(0) 1334; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1335; SI-NEXT: s_endpgm 1336; 1337; VI-LABEL: bfe_u32_constant_fold_test_11: 1338; VI: ; %bb.0: 1339; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1340; VI-NEXT: s_mov_b32 s3, 0xf000 1341; VI-NEXT: s_mov_b32 s2, -1 1342; VI-NEXT: v_mov_b32_e32 v0, 10 1343; VI-NEXT: s_waitcnt lgkmcnt(0) 1344; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1345; VI-NEXT: s_endpgm 1346 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4) 1347 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1348 ret void 1349} 1350 1351; EG-NOT: BFE 1352define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { 1353; SI-LABEL: bfe_u32_constant_fold_test_12: 1354; SI: ; %bb.0: 1355; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1356; SI-NEXT: s_mov_b32 s3, 0xf000 1357; SI-NEXT: s_mov_b32 s2, -1 1358; SI-NEXT: v_mov_b32_e32 v0, 0 1359; SI-NEXT: s_waitcnt lgkmcnt(0) 1360; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1361; SI-NEXT: s_endpgm 1362; 1363; VI-LABEL: bfe_u32_constant_fold_test_12: 1364; VI: ; %bb.0: 1365; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1366; VI-NEXT: s_mov_b32 s3, 0xf000 1367; VI-NEXT: s_mov_b32 s2, -1 1368; VI-NEXT: v_mov_b32_e32 v0, 0 1369; VI-NEXT: s_waitcnt lgkmcnt(0) 1370; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1371; VI-NEXT: s_endpgm 1372 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1) 1373 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1374 ret void 1375} 1376 1377; EG-NOT: BFE 1378define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { 1379; SI-LABEL: bfe_u32_constant_fold_test_13: 1380; SI: ; %bb.0: 1381; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1382; SI-NEXT: s_mov_b32 s3, 0xf000 1383; SI-NEXT: s_mov_b32 s2, -1 1384; SI-NEXT: v_mov_b32_e32 v0, 1 1385; SI-NEXT: s_waitcnt lgkmcnt(0) 1386; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1387; SI-NEXT: s_endpgm 1388; 1389; VI-LABEL: bfe_u32_constant_fold_test_13: 1390; VI: ; %bb.0: 1391; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1392; VI-NEXT: s_mov_b32 s3, 0xf000 1393; VI-NEXT: s_mov_b32 s2, -1 1394; VI-NEXT: v_mov_b32_e32 v0, 1 1395; VI-NEXT: s_waitcnt lgkmcnt(0) 1396; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1397; VI-NEXT: s_endpgm 1398 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16) 1399 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1400 ret void 1401} 1402 1403; EG-NOT: BFE 1404define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { 1405; SI-LABEL: bfe_u32_constant_fold_test_14: 1406; SI: ; %bb.0: 1407; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1408; SI-NEXT: s_mov_b32 s3, 0xf000 1409; SI-NEXT: s_mov_b32 s2, -1 1410; SI-NEXT: v_mov_b32_e32 v0, 40 1411; SI-NEXT: s_waitcnt lgkmcnt(0) 1412; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1413; SI-NEXT: s_endpgm 1414; 1415; VI-LABEL: bfe_u32_constant_fold_test_14: 1416; VI: ; %bb.0: 1417; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1418; VI-NEXT: s_mov_b32 s3, 0xf000 1419; VI-NEXT: s_mov_b32 s2, -1 1420; VI-NEXT: v_mov_b32_e32 v0, 40 1421; VI-NEXT: s_waitcnt lgkmcnt(0) 1422; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1423; VI-NEXT: s_endpgm 1424 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30) 1425 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1426 ret void 1427} 1428 1429; EG-NOT: BFE 1430define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { 1431; SI-LABEL: bfe_u32_constant_fold_test_15: 1432; SI: ; %bb.0: 1433; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1434; SI-NEXT: s_mov_b32 s3, 0xf000 1435; SI-NEXT: s_mov_b32 s2, -1 1436; SI-NEXT: v_mov_b32_e32 v0, 10 1437; SI-NEXT: s_waitcnt lgkmcnt(0) 1438; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1439; SI-NEXT: s_endpgm 1440; 1441; VI-LABEL: bfe_u32_constant_fold_test_15: 1442; VI: ; %bb.0: 1443; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1444; VI-NEXT: s_mov_b32 s3, 0xf000 1445; VI-NEXT: s_mov_b32 s2, -1 1446; VI-NEXT: v_mov_b32_e32 v0, 10 1447; VI-NEXT: s_waitcnt lgkmcnt(0) 1448; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1449; VI-NEXT: s_endpgm 1450 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28) 1451 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1452 ret void 1453} 1454 1455; EG-NOT: BFE 1456define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { 1457; SI-LABEL: bfe_u32_constant_fold_test_16: 1458; SI: ; %bb.0: 1459; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1460; SI-NEXT: s_mov_b32 s3, 0xf000 1461; SI-NEXT: s_mov_b32 s2, -1 1462; SI-NEXT: v_mov_b32_e32 v0, 0x7f 1463; SI-NEXT: s_waitcnt lgkmcnt(0) 1464; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1465; SI-NEXT: s_endpgm 1466; 1467; VI-LABEL: bfe_u32_constant_fold_test_16: 1468; VI: ; %bb.0: 1469; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1470; VI-NEXT: s_mov_b32 s3, 0xf000 1471; VI-NEXT: s_mov_b32 s2, -1 1472; VI-NEXT: v_mov_b32_e32 v0, 0x7f 1473; VI-NEXT: s_waitcnt lgkmcnt(0) 1474; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1475; VI-NEXT: s_endpgm 1476 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7) 1477 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1478 ret void 1479} 1480 1481; EG-NOT: BFE 1482define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { 1483; SI-LABEL: bfe_u32_constant_fold_test_17: 1484; SI: ; %bb.0: 1485; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1486; SI-NEXT: s_mov_b32 s3, 0xf000 1487; SI-NEXT: s_mov_b32 s2, -1 1488; SI-NEXT: v_mov_b32_e32 v0, 0x7f 1489; SI-NEXT: s_waitcnt lgkmcnt(0) 1490; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1491; SI-NEXT: s_endpgm 1492; 1493; VI-LABEL: bfe_u32_constant_fold_test_17: 1494; VI: ; %bb.0: 1495; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1496; VI-NEXT: s_mov_b32 s3, 0xf000 1497; VI-NEXT: s_mov_b32 s2, -1 1498; VI-NEXT: v_mov_b32_e32 v0, 0x7f 1499; VI-NEXT: s_waitcnt lgkmcnt(0) 1500; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1501; VI-NEXT: s_endpgm 1502 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31) 1503 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1504 ret void 1505} 1506 1507; EG-NOT: BFE 1508define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { 1509; SI-LABEL: bfe_u32_constant_fold_test_18: 1510; SI: ; %bb.0: 1511; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1512; SI-NEXT: s_mov_b32 s3, 0xf000 1513; SI-NEXT: s_mov_b32 s2, -1 1514; SI-NEXT: v_mov_b32_e32 v0, 0 1515; SI-NEXT: s_waitcnt lgkmcnt(0) 1516; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1517; SI-NEXT: s_endpgm 1518; 1519; VI-LABEL: bfe_u32_constant_fold_test_18: 1520; VI: ; %bb.0: 1521; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1522; VI-NEXT: s_mov_b32 s3, 0xf000 1523; VI-NEXT: s_mov_b32 s2, -1 1524; VI-NEXT: v_mov_b32_e32 v0, 0 1525; VI-NEXT: s_waitcnt lgkmcnt(0) 1526; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1527; VI-NEXT: s_endpgm 1528 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1) 1529 store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 1530 ret void 1531} 1532 1533; Make sure that SimplifyDemandedBits doesn't cause the and to be 1534; reduced to the bits demanded by the bfe. 1535 1536; XXX: The operand to v_bfe_u32 could also just directly be the load register. 1537define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, 1538; SI-LABEL: simplify_bfe_u32_multi_use_arg: 1539; SI: ; %bb.0: 1540; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 1541; SI-NEXT: s_mov_b32 s3, 0xf000 1542; SI-NEXT: s_mov_b32 s2, -1 1543; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 1544; SI-NEXT: s_mov_b32 s6, s2 1545; SI-NEXT: s_mov_b32 s7, s3 1546; SI-NEXT: s_waitcnt lgkmcnt(0) 1547; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1548; SI-NEXT: s_mov_b32 s0, s8 1549; SI-NEXT: s_mov_b32 s1, s9 1550; SI-NEXT: s_mov_b32 s4, s10 1551; SI-NEXT: s_mov_b32 s5, s11 1552; SI-NEXT: s_waitcnt vmcnt(0) 1553; SI-NEXT: v_and_b32_e32 v0, 63, v0 1554; SI-NEXT: v_bfe_u32 v1, v0, 2, 2 1555; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 1556; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1557; SI-NEXT: s_endpgm 1558; 1559; VI-LABEL: simplify_bfe_u32_multi_use_arg: 1560; VI: ; %bb.0: 1561; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1562; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1563; VI-NEXT: s_mov_b32 s11, 0xf000 1564; VI-NEXT: s_mov_b32 s10, -1 1565; VI-NEXT: s_mov_b32 s2, s10 1566; VI-NEXT: s_mov_b32 s3, s11 1567; VI-NEXT: s_waitcnt lgkmcnt(0) 1568; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 1569; VI-NEXT: s_mov_b32 s8, s4 1570; VI-NEXT: s_mov_b32 s9, s5 1571; VI-NEXT: s_mov_b32 s0, s6 1572; VI-NEXT: s_mov_b32 s1, s7 1573; VI-NEXT: s_waitcnt vmcnt(0) 1574; VI-NEXT: v_and_b32_e32 v0, 63, v0 1575; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 1576; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 1577; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1578; VI-NEXT: s_endpgm 1579 i32 addrspace(1)* %out1, 1580 i32 addrspace(1)* %in) #0 { 1581 %src = load i32, i32 addrspace(1)* %in, align 4 1582 %and = and i32 %src, 63 1583 %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %and, i32 2, i32 2) 1584 store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4 1585 store i32 %and, i32 addrspace(1)* %out1, align 4 1586 ret void 1587} 1588 1589define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { 1590; SI-LABEL: lshr_and: 1591; SI: ; %bb.0: 1592; SI-NEXT: s_load_dword s2, s[0:1], 0xb 1593; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1594; SI-NEXT: s_mov_b32 s7, 0xf000 1595; SI-NEXT: s_waitcnt lgkmcnt(0) 1596; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 1597; SI-NEXT: s_mov_b32 s6, -1 1598; SI-NEXT: v_mov_b32_e32 v0, s0 1599; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1600; SI-NEXT: s_endpgm 1601; 1602; VI-LABEL: lshr_and: 1603; VI: ; %bb.0: 1604; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1605; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 1606; VI-NEXT: s_mov_b32 s7, 0xf000 1607; VI-NEXT: s_mov_b32 s6, -1 1608; VI-NEXT: s_waitcnt lgkmcnt(0) 1609; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 1610; VI-NEXT: v_mov_b32_e32 v0, s0 1611; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1612; VI-NEXT: s_endpgm 1613 %b = lshr i32 %a, 6 1614 %c = and i32 %b, 7 1615 store i32 %c, i32 addrspace(1)* %out, align 8 1616 ret void 1617} 1618 1619define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 1620; SI-LABEL: v_lshr_and: 1621; SI: ; %bb.0: 1622; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 1623; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1624; SI-NEXT: s_mov_b32 s7, 0xf000 1625; SI-NEXT: s_waitcnt lgkmcnt(0) 1626; SI-NEXT: s_lshr_b32 s0, s2, s3 1627; SI-NEXT: s_and_b32 s0, s0, 7 1628; SI-NEXT: s_mov_b32 s6, -1 1629; SI-NEXT: v_mov_b32_e32 v0, s0 1630; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1631; SI-NEXT: s_endpgm 1632; 1633; VI-LABEL: v_lshr_and: 1634; VI: ; %bb.0: 1635; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1636; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1637; VI-NEXT: s_mov_b32 s7, 0xf000 1638; VI-NEXT: s_mov_b32 s6, -1 1639; VI-NEXT: s_waitcnt lgkmcnt(0) 1640; VI-NEXT: s_lshr_b32 s0, s0, s1 1641; VI-NEXT: s_and_b32 s0, s0, 7 1642; VI-NEXT: v_mov_b32_e32 v0, s0 1643; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1644; VI-NEXT: s_endpgm 1645 %c = lshr i32 %a, %b 1646 %d = and i32 %c, 7 1647 store i32 %d, i32 addrspace(1)* %out, align 8 1648 ret void 1649} 1650 1651define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 { 1652; SI-LABEL: and_lshr: 1653; SI: ; %bb.0: 1654; SI-NEXT: s_load_dword s2, s[0:1], 0xb 1655; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1656; SI-NEXT: s_mov_b32 s7, 0xf000 1657; SI-NEXT: s_waitcnt lgkmcnt(0) 1658; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 1659; SI-NEXT: s_mov_b32 s6, -1 1660; SI-NEXT: v_mov_b32_e32 v0, s0 1661; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1662; SI-NEXT: s_endpgm 1663; 1664; VI-LABEL: and_lshr: 1665; VI: ; %bb.0: 1666; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1667; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 1668; VI-NEXT: s_mov_b32 s7, 0xf000 1669; VI-NEXT: s_mov_b32 s6, -1 1670; VI-NEXT: s_waitcnt lgkmcnt(0) 1671; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 1672; VI-NEXT: v_mov_b32_e32 v0, s0 1673; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1674; VI-NEXT: s_endpgm 1675 %b = and i32 %a, 448 1676 %c = lshr i32 %b, 6 1677 store i32 %c, i32 addrspace(1)* %out, align 8 1678 ret void 1679} 1680 1681define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 { 1682; SI-LABEL: and_lshr2: 1683; SI: ; %bb.0: 1684; SI-NEXT: s_load_dword s2, s[0:1], 0xb 1685; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1686; SI-NEXT: s_mov_b32 s7, 0xf000 1687; SI-NEXT: s_waitcnt lgkmcnt(0) 1688; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 1689; SI-NEXT: s_mov_b32 s6, -1 1690; SI-NEXT: v_mov_b32_e32 v0, s0 1691; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1692; SI-NEXT: s_endpgm 1693; 1694; VI-LABEL: and_lshr2: 1695; VI: ; %bb.0: 1696; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1697; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 1698; VI-NEXT: s_mov_b32 s7, 0xf000 1699; VI-NEXT: s_mov_b32 s6, -1 1700; VI-NEXT: s_waitcnt lgkmcnt(0) 1701; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 1702; VI-NEXT: v_mov_b32_e32 v0, s0 1703; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1704; VI-NEXT: s_endpgm 1705 %b = and i32 %a, 511 1706 %c = lshr i32 %b, 6 1707 store i32 %c, i32 addrspace(1)* %out, align 8 1708 ret void 1709} 1710 1711define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 { 1712; SI-LABEL: shl_lshr: 1713; SI: ; %bb.0: 1714; SI-NEXT: s_load_dword s2, s[0:1], 0xb 1715; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1716; SI-NEXT: s_mov_b32 s7, 0xf000 1717; SI-NEXT: s_waitcnt lgkmcnt(0) 1718; SI-NEXT: s_bfe_u32 s0, s2, 0x150002 1719; SI-NEXT: s_mov_b32 s6, -1 1720; SI-NEXT: v_mov_b32_e32 v0, s0 1721; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1722; SI-NEXT: s_endpgm 1723; 1724; VI-LABEL: shl_lshr: 1725; VI: ; %bb.0: 1726; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1727; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 1728; VI-NEXT: s_mov_b32 s7, 0xf000 1729; VI-NEXT: s_mov_b32 s6, -1 1730; VI-NEXT: s_waitcnt lgkmcnt(0) 1731; VI-NEXT: s_bfe_u32 s0, s0, 0x150002 1732; VI-NEXT: v_mov_b32_e32 v0, s0 1733; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1734; VI-NEXT: s_endpgm 1735 %b = shl i32 %a, 9 1736 %c = lshr i32 %b, 11 1737 store i32 %c, i32 addrspace(1)* %out, align 8 1738 ret void 1739} 1740 1741declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) #1 1742 1743attributes #0 = { nounwind } 1744attributes #1 = { nounwind readnone } 1745