1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 %s 3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 4; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s 5 6define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) { 7; R600-LABEL: test_udivrem: 8; R600: ; %bb.0: 9; R600-NEXT: ALU 21, @4, KC0[CB0:0-32], KC1[] 10; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 11; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 12; R600-NEXT: CF_END 13; R600-NEXT: ALU clause starting at 4: 14; R600-NEXT: SUB_INT T0.W, 0.0, KC0[9].X, 15; R600-NEXT: RECIP_UINT * T0.X, KC0[9].X, 16; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 17; R600-NEXT: MULHI * T0.Y, T0.X, PS, 18; R600-NEXT: ADD_INT * T0.W, T0.X, PS, 19; R600-NEXT: MULHI * T0.X, KC0[6].W, PV.W, 20; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[9].X, 21; R600-NEXT: SUB_INT * T0.W, KC0[6].W, PS, 22; R600-NEXT: SUB_INT T1.W, PV.W, KC0[9].X, 23; R600-NEXT: SETGE_UINT * T2.W, PV.W, KC0[9].X, 24; R600-NEXT: CNDE_INT * T0.W, PS, T0.W, PV.W, 25; R600-NEXT: ADD_INT T0.Z, T0.X, 1, 26; R600-NEXT: SUB_INT T1.W, PV.W, KC0[9].X, 27; R600-NEXT: SETGE_UINT * T3.W, PV.W, KC0[9].X, 28; R600-NEXT: CNDE_INT T1.X, PS, T0.W, PV.W, 29; R600-NEXT: CNDE_INT T0.W, T2.W, T0.X, PV.Z, 30; R600-NEXT: LSHR * T0.X, KC0[4].Z, literal.x, 31; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 32; R600-NEXT: ADD_INT * T1.W, PV.W, 1, 33; R600-NEXT: CNDE_INT T2.X, T3.W, T0.W, PV.W, 34; R600-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 35; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 36; 37; GFX6-LABEL: test_udivrem: 38; GFX6: ; %bb.0: 39; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26 40; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 41; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 42; GFX6-NEXT: s_load_dword s0, s[0:1], 0x1d 43; GFX6-NEXT: s_mov_b32 s7, 0xf000 44; GFX6-NEXT: s_mov_b32 s6, -1 45; GFX6-NEXT: s_mov_b32 s10, s6 46; GFX6-NEXT: s_waitcnt lgkmcnt(0) 47; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 48; GFX6-NEXT: s_sub_i32 s2, 0, s3 49; GFX6-NEXT: s_mov_b32 s11, s7 50; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 51; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 52; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 53; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 54; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 55; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 56; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 57; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 58; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 59; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 60; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 61; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 62; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 63; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 64; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 65; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 66; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 67; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 68; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 69; GFX6-NEXT: s_waitcnt expcnt(0) 70; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] 71; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 72; GFX6-NEXT: s_endpgm 73; 74; GFX8-LABEL: test_udivrem: 75; GFX8: ; %bb.0: 76; GFX8-NEXT: s_load_dword s7, s[0:1], 0x98 77; GFX8-NEXT: s_load_dword s6, s[0:1], 0x74 78; GFX8-NEXT: s_waitcnt lgkmcnt(0) 79; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 80; GFX8-NEXT: s_sub_i32 s2, 0, s7 81; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 82; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 83; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 84; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 85; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 86; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c 87; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 88; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 89; GFX8-NEXT: v_mul_hi_u32 v2, s6, v0 90; GFX8-NEXT: s_waitcnt lgkmcnt(0) 91; GFX8-NEXT: v_mov_b32_e32 v0, s2 92; GFX8-NEXT: v_mov_b32_e32 v1, s3 93; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 94; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 95; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3 96; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v3 97; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 98; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s7, v3 99; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 100; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 101; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v3 102; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 103; GFX8-NEXT: flat_store_dword v[0:1], v2 104; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s7, v3 105; GFX8-NEXT: v_mov_b32_e32 v0, s4 106; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] 107; GFX8-NEXT: v_mov_b32_e32 v1, s5 108; GFX8-NEXT: flat_store_dword v[0:1], v2 109; GFX8-NEXT: s_endpgm 110 %result0 = udiv i32 %x, %y 111 store i32 %result0, i32 addrspace(1)* %out0 112 %result1 = urem i32 %x, %y 113 store i32 %result1, i32 addrspace(1)* %out1 114 ret void 115} 116 117define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 118; R600-LABEL: test_udivrem_v2: 119; R600: ; %bb.0: 120; R600-NEXT: ALU 29, @4, KC0[CB0:0-32], KC1[] 121; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 122; R600-NEXT: CF_END 123; R600-NEXT: PAD 124; R600-NEXT: ALU clause starting at 4: 125; R600-NEXT: SUB_INT T0.W, 0.0, KC0[3].Z, 126; R600-NEXT: RECIP_UINT * T0.X, KC0[3].Z, 127; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 128; R600-NEXT: SUB_INT T0.W, 0.0, KC0[3].Y, 129; R600-NEXT: RECIP_UINT * T0.Z, KC0[3].Y, 130; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 131; R600-NEXT: MULHI * T0.W, T0.Z, PS, 132; R600-NEXT: ADD_INT T0.W, T0.Z, PS, 133; R600-NEXT: MULHI * T0.Y, T0.X, T0.Y, 134; R600-NEXT: ADD_INT T1.W, T0.X, PS, 135; R600-NEXT: MULHI * T0.X, KC0[2].W, PV.W, 136; R600-NEXT: MULHI * T0.Y, KC0[3].X, PV.W, 137; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[3].Z, 138; R600-NEXT: SUB_INT T0.W, KC0[3].X, PS, 139; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[3].Y, 140; R600-NEXT: SUB_INT T0.Z, KC0[2].W, PS, 141; R600-NEXT: SETGE_UINT T1.W, PV.W, KC0[3].Z, 142; R600-NEXT: SUB_INT * T2.W, PV.W, KC0[3].Z, 143; R600-NEXT: CNDE_INT T1.Z, PV.W, T0.W, PS, 144; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Y, 145; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Y, 146; R600-NEXT: CNDE_INT T0.Z, PV.W, T0.Z, PS, 147; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Z, 148; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Z, 149; R600-NEXT: CNDE_INT T0.Y, PV.W, T1.Z, PS, 150; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Y, 151; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Y, 152; R600-NEXT: CNDE_INT T0.X, PV.W, T0.Z, PS, 153; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 154; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 155; 156; GFX6-LABEL: test_udivrem_v2: 157; GFX6: ; %bb.0: 158; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 159; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe 160; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 161; GFX6-NEXT: s_mov_b32 s3, 0xf000 162; GFX6-NEXT: s_waitcnt lgkmcnt(0) 163; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 164; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 165; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 166; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 167; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 168; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 169; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 170; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 171; GFX6-NEXT: s_sub_i32 s2, 0, s6 172; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 173; GFX6-NEXT: s_sub_i32 s2, 0, s7 174; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 175; GFX6-NEXT: s_mov_b32 s2, -1 176; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 177; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 178; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 179; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 180; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 181; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 182; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 183; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 184; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 185; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 186; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 187; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 188; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 189; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 190; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 191; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 192; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 193; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 194; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 195; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 196; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 197; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 198; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 199; GFX6-NEXT: s_endpgm 200; 201; GFX8-LABEL: test_udivrem_v2: 202; GFX8: ; %bb.0: 203; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 204; GFX8-NEXT: s_mov_b32 s3, 0x4f7ffffe 205; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 206; GFX8-NEXT: s_waitcnt lgkmcnt(0) 207; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 208; GFX8-NEXT: s_sub_i32 s2, 0, s6 209; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s7 210; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 211; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 212; GFX8-NEXT: v_mul_f32_e32 v0, s3, v0 213; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 214; GFX8-NEXT: v_mul_f32_e32 v1, s3, v1 215; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 216; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 217; GFX8-NEXT: s_sub_i32 s2, 0, s7 218; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 219; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 220; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 221; GFX8-NEXT: v_mul_lo_u32 v2, s2, v1 222; GFX8-NEXT: v_mul_lo_u32 v0, v0, s6 223; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 224; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 225; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v0 226; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 227; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 228; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v0 229; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 230; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 231; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 232; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 233; GFX8-NEXT: v_mul_lo_u32 v1, v1, s7 234; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 235; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1 236; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 237; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 238; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1 239; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 240; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 241; GFX8-NEXT: v_mov_b32_e32 v3, s1 242; GFX8-NEXT: v_mov_b32_e32 v2, s0 243; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 244; GFX8-NEXT: s_endpgm 245 %result0 = udiv <2 x i32> %x, %y 246 store <2 x i32> %result0, <2 x i32> addrspace(1)* %out 247 %result1 = urem <2 x i32> %x, %y 248 store <2 x i32> %result1, <2 x i32> addrspace(1)* %out 249 ret void 250} 251 252define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 253; R600-LABEL: test_udivrem_v4: 254; R600: ; %bb.0: 255; R600-NEXT: ALU 57, @4, KC0[CB0:0-32], KC1[] 256; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 257; R600-NEXT: CF_END 258; R600-NEXT: PAD 259; R600-NEXT: ALU clause starting at 4: 260; R600-NEXT: SUB_INT T0.W, 0.0, KC0[5].X, 261; R600-NEXT: RECIP_UINT * T0.X, KC0[5].X, 262; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 263; R600-NEXT: SUB_INT T0.W, 0.0, KC0[4].Z, 264; R600-NEXT: RECIP_UINT * T0.Z, KC0[4].Z, 265; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 266; R600-NEXT: MULHI * T0.W, T0.Z, PS, 267; R600-NEXT: ADD_INT T0.W, T0.Z, PS, 268; R600-NEXT: MULHI * T0.Y, T0.X, T0.Y, 269; R600-NEXT: ADD_INT T1.W, T0.X, PS, 270; R600-NEXT: MULHI * T0.X, KC0[3].Z, PV.W, 271; R600-NEXT: MULHI * T0.Y, KC0[4].X, PV.W, 272; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[5].X, 273; R600-NEXT: RECIP_UINT * T0.Z, KC0[4].Y, 274; R600-NEXT: SUB_INT T0.W, 0.0, KC0[4].W, 275; R600-NEXT: RECIP_UINT * T1.X, KC0[4].W, 276; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 277; R600-NEXT: SUB_INT T1.W, 0.0, KC0[4].Y, 278; R600-NEXT: MULHI * T0.W, T1.X, PS, 279; R600-NEXT: ADD_INT T0.W, T1.X, PS, 280; R600-NEXT: MULLO_INT * T1.X, PV.W, T0.Z, 281; R600-NEXT: MULHI * T0.W, KC0[3].W, PV.W, 282; R600-NEXT: MULLO_INT * T0.W, PS, KC0[4].W, 283; R600-NEXT: SUB_INT T0.W, KC0[3].W, PS, 284; R600-NEXT: MULHI * T1.X, T0.Z, T1.X, 285; R600-NEXT: SETGE_UINT T1.Y, PV.W, KC0[4].W, 286; R600-NEXT: ADD_INT T0.Z, T0.Z, PS, 287; R600-NEXT: SUB_INT T1.W, KC0[4].X, T0.Y, 288; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[4].Z, 289; R600-NEXT: SUB_INT T0.Y, KC0[3].Z, PS, 290; R600-NEXT: SETGE_UINT T1.Z, PV.W, KC0[5].X, 291; R600-NEXT: SUB_INT * T2.W, PV.W, KC0[5].X, 292; R600-NEXT: MULHI * T0.X, KC0[3].Y, T0.Z, 293; R600-NEXT: SUB_INT T1.X, T0.W, KC0[4].W, 294; R600-NEXT: CNDE_INT T2.Y, T1.Z, T1.W, T2.W, 295; R600-NEXT: SETGE_UINT T0.Z, T0.Y, KC0[4].Z, 296; R600-NEXT: SUB_INT T1.W, T0.Y, KC0[4].Z, 297; R600-NEXT: MULLO_INT * T0.X, PS, KC0[4].Y, 298; R600-NEXT: CNDE_INT T2.X, PV.Z, T0.Y, PV.W, 299; R600-NEXT: SETGE_UINT T0.Y, PV.Y, KC0[5].X, 300; R600-NEXT: SUB_INT T0.Z, PV.Y, KC0[5].X, 301; R600-NEXT: SUB_INT T1.W, KC0[3].Y, PS, 302; R600-NEXT: CNDE_INT * T0.W, T1.Y, T0.W, PV.X, 303; R600-NEXT: SETGE_UINT T0.X, PS, KC0[4].W, 304; R600-NEXT: SUB_INT T1.Y, PS, KC0[4].W, 305; R600-NEXT: SETGE_UINT T1.Z, PV.W, KC0[4].Y, 306; R600-NEXT: SUB_INT T2.W, PV.W, KC0[4].Y, 307; R600-NEXT: CNDE_INT * T3.W, PV.Y, T2.Y, PV.Z, 308; R600-NEXT: CNDE_INT T0.Y, PV.Z, T1.W, PV.W, 309; R600-NEXT: CNDE_INT T3.Z, PV.X, T0.W, PV.Y, BS:VEC_021/SCL_122 310; R600-NEXT: SETGE_UINT T0.W, T2.X, KC0[4].Z, 311; R600-NEXT: SUB_INT * T1.W, T2.X, KC0[4].Z, 312; R600-NEXT: CNDE_INT T3.Y, PV.W, T2.X, PS, 313; R600-NEXT: SETGE_UINT T0.W, PV.Y, KC0[4].Y, 314; R600-NEXT: SUB_INT * T1.W, PV.Y, KC0[4].Y, 315; R600-NEXT: CNDE_INT T3.X, PV.W, T0.Y, PS, 316; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 317; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 318; 319; GFX6-LABEL: test_udivrem_v4: 320; GFX6: ; %bb.0: 321; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 322; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 323; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 324; GFX6-NEXT: s_mov_b32 s3, 0xf000 325; GFX6-NEXT: s_waitcnt lgkmcnt(0) 326; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 327; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 328; GFX6-NEXT: s_sub_i32 s2, 0, s8 329; GFX6-NEXT: s_sub_i32 s12, 0, s9 330; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 331; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 332; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 333; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 334; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 335; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 336; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 337; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 338; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 339; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 340; GFX6-NEXT: s_mov_b32 s2, -1 341; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 342; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 343; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 344; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 345; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 346; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 347; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 348; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 349; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 350; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 351; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 352; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 353; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 354; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 355; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 356; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 357; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 358; GFX6-NEXT: s_sub_i32 s4, 0, s10 359; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 360; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 361; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 362; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 363; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 364; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 365; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 366; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 367; GFX6-NEXT: s_sub_i32 s4, 0, s11 368; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 369; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 370; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 371; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 372; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 373; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 374; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 375; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 376; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 377; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 378; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 379; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 380; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 381; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 382; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 383; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 384; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 385; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 386; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 387; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 388; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 389; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 390; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 391; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 392; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 393; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 394; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 395; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 396; GFX6-NEXT: s_endpgm 397; 398; GFX8-LABEL: test_udivrem_v4: 399; GFX8: ; %bb.0: 400; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 401; GFX8-NEXT: s_mov_b32 s12, 0x4f7ffffe 402; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 403; GFX8-NEXT: s_waitcnt lgkmcnt(0) 404; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 405; GFX8-NEXT: s_sub_i32 s2, 0, s8 406; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s9 407; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s11 408; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 409; GFX8-NEXT: s_sub_i32 s3, 0, s9 410; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 411; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10 412; GFX8-NEXT: v_mul_f32_e32 v0, s12, v0 413; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 414; GFX8-NEXT: v_mul_f32_e32 v1, s12, v1 415; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 416; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 417; GFX8-NEXT: v_mul_lo_u32 v3, s2, v0 418; GFX8-NEXT: s_sub_i32 s2, 0, s10 419; GFX8-NEXT: v_mul_f32_e32 v2, s12, v2 420; GFX8-NEXT: v_mul_hi_u32 v3, v0, v3 421; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 422; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 423; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 424; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v4 425; GFX8-NEXT: v_mul_lo_u32 v4, s3, v1 426; GFX8-NEXT: v_mul_lo_u32 v0, v0, s8 427; GFX8-NEXT: v_mul_f32_e32 v3, s12, v3 428; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 429; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 430; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 431; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 432; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 433; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 434; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 435; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 436; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 437; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 438; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 439; GFX8-NEXT: v_mul_lo_u32 v4, s2, v2 440; GFX8-NEXT: s_sub_i32 s2, 0, s11 441; GFX8-NEXT: v_mul_lo_u32 v1, v1, s9 442; GFX8-NEXT: v_mul_hi_u32 v4, v2, v4 443; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 444; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s9, v1 445; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 446; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 447; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s9, v1 448; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 449; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 450; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 451; GFX8-NEXT: v_mul_hi_u32 v2, s6, v2 452; GFX8-NEXT: v_mul_lo_u32 v4, s2, v3 453; GFX8-NEXT: v_mul_lo_u32 v2, v2, s10 454; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 455; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 456; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 457; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 458; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 459; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 460; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 461; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 462; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 463; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3 464; GFX8-NEXT: v_mul_lo_u32 v3, v3, s11 465; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 466; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s11, v3 467; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 468; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 469; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s11, v3 470; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 471; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 472; GFX8-NEXT: v_mov_b32_e32 v5, s1 473; GFX8-NEXT: v_mov_b32_e32 v4, s0 474; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 475; GFX8-NEXT: s_endpgm 476 %result0 = udiv <4 x i32> %x, %y 477 store <4 x i32> %result0, <4 x i32> addrspace(1)* %out 478 %result1 = urem <4 x i32> %x, %y 479 store <4 x i32> %result1, <4 x i32> addrspace(1)* %out 480 ret void 481} 482