1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 4 5; FUNC-LABEL: {{^}}udiv24_i8: 6; SI: v_cvt_f32_ubyte 7; SI: v_cvt_f32_ubyte 8; SI: v_rcp_iflag_f32 9; SI: v_cvt_u32_f32 10 11; EG: UINT_TO_FLT 12; EG-DAG: UINT_TO_FLT 13; EG-DAG: RECIP_IEEE 14; EG: FLT_TO_UINT 15define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { 16 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 17 %num = load i8, i8 addrspace(1) * %in 18 %den = load i8, i8 addrspace(1) * %den_ptr 19 %result = udiv i8 %num, %den 20 store i8 %result, i8 addrspace(1)* %out 21 ret void 22} 23 24; FUNC-LABEL: {{^}}udiv24_i16: 25; SI: v_cvt_f32_u32 26; SI: v_cvt_f32_u32 27; SI: v_rcp_iflag_f32 28; SI: v_cvt_u32_f32 29 30; EG: UINT_TO_FLT 31; EG-DAG: UINT_TO_FLT 32; EG-DAG: RECIP_IEEE 33; EG: FLT_TO_UINT 34define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 35 %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 36 %num = load i16, i16 addrspace(1) * %in, align 2 37 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 38 %result = udiv i16 %num, %den 39 store i16 %result, i16 addrspace(1)* %out, align 2 40 ret void 41} 42 43; FUNC-LABEL: {{^}}udiv23_i32: 44; SI: v_cvt_f32_u32 45; SI-DAG: v_cvt_f32_u32 46; SI-DAG: v_rcp_iflag_f32 47; SI: v_cvt_u32_f32 48 49; EG: UINT_TO_FLT 50; EG-DAG: UINT_TO_FLT 51; EG-DAG: RECIP_IEEE 52; EG: FLT_TO_UINT 53define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 54 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 55 %num = load i32, i32 addrspace(1) * %in, align 4 56 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 57 %num.i23.0 = shl i32 %num, 9 58 %den.i23.0 = shl i32 %den, 9 59 %num.i23 = lshr i32 %num.i23.0, 9 60 %den.i23 = lshr i32 %den.i23.0, 9 61 %result = udiv i32 %num.i23, %den.i23 62 store i32 %result, i32 addrspace(1)* %out, align 4 63 ret void 64} 65 66; FUNC-LABEL: {{^}}udiv24_i32: 67; SI: v_rcp_iflag 68; SI-NOT v_rcp_f32 69; EG-NOT: RECIP_IEEE 70define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 71 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 72 %num = load i32, i32 addrspace(1) * %in, align 4 73 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 74 %num.i24.0 = shl i32 %num, 8 75 %den.i24.0 = shl i32 %den, 8 76 %num.i24 = lshr i32 %num.i24.0, 8 77 %den.i24 = lshr i32 %den.i24.0, 8 78 %result = udiv i32 %num.i24, %den.i24 79 store i32 %result, i32 addrspace(1)* %out, align 4 80 ret void 81} 82 83; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32: 84; SI: v_rcp_iflag 85; SI-NOT v_rcp_f32 86; EG-NOT: RECIP_IEEE 87define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 88 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 89 %num = load i32, i32 addrspace(1) * %in, align 4 90 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 91 %num.i23.0 = shl i32 %num, 9 92 %den.i24.0 = shl i32 %den, 8 93 %num.i23 = lshr i32 %num.i23.0, 9 94 %den.i24 = lshr i32 %den.i24.0, 8 95 %result = udiv i32 %num.i23, %den.i24 96 store i32 %result, i32 addrspace(1)* %out, align 4 97 ret void 98} 99 100; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32: 101; SI: v_rcp_iflag 102; SI-NOT v_rcp_f32 103; EG-NOT: RECIP_IEEE 104define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 105 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 106 %num = load i32, i32 addrspace(1) * %in, align 4 107 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 108 %num.i24.0 = shl i32 %num, 8 109 %den.i23.0 = shl i32 %den, 9 110 %num.i24 = lshr i32 %num.i24.0, 8 111 %den.i23 = lshr i32 %den.i23.0, 9 112 %result = udiv i32 %num.i24, %den.i23 113 store i32 %result, i32 addrspace(1)* %out, align 4 114 ret void 115} 116 117; FUNC-LABEL: {{^}}udiv25_i32: 118; RCP_IFLAG is for URECIP in the full 32b alg 119; SI: v_rcp_iflag 120; SI-NOT: v_rcp_f32 121 122; EG-NOT: UINT_TO_FLT 123; EG-NOT: RECIP_IEEE 124define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 125 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 126 %num = load i32, i32 addrspace(1) * %in, align 4 127 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 128 %num.i25.0 = shl i32 %num, 7 129 %den.i25.0 = shl i32 %den, 7 130 %num.i25 = lshr i32 %num.i25.0, 7 131 %den.i25 = lshr i32 %den.i25.0, 7 132 %result = udiv i32 %num.i25, %den.i25 133 store i32 %result, i32 addrspace(1)* %out, align 4 134 ret void 135} 136 137; FUNC-LABEL: {{^}}test_no_udiv24_i32_1: 138; RCP_IFLAG is for URECIP in the full 32b alg 139; SI: v_rcp_iflag 140; SI-NOT: v_rcp_f32 141 142; EG-NOT: UINT_TO_FLT 143; EG-NOT: RECIP_IEEE 144define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 145 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 146 %num = load i32, i32 addrspace(1) * %in, align 4 147 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 148 %num.i24.0 = shl i32 %num, 8 149 %den.i24.0 = shl i32 %den, 7 150 %num.i24 = lshr i32 %num.i24.0, 8 151 %den.i24 = lshr i32 %den.i24.0, 7 152 %result = udiv i32 %num.i24, %den.i24 153 store i32 %result, i32 addrspace(1)* %out, align 4 154 ret void 155} 156 157; FUNC-LABEL: {{^}}test_no_udiv24_i32_2: 158; RCP_IFLAG is for URECIP in the full 32b alg 159; SI: v_rcp_iflag 160; SI-NOT: v_rcp_f32 161 162; EG-NOT: UINT_TO_FLT 163; EG-NOT: RECIP_IEEE 164define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 165 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 166 %num = load i32, i32 addrspace(1) * %in, align 4 167 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 168 %num.i24.0 = shl i32 %num, 7 169 %den.i24.0 = shl i32 %den, 8 170 %num.i24 = lshr i32 %num.i24.0, 7 171 %den.i24 = lshr i32 %den.i24.0, 8 172 %result = udiv i32 %num.i24, %den.i24 173 store i32 %result, i32 addrspace(1)* %out, align 4 174 ret void 175} 176 177; FUNC-LABEL: {{^}}urem24_i8: 178; SI: v_cvt_f32_ubyte 179; SI: v_cvt_f32_ubyte 180; SI: v_rcp_iflag_f32 181; SI: v_cvt_u32_f32 182 183; EG: UINT_TO_FLT 184; EG-DAG: UINT_TO_FLT 185; EG-DAG: RECIP_IEEE 186; EG: FLT_TO_UINT 187define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { 188 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 189 %num = load i8, i8 addrspace(1) * %in 190 %den = load i8, i8 addrspace(1) * %den_ptr 191 %result = urem i8 %num, %den 192 store i8 %result, i8 addrspace(1)* %out 193 ret void 194} 195 196; FUNC-LABEL: {{^}}urem24_i16: 197; SI: v_cvt_f32_u32 198; SI: v_cvt_f32_u32 199; SI: v_rcp_iflag_f32 200; SI: v_cvt_u32_f32 201 202; EG: UINT_TO_FLT 203; EG-DAG: UINT_TO_FLT 204; EG-DAG: RECIP_IEEE 205; EG: FLT_TO_UINT 206define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 207 %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 208 %num = load i16, i16 addrspace(1) * %in, align 2 209 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 210 %result = urem i16 %num, %den 211 store i16 %result, i16 addrspace(1)* %out, align 2 212 ret void 213} 214 215; FUNC-LABEL: {{^}}urem24_i32: 216; SI-NOT: v_rcp_f32 217; EG-NOT: RECIP_IEEE 218define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 219 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 220 %num = load i32, i32 addrspace(1) * %in, align 4 221 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 222 %num.i24.0 = shl i32 %num, 8 223 %den.i24.0 = shl i32 %den, 8 224 %num.i24 = lshr i32 %num.i24.0, 8 225 %den.i24 = lshr i32 %den.i24.0, 8 226 %result = urem i32 %num.i24, %den.i24 227 store i32 %result, i32 addrspace(1)* %out, align 4 228 ret void 229} 230 231; FUNC-LABEL: {{^}}urem25_i32: 232; RCP_IFLAG is for URECIP in the full 32b alg 233; SI: v_rcp_iflag 234; SI-NOT: v_rcp_f32 235 236; EG-NOT: UINT_TO_FLT 237; EG-NOT: RECIP_IEEE 238define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 239 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 240 %num = load i32, i32 addrspace(1) * %in, align 4 241 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 242 %num.i24.0 = shl i32 %num, 7 243 %den.i24.0 = shl i32 %den, 7 244 %num.i24 = lshr i32 %num.i24.0, 7 245 %den.i24 = lshr i32 %den.i24.0, 7 246 %result = urem i32 %num.i24, %den.i24 247 store i32 %result, i32 addrspace(1)* %out, align 4 248 ret void 249} 250 251; FUNC-LABEL: {{^}}test_no_urem24_i32_1: 252; RCP_IFLAG is for URECIP in the full 32b alg 253; SI: v_rcp_iflag 254; SI-NOT: v_rcp_f32 255 256; EG-NOT: UINT_TO_FLT 257; EG-NOT: RECIP_IEEE 258define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 259 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 260 %num = load i32, i32 addrspace(1) * %in, align 4 261 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 262 %num.i24.0 = shl i32 %num, 8 263 %den.i24.0 = shl i32 %den, 7 264 %num.i24 = lshr i32 %num.i24.0, 8 265 %den.i24 = lshr i32 %den.i24.0, 7 266 %result = urem i32 %num.i24, %den.i24 267 store i32 %result, i32 addrspace(1)* %out, align 4 268 ret void 269} 270 271; FUNC-LABEL: {{^}}test_no_urem24_i32_2: 272; RCP_IFLAG is for URECIP in the full 32b alg 273; SI: v_rcp_iflag 274; SI-NOT: v_rcp_f32 275 276; EG-NOT: UINT_TO_FLT 277; EG-NOT: RECIP_IEEE 278define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 279 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 280 %num = load i32, i32 addrspace(1) * %in, align 4 281 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 282 %num.i24.0 = shl i32 %num, 7 283 %den.i24.0 = shl i32 %den, 8 284 %num.i24 = lshr i32 %num.i24.0, 7 285 %den.i24 = lshr i32 %den.i24.0, 8 286 %result = urem i32 %num.i24, %den.i24 287 store i32 %result, i32 addrspace(1)* %out, align 4 288 ret void 289} 290 291; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32: 292; SI-DAG: v_rcp_iflag_f32 293; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} 294; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], 295 296; EG: RECIP_IEEE 297define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 298 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 299 %num = load i32, i32 addrspace(1) * %in, align 4 300 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 301 %num.i16.0 = shl i32 %num, 16 302 %den.i23.0 = shl i32 %den, 9 303 %num.i16 = lshr i32 %num.i16.0, 16 304 %den.i23 = lshr i32 %den.i23.0, 9 305 %result = udiv i32 %num.i16, %den.i23 306 store i32 %result, i32 addrspace(1)* %out, align 4 307 ret void 308} 309 310; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32: 311; SI-DAG: v_rcp_iflag_f32 312; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} 313; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], 314 315; EG: RECIP_IEEE 316define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 317 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 318 %num = load i32, i32 addrspace(1) * %in, align 4 319 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 320 %num.i23.0 = shl i32 %num, 9 321 %den.i16.0 = shl i32 %den, 16 322 %num.i23 = lshr i32 %num.i23.0, 9 323 %den.i16 = lshr i32 %den.i16.0, 16 324 %result = udiv i32 %num.i23, %den.i16 325 store i32 %result, i32 addrspace(1)* %out, align 4 326 ret void 327} 328