1; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s 4; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 5 6; FUNC-LABEL: {{^}}v_test_imin_sle_i32: 7; GCN: v_min_i32_e32 8 9; EG: MIN_INT 10define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { 11 %tid = call i32 @llvm.r600.read.tidig.x() 12 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid 13 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid 14 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid 15 %a = load i32, i32 addrspace(1)* %a.gep, align 4 16 %b = load i32, i32 addrspace(1)* %b.gep, align 4 17 %cmp = icmp sle i32 %a, %b 18 %val = select i1 %cmp, i32 %a, i32 %b 19 store i32 %val, i32 addrspace(1)* %out.gep, align 4 20 ret void 21} 22 23; FUNC-LABEL: {{^}}s_test_imin_sle_i32: 24; GCN: s_min_i32 25 26; EG: MIN_INT 27define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 28 %cmp = icmp sle i32 %a, %b 29 %val = select i1 %cmp, i32 %a, i32 %b 30 store i32 %val, i32 addrspace(1)* %out, align 4 31 ret void 32} 33 34; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32: 35; GCN: s_min_i32 36 37; EG: MIN_INT 38define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { 39 %cmp = icmp sle <1 x i32> %a, %b 40 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b 41 store <1 x i32> %val, <1 x i32> addrspace(1)* %out 42 ret void 43} 44 45; FUNC-LABEL: {{^}}s_test_imin_sle_v4i32: 46; GCN: s_min_i32 47; GCN: s_min_i32 48; GCN: s_min_i32 49; GCN: s_min_i32 50 51; EG: MIN_INT 52; EG: MIN_INT 53; EG: MIN_INT 54; EG: MIN_INT 55define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { 56 %cmp = icmp sle <4 x i32> %a, %b 57 %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b 58 store <4 x i32> %val, <4 x i32> addrspace(1)* %out 59 ret void 60} 61 62; FUNC-LABEL: {{^}}s_test_imin_sle_i8: 63; GCN: s_load_dword 64; GCN: s_load_dword 65; GCN: s_sext_i32_i8 66; GCN: s_sext_i32_i8 67; GCN: s_min_i32 68define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 { 69 %cmp = icmp sle i8 %a, %b 70 %val = select i1 %cmp, i8 %a, i8 %b 71 store i8 %val, i8 addrspace(1)* %out 72 ret void 73} 74 75; FIXME: Why vector and sdwa for last element? 76; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: 77; GCN: s_load_dword s 78; GCN: s_load_dword s 79; GCN-NOT: _load_ 80 81; SI: s_min_i32 82; SI: s_min_i32 83; SI: s_min_i32 84; SI: s_min_i32 85 86; VI: s_min_i32 87; VI: s_min_i32 88; VI: s_min_i32 89; VI: v_min_i32_sdwa 90 91; GFX9: v_min_i16 92; GFX9: v_min_i16 93; GFX9: v_min_i16 94; GFX9: v_min_i16 95 96; EG: MIN_INT 97; EG: MIN_INT 98; EG: MIN_INT 99; EG: MIN_INT 100define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 { 101 %cmp = icmp sle <4 x i8> %a, %b 102 %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b 103 store <4 x i8> %val, <4 x i8> addrspace(1)* %out 104 ret void 105} 106 107; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16: 108; GCN: s_load_dword s 109; GCN: s_load_dword s 110 111; SI: s_ashr_i32 112; SI: s_sext_i32_i16 113; SI: s_ashr_i32 114; SI: s_sext_i32_i16 115; SI: s_min_i32 116; SI: s_min_i32 117 118; VI: s_sext_i32_i16 119; VI: s_sext_i32_i16 120; VI: s_min_i32 121; VI: s_min_i32 122 123; GFX9: v_pk_min_i16 124 125; EG: MIN_INT 126; EG: MIN_INT 127define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { 128 %cmp = icmp sle <2 x i16> %a, %b 129 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 130 store <2 x i16> %val, <2 x i16> addrspace(1)* %out 131 ret void 132} 133 134; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: 135; SI-NOT: buffer_load 136; SI: s_min_i32 137; SI: s_min_i32 138; SI: s_min_i32 139; SI: s_min_i32 140 141; VI: s_min_i32 142; VI: s_min_i32 143; VI: s_min_i32 144; VI: s_min_i32 145 146; GFX9: v_pk_min_i16 147; GFX9: v_pk_min_i16 148 149; EG: MIN_INT 150; EG: MIN_INT 151; EG: MIN_INT 152; EG: MIN_INT 153define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 { 154 %cmp = icmp sle <4 x i16> %a, %b 155 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b 156 store <4 x i16> %val, <4 x i16> addrspace(1)* %out 157 ret void 158} 159 160; FUNC-LABEL: @v_test_imin_slt_i32 161; GCN: v_min_i32_e32 162 163; EG: MIN_INT 164define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { 165 %tid = call i32 @llvm.r600.read.tidig.x() 166 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid 167 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid 168 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid 169 %a = load i32, i32 addrspace(1)* %a.gep, align 4 170 %b = load i32, i32 addrspace(1)* %b.gep, align 4 171 %cmp = icmp slt i32 %a, %b 172 %val = select i1 %cmp, i32 %a, i32 %b 173 store i32 %val, i32 addrspace(1)* %out.gep, align 4 174 ret void 175} 176 177; FUNC-LABEL: @v_test_imin_slt_i16 178; SI: v_min_i32_e32 179 180; GFX89: v_min_i16_e32 181 182; EG: MIN_INT 183define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { 184 %tid = call i32 @llvm.r600.read.tidig.x() 185 %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid 186 %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid 187 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid 188 189 %a = load i16, i16 addrspace(1)* %a.gep 190 %b = load i16, i16 addrspace(1)* %b.gep 191 %cmp = icmp slt i16 %a, %b 192 %val = select i1 %cmp, i16 %a, i16 %b 193 store i16 %val, i16 addrspace(1)* %out.gep 194 ret void 195} 196 197; FUNC-LABEL: @s_test_imin_slt_i32 198; GCN: s_min_i32 199 200; EG: MIN_INT 201define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 202 %cmp = icmp slt i32 %a, %b 203 %val = select i1 %cmp, i32 %a, i32 %b 204 store i32 %val, i32 addrspace(1)* %out, align 4 205 ret void 206} 207 208; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32: 209; GCN: s_min_i32 210; GCN: s_min_i32 211 212; EG: MIN_INT 213; EG: MIN_INT 214define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { 215 %cmp = icmp slt <2 x i32> %a, %b 216 %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b 217 store <2 x i32> %val, <2 x i32> addrspace(1)* %out 218 ret void 219} 220 221; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32: 222; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 223 224; EG: MIN_INT {{.*}}literal.{{[xyzw]}} 225define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { 226 %cmp = icmp slt i32 %a, 8 227 %val = select i1 %cmp, i32 %a, i32 8 228 store i32 %val, i32 addrspace(1)* %out, align 4 229 ret void 230} 231 232; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32: 233; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 234 235; EG: MIN_INT {{.*}}literal.{{[xyzw]}} 236define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { 237 %cmp = icmp sle i32 %a, 8 238 %val = select i1 %cmp, i32 %a, i32 8 239 store i32 %val, i32 addrspace(1)* %out, align 4 240 ret void 241} 242 243; FUNC-LABEL: @v_test_umin_ule_i32 244; GCN: v_min_u32_e32 245 246; EG: MIN_UINT 247define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { 248 %tid = call i32 @llvm.r600.read.tidig.x() 249 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid 250 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid 251 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid 252 %a = load i32, i32 addrspace(1)* %a.gep, align 4 253 %b = load i32, i32 addrspace(1)* %b.gep, align 4 254 %cmp = icmp ule i32 %a, %b 255 %val = select i1 %cmp, i32 %a, i32 %b 256 store i32 %val, i32 addrspace(1)* %out.gep, align 4 257 ret void 258} 259 260; FUNC-LABEL: @v_test_umin_ule_v3i32 261; GCN: v_min_u32_e32 262; GCN: v_min_u32_e32 263; GCN: v_min_u32_e32 264; GCN-NOT: v_min_u32_e32 265; GCN: s_endpgm 266 267; EG: MIN_UINT 268; EG: MIN_UINT 269; EG: MIN_UINT 270define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 { 271 %tid = call i32 @llvm.r600.read.tidig.x() 272 %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid 273 %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid 274 %out.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid 275 276 %a = load <3 x i32>, <3 x i32> addrspace(1)* %a.gep 277 %b = load <3 x i32>, <3 x i32> addrspace(1)* %b.gep 278 %cmp = icmp ule <3 x i32> %a, %b 279 %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b 280 store <3 x i32> %val, <3 x i32> addrspace(1)* %out.gep 281 ret void 282} 283 284; FIXME: Reduce unused packed component to scalar 285; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}} 286; SI: v_min_u32_e32 287; SI: v_min_u32_e32 288; SI: v_min_u32_e32 289; SI-NOT: v_min_u32_e32 290 291; VI: v_min_u16_e32 292; VI: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 293; VI: v_min_u16_e32 294; VI-NOT: v_min_u16 295 296; GFX9: v_pk_min_u16 297; GFX9: v_pk_min_u16 298 299; GCN: s_endpgm 300 301; EG: MIN_UINT 302; EG: MIN_UINT 303; EG: MIN_UINT 304define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 { 305 %tid = call i32 @llvm.r600.read.tidig.x() 306 %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid 307 %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid 308 %out.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid 309 310 %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.gep 311 %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.gep 312 %cmp = icmp ule <3 x i16> %a, %b 313 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b 314 store <3 x i16> %val, <3 x i16> addrspace(1)* %out.gep 315 ret void 316} 317 318; FUNC-LABEL: @s_test_umin_ule_i32 319; GCN: s_min_u32 320 321; EG: MIN_UINT 322define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 323 %cmp = icmp ule i32 %a, %b 324 %val = select i1 %cmp, i32 %a, i32 %b 325 store i32 %val, i32 addrspace(1)* %out, align 4 326 ret void 327} 328 329; FUNC-LABEL: @v_test_umin_ult_i32 330; GCN: v_min_u32_e32 331 332; EG: MIN_UINT 333define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { 334 %tid = call i32 @llvm.r600.read.tidig.x() 335 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid 336 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid 337 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid 338 %a = load i32, i32 addrspace(1)* %a.gep, align 4 339 %b = load i32, i32 addrspace(1)* %b.gep, align 4 340 %cmp = icmp ult i32 %a, %b 341 %val = select i1 %cmp, i32 %a, i32 %b 342 store i32 %val, i32 addrspace(1)* %out.gep, align 4 343 ret void 344} 345 346; FUNC-LABEL: {{^}}v_test_umin_ult_i8: 347; SI: {{buffer|flat|global}}_load_ubyte 348; SI: {{buffer|flat|global}}_load_ubyte 349; SI: v_min_u32_e32 350 351; GFX89: {{flat|global}}_load_ubyte 352; GFX89: {{flat|global}}_load_ubyte 353; GFX89: v_min_u16_e32 354 355; EG: MIN_UINT 356define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 { 357 %tid = call i32 @llvm.r600.read.tidig.x() 358 %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid 359 %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid 360 %out.gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %tid 361 362 %a = load i8, i8 addrspace(1)* %a.gep, align 1 363 %b = load i8, i8 addrspace(1)* %b.gep, align 1 364 %cmp = icmp ult i8 %a, %b 365 %val = select i1 %cmp, i8 %a, i8 %b 366 store i8 %val, i8 addrspace(1)* %out.gep, align 1 367 ret void 368} 369 370; FUNC-LABEL: @s_test_umin_ult_i32 371; GCN: s_min_u32 372 373; EG: MIN_UINT 374define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 375 %cmp = icmp ult i32 %a, %b 376 %val = select i1 %cmp, i32 %a, i32 %b 377 store i32 %val, i32 addrspace(1)* %out, align 4 378 ret void 379} 380 381; FUNC-LABEL: @v_test_umin_ult_i32_multi_use 382; SI-NOT: v_min 383; GCN: v_cmp_lt_u32 384; SI-NEXT: v_cndmask_b32 385; SI-NOT: v_min 386; GCN: s_endpgm 387 388; EG-NOT: MIN_UINT 389define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { 390 %a = load i32, i32 addrspace(1)* %aptr, align 4 391 %b = load i32, i32 addrspace(1)* %bptr, align 4 392 %cmp = icmp ult i32 %a, %b 393 %val = select i1 %cmp, i32 %a, i32 %b 394 store i32 %val, i32 addrspace(1)* %out0, align 4 395 store i1 %cmp, i1 addrspace(1)* %out1 396 ret void 397} 398 399; FUNC-LABEL: @v_test_umin_ult_i16_multi_use 400; GCN-NOT: v_min 401; GCN: v_cmp_lt_u32 402; GCN-NEXT: v_cndmask_b32 403; GCN-NOT: v_min 404; GCN: s_endpgm 405 406; EG-NOT: MIN_UINT 407define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { 408 %a = load i16, i16 addrspace(1)* %aptr, align 2 409 %b = load i16, i16 addrspace(1)* %bptr, align 2 410 %cmp = icmp ult i16 %a, %b 411 %val = select i1 %cmp, i16 %a, i16 %b 412 store i16 %val, i16 addrspace(1)* %out0, align 2 413 store i1 %cmp, i1 addrspace(1)* %out1 414 ret void 415} 416 417 418; FUNC-LABEL: @s_test_umin_ult_v1i32 419; GCN: s_min_u32 420 421; EG: MIN_UINT 422define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { 423 %cmp = icmp ult <1 x i32> %a, %b 424 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b 425 store <1 x i32> %val, <1 x i32> addrspace(1)* %out 426 ret void 427} 428 429; FUNC-LABEL: {{^}}s_test_umin_ult_v8i32: 430; GCN: s_min_u32 431; GCN: s_min_u32 432; GCN: s_min_u32 433; GCN: s_min_u32 434; GCN: s_min_u32 435; GCN: s_min_u32 436; GCN: s_min_u32 437; GCN: s_min_u32 438 439; EG: MIN_UINT 440; EG: MIN_UINT 441; EG: MIN_UINT 442; EG: MIN_UINT 443; EG: MIN_UINT 444; EG: MIN_UINT 445; EG: MIN_UINT 446; EG: MIN_UINT 447define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 { 448 %cmp = icmp ult <8 x i32> %a, %b 449 %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b 450 store <8 x i32> %val, <8 x i32> addrspace(1)* %out 451 ret void 452} 453 454; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: 455; GCN-NOT: {{buffer|flat|global}}_load 456; SI: s_min_u32 457; SI: s_min_u32 458; SI: s_min_u32 459; SI: s_min_u32 460; SI: s_min_u32 461; SI: s_min_u32 462; SI: s_min_u32 463; SI: s_min_u32 464 465; VI: s_min_u32 466; VI: s_min_u32 467; VI: s_min_u32 468; VI: s_min_u32 469; VI: s_min_u32 470; VI: s_min_u32 471; VI: s_min_u32 472; VI: s_min_u32 473 474; EG: MIN_UINT 475; EG: MIN_UINT 476; EG: MIN_UINT 477; EG: MIN_UINT 478; EG: MIN_UINT 479; EG: MIN_UINT 480; EG: MIN_UINT 481; EG: MIN_UINT 482define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 { 483 %cmp = icmp ult <8 x i16> %a, %b 484 %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b 485 store <8 x i16> %val, <8 x i16> addrspace(1)* %out 486 ret void 487} 488 489; Make sure redundant and removed 490; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: 491; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} 492; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} 493; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] 494; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] 495; GCN: buffer_store_dword [[VMIN]] 496 497; EG: MIN_UINT 498define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 { 499 %a.ext = zext i16 %a to i32 500 %b.ext = zext i16 %b to i32 501 %cmp = icmp ult i32 %a.ext, %b.ext 502 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext 503 %mask = and i32 %val, 65535 504 store i32 %mask, i32 addrspace(1)* %out 505 ret void 506} 507 508; Make sure redundant sign_extend_inreg removed. 509 510; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: 511; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} 512; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} 513; GCN-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]] 514; GCN-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]] 515 516; GCN: s_min_i32 [[MIN:s[0-9]+]], [[EXT_A]], [[EXT_B]] 517; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] 518; GCN: buffer_store_dword [[VMIN]] 519 520; EG: MIN_INT 521define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 { 522 %a.ext = sext i16 %a to i32 523 %b.ext = sext i16 %b to i32 524 %cmp = icmp slt i32 %a.ext, %b.ext 525 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext 526 %shl = shl i32 %val, 16 527 %sextinreg = ashr i32 %shl, 16 528 store i32 %sextinreg, i32 addrspace(1)* %out 529 ret void 530} 531 532; FUNC-LABEL: {{^}}s_test_imin_sle_i16: 533; GCN: s_min_i32 534 535; EG: MIN_INT 536define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 { 537 %cmp = icmp sle i16 %a, %b 538 %val = select i1 %cmp, i16 %a, i16 %b 539 store i16 %val, i16 addrspace(1)* %out 540 ret void 541} 542 543; 64 bit 544; FUNC-LABEL: {{^}}test_umin_ult_i64 545; GCN: s_endpgm 546 547; EG: MIN_UINT 548; EG: MIN_UINT 549define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { 550 %tmp = icmp ult i64 %a, %b 551 %val = select i1 %tmp, i64 %a, i64 %b 552 store i64 %val, i64 addrspace(1)* %out, align 8 553 ret void 554} 555 556; FUNC-LABEL: {{^}}test_umin_ule_i64 557; GCN: s_endpgm 558 559; EG: MIN_UINT 560; EG: MIN_UINT 561define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { 562 %tmp = icmp ule i64 %a, %b 563 %val = select i1 %tmp, i64 %a, i64 %b 564 store i64 %val, i64 addrspace(1)* %out, align 8 565 ret void 566} 567 568; FUNC-LABEL: {{^}}test_imin_slt_i64 569; GCN: s_endpgm 570 571; EG-DAG: MIN_UINT 572; EG-DAG: MIN_INT 573define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { 574 %tmp = icmp slt i64 %a, %b 575 %val = select i1 %tmp, i64 %a, i64 %b 576 store i64 %val, i64 addrspace(1)* %out, align 8 577 ret void 578} 579 580; FUNC-LABEL: {{^}}test_imin_sle_i64 581; GCN: s_endpgm 582 583; EG-DAG: MIN_UINT 584; EG-DAG: MIN_INT 585define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { 586 %tmp = icmp sle i64 %a, %b 587 %val = select i1 %tmp, i64 %a, i64 %b 588 store i64 %val, i64 addrspace(1)* %out, align 8 589 ret void 590} 591 592; FUNC-LABEL: {{^}}v_test_imin_sle_v2i16: 593; SI: v_min_i32 594; SI: v_min_i32 595 596; VI: v_min_i16 597; VI: v_min_i16 598 599; GFX9: v_pk_min_i16 600 601; EG: MIN_INT 602; EG: MIN_INT 603define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { 604 %tid = call i32 @llvm.r600.read.tidig.x() 605 %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid 606 %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid 607 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 608 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep 609 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep 610 %cmp = icmp sle <2 x i16> %a, %b 611 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 612 store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep 613 ret void 614} 615 616; FIXME: i16 min 617; FUNC-LABEL: {{^}}v_test_imin_ule_v2i16: 618; SI: v_min_u32 619; SI: v_min_u32 620 621; VI: v_min_u16 622; VI: v_min_u16 623 624; GFX9: v_pk_min_u16 625 626; EG: MIN_UINT 627; EG: MIN_UINT 628define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { 629 %tid = call i32 @llvm.r600.read.tidig.x() 630 %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid 631 %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid 632 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 633 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep 634 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep 635 %cmp = icmp ule <2 x i16> %a, %b 636 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 637 store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep 638 ret void 639} 640 641declare i32 @llvm.r600.read.tidig.x() #1 642 643attributes #0 = { nounwind } 644attributes #1 = { nounwind readnone } 645