1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s 4 5; FUNC-LABEL: {{^}}s_uaddo_i64_zext: 6; GCN: s_add_u32 7; GCN: s_addc_u32 8; GCN: v_cmp_lt_u64_e32 vcc 9 10; EG: ADDC_UINT 11; EG: ADDC_UINT 12define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { 13 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) 14 %val = extractvalue { i64, i1 } %uadd, 0 15 %carry = extractvalue { i64, i1 } %uadd, 1 16 %ext = zext i1 %carry to i64 17 %add2 = add i64 %val, %ext 18 store i64 %add2, i64 addrspace(1)* %out, align 8 19 ret void 20} 21 22; FIXME: Could do scalar 23 24; FUNC-LABEL: {{^}}s_uaddo_i32: 25; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} 26; VI: v_add_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} 27; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} 28 29; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc 30 31; EG: ADDC_UINT 32; EG: ADD_INT 33define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { 34 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 35 %val = extractvalue { i32, i1 } %uadd, 0 36 %carry = extractvalue { i32, i1 } %uadd, 1 37 store i32 %val, i32 addrspace(1)* %out, align 4 38 store i1 %carry, i1 addrspace(1)* %carryout 39 ret void 40} 41 42; FUNC-LABEL: {{^}}v_uaddo_i32: 43; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 44; VI: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 45; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 46 47; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc 48 49; EG: ADDC_UINT 50; EG: ADD_INT 51define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { 52 %tid = call i32 @llvm.amdgcn.workitem.id.x() 53 %tid.ext = sext i32 %tid to i64 54 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr 55 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr 56 %a = load i32, i32 addrspace(1)* %a.gep, align 4 57 %b = load i32, i32 addrspace(1)* %b.gep, align 4 58 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 59 %val = extractvalue { i32, i1 } %uadd, 0 60 %carry = extractvalue { i32, i1 } %uadd, 1 61 store i32 %val, i32 addrspace(1)* %out, align 4 62 store i1 %carry, i1 addrspace(1)* %carryout 63 ret void 64} 65 66; FUNC-LABEL: {{^}}v_uaddo_i32_novcc: 67; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 68; VI: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 69; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 70 71; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc 72 73; EG: ADDC_UINT 74; EG: ADD_INT 75define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { 76 %tid = call i32 @llvm.amdgcn.workitem.id.x() 77 %tid.ext = sext i32 %tid to i64 78 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr 79 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr 80 %a = load i32, i32 addrspace(1)* %a.gep, align 4 81 %b = load i32, i32 addrspace(1)* %b.gep, align 4 82 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 83 %val = extractvalue { i32, i1 } %uadd, 0 84 %carry = extractvalue { i32, i1 } %uadd, 1 85 store volatile i32 %val, i32 addrspace(1)* %out, align 4 86 call void asm sideeffect "", "~{vcc}"() #0 87 store volatile i1 %carry, i1 addrspace(1)* %carryout 88 ret void 89} 90 91; FUNC-LABEL: {{^}}s_uaddo_i64: 92; GCN: s_add_u32 93; GCN: s_addc_u32 94 95; EG: ADDC_UINT 96; EG: ADD_INT 97define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 { 98 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) 99 %val = extractvalue { i64, i1 } %uadd, 0 100 %carry = extractvalue { i64, i1 } %uadd, 1 101 store i64 %val, i64 addrspace(1)* %out, align 8 102 store i1 %carry, i1 addrspace(1)* %carryout 103 ret void 104} 105 106; FUNC-LABEL: {{^}}v_uaddo_i64: 107; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 108; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 109 110; VI: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 111; VI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 112 113; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 114; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 115 116; EG: ADDC_UINT 117; EG: ADD_INT 118define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 { 119 %tid = call i32 @llvm.amdgcn.workitem.id.x() 120 %tid.ext = sext i32 %tid to i64 121 %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr 122 %b.gep = getelementptr inbounds i64, i64 addrspace(1)* %b.ptr 123 %a = load i64, i64 addrspace(1)* %a.gep 124 %b = load i64, i64 addrspace(1)* %b.gep 125 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) 126 %val = extractvalue { i64, i1 } %uadd, 0 127 %carry = extractvalue { i64, i1 } %uadd, 1 128 store i64 %val, i64 addrspace(1)* %out 129 store i1 %carry, i1 addrspace(1)* %carryout 130 ret void 131} 132 133; FUNC-LABEL: {{^}}v_uaddo_i16: 134; VI: v_add_u16_e32 135; VI: v_cmp_lt_u16_e32 136 137; GFX9: v_add_u16_e32 138; GFX9: v_cmp_lt_u16_e32 139define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { 140 %tid = call i32 @llvm.amdgcn.workitem.id.x() 141 %tid.ext = sext i32 %tid to i64 142 %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr 143 %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr 144 %a = load i16, i16 addrspace(1)* %a.gep 145 %b = load i16, i16 addrspace(1)* %b.gep 146 %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b) 147 %val = extractvalue { i16, i1 } %uadd, 0 148 %carry = extractvalue { i16, i1 } %uadd, 1 149 store i16 %val, i16 addrspace(1)* %out 150 store i1 %carry, i1 addrspace(1)* %carryout 151 ret void 152} 153 154; FUNC-LABEL: {{^}}v_uaddo_v2i32: 155; SICIVI: v_cmp_lt_i32 156; SICIVI: v_cmp_lt_i32 157; SICIVI: v_add_{{[iu]}}32 158; SICIVI: v_cmp_lt_i32 159; SICIVI: v_cmp_lt_i32 160; SICIVI: v_add_{{[iu]}}32 161define amdgpu_kernel void @v_uaddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { 162 %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 163 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 164 %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind 165 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 166 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 167 store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 168 %carry.ext = zext <2 x i1> %carry to <2 x i32> 169 store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout 170 ret void 171} 172 173; FUNC-LABEL: {{^}}s_uaddo_clamp_bit: 174; GCN: v_add_{{i|u|co_u}}32_e32 175; GCN: s_endpgm 176define amdgpu_kernel void @s_uaddo_clamp_bit(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { 177entry: 178 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 179 %val = extractvalue { i32, i1 } %uadd, 0 180 %carry = extractvalue { i32, i1 } %uadd, 1 181 %c2 = icmp eq i1 %carry, false 182 %cc = icmp eq i32 %a, %b 183 br i1 %cc, label %exit, label %if 184 185if: 186 br label %exit 187 188exit: 189 %cout = phi i1 [false, %entry], [%c2, %if] 190 store i32 %val, i32 addrspace(1)* %out, align 4 191 store i1 %cout, i1 addrspace(1)* %carryout 192 ret void 193} 194 195; FUNC-LABEL: {{^}}v_uaddo_clamp_bit: 196; GCN: v_add_{{i|u|co_u}}32_e64 197; GCN: s_endpgm 198define amdgpu_kernel void @v_uaddo_clamp_bit(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { 199entry: 200 %tid = call i32 @llvm.amdgcn.workitem.id.x() 201 %tid.ext = sext i32 %tid to i64 202 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr 203 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr 204 %a = load i32, i32 addrspace(1)* %a.gep 205 %b = load i32, i32 addrspace(1)* %b.gep 206 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 207 %val = extractvalue { i32, i1 } %uadd, 0 208 %carry = extractvalue { i32, i1 } %uadd, 1 209 %c2 = icmp eq i1 %carry, false 210 %cc = icmp eq i32 %a, %b 211 br i1 %cc, label %exit, label %if 212 213if: 214 br label %exit 215 216exit: 217 %cout = phi i1 [false, %entry], [%c2, %if] 218 store i32 %val, i32 addrspace(1)* %out, align 4 219 store i1 %cout, i1 addrspace(1)* %carryout 220 ret void 221} 222 223declare i32 @llvm.amdgcn.workitem.id.x() #1 224declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1 225declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 226declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1 227declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 228 229 230attributes #0 = { nounwind } 231attributes #1 = { nounwind readnone } 232