1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
4
5; FUNC-LABEL: {{^}}s_uaddo_i64_zext:
6; GCN: s_add_u32
7; GCN: s_addc_u32
8; GCN: v_cmp_lt_u64_e32 vcc
9
10; EG: ADDC_UINT
11; EG: ADDC_UINT
12define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
13  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
14  %val = extractvalue { i64, i1 } %uadd, 0
15  %carry = extractvalue { i64, i1 } %uadd, 1
16  %ext = zext i1 %carry to i64
17  %add2 = add i64 %val, %ext
18  store i64 %add2, i64 addrspace(1)* %out, align 8
19  ret void
20}
21
22; FIXME: Could do scalar
23
24; FUNC-LABEL: {{^}}s_uaddo_i32:
25; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
26; VI: v_add_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
27; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
28
29; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
30
31; EG: ADDC_UINT
32; EG: ADD_INT
33define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
34  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
35  %val = extractvalue { i32, i1 } %uadd, 0
36  %carry = extractvalue { i32, i1 } %uadd, 1
37  store i32 %val, i32 addrspace(1)* %out, align 4
38  store i1 %carry, i1 addrspace(1)* %carryout
39  ret void
40}
41
42; FUNC-LABEL: {{^}}v_uaddo_i32:
43; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
44; VI: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
45; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
46
47; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
48
49; EG: ADDC_UINT
50; EG: ADD_INT
51define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
52  %tid = call i32 @llvm.amdgcn.workitem.id.x()
53  %tid.ext = sext i32 %tid to i64
54  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
55  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
56  %a = load i32, i32 addrspace(1)* %a.gep, align 4
57  %b = load i32, i32 addrspace(1)* %b.gep, align 4
58  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
59  %val = extractvalue { i32, i1 } %uadd, 0
60  %carry = extractvalue { i32, i1 } %uadd, 1
61  store i32 %val, i32 addrspace(1)* %out, align 4
62  store i1 %carry, i1 addrspace(1)* %carryout
63  ret void
64}
65
66; FUNC-LABEL: {{^}}v_uaddo_i32_novcc:
67; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
68; VI: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
69; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
70
71; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
72
73; EG: ADDC_UINT
74; EG: ADD_INT
75define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
76  %tid = call i32 @llvm.amdgcn.workitem.id.x()
77  %tid.ext = sext i32 %tid to i64
78  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
79  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
80  %a = load i32, i32 addrspace(1)* %a.gep, align 4
81  %b = load i32, i32 addrspace(1)* %b.gep, align 4
82  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
83  %val = extractvalue { i32, i1 } %uadd, 0
84  %carry = extractvalue { i32, i1 } %uadd, 1
85  store volatile i32 %val, i32 addrspace(1)* %out, align 4
86  call void asm sideeffect "", "~{vcc}"() #0
87  store volatile i1 %carry, i1 addrspace(1)* %carryout
88  ret void
89}
90
91; FUNC-LABEL: {{^}}s_uaddo_i64:
92; GCN: s_add_u32
93; GCN: s_addc_u32
94
95; EG: ADDC_UINT
96; EG: ADD_INT
97define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
98  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
99  %val = extractvalue { i64, i1 } %uadd, 0
100  %carry = extractvalue { i64, i1 } %uadd, 1
101  store i64 %val, i64 addrspace(1)* %out, align 8
102  store i1 %carry, i1 addrspace(1)* %carryout
103  ret void
104}
105
106; FUNC-LABEL: {{^}}v_uaddo_i64:
107; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
108; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc,
109
110; VI: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
111; VI: v_addc_u32_e32 v{{[0-9]+}}, vcc,
112
113; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
114; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc,
115
116; EG: ADDC_UINT
117; EG: ADD_INT
118define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
119  %tid = call i32 @llvm.amdgcn.workitem.id.x()
120  %tid.ext = sext i32 %tid to i64
121  %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
122  %b.gep = getelementptr inbounds i64, i64 addrspace(1)* %b.ptr
123  %a = load i64, i64 addrspace(1)* %a.gep
124  %b = load i64, i64 addrspace(1)* %b.gep
125  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
126  %val = extractvalue { i64, i1 } %uadd, 0
127  %carry = extractvalue { i64, i1 } %uadd, 1
128  store i64 %val, i64 addrspace(1)* %out
129  store i1 %carry, i1 addrspace(1)* %carryout
130  ret void
131}
132
133; FUNC-LABEL: {{^}}v_uaddo_i16:
134; VI: v_add_u16_e32
135; VI: v_cmp_lt_u16_e32
136
137; GFX9: v_add_u16_e32
138; GFX9: v_cmp_lt_u16_e32
139define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
140  %tid = call i32 @llvm.amdgcn.workitem.id.x()
141  %tid.ext = sext i32 %tid to i64
142  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
143  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr
144  %a = load i16, i16 addrspace(1)* %a.gep
145  %b = load i16, i16 addrspace(1)* %b.gep
146  %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
147  %val = extractvalue { i16, i1 } %uadd, 0
148  %carry = extractvalue { i16, i1 } %uadd, 1
149  store i16 %val, i16 addrspace(1)* %out
150  store i1 %carry, i1 addrspace(1)* %carryout
151  ret void
152}
153
154; FUNC-LABEL: {{^}}v_uaddo_v2i32:
155; SICIVI: v_cmp_lt_i32
156; SICIVI: v_cmp_lt_i32
157; SICIVI: v_add_{{[iu]}}32
158; SICIVI: v_cmp_lt_i32
159; SICIVI: v_cmp_lt_i32
160; SICIVI: v_add_{{[iu]}}32
161define amdgpu_kernel void @v_uaddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
162  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
163  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
164  %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
165  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
166  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
167  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
168  %carry.ext = zext <2 x i1> %carry to <2 x i32>
169  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
170  ret void
171}
172
173; FUNC-LABEL: {{^}}s_uaddo_clamp_bit:
174; GCN: v_add_{{i|u|co_u}}32_e32
175; GCN: s_endpgm
176define amdgpu_kernel void @s_uaddo_clamp_bit(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
177entry:
178  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
179  %val = extractvalue { i32, i1 } %uadd, 0
180  %carry = extractvalue { i32, i1 } %uadd, 1
181  %c2 = icmp eq i1 %carry, false
182  %cc = icmp eq i32 %a, %b
183  br i1 %cc, label %exit, label %if
184
185if:
186  br label %exit
187
188exit:
189  %cout = phi i1 [false, %entry], [%c2, %if]
190  store i32 %val, i32 addrspace(1)* %out, align 4
191  store i1 %cout, i1 addrspace(1)* %carryout
192  ret void
193}
194
195; FUNC-LABEL: {{^}}v_uaddo_clamp_bit:
196; GCN: v_add_{{i|u|co_u}}32_e64
197; GCN: s_endpgm
198define amdgpu_kernel void @v_uaddo_clamp_bit(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
199entry:
200  %tid = call i32 @llvm.amdgcn.workitem.id.x()
201  %tid.ext = sext i32 %tid to i64
202  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
203  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
204  %a = load i32, i32 addrspace(1)* %a.gep
205  %b = load i32, i32 addrspace(1)* %b.gep
206  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
207  %val = extractvalue { i32, i1 } %uadd, 0
208  %carry = extractvalue { i32, i1 } %uadd, 1
209  %c2 = icmp eq i1 %carry, false
210  %cc = icmp eq i32 %a, %b
211  br i1 %cc, label %exit, label %if
212
213if:
214  br label %exit
215
216exit:
217  %cout = phi i1 [false, %entry], [%c2, %if]
218  store i32 %val, i32 addrspace(1)* %out, align 4
219  store i1 %cout, i1 addrspace(1)* %carryout
220  ret void
221}
222
223declare i32 @llvm.amdgcn.workitem.id.x() #1
224declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1
225declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
226declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1
227declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
228
229
230attributes #0 = { nounwind }
231attributes #1 = { nounwind readnone }
232