1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
4
5; GCN-LABEL: {{^}}mac_vvv:
6; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
7; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
8; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
9; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]]
10; GCN: buffer_store_dword [[C]]
11define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
12entry:
13  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
14  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
15
16  %a = load volatile float, float addrspace(1)* %in
17  %b = load volatile float, float addrspace(1)* %b_ptr
18  %c = load volatile float, float addrspace(1)* %c_ptr
19
20  %tmp0 = fmul float %a, %b
21  %tmp1 = fadd float %tmp0, %c
22  store float %tmp1, float addrspace(1)* %out
23  ret void
24}
25
26; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
27; GCN-NOT: v_mac_f32
28; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
29define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
30entry:
31  %tmp0 = fmul float 0.5, %in
32  %tmp1 = fadd float %tmp0, 0.5
33  store float %tmp1, float addrspace(1)* %out
34  ret void
35}
36
37; GCN-LABEL: {{^}}mad_vvs:
38; GCN-NOT: v_mac_f32
39; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
40define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
41entry:
42  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
43
44  %a = load float, float addrspace(1)* %in
45  %b = load float, float addrspace(1)* %b_ptr
46
47  %tmp0 = fmul float %a, %b
48  %tmp1 = fadd float %tmp0, %c
49  store float %tmp1, float addrspace(1)* %out
50  ret void
51}
52
53; GCN-LABEL: {{^}}mac_ssv:
54; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
55define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
56entry:
57  %c = load float, float addrspace(1)* %in
58
59  %tmp0 = fmul float %a, %a
60  %tmp1 = fadd float %tmp0, %c
61  store float %tmp1, float addrspace(1)* %out
62  ret void
63}
64
65; GCN-LABEL: {{^}}mac_mad_same_add:
66; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
67; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
68define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
69entry:
70  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
71  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
72  %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3
73  %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4
74
75  %a = load volatile float, float addrspace(1)* %in
76  %b = load volatile float, float addrspace(1)* %b_ptr
77  %c = load volatile float, float addrspace(1)* %c_ptr
78  %d = load volatile float, float addrspace(1)* %d_ptr
79  %e = load volatile float, float addrspace(1)* %e_ptr
80
81  %tmp0 = fmul float %a, %b
82  %tmp1 = fadd float %tmp0, %c
83
84  %tmp2 = fmul float %d, %e
85  %tmp3 = fadd float %tmp2, %c
86
87  %out1 = getelementptr float, float addrspace(1)* %out, i32 1
88  store float %tmp1, float addrspace(1)* %out
89  store float %tmp3, float addrspace(1)* %out1
90  ret void
91}
92
93; There is no advantage to using v_mac when one of the operands is negated
94; and v_mad accepts more operand types.
95
96; GCN-LABEL: {{^}}mad_neg_src0:
97; GCN-NOT: v_mac_f32
98; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
99define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
100entry:
101  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
102  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
103
104  %a = load float, float addrspace(1)* %in
105  %b = load float, float addrspace(1)* %b_ptr
106  %c = load float, float addrspace(1)* %c_ptr
107
108  %neg_a = fsub float -0.0, %a
109  %tmp0 = fmul float %neg_a, %b
110  %tmp1 = fadd float %tmp0, %c
111
112  store float %tmp1, float addrspace(1)* %out
113  ret void
114}
115
116; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
117; GCN-NOT: v_mac_f32
118; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
119define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
120entry:
121  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
122  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
123
124  %a = load float, float addrspace(1)* %in
125  %b = load float, float addrspace(1)* %b_ptr
126  %c = load float, float addrspace(1)* %c_ptr
127
128  %neg_a = fsub float 0.0, %a
129  %tmp0 = fmul float %neg_a, %b
130  %tmp1 = fadd float %tmp0, %c
131
132  store float %tmp1, float addrspace(1)* %out
133  ret void
134}
135
136; GCN-LABEL: {{^}}safe_mad_sub0_src0:
137; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
138; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}}
139define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
140entry:
141  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
142  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
143
144  %a = load float, float addrspace(1)* %in
145  %b = load float, float addrspace(1)* %b_ptr
146  %c = load float, float addrspace(1)* %c_ptr
147
148  %neg_a = fsub float 0.0, %a
149  %tmp0 = fmul float %neg_a, %b
150  %tmp1 = fadd float %tmp0, %c
151
152  store float %tmp1, float addrspace(1)* %out
153  ret void
154}
155
156; GCN-LABEL: {{^}}mad_neg_src1:
157; GCN-NOT: v_mac_f32
158; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
159define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
160entry:
161  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
162  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
163
164  %a = load float, float addrspace(1)* %in
165  %b = load float, float addrspace(1)* %b_ptr
166  %c = load float, float addrspace(1)* %c_ptr
167
168  %neg_b = fsub float -0.0, %b
169  %tmp0 = fmul float %a, %neg_b
170  %tmp1 = fadd float %tmp0, %c
171
172  store float %tmp1, float addrspace(1)* %out
173  ret void
174}
175
176; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
177; GCN-NOT: v_mac_f32
178; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
179define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
180entry:
181  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
182  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
183
184  %a = load float, float addrspace(1)* %in
185  %b = load float, float addrspace(1)* %b_ptr
186  %c = load float, float addrspace(1)* %c_ptr
187
188  %neg_b = fsub float 0.0, %b
189  %tmp0 = fmul float %a, %neg_b
190  %tmp1 = fadd float %tmp0, %c
191
192  store float %tmp1, float addrspace(1)* %out
193  ret void
194}
195
196; GCN-LABEL: {{^}}mad_neg_src2:
197; GCN-NOT: v_mac
198; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
199define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
200entry:
201  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
202  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
203
204  %a = load float, float addrspace(1)* %in
205  %b = load float, float addrspace(1)* %b_ptr
206  %c = load float, float addrspace(1)* %c_ptr
207
208  %neg_c = fsub float -0.0, %c
209  %tmp0 = fmul float %a, %b
210  %tmp1 = fadd float %tmp0, %neg_c
211
212  store float %tmp1, float addrspace(1)* %out
213  ret void
214}
215
216; Without special casing the inline constant check for v_mac_f32's
217; src2, this fails to fold the 1.0 into a mad.
218
219; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32:
220; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
221; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
222
223; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
224; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
225define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
226bb:
227  %tid = call i32 @llvm.amdgcn.workitem.id.x()
228  %tid.ext = sext i32 %tid to i64
229  %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext
230  %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext
231  %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
232  %tmp = load volatile float, float addrspace(1)* %gep.a
233  %tmp1 = load volatile float, float addrspace(1)* %gep.b
234  %tmp2 = fadd float %tmp, %tmp
235  %tmp3 = fmul float %tmp2, 4.0
236  %tmp4 = fsub float 1.0, %tmp3
237  %tmp5 = fadd float %tmp4, %tmp1
238  %tmp6 = fadd float %tmp1, %tmp1
239  %tmp7 = fmul float %tmp6, %tmp
240  %tmp8 = fsub float 1.0, %tmp7
241  %tmp9 = fmul float %tmp8, 8.0
242  %tmp10 = fadd float %tmp5, %tmp9
243  store float %tmp10, float addrspace(1)* %gep.out
244  ret void
245}
246
247; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16:
248; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
249; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]]
250
251; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[A]]
252; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]]
253
254; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]]
255; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
256; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}}
257
258; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
259; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
260define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
261bb:
262  %tid = call i32 @llvm.amdgcn.workitem.id.x()
263  %tid.ext = sext i32 %tid to i64
264  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
265  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
266  %gep.out = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
267  %tmp = load volatile half, half addrspace(1)* %gep.a
268  %tmp1 = load volatile half, half addrspace(1)* %gep.b
269  %tmp2 = fadd half %tmp, %tmp
270  %tmp3 = fmul half %tmp2, 4.0
271  %tmp4 = fsub half 1.0, %tmp3
272  %tmp5 = fadd half %tmp4, %tmp1
273  %tmp6 = fadd half %tmp1, %tmp1
274  %tmp7 = fmul half %tmp6, %tmp
275  %tmp8 = fsub half 1.0, %tmp7
276  %tmp9 = fmul half %tmp8, 8.0
277  %tmp10 = fadd half %tmp5, %tmp9
278  store half %tmp10, half addrspace(1)* %gep.out
279  ret void
280}
281
282declare i32 @llvm.amdgcn.workitem.id.x() #2
283
284attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
285attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
286attributes #2 = { nounwind readnone }
287attributes #3 = { nounwind }
288