1; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
3
4; --------------------------------------------------------------------------------
5; fadd tests
6; --------------------------------------------------------------------------------
7
8; GCN-LABEL: {{^}}v_fneg_add_f32:
9; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
10; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
11
12; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
13; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
14
15; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
16; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
17define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
18  %tid = call i32 @llvm.amdgcn.workitem.id.x()
19  %tid.ext = sext i32 %tid to i64
20  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
21  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
22  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
23  %a = load volatile float, float addrspace(1)* %a.gep
24  %b = load volatile float, float addrspace(1)* %b.gep
25  %add = fadd float %a, %b
26  %fneg = fsub float -0.000000e+00, %add
27  store float %fneg, float addrspace(1)* %out.gep
28  ret void
29}
30
31; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
32; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
33; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
34; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
35; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
36; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
37; GCN-NEXT: buffer_store_dword [[ADD]]
38define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
39  %tid = call i32 @llvm.amdgcn.workitem.id.x()
40  %tid.ext = sext i32 %tid to i64
41  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
42  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
43  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
44  %a = load volatile float, float addrspace(1)* %a.gep
45  %b = load volatile float, float addrspace(1)* %b.gep
46  %add = fadd float %a, %b
47  %fneg = fsub float -0.000000e+00, %add
48  store volatile float %fneg, float addrspace(1)* %out
49  store volatile float %add, float addrspace(1)* %out
50  ret void
51}
52
53; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
54; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
55; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
56
57; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
58; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
59; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
60
61; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
62; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]
63; GCN: buffer_store_dword [[NEG_ADD]]
64; GCN-NEXT: buffer_store_dword [[MUL]]
65define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
66  %tid = call i32 @llvm.amdgcn.workitem.id.x()
67  %tid.ext = sext i32 %tid to i64
68  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
69  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
70  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
71  %a = load volatile float, float addrspace(1)* %a.gep
72  %b = load volatile float, float addrspace(1)* %b.gep
73  %add = fadd float %a, %b
74  %fneg = fsub float -0.000000e+00, %add
75  %use1 = fmul float %add, 4.0
76  store volatile float %fneg, float addrspace(1)* %out
77  store volatile float %use1, float addrspace(1)* %out
78  ret void
79}
80
81; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
82; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
83; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
84
85; GCN-SAFE: v_sub_f32_e32
86; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000,
87
88; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
89; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
90define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
91  %tid = call i32 @llvm.amdgcn.workitem.id.x()
92  %tid.ext = sext i32 %tid to i64
93  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
94  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
95  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
96  %a = load volatile float, float addrspace(1)* %a.gep
97  %b = load volatile float, float addrspace(1)* %b.gep
98  %fneg.a = fsub float -0.000000e+00, %a
99  %add = fadd float %fneg.a, %b
100  %fneg = fsub float -0.000000e+00, %add
101  store volatile float %fneg, float addrspace(1)* %out
102  ret void
103}
104
105; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
106; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
107; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
108
109; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
110; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
111
112; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
113; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
114define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
115  %tid = call i32 @llvm.amdgcn.workitem.id.x()
116  %tid.ext = sext i32 %tid to i64
117  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
118  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
119  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
120  %a = load volatile float, float addrspace(1)* %a.gep
121  %b = load volatile float, float addrspace(1)* %b.gep
122  %fneg.b = fsub float -0.000000e+00, %b
123  %add = fadd float %a, %fneg.b
124  %fneg = fsub float -0.000000e+00, %add
125  store volatile float %fneg, float addrspace(1)* %out
126  ret void
127}
128
129; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
130; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
131; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
132
133; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
134; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
135
136; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
137; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
138define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
139  %tid = call i32 @llvm.amdgcn.workitem.id.x()
140  %tid.ext = sext i32 %tid to i64
141  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
142  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
143  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
144  %a = load volatile float, float addrspace(1)* %a.gep
145  %b = load volatile float, float addrspace(1)* %b.gep
146  %fneg.a = fsub float -0.000000e+00, %a
147  %fneg.b = fsub float -0.000000e+00, %b
148  %add = fadd float %fneg.a, %fneg.b
149  %fneg = fsub float -0.000000e+00, %add
150  store volatile float %fneg, float addrspace(1)* %out
151  ret void
152}
153
154; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
155; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
156; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
157
158; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}}
159; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]]
160; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
161; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]]
162
163; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
164; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
165; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
166; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
167define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
168  %tid = call i32 @llvm.amdgcn.workitem.id.x()
169  %tid.ext = sext i32 %tid to i64
170  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
171  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
172  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
173  %a = load volatile float, float addrspace(1)* %a.gep
174  %b = load volatile float, float addrspace(1)* %b.gep
175  %fneg.a = fsub float -0.000000e+00, %a
176  %add = fadd float %fneg.a, %b
177  %fneg = fsub float -0.000000e+00, %add
178  store volatile float %fneg, float addrspace(1)* %out
179  store volatile float %fneg.a, float addrspace(1)* %out
180  ret void
181}
182
183; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
184; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
185; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
186
187; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
188; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
189; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
190
191; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
192; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
193; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
194; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
195define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
196  %tid = call i32 @llvm.amdgcn.workitem.id.x()
197  %tid.ext = sext i32 %tid to i64
198  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
199  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
200  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
201  %a = load volatile float, float addrspace(1)* %a.gep
202  %b = load volatile float, float addrspace(1)* %b.gep
203  %fneg.a = fsub float -0.000000e+00, %a
204  %add = fadd float %fneg.a, %b
205  %fneg = fsub float -0.000000e+00, %add
206  %use1 = fmul float %fneg.a, %c
207  store volatile float %fneg, float addrspace(1)* %out
208  store volatile float %use1, float addrspace(1)* %out
209  ret void
210}
211
212; --------------------------------------------------------------------------------
213; fmul tests
214; --------------------------------------------------------------------------------
215
216; GCN-LABEL: {{^}}v_fneg_mul_f32:
217; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
218; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
219; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
220; GCN-NEXT: buffer_store_dword [[RESULT]]
221define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
222  %tid = call i32 @llvm.amdgcn.workitem.id.x()
223  %tid.ext = sext i32 %tid to i64
224  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
225  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
226  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
227  %a = load volatile float, float addrspace(1)* %a.gep
228  %b = load volatile float, float addrspace(1)* %b.gep
229  %mul = fmul float %a, %b
230  %fneg = fsub float -0.000000e+00, %mul
231  store float %fneg, float addrspace(1)* %out.gep
232  ret void
233}
234
235; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
236; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
237; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
238; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
239; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
240; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
241; GCN: buffer_store_dword [[ADD]]
242define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
243  %tid = call i32 @llvm.amdgcn.workitem.id.x()
244  %tid.ext = sext i32 %tid to i64
245  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
246  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
247  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
248  %a = load volatile float, float addrspace(1)* %a.gep
249  %b = load volatile float, float addrspace(1)* %b.gep
250  %mul = fmul float %a, %b
251  %fneg = fsub float -0.000000e+00, %mul
252  store volatile float %fneg, float addrspace(1)* %out
253  store volatile float %mul, float addrspace(1)* %out
254  ret void
255}
256
257; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
258; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
259; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
260; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
261; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
262; GCN-NEXT: buffer_store_dword [[MUL0]]
263; GCN-NEXT: buffer_store_dword [[MUL1]]
264define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
265  %tid = call i32 @llvm.amdgcn.workitem.id.x()
266  %tid.ext = sext i32 %tid to i64
267  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
268  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
269  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
270  %a = load volatile float, float addrspace(1)* %a.gep
271  %b = load volatile float, float addrspace(1)* %b.gep
272  %mul = fmul float %a, %b
273  %fneg = fsub float -0.000000e+00, %mul
274  %use1 = fmul float %mul, 4.0
275  store volatile float %fneg, float addrspace(1)* %out
276  store volatile float %use1, float addrspace(1)* %out
277  ret void
278}
279
280; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
281; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
282; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
283; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
284; GCN-NEXT: buffer_store_dword [[ADD]]
285define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
286  %tid = call i32 @llvm.amdgcn.workitem.id.x()
287  %tid.ext = sext i32 %tid to i64
288  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
289  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
290  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
291  %a = load volatile float, float addrspace(1)* %a.gep
292  %b = load volatile float, float addrspace(1)* %b.gep
293  %fneg.a = fsub float -0.000000e+00, %a
294  %mul = fmul float %fneg.a, %b
295  %fneg = fsub float -0.000000e+00, %mul
296  store volatile float %fneg, float addrspace(1)* %out
297  ret void
298}
299
300; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
301; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
302; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
303; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
304; GCN-NEXT: buffer_store_dword [[ADD]]
305define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
306  %tid = call i32 @llvm.amdgcn.workitem.id.x()
307  %tid.ext = sext i32 %tid to i64
308  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
309  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
310  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
311  %a = load volatile float, float addrspace(1)* %a.gep
312  %b = load volatile float, float addrspace(1)* %b.gep
313  %fneg.b = fsub float -0.000000e+00, %b
314  %mul = fmul float %a, %fneg.b
315  %fneg = fsub float -0.000000e+00, %mul
316  store volatile float %fneg, float addrspace(1)* %out
317  ret void
318}
319
320; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
321; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
322; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
323; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
324; GCN-NEXT: buffer_store_dword [[ADD]]
325define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
326  %tid = call i32 @llvm.amdgcn.workitem.id.x()
327  %tid.ext = sext i32 %tid to i64
328  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
329  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
330  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
331  %a = load volatile float, float addrspace(1)* %a.gep
332  %b = load volatile float, float addrspace(1)* %b.gep
333  %fneg.a = fsub float -0.000000e+00, %a
334  %fneg.b = fsub float -0.000000e+00, %b
335  %mul = fmul float %fneg.a, %fneg.b
336  %fneg = fsub float -0.000000e+00, %mul
337  store volatile float %fneg, float addrspace(1)* %out
338  ret void
339}
340
341; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
342; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
343; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
344; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
345; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
346; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
347; GCN: buffer_store_dword [[NEG_A]]
348define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
349  %tid = call i32 @llvm.amdgcn.workitem.id.x()
350  %tid.ext = sext i32 %tid to i64
351  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
352  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
353  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
354  %a = load volatile float, float addrspace(1)* %a.gep
355  %b = load volatile float, float addrspace(1)* %b.gep
356  %fneg.a = fsub float -0.000000e+00, %a
357  %mul = fmul float %fneg.a, %b
358  %fneg = fsub float -0.000000e+00, %mul
359  store volatile float %fneg, float addrspace(1)* %out
360  store volatile float %fneg.a, float addrspace(1)* %out
361  ret void
362}
363
364; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
365; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
366; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
367; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
368; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
369; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
370; GCN: buffer_store_dword [[MUL]]
371define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
372  %tid = call i32 @llvm.amdgcn.workitem.id.x()
373  %tid.ext = sext i32 %tid to i64
374  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
375  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
376  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
377  %a = load volatile float, float addrspace(1)* %a.gep
378  %b = load volatile float, float addrspace(1)* %b.gep
379  %fneg.a = fsub float -0.000000e+00, %a
380  %mul = fmul float %fneg.a, %b
381  %fneg = fsub float -0.000000e+00, %mul
382  %use1 = fmul float %fneg.a, %c
383  store volatile float %fneg, float addrspace(1)* %out
384  store volatile float %use1, float addrspace(1)* %out
385  ret void
386}
387
388; --------------------------------------------------------------------------------
389; fminnum tests
390; --------------------------------------------------------------------------------
391
392; GCN-LABEL: {{^}}v_fneg_minnum_f32:
393; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
394; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
395; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
396; GCN: buffer_store_dword [[RESULT]]
397define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
398  %tid = call i32 @llvm.amdgcn.workitem.id.x()
399  %tid.ext = sext i32 %tid to i64
400  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
401  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
402  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
403  %a = load volatile float, float addrspace(1)* %a.gep
404  %b = load volatile float, float addrspace(1)* %b.gep
405  %min = call float @llvm.minnum.f32(float %a, float %b)
406  %fneg = fsub float -0.000000e+00, %min
407  store float %fneg, float addrspace(1)* %out.gep
408  ret void
409}
410
411; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
412; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
413; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
414; GCN: buffer_store_dword [[RESULT]]
415define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
416  %tid = call i32 @llvm.amdgcn.workitem.id.x()
417  %tid.ext = sext i32 %tid to i64
418  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
419  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
420  %a = load volatile float, float addrspace(1)* %a.gep
421  %min = call float @llvm.minnum.f32(float %a, float %a)
422  %min.fneg = fsub float -0.0, %min
423  store float %min.fneg, float addrspace(1)* %out.gep
424  ret void
425}
426
427; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
428; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
429; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
430; GCN: buffer_store_dword [[RESULT]]
431define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
432  %tid = call i32 @llvm.amdgcn.workitem.id.x()
433  %tid.ext = sext i32 %tid to i64
434  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
435  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
436  %a = load volatile float, float addrspace(1)* %a.gep
437  %min = call float @llvm.minnum.f32(float 4.0, float %a)
438  %fneg = fsub float -0.000000e+00, %min
439  store float %fneg, float addrspace(1)* %out.gep
440  ret void
441}
442
443; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
444; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
445; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
446; GCN: buffer_store_dword [[RESULT]]
447define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
448  %tid = call i32 @llvm.amdgcn.workitem.id.x()
449  %tid.ext = sext i32 %tid to i64
450  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
451  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
452  %a = load volatile float, float addrspace(1)* %a.gep
453  %min = call float @llvm.minnum.f32(float -4.0, float %a)
454  %fneg = fsub float -0.000000e+00, %min
455  store float %fneg, float addrspace(1)* %out.gep
456  ret void
457}
458
459; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
460; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
461; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
462; GCN: buffer_store_dword [[RESULT]]
463define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
464  %tid = call i32 @llvm.amdgcn.workitem.id.x()
465  %tid.ext = sext i32 %tid to i64
466  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
467  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
468  %a = load volatile float, float addrspace(1)* %a.gep
469  %min = call float @llvm.minnum.f32(float 0.0, float %a)
470  %fneg = fsub float -0.000000e+00, %min
471  store float %fneg, float addrspace(1)* %out.gep
472  ret void
473}
474
475; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
476; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
477; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
478; GCN: buffer_store_dword [[RESULT]]
479define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
480  %tid = call i32 @llvm.amdgcn.workitem.id.x()
481  %tid.ext = sext i32 %tid to i64
482  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
483  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
484  %a = load volatile float, float addrspace(1)* %a.gep
485  %min = call float @llvm.minnum.f32(float -0.0, float %a)
486  %fneg = fsub float -0.000000e+00, %min
487  store float %fneg, float addrspace(1)* %out.gep
488  ret void
489}
490
491; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
492; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
493; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
494; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
495; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
496; GCN: buffer_store_dword [[RESULT]]
497define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
498  %tid = call i32 @llvm.amdgcn.workitem.id.x()
499  %tid.ext = sext i32 %tid to i64
500  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
501  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
502  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
503  %a = load volatile float, float addrspace(1)* %a.gep
504  %b = load volatile float, float addrspace(1)* %b.gep
505  %min = call float @llvm.minnum.f32(float 0.0, float %a)
506  %fneg = fsub float -0.000000e+00, %min
507  %mul = fmul float %fneg, %b
508  store float %mul, float addrspace(1)* %out.gep
509  ret void
510}
511
512; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
513; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
514; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
515; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
516; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
517; GCN-NEXT: buffer_store_dword [[MAX0]]
518; GCN-NEXT: buffer_store_dword [[MUL1]]
519define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
520  %tid = call i32 @llvm.amdgcn.workitem.id.x()
521  %tid.ext = sext i32 %tid to i64
522  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
523  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
524  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
525  %a = load volatile float, float addrspace(1)* %a.gep
526  %b = load volatile float, float addrspace(1)* %b.gep
527  %min = call float @llvm.minnum.f32(float %a, float %b)
528  %fneg = fsub float -0.000000e+00, %min
529  %use1 = fmul float %min, 4.0
530  store volatile float %fneg, float addrspace(1)* %out
531  store volatile float %use1, float addrspace(1)* %out
532  ret void
533}
534
535; --------------------------------------------------------------------------------
536; fmaxnum tests
537; --------------------------------------------------------------------------------
538
539; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
540; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
541; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
542; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
543; GCN: buffer_store_dword [[RESULT]]
544define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
545  %tid = call i32 @llvm.amdgcn.workitem.id.x()
546  %tid.ext = sext i32 %tid to i64
547  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
548  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
549  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
550  %a = load volatile float, float addrspace(1)* %a.gep
551  %b = load volatile float, float addrspace(1)* %b.gep
552  %min = call float @llvm.maxnum.f32(float %a, float %b)
553  %fneg = fsub float -0.000000e+00, %min
554  store float %fneg, float addrspace(1)* %out.gep
555  ret void
556}
557
558; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
559; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
560; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
561; GCN: buffer_store_dword [[RESULT]]
562define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
563  %tid = call i32 @llvm.amdgcn.workitem.id.x()
564  %tid.ext = sext i32 %tid to i64
565  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
566  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
567  %a = load volatile float, float addrspace(1)* %a.gep
568  %min = call float @llvm.maxnum.f32(float %a, float %a)
569  %min.fneg = fsub float -0.0, %min
570  store float %min.fneg, float addrspace(1)* %out.gep
571  ret void
572}
573
574; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
575; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
576; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
577; GCN: buffer_store_dword [[RESULT]]
578define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
579  %tid = call i32 @llvm.amdgcn.workitem.id.x()
580  %tid.ext = sext i32 %tid to i64
581  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
582  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
583  %a = load volatile float, float addrspace(1)* %a.gep
584  %min = call float @llvm.maxnum.f32(float 4.0, float %a)
585  %fneg = fsub float -0.000000e+00, %min
586  store float %fneg, float addrspace(1)* %out.gep
587  ret void
588}
589
590; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
591; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
592; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
593; GCN: buffer_store_dword [[RESULT]]
594define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
595  %tid = call i32 @llvm.amdgcn.workitem.id.x()
596  %tid.ext = sext i32 %tid to i64
597  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
598  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
599  %a = load volatile float, float addrspace(1)* %a.gep
600  %min = call float @llvm.maxnum.f32(float -4.0, float %a)
601  %fneg = fsub float -0.000000e+00, %min
602  store float %fneg, float addrspace(1)* %out.gep
603  ret void
604}
605
606; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
607; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
608; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
609; GCN: buffer_store_dword [[RESULT]]
610define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
611  %tid = call i32 @llvm.amdgcn.workitem.id.x()
612  %tid.ext = sext i32 %tid to i64
613  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
614  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
615  %a = load volatile float, float addrspace(1)* %a.gep
616  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
617  %fneg = fsub float -0.000000e+00, %max
618  store float %fneg, float addrspace(1)* %out.gep
619  ret void
620}
621
622; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
623; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
624; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
625; GCN: buffer_store_dword [[RESULT]]
626define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
627  %tid = call i32 @llvm.amdgcn.workitem.id.x()
628  %tid.ext = sext i32 %tid to i64
629  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
630  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
631  %a = load volatile float, float addrspace(1)* %a.gep
632  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
633  %fneg = fsub float -0.000000e+00, %max
634  store float %fneg, float addrspace(1)* %out.gep
635  ret void
636}
637
638; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
639; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
640; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
641; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
642; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
643; GCN: buffer_store_dword [[RESULT]]
644define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
645  %tid = call i32 @llvm.amdgcn.workitem.id.x()
646  %tid.ext = sext i32 %tid to i64
647  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
648  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
649  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
650  %a = load volatile float, float addrspace(1)* %a.gep
651  %b = load volatile float, float addrspace(1)* %b.gep
652  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
653  %fneg = fsub float -0.000000e+00, %max
654  %mul = fmul float %fneg, %b
655  store float %mul, float addrspace(1)* %out.gep
656  ret void
657}
658
659; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
660; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
661; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
662; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
663; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
664; GCN-NEXT: buffer_store_dword [[MAX0]]
665; GCN-NEXT: buffer_store_dword [[MUL1]]
666define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
667  %tid = call i32 @llvm.amdgcn.workitem.id.x()
668  %tid.ext = sext i32 %tid to i64
669  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
670  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
671  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
672  %a = load volatile float, float addrspace(1)* %a.gep
673  %b = load volatile float, float addrspace(1)* %b.gep
674  %min = call float @llvm.maxnum.f32(float %a, float %b)
675  %fneg = fsub float -0.000000e+00, %min
676  %use1 = fmul float %min, 4.0
677  store volatile float %fneg, float addrspace(1)* %out
678  store volatile float %use1, float addrspace(1)* %out
679  ret void
680}
681
682; --------------------------------------------------------------------------------
683; fma tests
684; --------------------------------------------------------------------------------
685
686; GCN-LABEL: {{^}}v_fneg_fma_f32:
687; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
688; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
689; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
690
691; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
692; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]]
693
694; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
695; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
696define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
697  %tid = call i32 @llvm.amdgcn.workitem.id.x()
698  %tid.ext = sext i32 %tid to i64
699  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
700  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
701  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
702  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
703  %a = load volatile float, float addrspace(1)* %a.gep
704  %b = load volatile float, float addrspace(1)* %b.gep
705  %c = load volatile float, float addrspace(1)* %c.gep
706  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
707  %fneg = fsub float -0.000000e+00, %fma
708  store float %fneg, float addrspace(1)* %out.gep
709  ret void
710}
711
712; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
713; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
714; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
715; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
716; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
717; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
718; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
719; GCN-NEXT: buffer_store_dword [[FMA]]
720define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
721  %tid = call i32 @llvm.amdgcn.workitem.id.x()
722  %tid.ext = sext i32 %tid to i64
723  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
724  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
725  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
726  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
727  %a = load volatile float, float addrspace(1)* %a.gep
728  %b = load volatile float, float addrspace(1)* %b.gep
729  %c = load volatile float, float addrspace(1)* %c.gep
730  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
731  %fneg = fsub float -0.000000e+00, %fma
732  store volatile float %fneg, float addrspace(1)* %out
733  store volatile float %fma, float addrspace(1)* %out
734  ret void
735}
736
737; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
738; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
739; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
740; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
741
742; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
743; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
744; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
745
746; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
747; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
748
749; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
750; GCN-NEXT: buffer_store_dword [[MUL]]
751define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
752  %tid = call i32 @llvm.amdgcn.workitem.id.x()
753  %tid.ext = sext i32 %tid to i64
754  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
755  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
756  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
757  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
758  %a = load volatile float, float addrspace(1)* %a.gep
759  %b = load volatile float, float addrspace(1)* %b.gep
760  %c = load volatile float, float addrspace(1)* %c.gep
761  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
762  %fneg = fsub float -0.000000e+00, %fma
763  %use1 = fmul float %fma, 4.0
764  store volatile float %fneg, float addrspace(1)* %out
765  store volatile float %use1, float addrspace(1)* %out
766  ret void
767}
768
769; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
770; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
771; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
772; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
773
774; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]]
775; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
776
777; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
778; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
779define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
780  %tid = call i32 @llvm.amdgcn.workitem.id.x()
781  %tid.ext = sext i32 %tid to i64
782  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
783  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
784  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
785  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
786  %a = load volatile float, float addrspace(1)* %a.gep
787  %b = load volatile float, float addrspace(1)* %b.gep
788  %c = load volatile float, float addrspace(1)* %c.gep
789  %fneg.a = fsub float -0.000000e+00, %a
790  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
791  %fneg = fsub float -0.000000e+00, %fma
792  store volatile float %fneg, float addrspace(1)* %out
793  ret void
794}
795
796; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
797; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
798; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
799; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
800
801; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
802; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
803
804; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
805; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
806define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
807  %tid = call i32 @llvm.amdgcn.workitem.id.x()
808  %tid.ext = sext i32 %tid to i64
809  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
810  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
811  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
812  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
813  %a = load volatile float, float addrspace(1)* %a.gep
814  %b = load volatile float, float addrspace(1)* %b.gep
815  %c = load volatile float, float addrspace(1)* %c.gep
816  %fneg.b = fsub float -0.000000e+00, %b
817  %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
818  %fneg = fsub float -0.000000e+00, %fma
819  store volatile float %fneg, float addrspace(1)* %out
820  ret void
821}
822
823; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
824; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
825; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
826; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
827
828; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], -[[B]], [[C]]
829; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]]
830
831; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
832; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
833define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
834  %tid = call i32 @llvm.amdgcn.workitem.id.x()
835  %tid.ext = sext i32 %tid to i64
836  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
837  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
838  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
839  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
840  %a = load volatile float, float addrspace(1)* %a.gep
841  %b = load volatile float, float addrspace(1)* %b.gep
842  %c = load volatile float, float addrspace(1)* %c.gep
843  %fneg.a = fsub float -0.000000e+00, %a
844  %fneg.b = fsub float -0.000000e+00, %b
845  %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
846  %fneg = fsub float -0.000000e+00, %fma
847  store volatile float %fneg, float addrspace(1)* %out
848  ret void
849}
850
851; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
852; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
853; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
854; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
855
856; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]]
857; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]]
858
859; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
860; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
861define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
862  %tid = call i32 @llvm.amdgcn.workitem.id.x()
863  %tid.ext = sext i32 %tid to i64
864  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
865  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
866  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
867  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
868  %a = load volatile float, float addrspace(1)* %a.gep
869  %b = load volatile float, float addrspace(1)* %b.gep
870  %c = load volatile float, float addrspace(1)* %c.gep
871  %fneg.a = fsub float -0.000000e+00, %a
872  %fneg.c = fsub float -0.000000e+00, %c
873  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
874  %fneg = fsub float -0.000000e+00, %fma
875  store volatile float %fneg, float addrspace(1)* %out
876  ret void
877}
878
879; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
880; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
881; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
882; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
883
884; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
885; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
886
887; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
888; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
889define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
890  %tid = call i32 @llvm.amdgcn.workitem.id.x()
891  %tid.ext = sext i32 %tid to i64
892  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
893  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
894  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
895  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
896  %a = load volatile float, float addrspace(1)* %a.gep
897  %b = load volatile float, float addrspace(1)* %b.gep
898  %c = load volatile float, float addrspace(1)* %c.gep
899  %fneg.c = fsub float -0.000000e+00, %c
900  %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
901  %fneg = fsub float -0.000000e+00, %fma
902  store volatile float %fneg, float addrspace(1)* %out
903  ret void
904}
905
906; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
907; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
908; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
909; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
910
911; GCN-SAFE: v_xor_b32
912; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]],
913; GCN-SAFE: v_xor_b32
914
915; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
916; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
917; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
918; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
919define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
920  %tid = call i32 @llvm.amdgcn.workitem.id.x()
921  %tid.ext = sext i32 %tid to i64
922  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
923  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
924  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
925  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
926  %a = load volatile float, float addrspace(1)* %a.gep
927  %b = load volatile float, float addrspace(1)* %b.gep
928  %c = load volatile float, float addrspace(1)* %c.gep
929  %fneg.a = fsub float -0.000000e+00, %a
930  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
931  %fneg = fsub float -0.000000e+00, %fma
932  store volatile float %fneg, float addrspace(1)* %out
933  store volatile float %fneg.a, float addrspace(1)* %out
934  ret void
935}
936
937; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
938; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
939; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
940; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
941
942; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
943; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]]
944; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
945
946; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
947; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
948; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
949define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
950  %tid = call i32 @llvm.amdgcn.workitem.id.x()
951  %tid.ext = sext i32 %tid to i64
952  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
953  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
954  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
955  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
956  %a = load volatile float, float addrspace(1)* %a.gep
957  %b = load volatile float, float addrspace(1)* %b.gep
958  %c = load volatile float, float addrspace(1)* %c.gep
959  %fneg.a = fsub float -0.000000e+00, %a
960  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
961  %fneg = fsub float -0.000000e+00, %fma
962  %use1 = fmul float %fneg.a, %d
963  store volatile float %fneg, float addrspace(1)* %out
964  store volatile float %use1, float addrspace(1)* %out
965  ret void
966}
967
968; --------------------------------------------------------------------------------
969; fmad tests
970; --------------------------------------------------------------------------------
971
972; GCN-LABEL: {{^}}v_fneg_fmad_f32:
973; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
974; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
975; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
976
977; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
978; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
979
980; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
981; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
982define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
983  %tid = call i32 @llvm.amdgcn.workitem.id.x()
984  %tid.ext = sext i32 %tid to i64
985  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
986  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
987  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
988  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
989  %a = load volatile float, float addrspace(1)* %a.gep
990  %b = load volatile float, float addrspace(1)* %b.gep
991  %c = load volatile float, float addrspace(1)* %c.gep
992  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
993  %fneg = fsub float -0.000000e+00, %fma
994  store float %fneg, float addrspace(1)* %out.gep
995  ret void
996}
997
998; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
999; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1000; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1001; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1002
1003; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1004; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
1005; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
1006
1007; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
1008; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
1009
1010; GCN: buffer_store_dword [[NEG_MAD]]
1011; GCN-NEXT: buffer_store_dword [[MUL]]
1012define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1013  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1014  %tid.ext = sext i32 %tid to i64
1015  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1016  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1017  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1018  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1019  %a = load volatile float, float addrspace(1)* %a.gep
1020  %b = load volatile float, float addrspace(1)* %b.gep
1021  %c = load volatile float, float addrspace(1)* %c.gep
1022  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1023  %fneg = fsub float -0.000000e+00, %fma
1024  %use1 = fmul float %fma, 4.0
1025  store volatile float %fneg, float addrspace(1)* %out
1026  store volatile float %use1, float addrspace(1)* %out
1027  ret void
1028}
1029
1030; --------------------------------------------------------------------------------
1031; fp_extend tests
1032; --------------------------------------------------------------------------------
1033
1034; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
1035; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1036; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
1037; GCN: buffer_store_dwordx2 [[RESULT]]
1038define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1039  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1040  %tid.ext = sext i32 %tid to i64
1041  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1042  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1043  %a = load volatile float, float addrspace(1)* %a.gep
1044  %fpext = fpext float %a to double
1045  %fneg = fsub double -0.000000e+00, %fpext
1046  store double %fneg, double addrspace(1)* %out.gep
1047  ret void
1048}
1049
1050; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
1051; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1052; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1053; GCN: buffer_store_dwordx2 [[RESULT]]
1054define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1055  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1056  %tid.ext = sext i32 %tid to i64
1057  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1058  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1059  %a = load volatile float, float addrspace(1)* %a.gep
1060  %fneg.a = fsub float -0.000000e+00, %a
1061  %fpext = fpext float %fneg.a to double
1062  %fneg = fsub double -0.000000e+00, %fpext
1063  store double %fneg, double addrspace(1)* %out.gep
1064  ret void
1065}
1066
1067; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
1068; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1069; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1070; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
1071; GCN: buffer_store_dwordx2 [[RESULT]]
1072; GCN: buffer_store_dword [[FNEG_A]]
1073define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1074  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1075  %tid.ext = sext i32 %tid to i64
1076  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1077  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1078  %a = load volatile float, float addrspace(1)* %a.gep
1079  %fneg.a = fsub float -0.000000e+00, %a
1080  %fpext = fpext float %fneg.a to double
1081  %fneg = fsub double -0.000000e+00, %fpext
1082  store volatile double %fneg, double addrspace(1)* %out.gep
1083  store volatile float %fneg.a, float addrspace(1)* undef
1084  ret void
1085}
1086
1087; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
1088; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1089; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
1090; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1091; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
1092; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
1093define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1094  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1095  %tid.ext = sext i32 %tid to i64
1096  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1097  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1098  %a = load volatile float, float addrspace(1)* %a.gep
1099  %fpext = fpext float %a to double
1100  %fneg = fsub double -0.000000e+00, %fpext
1101  store volatile double %fneg, double addrspace(1)* %out.gep
1102  store volatile double %fpext, double addrspace(1)* undef
1103  ret void
1104}
1105
1106; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
1107; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1108; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
1109; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1110; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
1111; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
1112; GCN: buffer_store_dwordx2 [[MUL]]
1113define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1114  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1115  %tid.ext = sext i32 %tid to i64
1116  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1117  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1118  %a = load volatile float, float addrspace(1)* %a.gep
1119  %fpext = fpext float %a to double
1120  %fneg = fsub double -0.000000e+00, %fpext
1121  %mul = fmul double %fpext, 4.0
1122  store volatile double %fneg, double addrspace(1)* %out.gep
1123  store volatile double %mul, double addrspace(1)* %out.gep
1124  ret void
1125}
1126
1127; FIXME: Source modifiers not folded for f16->f32
1128; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
1129define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1130  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1131  %tid.ext = sext i32 %tid to i64
1132  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1133  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1134  %a = load volatile half, half addrspace(1)* %a.gep
1135  %fpext = fpext half %a to float
1136  %fneg = fsub float -0.000000e+00, %fpext
1137  store volatile float %fneg, float addrspace(1)* %out.gep
1138  store volatile float %fpext, float addrspace(1)* %out.gep
1139  ret void
1140}
1141
1142; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
1143define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1144  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1145  %tid.ext = sext i32 %tid to i64
1146  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1147  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1148  %a = load volatile half, half addrspace(1)* %a.gep
1149  %fpext = fpext half %a to float
1150  %fneg = fsub float -0.000000e+00, %fpext
1151  %mul = fmul float %fpext, 4.0
1152  store volatile float %fneg, float addrspace(1)* %out.gep
1153  store volatile float %mul, float addrspace(1)* %out.gep
1154  ret void
1155}
1156
1157; --------------------------------------------------------------------------------
1158; fp_round tests
1159; --------------------------------------------------------------------------------
1160
1161; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
1162; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1163; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
1164; GCN: buffer_store_dword [[RESULT]]
1165define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1166  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1167  %tid.ext = sext i32 %tid to i64
1168  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1169  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1170  %a = load volatile double, double addrspace(1)* %a.gep
1171  %fpround = fptrunc double %a to float
1172  %fneg = fsub float -0.000000e+00, %fpround
1173  store float %fneg, float addrspace(1)* %out.gep
1174  ret void
1175}
1176
1177; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
1178; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1179; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1180; GCN: buffer_store_dword [[RESULT]]
1181define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1182  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1183  %tid.ext = sext i32 %tid to i64
1184  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1185  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1186  %a = load volatile double, double addrspace(1)* %a.gep
1187  %fneg.a = fsub double -0.000000e+00, %a
1188  %fpround = fptrunc double %fneg.a to float
1189  %fneg = fsub float -0.000000e+00, %fpround
1190  store float %fneg, float addrspace(1)* %out.gep
1191  ret void
1192}
1193
1194; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
1195; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
1196; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
1197; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
1198; GCN: buffer_store_dword [[RESULT]]
1199; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
1200define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1201  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1202  %tid.ext = sext i32 %tid to i64
1203  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1204  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1205  %a = load volatile double, double addrspace(1)* %a.gep
1206  %fneg.a = fsub double -0.000000e+00, %a
1207  %fpround = fptrunc double %fneg.a to float
1208  %fneg = fsub float -0.000000e+00, %fpround
1209  store volatile float %fneg, float addrspace(1)* %out.gep
1210  store volatile double %fneg.a, double addrspace(1)* undef
1211  ret void
1212}
1213
1214; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
1215; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1216; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1217; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
1218; GCN: buffer_store_dword [[RESULT]]
1219; GCN: buffer_store_dwordx2 [[USE1]]
1220define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
1221  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1222  %tid.ext = sext i32 %tid to i64
1223  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1224  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1225  %a = load volatile double, double addrspace(1)* %a.gep
1226  %fneg.a = fsub double -0.000000e+00, %a
1227  %fpround = fptrunc double %fneg.a to float
1228  %fneg = fsub float -0.000000e+00, %fpround
1229  %use1 = fmul double %fneg.a, %c
1230  store volatile float %fneg, float addrspace(1)* %out.gep
1231  store volatile double %use1, double addrspace(1)* undef
1232  ret void
1233}
1234
1235; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
1236; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1237; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1238; GCN: buffer_store_short [[RESULT]]
1239define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1240  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1241  %tid.ext = sext i32 %tid to i64
1242  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1243  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1244  %a = load volatile float, float addrspace(1)* %a.gep
1245  %fpround = fptrunc float %a to half
1246  %fneg = fsub half -0.000000e+00, %fpround
1247  store half %fneg, half addrspace(1)* %out.gep
1248  ret void
1249}
1250
1251; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
1252; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1253; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1254; GCN: buffer_store_short [[RESULT]]
1255define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1256  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1257  %tid.ext = sext i32 %tid to i64
1258  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1259  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1260  %a = load volatile float, float addrspace(1)* %a.gep
1261  %fneg.a = fsub float -0.000000e+00, %a
1262  %fpround = fptrunc float %fneg.a to half
1263  %fneg = fsub half -0.000000e+00, %fpround
1264  store half %fneg, half addrspace(1)* %out.gep
1265  ret void
1266}
1267
1268; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
1269; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1270; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
1271; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
1272; GCN: buffer_store_dword [[NEG]]
1273; GCN: buffer_store_dword [[CVT]]
1274define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1275  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1276  %tid.ext = sext i32 %tid to i64
1277  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1278  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1279  %a = load volatile double, double addrspace(1)* %a.gep
1280  %fpround = fptrunc double %a to float
1281  %fneg = fsub float -0.000000e+00, %fpround
1282  store volatile float %fneg, float addrspace(1)* %out.gep
1283  store volatile float %fpround, float addrspace(1)* %out.gep
1284  ret void
1285}
1286
1287; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
1288; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1289; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1290; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1291; GCN: buffer_store_short [[RESULT]]
1292; GCN: buffer_store_dword [[NEG_A]]
1293define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1294  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1295  %tid.ext = sext i32 %tid to i64
1296  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1297  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1298  %a = load volatile float, float addrspace(1)* %a.gep
1299  %fneg.a = fsub float -0.000000e+00, %a
1300  %fpround = fptrunc float %fneg.a to half
1301  %fneg = fsub half -0.000000e+00, %fpround
1302  store volatile half %fneg, half addrspace(1)* %out.gep
1303  store volatile float %fneg.a, float addrspace(1)* undef
1304  ret void
1305}
1306
1307; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
1308; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1309; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1310; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
1311; GCN: buffer_store_short [[RESULT]]
1312; GCN: buffer_store_dword [[USE1]]
1313define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1314  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1315  %tid.ext = sext i32 %tid to i64
1316  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1317  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1318  %a = load volatile float, float addrspace(1)* %a.gep
1319  %fneg.a = fsub float -0.000000e+00, %a
1320  %fpround = fptrunc float %fneg.a to half
1321  %fneg = fsub half -0.000000e+00, %fpround
1322  %use1 = fmul float %fneg.a, %c
1323  store volatile half %fneg, half addrspace(1)* %out.gep
1324  store volatile float %use1, float addrspace(1)* undef
1325  ret void
1326}
1327
1328; --------------------------------------------------------------------------------
1329; rcp tests
1330; --------------------------------------------------------------------------------
1331
1332; GCN-LABEL: {{^}}v_fneg_rcp_f32:
1333; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1334; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1335; GCN: buffer_store_dword [[RESULT]]
1336define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1337  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1338  %tid.ext = sext i32 %tid to i64
1339  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1340  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1341  %a = load volatile float, float addrspace(1)* %a.gep
1342  %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
1343  %fneg = fsub float -0.000000e+00, %rcp
1344  store float %fneg, float addrspace(1)* %out.gep
1345  ret void
1346}
1347
1348; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
1349; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1350; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1351; GCN: buffer_store_dword [[RESULT]]
1352define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1353  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1354  %tid.ext = sext i32 %tid to i64
1355  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1356  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1357  %a = load volatile float, float addrspace(1)* %a.gep
1358  %fneg.a = fsub float -0.000000e+00, %a
1359  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1360  %fneg = fsub float -0.000000e+00, %rcp
1361  store float %fneg, float addrspace(1)* %out.gep
1362  ret void
1363}
1364
1365; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
1366; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1367; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1368; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1369; GCN: buffer_store_dword [[RESULT]]
1370; GCN: buffer_store_dword [[NEG_A]]
1371define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1372  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1373  %tid.ext = sext i32 %tid to i64
1374  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1375  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1376  %a = load volatile float, float addrspace(1)* %a.gep
1377  %fneg.a = fsub float -0.000000e+00, %a
1378  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1379  %fneg = fsub float -0.000000e+00, %rcp
1380  store volatile float %fneg, float addrspace(1)* %out.gep
1381  store volatile float %fneg.a, float addrspace(1)* undef
1382  ret void
1383}
1384
1385; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
1386; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1387; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1388; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1389; GCN: buffer_store_dword [[RESULT]]
1390; GCN: buffer_store_dword [[MUL]]
1391define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1392  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1393  %tid.ext = sext i32 %tid to i64
1394  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1395  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1396  %a = load volatile float, float addrspace(1)* %a.gep
1397  %fneg.a = fsub float -0.000000e+00, %a
1398  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1399  %fneg = fsub float -0.000000e+00, %rcp
1400  %use1 = fmul float %fneg.a, %c
1401  store volatile float %fneg, float addrspace(1)* %out.gep
1402  store volatile float %use1, float addrspace(1)* undef
1403  ret void
1404}
1405
1406; --------------------------------------------------------------------------------
1407; rcp_legacy tests
1408; --------------------------------------------------------------------------------
1409
1410; GCN-LABEL: {{^}}v_fneg_rcp_legacy_f32:
1411; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1412; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1413; GCN: buffer_store_dword [[RESULT]]
1414define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1415  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1416  %tid.ext = sext i32 %tid to i64
1417  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1418  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1419  %a = load volatile float, float addrspace(1)* %a.gep
1420  %rcp = call float @llvm.amdgcn.rcp.legacy(float %a)
1421  %fneg = fsub float -0.000000e+00, %rcp
1422  store float %fneg, float addrspace(1)* %out.gep
1423  ret void
1424}
1425
1426; --------------------------------------------------------------------------------
1427; fmul_legacy tests
1428; --------------------------------------------------------------------------------
1429
1430; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
1431; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1432; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1433; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
1434; GCN-NEXT: buffer_store_dword [[RESULT]]
1435define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1436  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1437  %tid.ext = sext i32 %tid to i64
1438  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1439  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1440  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1441  %a = load volatile float, float addrspace(1)* %a.gep
1442  %b = load volatile float, float addrspace(1)* %b.gep
1443  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1444  %fneg = fsub float -0.000000e+00, %mul
1445  store float %fneg, float addrspace(1)* %out.gep
1446  ret void
1447}
1448
1449; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
1450; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1451; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1452; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1453; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
1454; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
1455; GCN: buffer_store_dword [[ADD]]
1456define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1457  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1458  %tid.ext = sext i32 %tid to i64
1459  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1460  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1461  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1462  %a = load volatile float, float addrspace(1)* %a.gep
1463  %b = load volatile float, float addrspace(1)* %b.gep
1464  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1465  %fneg = fsub float -0.000000e+00, %mul
1466  store volatile float %fneg, float addrspace(1)* %out
1467  store volatile float %mul, float addrspace(1)* %out
1468  ret void
1469}
1470
1471; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
1472; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1473; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1474; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1475; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
1476; GCN-NEXT: buffer_store_dword [[ADD]]
1477; GCN-NEXT: buffer_store_dword [[MUL]]
1478define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1479  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1480  %tid.ext = sext i32 %tid to i64
1481  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1482  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1483  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1484  %a = load volatile float, float addrspace(1)* %a.gep
1485  %b = load volatile float, float addrspace(1)* %b.gep
1486  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1487  %fneg = fsub float -0.000000e+00, %mul
1488  %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
1489  store volatile float %fneg, float addrspace(1)* %out
1490  store volatile float %use1, float addrspace(1)* %out
1491  ret void
1492}
1493
1494; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
1495; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1496; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1497; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1498; GCN-NEXT: buffer_store_dword [[ADD]]
1499define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1500  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1501  %tid.ext = sext i32 %tid to i64
1502  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1503  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1504  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1505  %a = load volatile float, float addrspace(1)* %a.gep
1506  %b = load volatile float, float addrspace(1)* %b.gep
1507  %fneg.a = fsub float -0.000000e+00, %a
1508  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1509  %fneg = fsub float -0.000000e+00, %mul
1510  store volatile float %fneg, float addrspace(1)* %out
1511  ret void
1512}
1513
1514; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
1515; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1516; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1517; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1518; GCN-NEXT: buffer_store_dword [[ADD]]
1519define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1520  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1521  %tid.ext = sext i32 %tid to i64
1522  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1523  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1524  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1525  %a = load volatile float, float addrspace(1)* %a.gep
1526  %b = load volatile float, float addrspace(1)* %b.gep
1527  %fneg.b = fsub float -0.000000e+00, %b
1528  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
1529  %fneg = fsub float -0.000000e+00, %mul
1530  store volatile float %fneg, float addrspace(1)* %out
1531  ret void
1532}
1533
1534; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
1535; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1536; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1537; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1538; GCN-NEXT: buffer_store_dword [[ADD]]
1539define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1540  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1541  %tid.ext = sext i32 %tid to i64
1542  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1543  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1544  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1545  %a = load volatile float, float addrspace(1)* %a.gep
1546  %b = load volatile float, float addrspace(1)* %b.gep
1547  %fneg.a = fsub float -0.000000e+00, %a
1548  %fneg.b = fsub float -0.000000e+00, %b
1549  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
1550  %fneg = fsub float -0.000000e+00, %mul
1551  store volatile float %fneg, float addrspace(1)* %out
1552  ret void
1553}
1554
1555; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
1556; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1557; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1558; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1559; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
1560; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
1561; GCN: buffer_store_dword [[NEG_A]]
1562define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1563  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1564  %tid.ext = sext i32 %tid to i64
1565  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1566  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1567  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1568  %a = load volatile float, float addrspace(1)* %a.gep
1569  %b = load volatile float, float addrspace(1)* %b.gep
1570  %fneg.a = fsub float -0.000000e+00, %a
1571  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1572  %fneg = fsub float -0.000000e+00, %mul
1573  store volatile float %fneg, float addrspace(1)* %out
1574  store volatile float %fneg.a, float addrspace(1)* %out
1575  ret void
1576}
1577
1578; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
1579; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1580; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1581; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
1582; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1583; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
1584; GCN: buffer_store_dword [[MUL]]
1585define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
1586  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1587  %tid.ext = sext i32 %tid to i64
1588  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1589  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1590  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1591  %a = load volatile float, float addrspace(1)* %a.gep
1592  %b = load volatile float, float addrspace(1)* %b.gep
1593  %fneg.a = fsub float -0.000000e+00, %a
1594  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1595  %fneg = fsub float -0.000000e+00, %mul
1596  %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
1597  store volatile float %fneg, float addrspace(1)* %out
1598  store volatile float %use1, float addrspace(1)* %out
1599  ret void
1600}
1601
1602; --------------------------------------------------------------------------------
1603; sin tests
1604; --------------------------------------------------------------------------------
1605
1606; GCN-LABEL: {{^}}v_fneg_sin_f32:
1607; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1608; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]]
1609; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
1610; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
1611; GCN: buffer_store_dword [[RESULT]]
1612define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1613  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1614  %tid.ext = sext i32 %tid to i64
1615  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1616  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1617  %a = load volatile float, float addrspace(1)* %a.gep
1618  %sin = call float @llvm.sin.f32(float %a)
1619  %fneg = fsub float -0.000000e+00, %sin
1620  store float %fneg, float addrspace(1)* %out.gep
1621  ret void
1622}
1623
1624; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
1625; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1626; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1627; GCN: buffer_store_dword [[RESULT]]
1628define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1629  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1630  %tid.ext = sext i32 %tid to i64
1631  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1632  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1633  %a = load volatile float, float addrspace(1)* %a.gep
1634  %sin = call float @llvm.amdgcn.sin.f32(float %a)
1635  %fneg = fsub float -0.0, %sin
1636  store float %fneg, float addrspace(1)* %out.gep
1637  ret void
1638}
1639
1640; --------------------------------------------------------------------------------
1641; ftrunc tests
1642; --------------------------------------------------------------------------------
1643
1644; GCN-LABEL: {{^}}v_fneg_trunc_f32:
1645; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1646; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1647; GCN: buffer_store_dword [[RESULT]]
1648define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1649  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1650  %tid.ext = sext i32 %tid to i64
1651  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1652  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1653  %a = load volatile float, float addrspace(1)* %a.gep
1654  %trunc = call float @llvm.trunc.f32(float %a)
1655  %fneg = fsub float -0.0, %trunc
1656  store float %fneg, float addrspace(1)* %out.gep
1657  ret void
1658}
1659
1660; --------------------------------------------------------------------------------
1661; fround tests
1662; --------------------------------------------------------------------------------
1663
1664; GCN-LABEL: {{^}}v_fneg_round_f32:
1665; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1666; GCN: v_trunc_f32_e32
1667; GCN: v_sub_f32_e32
1668; GCN: v_cndmask_b32
1669
1670; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
1671; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
1672
1673; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
1674; GCN: buffer_store_dword [[RESULT]]
1675define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1676  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1677  %tid.ext = sext i32 %tid to i64
1678  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1679  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1680  %a = load volatile float, float addrspace(1)* %a.gep
1681  %round = call float @llvm.round.f32(float %a)
1682  %fneg = fsub float -0.0, %round
1683  store float %fneg, float addrspace(1)* %out.gep
1684  ret void
1685}
1686
1687; --------------------------------------------------------------------------------
1688; rint tests
1689; --------------------------------------------------------------------------------
1690
1691; GCN-LABEL: {{^}}v_fneg_rint_f32:
1692; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1693; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1694; GCN: buffer_store_dword [[RESULT]]
1695define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1696  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1697  %tid.ext = sext i32 %tid to i64
1698  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1699  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1700  %a = load volatile float, float addrspace(1)* %a.gep
1701  %rint = call float @llvm.rint.f32(float %a)
1702  %fneg = fsub float -0.0, %rint
1703  store float %fneg, float addrspace(1)* %out.gep
1704  ret void
1705}
1706
1707; --------------------------------------------------------------------------------
1708; nearbyint tests
1709; --------------------------------------------------------------------------------
1710
1711; GCN-LABEL: {{^}}v_fneg_nearbyint_f32:
1712; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1713; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1714; GCN: buffer_store_dword [[RESULT]]
1715define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1716  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1717  %tid.ext = sext i32 %tid to i64
1718  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1719  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1720  %a = load volatile float, float addrspace(1)* %a.gep
1721  %nearbyint = call float @llvm.nearbyint.f32(float %a)
1722  %fneg = fsub float -0.0, %nearbyint
1723  store float %fneg, float addrspace(1)* %out.gep
1724  ret void
1725}
1726
1727; --------------------------------------------------------------------------------
1728; fcanonicalize tests
1729; --------------------------------------------------------------------------------
1730
1731; GCN-LABEL: {{^}}v_fneg_canonicalize_f32:
1732; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1733; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]]
1734; GCN: buffer_store_dword [[RESULT]]
1735define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1736  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1737  %tid.ext = sext i32 %tid to i64
1738  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1739  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1740  %a = load volatile float, float addrspace(1)* %a.gep
1741  %trunc = call float @llvm.canonicalize.f32(float %a)
1742  %fneg = fsub float -0.0, %trunc
1743  store float %fneg, float addrspace(1)* %out.gep
1744  ret void
1745}
1746
1747; --------------------------------------------------------------------------------
1748; vintrp tests
1749; --------------------------------------------------------------------------------
1750
1751; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
1752; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1753; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1754; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1755; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
1756; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
1757define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1758  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1759  %tid.ext = sext i32 %tid to i64
1760  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1761  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1762  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1763  %a = load volatile float, float addrspace(1)* %a.gep
1764  %b = load volatile float, float addrspace(1)* %b.gep
1765  %mul = fmul float %a, %b
1766  %fneg = fsub float -0.0, %mul
1767  %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
1768  %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
1769  store volatile float %intrp0, float addrspace(1)* %out.gep
1770  store volatile float %intrp1, float addrspace(1)* %out.gep
1771  ret void
1772}
1773
1774; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
1775; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1776; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1777; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1778; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
1779; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
1780define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1781  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1782  %tid.ext = sext i32 %tid to i64
1783  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1784  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1785  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1786  %a = load volatile float, float addrspace(1)* %a.gep
1787  %b = load volatile float, float addrspace(1)* %b.gep
1788  %mul = fmul float %a, %b
1789  %fneg = fsub float -0.0, %mul
1790  %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
1791  %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
1792  store volatile float %intrp0, float addrspace(1)* %out.gep
1793  store volatile float %intrp1, float addrspace(1)* %out.gep
1794  ret void
1795}
1796
1797; --------------------------------------------------------------------------------
1798; CopyToReg tests
1799; --------------------------------------------------------------------------------
1800
1801; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
1802; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1803; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1804; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1805; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
1806; GCN: s_cbranch_scc1
1807
1808; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
1809; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
1810; GCN: buffer_store_dword [[MUL1]]
1811
1812; GCN: buffer_store_dword [[MUL0]]
1813define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1814  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1815  %tid.ext = sext i32 %tid to i64
1816  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1817  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1818  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1819  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1820  %a = load volatile float, float addrspace(1)* %a.gep
1821  %b = load volatile float, float addrspace(1)* %b.gep
1822  %c = load volatile float, float addrspace(1)* %c.gep
1823  %mul = fmul float %a, %b
1824  %fneg = fsub float -0.0, %mul
1825  %cmp0 = icmp eq i32 %d, 0
1826  br i1 %cmp0, label %if, label %endif
1827
1828if:
1829  %mul1 = fmul float %fneg, %c
1830  store volatile float %mul1, float addrspace(1)* %out.gep
1831  br label %endif
1832
1833endif:
1834  store volatile float %mul, float addrspace(1)* %out.gep
1835  ret void
1836}
1837
1838; --------------------------------------------------------------------------------
1839; inlineasm tests
1840; --------------------------------------------------------------------------------
1841
1842; Can't fold into use, so should fold into source
1843; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
1844; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1845; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1846; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1847; GCN: ; use [[MUL]]
1848; GCN: buffer_store_dword [[MUL]]
1849define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1850  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1851  %tid.ext = sext i32 %tid to i64
1852  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1853  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1854  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1855  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1856  %a = load volatile float, float addrspace(1)* %a.gep
1857  %b = load volatile float, float addrspace(1)* %b.gep
1858  %c = load volatile float, float addrspace(1)* %c.gep
1859  %mul = fmul float %a, %b
1860  %fneg = fsub float -0.0, %mul
1861  call void asm sideeffect "; use $0", "v"(float %fneg) #0
1862  store volatile float %fneg, float addrspace(1)* %out.gep
1863  ret void
1864}
1865
1866; --------------------------------------------------------------------------------
1867; inlineasm tests
1868; --------------------------------------------------------------------------------
1869
1870; Can't fold into use, so should fold into source
1871; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
1872; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1873; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1874; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
1875; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
1876; GCN: ; use [[NEG]]
1877; GCN: buffer_store_dword [[MUL]]
1878define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1879  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1880  %tid.ext = sext i32 %tid to i64
1881  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1882  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1883  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1884  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1885  %a = load volatile float, float addrspace(1)* %a.gep
1886  %b = load volatile float, float addrspace(1)* %b.gep
1887  %c = load volatile float, float addrspace(1)* %c.gep
1888  %mul = fmul float %a, %b
1889  %fneg = fsub float -0.0, %mul
1890  call void asm sideeffect "; use $0", "v"(float %fneg) #0
1891  store volatile float %mul, float addrspace(1)* %out.gep
1892  ret void
1893}
1894
1895; --------------------------------------------------------------------------------
1896; code size regression tests
1897; --------------------------------------------------------------------------------
1898
1899; There are multiple users of the fneg that must use a VOP3
1900; instruction, so there is no penalty
1901; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
1902; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1903; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1904; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1905
1906; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
1907; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
1908; GCN-NEXT:	buffer_store_dword [[FMA0]]
1909; GCN-NEXT:	buffer_store_dword [[FMA1]]
1910define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1911  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1912  %tid.ext = sext i32 %tid to i64
1913  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1914  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1915  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1916  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1917  %a = load volatile float, float addrspace(1)* %a.gep
1918  %b = load volatile float, float addrspace(1)* %b.gep
1919  %c = load volatile float, float addrspace(1)* %c.gep
1920
1921  %fneg.a = fsub float -0.0, %a
1922  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1923  %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
1924
1925  store volatile float %fma0, float addrspace(1)* %out
1926  store volatile float %fma1, float addrspace(1)* %out
1927  ret void
1928}
1929
1930; There are multiple users, but both require using a larger encoding
1931; for the modifier.
1932
1933; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
1934; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1935; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1936; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1937
1938; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
1939; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
1940; GCN-NEXT:	buffer_store_dword [[MUL0]]
1941; GCN-NEXT:	buffer_store_dword [[MUL1]]
1942define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1943  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1944  %tid.ext = sext i32 %tid to i64
1945  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1946  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1947  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1948  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1949  %a = load volatile float, float addrspace(1)* %a.gep
1950  %b = load volatile float, float addrspace(1)* %b.gep
1951  %c = load volatile float, float addrspace(1)* %c.gep
1952
1953  %fneg.a = fsub float -0.0, %a
1954  %mul0 = fmul float %fneg.a, %b
1955  %mul1 = fmul float %fneg.a, %c
1956
1957  store volatile float %mul0, float addrspace(1)* %out
1958  store volatile float %mul1, float addrspace(1)* %out
1959  ret void
1960}
1961
1962; One user is VOP3 so has no cost to folding the modifier, the other does.
1963; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
1964; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1965; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1966; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1967
1968; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
1969; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
1970
1971; GCN:	buffer_store_dword [[FMA0]]
1972; GCN-NEXT:	buffer_store_dword [[MUL1]]
1973define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1974  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1975  %tid.ext = sext i32 %tid to i64
1976  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1977  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1978  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1979  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1980  %a = load volatile float, float addrspace(1)* %a.gep
1981  %b = load volatile float, float addrspace(1)* %b.gep
1982  %c = load volatile float, float addrspace(1)* %c.gep
1983
1984  %fneg.a = fsub float -0.0, %a
1985  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
1986  %mul1 = fmul float %fneg.a, %c
1987
1988  store volatile float %fma0, float addrspace(1)* %out
1989  store volatile float %mul1, float addrspace(1)* %out
1990  ret void
1991}
1992
1993; The use of the fneg requires a code size increase, but folding into
1994; the source does not
1995
1996; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
1997; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1998; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1999; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2000; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2001
2002; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
2003; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
2004; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
2005
2006; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
2007; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
2008; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
2009
2010; GCN: buffer_store_dword [[MUL1]]
2011; GCN-NEXT:	buffer_store_dword [[MUL2]]
2012define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2013  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2014  %tid.ext = sext i32 %tid to i64
2015  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2016  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2017  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2018  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2019  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2020  %a = load volatile float, float addrspace(1)* %a.gep
2021  %b = load volatile float, float addrspace(1)* %b.gep
2022  %c = load volatile float, float addrspace(1)* %c.gep
2023  %d = load volatile float, float addrspace(1)* %d.gep
2024
2025  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
2026  %fneg.fma0 = fsub float -0.0, %fma0
2027  %mul1 = fmul float %fneg.fma0, %c
2028  %mul2 = fmul float %fneg.fma0, %d
2029
2030  store volatile float %mul1, float addrspace(1)* %out
2031  store volatile float %mul2, float addrspace(1)* %out
2032  ret void
2033}
2034
2035; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
2036; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
2037; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
2038; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
2039; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
2040
2041; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
2042; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
2043; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
2044
2045; GCN: buffer_store_dwordx2 [[MUL0]]
2046; GCN: buffer_store_dwordx2 [[MUL1]]
2047define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
2048  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2049  %tid.ext = sext i32 %tid to i64
2050  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
2051  %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
2052  %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
2053  %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
2054  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
2055  %a = load volatile double, double addrspace(1)* %a.gep
2056  %b = load volatile double, double addrspace(1)* %b.gep
2057  %c = load volatile double, double addrspace(1)* %c.gep
2058  %d = load volatile double, double addrspace(1)* %d.gep
2059
2060  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
2061  %fneg.fma0 = fsub double -0.0, %fma0
2062  %mul1 = fmul double %fneg.fma0, %c
2063  %mul2 = fmul double %fneg.fma0, %d
2064
2065  store volatile double %mul1, double addrspace(1)* %out
2066  store volatile double %mul2, double addrspace(1)* %out
2067  ret void
2068}
2069
2070; %trunc.a has one fneg use, but it requires a code size increase and
2071; %the fneg can instead be folded for free into the fma.
2072
2073; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
2074; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2075; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2076; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2077; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2078; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2079; GCN: buffer_store_dword [[FMA0]]
2080define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2081  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2082  %tid.ext = sext i32 %tid to i64
2083  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2084  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2085  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2086  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2087  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2088  %a = load volatile float, float addrspace(1)* %a.gep
2089  %b = load volatile float, float addrspace(1)* %b.gep
2090  %c = load volatile float, float addrspace(1)* %c.gep
2091  %d = load volatile float, float addrspace(1)* %d.gep
2092
2093  %trunc.a = call float @llvm.trunc.f32(float %a)
2094  %trunc.fneg.a = fsub float -0.0, %trunc.a
2095  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2096  store volatile float %fma0, float addrspace(1)* %out
2097  ret void
2098}
2099
2100; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
2101; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2102; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2103; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2104; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2105; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2106; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2107; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
2108; GCN: buffer_store_dword [[FMA0]]
2109; GCN: buffer_store_dword [[MUL1]]
2110define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2111  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2112  %tid.ext = sext i32 %tid to i64
2113  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2114  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2115  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2116  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2117  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2118  %a = load volatile float, float addrspace(1)* %a.gep
2119  %b = load volatile float, float addrspace(1)* %b.gep
2120  %c = load volatile float, float addrspace(1)* %c.gep
2121  %d = load volatile float, float addrspace(1)* %d.gep
2122
2123  %trunc.a = call float @llvm.trunc.f32(float %a)
2124  %trunc.fneg.a = fsub float -0.0, %trunc.a
2125  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2126  %mul1 = fmul float %trunc.a, %d
2127  store volatile float %fma0, float addrspace(1)* %out
2128  store volatile float %mul1, float addrspace(1)* %out
2129  ret void
2130}
2131
2132declare i32 @llvm.amdgcn.workitem.id.x() #1
2133declare float @llvm.fma.f32(float, float, float) #1
2134declare float @llvm.fmuladd.f32(float, float, float) #1
2135declare float @llvm.sin.f32(float) #1
2136declare float @llvm.trunc.f32(float) #1
2137declare float @llvm.round.f32(float) #1
2138declare float @llvm.rint.f32(float) #1
2139declare float @llvm.nearbyint.f32(float) #1
2140declare float @llvm.canonicalize.f32(float) #1
2141declare float @llvm.minnum.f32(float, float) #1
2142declare float @llvm.maxnum.f32(float, float) #1
2143
2144declare double @llvm.fma.f64(double, double, double) #1
2145
2146declare float @llvm.amdgcn.sin.f32(float) #1
2147declare float @llvm.amdgcn.rcp.f32(float) #1
2148declare float @llvm.amdgcn.rcp.legacy(float) #1
2149declare float @llvm.amdgcn.fmul.legacy(float, float) #1
2150declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
2151declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
2152
2153attributes #0 = { nounwind }
2154attributes #1 = { nounwind readnone }
2155