1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s
6
7declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
8declare float @llvm.fabs.f32(float) nounwind readnone
9
10; GCN-LABEL: {{^}}madak_f32:
11; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
12; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
13; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
14; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
15; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
16; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
17; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
18; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
19; MAD:   v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
20; GFX10-MAD:   v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
21; FMA:   v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
22define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 {
23  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
24  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
25  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
26  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
27
28  %a = load float, float addrspace(1)* %in.a.gep, align 4
29  %b = load float, float addrspace(1)* %in.b.gep, align 4
30
31  %mul = fmul float %a, %b
32  %madak = fadd float %mul, 10.0
33  store float %madak, float addrspace(1)* %out.gep, align 4
34  ret void
35}
36
37; Make sure this is only folded with one use. This is a code size
38; optimization and if we fold the immediate multiple times, we'll undo
39; it.
40
41; GCN-LABEL: {{^}}madak_2_use_f32:
42; GFX9:         v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
43; GFX10:        v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
44; GFX6-DAG:     buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
45; GFX6-DAG:     buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
46; GFX6-DAG:     buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
47; GFX8_9_10:    {{flat|global}}_load_dword [[VA:v[0-9]+]],
48; GFX8_9_10:    {{flat|global}}_load_dword [[VB:v[0-9]+]],
49; GFX8_9_10:    {{flat|global}}_load_dword [[VC:v[0-9]+]],
50; GFX6-DAG:     v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
51; GFX8-DAG:     v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
52; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
53; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
54; FMA-DAG:      v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
55; MAD-DAG:      v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
56; FMA-DAG:      v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
57; GCN:          s_endpgm
58define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
59  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
60
61  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
62  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
63  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
64
65  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
66  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
67
68  %a = load volatile float, float addrspace(1)* %in.gep.0, align 4
69  %b = load volatile float, float addrspace(1)* %in.gep.1, align 4
70  %c = load volatile float, float addrspace(1)* %in.gep.2, align 4
71
72  %mul0 = fmul float %a, %b
73  %mul1 = fmul float %a, %c
74  %madak0 = fadd float %mul0, 10.0
75  %madak1 = fadd float %mul1, 10.0
76
77  store volatile float %madak0, float addrspace(1)* %out.gep.0, align 4
78  store volatile float %madak1, float addrspace(1)* %out.gep.1, align 4
79  ret void
80}
81
82; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
83; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
84; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
85; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
86; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
87define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) #0 {
88  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
89  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
90  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
91
92  %a = load float, float addrspace(1)* %in.a.gep, align 4
93
94  %mul = fmul float 4.0, %a
95  %madak = fadd float %mul, 10.0
96  store float %madak, float addrspace(1)* %out.gep, align 4
97  ret void
98}
99
100; Make sure nothing weird happens with a value that is also allowed as
101; an inline immediate.
102
103; GCN-LABEL: {{^}}madak_inline_imm_f32:
104; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
105; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
106; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
107; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
108; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
109; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
110; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
111; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
112; MAD:   v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
113; GFX10-MAD:   v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
114; FMA:   v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
115define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 {
116  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
117  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
118  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
119  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
120
121  %a = load float, float addrspace(1)* %in.a.gep, align 4
122  %b = load float, float addrspace(1)* %in.b.gep, align 4
123
124  %mul = fmul float %a, %b
125  %madak = fadd float %mul, 4.0
126  store float %madak, float addrspace(1)* %out.gep, align 4
127  ret void
128}
129
130; We can't use an SGPR when forming madak
131; GCN-LABEL: {{^}}s_v_madak_f32:
132; GCN-DAG:      s_load_dword [[SB:s[0-9]+]]
133; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
134; GCN-DAG:      {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
135; GCN-NOT:      v_madak_f32
136; GFX6_8_9:     v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
137; GFX10-MAD:    v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
138; FMA:          v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
139define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 {
140  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
141  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
142  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
143
144  %a = load float, float addrspace(1)* %in.a.gep, align 4
145
146  %mul = fmul float %a, %b
147  %madak = fadd float %mul, 10.0
148  store float %madak, float addrspace(1)* %out.gep, align 4
149  ret void
150}
151
152; GCN-LABEL: @v_s_madak_f32
153; GCN-DAG:       s_load_dword [[SB:s[0-9]+]]
154; GFX6_8_9-DAG:  v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
155; GCN-DAG:       {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
156; GFX6_8_9-NOT:  v_madak_f32
157; GFX6_8_9:      v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
158; GFX10-MAD:     v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
159; FMA:           v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
160define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) #0 {
161  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
162  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
163  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
164
165  %b = load float, float addrspace(1)* %in.b.gep, align 4
166
167  %mul = fmul float %a, %b
168  %madak = fadd float %mul, 10.0
169  store float %madak, float addrspace(1)* %out.gep, align 4
170  ret void
171}
172
173; GCN-LABEL: {{^}}s_s_madak_f32:
174; GCN-NOT: v_madak_f32
175; GFX8_9:  v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
176; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
177; FMA:       v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
178define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) #0 {
179  %mul = fmul float %a, %b
180  %madak = fadd float %mul, 10.0
181  store float %madak, float addrspace(1)* %out, align 4
182  ret void
183}
184
185; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
186; GFX6:      buffer_load_dword [[VA:v[0-9]+]]
187; GFX6:      buffer_load_dword [[VB:v[0-9]+]]
188; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
189; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
190; GFX6_8_9:  v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
191; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
192; FMA:       v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
193; GCN:       s_endpgm
194define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 {
195  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
196  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
197  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
198  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
199
200  %a = load float, float addrspace(1)* %in.a.gep, align 4
201  %b = load float, float addrspace(1)* %in.b.gep, align 4
202
203  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
204
205  %mul = fmul float %a.fabs, %b
206  %madak = fadd float %mul, 10.0
207  store float %madak, float addrspace(1)* %out.gep, align 4
208  ret void
209}
210
211; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
212; GFX6:      buffer_load_dword [[VA:v[0-9]+]]
213; GFX6:      buffer_load_dword [[VB:v[0-9]+]]
214; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
215; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
216; GFX6_8_9:  v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
217; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
218; FMA:       v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
219; GCN:       s_endpgm
220define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 {
221  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
222  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
223  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
224  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
225
226  %a = load float, float addrspace(1)* %in.a.gep, align 4
227  %b = load float, float addrspace(1)* %in.b.gep, align 4
228
229  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
230
231  %mul = fmul float %a, %b.fabs
232  %madak = fadd float %mul, 10.0
233  store float %madak, float addrspace(1)* %out.gep, align 4
234  ret void
235}
236
237; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
238; because the implicit immediate already uses the constant bus.
239; On GFX10+ we can use two scalar operands.
240; GCN-LABEL: {{^}}madak_constant_bus_violation:
241; GCN:       s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
242
243; GCN:       {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
244; MAD:       v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000
245; MAD:       v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5
246; GFX10:     v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
247; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
248; FMA:       v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
249; GCN:       v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
250; GFX6:      buffer_store_dword [[MUL]]
251; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
252define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
253bb:
254  %tmp = icmp eq i32 %arg1, 0
255  br i1 %tmp, label %bb3, label %bb4
256
257bb3:
258  store volatile float 0.0, float addrspace(1)* undef
259  br label %bb4
260
261bb4:
262  %vgpr = load volatile float, float addrspace(1)* undef
263  %tmp0 = fmul float %sgpr0, 0.5
264  %tmp1 = fadd float %tmp0, 42.0
265  %tmp2 = fmul float %tmp1, %vgpr
266  store volatile float %tmp2, float addrspace(1)* undef, align 4
267  ret void
268}
269
270attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
271