1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3
4; IEEE bit enabled for compute kernel, no shouldn't use.
5; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros:
6; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
7; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
8; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
9define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
10  %tid = call i32 @llvm.amdgcn.workitem.id.x()
11  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
12  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
13  %a = load float, float addrspace(1)* %gep0
14  %add = fadd float %a, 1.0
15  %div2 = fmul float %add, 0.5
16  store float %div2, float addrspace(1)* %out.gep
17  ret void
18}
19
20; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed
21; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz:
22; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
23; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
24; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
25define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
26  %tid = call i32 @llvm.amdgcn.workitem.id.x()
27  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
28  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
29  %a = load float, float addrspace(1)* %gep0
30  %add = fadd float %a, 1.0
31  %div2 = fmul float %add, 0.5
32  store float %div2, float addrspace(1)* %out.gep
33  ret void
34}
35
36; Only allow without IEEE bit if signed zeros are significant.
37; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros:
38; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
39; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
40define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
41  %add = fadd float %a, 1.0
42  %div2 = fmul float %add, 0.5
43  store float %div2, float addrspace(1)* undef
44  ret void
45}
46
47; GCN-LABEL: {{^}}v_omod_div2_f32:
48; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}}
49define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
50  %add = fadd float %a, 1.0
51  %div2 = fmul float %add, 0.5
52  store float %div2, float addrspace(1)* undef
53  ret void
54}
55
56; GCN-LABEL: {{^}}v_omod_mul2_f32:
57; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}}
58define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
59  %add = fadd float %a, 1.0
60  %div2 = fmul float %add, 2.0
61  store float %div2, float addrspace(1)* undef
62  ret void
63}
64
65; GCN-LABEL: {{^}}v_omod_mul4_f32:
66; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
67define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
68  %add = fadd float %a, 1.0
69  %div2 = fmul float %add, 4.0
70  store float %div2, float addrspace(1)* undef
71  ret void
72}
73
74; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32:
75; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
76; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}}
77define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
78  %add = fadd float %a, 1.0
79  %div2 = fmul float %add, 4.0
80  store float %div2, float addrspace(1)* undef
81  store volatile float %add, float addrspace(1)* undef
82  ret void
83}
84
85; GCN-LABEL: {{^}}v_omod_mul4_dbg_use_f32:
86; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
87define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
88  %add = fadd float %a, 1.0
89  call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
90  %div2 = fmul float %add, 4.0
91  store float %div2, float addrspace(1)* undef
92  ret void
93}
94
95; Clamp is applied after omod, folding both into instruction is OK.
96; GCN-LABEL: {{^}}v_clamp_omod_div2_f32:
97; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 clamp div:2{{$}}
98define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
99  %add = fadd float %a, 1.0
100  %div2 = fmul float %add, 0.5
101
102  %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
103  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
104  store float %clamp, float addrspace(1)* undef
105  ret void
106}
107
108; Cannot fold omod into clamp
109; GCN-LABEL: {{^}}v_omod_div2_clamp_f32:
110; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 clamp{{$}}
111; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
112define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
113  %add = fadd float %a, 1.0
114  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
115  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
116  %div2 = fmul float %clamp, 0.5
117  store float %div2, float addrspace(1)* undef
118  ret void
119}
120
121; GCN-LABEL: {{^}}v_omod_div2_abs_src_f32:
122; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
123; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ADD]]|, 0.5{{$}}
124define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
125  %add = fadd float %a, 1.0
126  %abs.add = call float @llvm.fabs.f32(float %add)
127  %div2 = fmul float %abs.add, 0.5
128  store float %div2, float addrspace(1)* undef
129  ret void
130}
131
132; GCN-LABEL: {{^}}v_omod_add_self_clamp_f32:
133; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, v0 clamp{{$}}
134define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
135  %add = fadd float %a, %a
136  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
137  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
138  store float %clamp, float addrspace(1)* undef
139  ret void
140}
141
142; GCN-LABEL: {{^}}v_omod_add_clamp_self_f32:
143; GCN: v_max_f32_e64 [[CLAMP:v[0-9]+]], v0, v0 clamp{{$}}
144; GCN: v_add_f32_e32 v{{[0-9]+}}, [[CLAMP]], [[CLAMP]]{{$}}
145define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
146  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
147  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
148  %add = fadd float %clamp, %clamp
149  store float %add, float addrspace(1)* undef
150  ret void
151}
152
153; GCN-LABEL: {{^}}v_omod_add_abs_self_f32:
154; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
155; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, |[[X]]|{{$}}
156define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
157  %x = fadd float %a, 1.0
158  %abs.x = call float @llvm.fabs.f32(float %x)
159  %add = fadd float %abs.x, %abs.x
160  store float %add, float addrspace(1)* undef
161  ret void
162}
163
164; GCN-LABEL: {{^}}v_omod_add_abs_x_x_f32:
165
166; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
167; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[X]]{{$}}
168define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
169  %x = fadd float %a, 1.0
170  %abs.x = call float @llvm.fabs.f32(float %x)
171  %add = fadd float %abs.x, %x
172  store float %add, float addrspace(1)* undef
173  ret void
174}
175
176; GCN-LABEL: {{^}}v_omod_add_x_abs_x_f32:
177; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
178; GCN: v_add_f32_e64 v{{[0-9]+}}, [[X]], |[[X]]|{{$}}
179define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
180  %x = fadd float %a, 1.0
181  %abs.x = call float @llvm.fabs.f32(float %x)
182  %add = fadd float %x, %abs.x
183  store float %add, float addrspace(1)* undef
184  ret void
185}
186
187; Don't fold omod into omod into another omod.
188; GCN-LABEL: {{^}}v_omod_div2_omod_div2_f32:
189; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}}
190; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
191define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
192  %add = fadd float %a, 1.0
193  %div2.0 = fmul float %add, 0.5
194  %div2.1 = fmul float %div2.0, 0.5
195  store float %div2.1, float addrspace(1)* undef
196  ret void
197}
198
199; Don't fold omod if denorms enabled
200; GCN-LABEL: {{^}}v_omod_div2_f32_denormals:
201; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
202; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
203define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
204  %add = fadd float %a, 1.0
205  %div2 = fmul float %add, 0.5
206  store float %div2, float addrspace(1)* undef
207  ret void
208}
209
210; Don't fold omod if denorms enabled for add form.
211; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals:
212; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
213; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
214define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
215  %add = fadd float %a, 1.0
216  %mul2 = fadd float %add, %add
217  store float %mul2, float addrspace(1)* undef
218  ret void
219}
220
221; Don't fold omod if denorms enabled
222; GCN-LABEL: {{^}}v_omod_div2_f16_denormals:
223; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
224; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
225define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
226  %add = fadd half %a, 1.0
227  %div2 = fmul half %add, 0.5
228  store half %div2, half addrspace(1)* undef
229  ret void
230}
231
232; Don't fold omod if denorms enabled for add form.
233; GCN-LABEL: {{^}}v_omod_mul2_f16_denormals:
234; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
235; VI: v_add_f16_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
236define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
237  %add = fadd half %a, 1.0
238  %mul2 = fadd half %add, %add
239  store half %mul2, half addrspace(1)* undef
240  ret void
241}
242
243; GCN-LABEL: {{^}}v_omod_div2_f16_no_denormals:
244; VI-NOT: v0
245; VI: v_add_f16_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}}
246define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
247  %add = fadd half %a, 1.0
248  %div2 = fmul half %add, 0.5
249  store half %div2, half addrspace(1)* undef
250  ret void
251}
252
253; GCN-LABEL: {{^}}v_omod_mac_to_mad:
254; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} mul:2{{$}}
255define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
256  %mul = fmul float %a, %a
257  %add = fadd float %mul, %b
258  %mad = fmul float %add, 2.0
259  %res = fmul float %mad, %b
260  store float %res, float addrspace(1)* undef
261  ret void
262}
263
264declare i32 @llvm.amdgcn.workitem.id.x() #1
265declare float @llvm.fabs.f32(float) #1
266declare float @llvm.floor.f32(float) #1
267declare float @llvm.minnum.f32(float, float) #1
268declare float @llvm.maxnum.f32(float, float) #1
269declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
270declare double @llvm.fabs.f64(double) #1
271declare double @llvm.minnum.f64(double, double) #1
272declare double @llvm.maxnum.f64(double, double) #1
273declare half @llvm.fabs.f16(half) #1
274declare half @llvm.minnum.f16(half, half) #1
275declare half @llvm.maxnum.f16(half, half) #1
276declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
277
278attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
279attributes #1 = { nounwind readnone }
280attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
281attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
282attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
283
284!llvm.dbg.cu = !{!0}
285!llvm.module.flags = !{!2, !3}
286
287!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
288!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
289!2 = !{i32 2, !"Dwarf Version", i32 4}
290!3 = !{i32 2, !"Debug Info Version", i32 3}
291!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
292!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
293!6 = !DISubroutineType(types: !7)
294!7 = !{null, !8}
295!8 = !DIBasicType(name: "float", size: 32, align: 32)
296!9 = !DIExpression()
297!10 = !DILocation(line: 1, column: 42, scope: !5)
298