1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
4
5; GCN-LABEL: {{^}}v_clamp_f32:
6; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
7; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
8define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
9  %tid = call i32 @llvm.amdgcn.workitem.id.x()
10  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
11  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
12  %a = load float, float addrspace(1)* %gep0
13  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
14  %med = call float @llvm.minnum.f32(float %max, float 1.0)
15
16  store float %med, float addrspace(1)* %out.gep
17  ret void
18}
19
20; GCN-LABEL: {{^}}v_clamp_neg_f32:
21; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
22; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
23define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
24  %tid = call i32 @llvm.amdgcn.workitem.id.x()
25  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
26  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
27  %a = load float, float addrspace(1)* %gep0
28  %fneg.a = fsub float -0.0, %a
29  %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
30  %med = call float @llvm.minnum.f32(float %max, float 1.0)
31
32  store float %med, float addrspace(1)* %out.gep
33  ret void
34}
35
36; GCN-LABEL: {{^}}v_clamp_negabs_f32:
37; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
38; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
39define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
40  %tid = call i32 @llvm.amdgcn.workitem.id.x()
41  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
42  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
43  %a = load float, float addrspace(1)* %gep0
44  %fabs.a = call float @llvm.fabs.f32(float %a)
45  %fneg.fabs.a = fsub float -0.0, %fabs.a
46
47  %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
48  %med = call float @llvm.minnum.f32(float %max, float 1.0)
49
50  store float %med, float addrspace(1)* %out.gep
51  ret void
52}
53
54; GCN-LABEL: {{^}}v_clamp_negzero_f32:
55; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
56; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
57; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0
58define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
59  %tid = call i32 @llvm.amdgcn.workitem.id.x()
60  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
61  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
62  %a = load float, float addrspace(1)* %gep0
63  %max = call float @llvm.maxnum.f32(float %a, float -0.0)
64  %med = call float @llvm.minnum.f32(float %max, float 1.0)
65
66  store float %med, float addrspace(1)* %out.gep
67  ret void
68}
69
70; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
71; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
72; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
73; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
74define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
75  %tid = call i32 @llvm.amdgcn.workitem.id.x()
76  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
77  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
78  %a = load float, float addrspace(1)* %gep0
79  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
80  %med = call float @llvm.minnum.f32(float %max, float 1.0)
81
82  store float %med, float addrspace(1)* %out.gep
83  store volatile float %max, float addrspace(1)* undef
84  ret void
85}
86
87; GCN-LABEL: {{^}}v_clamp_f16:
88; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
89; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
90
91; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
92; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
93define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
94  %tid = call i32 @llvm.amdgcn.workitem.id.x()
95  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
96  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
97  %a = load half, half addrspace(1)* %gep0
98  %max = call half @llvm.maxnum.f16(half %a, half 0.0)
99  %med = call half @llvm.minnum.f16(half %max, half 1.0)
100
101  store half %med, half addrspace(1)* %out.gep
102  ret void
103}
104
105; GCN-LABEL: {{^}}v_clamp_neg_f16:
106; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
107; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
108
109; FIXME: Better to fold neg into max
110; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
111; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
112define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
113  %tid = call i32 @llvm.amdgcn.workitem.id.x()
114  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
115  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
116  %a = load half, half addrspace(1)* %gep0
117  %fneg.a = fsub half -0.0, %a
118  %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
119  %med = call half @llvm.minnum.f16(half %max, half 1.0)
120
121  store half %med, half addrspace(1)* %out.gep
122  ret void
123}
124
125; GCN-LABEL: {{^}}v_clamp_negabs_f16:
126; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
127; GFX89: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
128
129; FIXME: Better to fold neg/abs into max
130
131; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
132; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
133define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
134  %tid = call i32 @llvm.amdgcn.workitem.id.x()
135  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
136  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
137  %a = load half, half addrspace(1)* %gep0
138  %fabs.a = call half @llvm.fabs.f16(half %a)
139  %fneg.fabs.a = fsub half -0.0, %fabs.a
140
141  %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
142  %med = call half @llvm.minnum.f16(half %max, half 1.0)
143
144  store half %med, half addrspace(1)* %out.gep
145  ret void
146}
147
148; FIXME: Do f64 instructions support clamp?
149; GCN-LABEL: {{^}}v_clamp_f64:
150; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
151; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
152define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
153  %tid = call i32 @llvm.amdgcn.workitem.id.x()
154  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
155  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
156  %a = load double, double addrspace(1)* %gep0
157  %max = call double @llvm.maxnum.f64(double %a, double 0.0)
158  %med = call double @llvm.minnum.f64(double %max, double 1.0)
159
160  store double %med, double addrspace(1)* %out.gep
161  ret void
162}
163
164; GCN-LABEL: {{^}}v_clamp_neg_f64:
165; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
166; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
167define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
168  %tid = call i32 @llvm.amdgcn.workitem.id.x()
169  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
170  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
171  %a = load double, double addrspace(1)* %gep0
172  %fneg.a = fsub double -0.0, %a
173  %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
174  %med = call double @llvm.minnum.f64(double %max, double 1.0)
175
176  store double %med, double addrspace(1)* %out.gep
177  ret void
178}
179
180; GCN-LABEL: {{^}}v_clamp_negabs_f64:
181; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
182; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
183define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
184  %tid = call i32 @llvm.amdgcn.workitem.id.x()
185  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
186  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
187  %a = load double, double addrspace(1)* %gep0
188  %fabs.a = call double @llvm.fabs.f64(double %a)
189  %fneg.fabs.a = fsub double -0.0, %fabs.a
190
191  %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
192  %med = call double @llvm.minnum.f64(double %max, double 1.0)
193
194  store double %med, double addrspace(1)* %out.gep
195  ret void
196}
197
198; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
199; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
200; GCN: v_med3_f32
201define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
202  %tid = call i32 @llvm.amdgcn.workitem.id.x()
203  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
204  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
205  %a = load float, float addrspace(1)* %gep0
206  %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
207  store float %med, float addrspace(1)* %out.gep
208  ret void
209}
210
211; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
212; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
213; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
214define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
215  %tid = call i32 @llvm.amdgcn.workitem.id.x()
216  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
217  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
218  %a = load float, float addrspace(1)* %gep0
219  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
220  store float %med, float addrspace(1)* %out.gep
221  ret void
222}
223
224; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
225; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
226; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
227define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
228  %tid = call i32 @llvm.amdgcn.workitem.id.x()
229  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
230  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
231  %a = load float, float addrspace(1)* %gep0
232  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
233  store float %med, float addrspace(1)* %out.gep
234  ret void
235}
236
237; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
238; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
239; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
240define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
241  %tid = call i32 @llvm.amdgcn.workitem.id.x()
242  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
243  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
244  %a = load float, float addrspace(1)* %gep0
245  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
246  store float %med, float addrspace(1)* %out.gep
247  ret void
248}
249
250; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
251; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
252; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
253define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
254  %tid = call i32 @llvm.amdgcn.workitem.id.x()
255  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
256  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
257  %a = load float, float addrspace(1)* %gep0
258  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
259  store float %med, float addrspace(1)* %out.gep
260  ret void
261}
262
263; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
264; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
265; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
266define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
267  %tid = call i32 @llvm.amdgcn.workitem.id.x()
268  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
269  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
270  %a = load float, float addrspace(1)* %gep0
271  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
272  store float %med, float addrspace(1)* %out.gep
273  ret void
274}
275
276; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
277; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
278; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
279define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
280  %tid = call i32 @llvm.amdgcn.workitem.id.x()
281  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
282  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
283  %a = load float, float addrspace(1)* %gep0
284  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
285  store float %med, float addrspace(1)* %out.gep
286  ret void
287}
288
289; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
290; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
291define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
292  %tid = call i32 @llvm.amdgcn.workitem.id.x()
293  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
294  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
295  store float %med, float addrspace(1)* %out.gep
296  ret void
297}
298
299; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
300; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
301define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
302  %tid = call i32 @llvm.amdgcn.workitem.id.x()
303  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
304  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
305  store float %med, float addrspace(1)* %out.gep
306  ret void
307}
308
309; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
310; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
311define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
312  %tid = call i32 @llvm.amdgcn.workitem.id.x()
313  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
314  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
315  store float %med, float addrspace(1)* %out.gep
316  ret void
317}
318
319; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
320; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
321define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
322  %tid = call i32 @llvm.amdgcn.workitem.id.x()
323  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
324  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
325  store float %med, float addrspace(1)* %out.gep
326  ret void
327}
328
329; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
330; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
331define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
332  %tid = call i32 @llvm.amdgcn.workitem.id.x()
333  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
334  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
335  store float %med, float addrspace(1)* %out.gep
336  ret void
337}
338
339; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
340; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
341define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
342  %tid = call i32 @llvm.amdgcn.workitem.id.x()
343  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
344  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
345  store float %med, float addrspace(1)* %out.gep
346  ret void
347}
348
349; ---------------------------------------------------------------------
350; Test non-default behaviors enabling snans and disabling dx10_clamp
351; ---------------------------------------------------------------------
352
353; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
354; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
355; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
356define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
357  %tid = call i32 @llvm.amdgcn.workitem.id.x()
358  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
359  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
360  %a = load float, float addrspace(1)* %gep0
361  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
362  %med = call float @llvm.minnum.f32(float %max, float 1.0)
363
364  store float %med, float addrspace(1)* %out.gep
365  ret void
366}
367
368; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
369; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
370; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
371define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
372  %tid = call i32 @llvm.amdgcn.workitem.id.x()
373  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
374  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
375  %a = load float, float addrspace(1)* %gep0
376  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
377  %med = call float @llvm.minnum.f32(float %max, float 1.0)
378
379  store float %med, float addrspace(1)* %out.gep
380  ret void
381}
382
383; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
384; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
385; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
386; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
387define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
388  %tid = call i32 @llvm.amdgcn.workitem.id.x()
389  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
390  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
391  %a = load float, float addrspace(1)* %gep0
392  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
393  %med = call float @llvm.minnum.f32(float %max, float 1.0)
394
395  store float %med, float addrspace(1)* %out.gep
396  ret void
397}
398
399; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
400; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
401; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
402; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
403define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
404  %tid = call i32 @llvm.amdgcn.workitem.id.x()
405  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
406  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
407  %a = load float, float addrspace(1)* %gep0
408  %add  = fadd nnan float %a, 1.0
409  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
410  %med = call float @llvm.minnum.f32(float %max, float 1.0)
411
412  store float %med, float addrspace(1)* %out.gep
413  ret void
414}
415
416; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
417; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
418; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
419define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
420  %tid = call i32 @llvm.amdgcn.workitem.id.x()
421  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
422  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
423  %a = load float, float addrspace(1)* %gep0
424  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
425  store float %med, float addrspace(1)* %out.gep
426  ret void
427}
428
429; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
430; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
431; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
432define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
433  %tid = call i32 @llvm.amdgcn.workitem.id.x()
434  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
435  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
436  %a = load float, float addrspace(1)* %gep0
437  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
438  store float %med, float addrspace(1)* %out.gep
439  ret void
440}
441
442; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
443; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
444; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
445define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
446  %tid = call i32 @llvm.amdgcn.workitem.id.x()
447  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
448  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
449  %a = load float, float addrspace(1)* %gep0
450  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
451  store float %med, float addrspace(1)* %out.gep
452  ret void
453}
454
455; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
456; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
457; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
458define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
459  %tid = call i32 @llvm.amdgcn.workitem.id.x()
460  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
461  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
462  %a = load float, float addrspace(1)* %gep0
463  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
464  store float %med, float addrspace(1)* %out.gep
465  ret void
466}
467
468; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
469; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
470; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
471define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
472  %tid = call i32 @llvm.amdgcn.workitem.id.x()
473  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
474  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
475  %a = load float, float addrspace(1)* %gep0
476  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
477  store float %med, float addrspace(1)* %out.gep
478  ret void
479}
480
481; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
482; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
483; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
484define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
485  %tid = call i32 @llvm.amdgcn.workitem.id.x()
486  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
487  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
488  %a = load float, float addrspace(1)* %gep0
489  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
490  store float %med, float addrspace(1)* %out.gep
491  ret void
492}
493
494; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
495; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
496define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
497  %tid = call i32 @llvm.amdgcn.workitem.id.x()
498  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
499  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
500  store float %med, float addrspace(1)* %out.gep
501  ret void
502}
503
504; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
505; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
506define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
507  %tid = call i32 @llvm.amdgcn.workitem.id.x()
508  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
509  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
510  store float %med, float addrspace(1)* %out.gep
511  ret void
512}
513
514; GCN-LABEL: {{^}}v_clamp_v2f16:
515; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
516; GFX9-NOT: [[A]]
517; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
518define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
519  %tid = call i32 @llvm.amdgcn.workitem.id.x()
520  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
521  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
522  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
523  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
524  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
525
526  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
527  ret void
528}
529
530; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt:
531; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
532; GFX9-NOT: [[A]]
533; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
534define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
535  %tid = call i32 @llvm.amdgcn.workitem.id.x()
536  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
537  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
538  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
539  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
540  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
541
542  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
543  ret void
544}
545
546; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero:
547; GFX9: v_pk_max_f16
548; GFX9: v_pk_min_f16
549define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
550  %tid = call i32 @llvm.amdgcn.workitem.id.x()
551  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
552  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
553  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
554  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
555  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
556
557  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
558  ret void
559}
560
561; GCN-LABEL: {{^}}v_clamp_v2f16_not_one:
562; GFX9: v_pk_max_f16
563; GFX9: v_pk_min_f16
564define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
565  %tid = call i32 @llvm.amdgcn.workitem.id.x()
566  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
567  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
568  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
569  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
570  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
571
572  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
573  ret void
574}
575
576; GCN-LABEL: {{^}}v_clamp_neg_v2f16:
577; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
578; GFX9-NOT: [[A]]
579; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
580define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
581  %tid = call i32 @llvm.amdgcn.workitem.id.x()
582  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
583  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
584  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
585  %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a
586  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
587  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
588
589  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
590  ret void
591}
592
593; GCN-LABEL: {{^}}v_clamp_negabs_v2f16:
594; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
595; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]]
596; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
597define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
598  %tid = call i32 @llvm.amdgcn.workitem.id.x()
599  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
600  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
601  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
602  %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
603  %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a
604
605  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
606  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
607
608  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
609  ret void
610}
611
612; GCN-LABEL: {{^}}v_clamp_neglo_v2f16:
613; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
614; GFX9-NOT: [[A]]
615; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}}
616define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
617  %tid = call i32 @llvm.amdgcn.workitem.id.x()
618  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
619  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
620  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
621  %lo = extractelement <2 x half> %a, i32 0
622  %neg.lo = fsub half -0.0, %lo
623  %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
624  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
625  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
626
627  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
628  ret void
629}
630
631; GCN-LABEL: {{^}}v_clamp_neghi_v2f16:
632; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
633; GFX9-NOT: [[A]]
634; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}}
635define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
636  %tid = call i32 @llvm.amdgcn.workitem.id.x()
637  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
638  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
639  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
640  %hi = extractelement <2 x half> %a, i32 1
641  %neg.hi = fsub half -0.0, %hi
642  %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
643  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
644  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
645
646  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
647  ret void
648}
649
650; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle:
651; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
652; GFX9-NOT: [[A]]
653; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
654define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
655  %tid = call i32 @llvm.amdgcn.workitem.id.x()
656  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
657  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
658  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
659  %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
660  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
661  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
662
663  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
664  ret void
665}
666
667; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
668; GCN: v_add_f32_e32 [[A:v[0-9]+]]
669; GCN: v_add_f32_e32 [[B:v[0-9]+]]
670; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}}
671define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0
672{
673  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0
674  %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1
675  %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2
676  %l0 = load float, float addrspace(1)* %gep0
677  %l1 = load float, float addrspace(1)* %gep1
678  %l2 = load float, float addrspace(1)* %gep2
679  %a = fadd nsz float %l0, %l1
680  %b = fadd nsz float %l0, %l2
681  %res = call nsz float @llvm.maxnum.f32(float %a, float %b)
682  %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
683  %min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
684  %out.gep = getelementptr float, float addrspace(1)* %out, i32 3
685  store float %min, float addrspace(1)* %out.gep
686  ret void
687}
688
689declare i32 @llvm.amdgcn.workitem.id.x() #1
690declare float @llvm.fabs.f32(float) #1
691declare float @llvm.minnum.f32(float, float) #1
692declare float @llvm.maxnum.f32(float, float) #1
693declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
694declare double @llvm.fabs.f64(double) #1
695declare double @llvm.minnum.f64(double, double) #1
696declare double @llvm.maxnum.f64(double, double) #1
697declare half @llvm.fabs.f16(half) #1
698declare half @llvm.minnum.f16(half, half) #1
699declare half @llvm.maxnum.f16(half, half) #1
700declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
701declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
702declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
703
704attributes #0 = { nounwind }
705attributes #1 = { nounwind readnone }
706attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
707attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
708attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
709