1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s 3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s 4 5; GCN-LABEL: {{^}}v_clamp_f32: 6; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 7; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 8define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 9 %tid = call i32 @llvm.amdgcn.workitem.id.x() 10 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 11 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 12 %a = load float, float addrspace(1)* %gep0 13 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 14 %med = call float @llvm.minnum.f32(float %max, float 1.0) 15 16 store float %med, float addrspace(1)* %out.gep 17 ret void 18} 19 20; GCN-LABEL: {{^}}v_clamp_neg_f32: 21; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 22; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} 23define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 24 %tid = call i32 @llvm.amdgcn.workitem.id.x() 25 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 26 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 27 %a = load float, float addrspace(1)* %gep0 28 %fneg.a = fsub float -0.0, %a 29 %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0) 30 %med = call float @llvm.minnum.f32(float %max, float 1.0) 31 32 store float %med, float addrspace(1)* %out.gep 33 ret void 34} 35 36; GCN-LABEL: {{^}}v_clamp_negabs_f32: 37; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 38; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} 39define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 40 %tid = call i32 @llvm.amdgcn.workitem.id.x() 41 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 42 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 43 %a = load float, float addrspace(1)* %gep0 44 %fabs.a = call float @llvm.fabs.f32(float %a) 45 %fneg.fabs.a = fsub float -0.0, %fabs.a 46 47 %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0) 48 %med = call float @llvm.minnum.f32(float %max, float 1.0) 49 50 store float %med, float addrspace(1)* %out.gep 51 ret void 52} 53 54; GCN-LABEL: {{^}}v_clamp_negzero_f32: 55; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 56; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1 57; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0 58define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 59 %tid = call i32 @llvm.amdgcn.workitem.id.x() 60 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 61 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 62 %a = load float, float addrspace(1)* %gep0 63 %max = call float @llvm.maxnum.f32(float %a, float -0.0) 64 %med = call float @llvm.minnum.f32(float %max, float 1.0) 65 66 store float %med, float addrspace(1)* %out.gep 67 ret void 68} 69 70; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32: 71; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 72; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] 73; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] 74define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 75 %tid = call i32 @llvm.amdgcn.workitem.id.x() 76 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 77 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 78 %a = load float, float addrspace(1)* %gep0 79 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 80 %med = call float @llvm.minnum.f32(float %max, float 1.0) 81 82 store float %med, float addrspace(1)* %out.gep 83 store volatile float %max, float addrspace(1)* undef 84 ret void 85} 86 87; GCN-LABEL: {{^}}v_clamp_f16: 88; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] 89; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 90 91; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}} 92; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] 93define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 94 %tid = call i32 @llvm.amdgcn.workitem.id.x() 95 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 96 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 97 %a = load half, half addrspace(1)* %gep0 98 %max = call half @llvm.maxnum.f16(half %a, half 0.0) 99 %med = call half @llvm.minnum.f16(half %max, half 1.0) 100 101 store half %med, half addrspace(1)* %out.gep 102 ret void 103} 104 105; GCN-LABEL: {{^}}v_clamp_neg_f16: 106; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] 107; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} 108 109; FIXME: Better to fold neg into max 110; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}} 111; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] 112define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 113 %tid = call i32 @llvm.amdgcn.workitem.id.x() 114 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 115 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 116 %a = load half, half addrspace(1)* %gep0 117 %fneg.a = fsub half -0.0, %a 118 %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0) 119 %med = call half @llvm.minnum.f16(half %max, half 1.0) 120 121 store half %med, half addrspace(1)* %out.gep 122 ret void 123} 124 125; GCN-LABEL: {{^}}v_clamp_negabs_f16: 126; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] 127; GFX89: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} 128 129; FIXME: Better to fold neg/abs into max 130 131; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}} 132; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] 133define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 134 %tid = call i32 @llvm.amdgcn.workitem.id.x() 135 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 136 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 137 %a = load half, half addrspace(1)* %gep0 138 %fabs.a = call half @llvm.fabs.f16(half %a) 139 %fneg.fabs.a = fsub half -0.0, %fabs.a 140 141 %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0) 142 %med = call half @llvm.minnum.f16(half %max, half 1.0) 143 144 store half %med, half addrspace(1)* %out.gep 145 ret void 146} 147 148; FIXME: Do f64 instructions support clamp? 149; GCN-LABEL: {{^}}v_clamp_f64: 150; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 151; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}} 152define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 153 %tid = call i32 @llvm.amdgcn.workitem.id.x() 154 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 155 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 156 %a = load double, double addrspace(1)* %gep0 157 %max = call double @llvm.maxnum.f64(double %a, double 0.0) 158 %med = call double @llvm.minnum.f64(double %max, double 1.0) 159 160 store double %med, double addrspace(1)* %out.gep 161 ret void 162} 163 164; GCN-LABEL: {{^}}v_clamp_neg_f64: 165; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 166; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}} 167define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 168 %tid = call i32 @llvm.amdgcn.workitem.id.x() 169 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 170 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 171 %a = load double, double addrspace(1)* %gep0 172 %fneg.a = fsub double -0.0, %a 173 %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0) 174 %med = call double @llvm.minnum.f64(double %max, double 1.0) 175 176 store double %med, double addrspace(1)* %out.gep 177 ret void 178} 179 180; GCN-LABEL: {{^}}v_clamp_negabs_f64: 181; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 182; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}} 183define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 184 %tid = call i32 @llvm.amdgcn.workitem.id.x() 185 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 186 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 187 %a = load double, double addrspace(1)* %gep0 188 %fabs.a = call double @llvm.fabs.f64(double %a) 189 %fneg.fabs.a = fsub double -0.0, %fabs.a 190 191 %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0) 192 %med = call double @llvm.minnum.f64(double %max, double 1.0) 193 194 store double %med, double addrspace(1)* %out.gep 195 ret void 196} 197 198; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32: 199; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 200; GCN: v_med3_f32 201define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 202 %tid = call i32 @llvm.amdgcn.workitem.id.x() 203 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 204 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 205 %a = load float, float addrspace(1)* %gep0 206 %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a) 207 store float %med, float addrspace(1)* %out.gep 208 ret void 209} 210 211; GCN-LABEL: {{^}}v_clamp_med3_aby_f32: 212; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 213; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 214define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 215 %tid = call i32 @llvm.amdgcn.workitem.id.x() 216 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 217 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 218 %a = load float, float addrspace(1)* %gep0 219 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) 220 store float %med, float addrspace(1)* %out.gep 221 ret void 222} 223 224; GCN-LABEL: {{^}}v_clamp_med3_bay_f32: 225; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 226; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 227define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 228 %tid = call i32 @llvm.amdgcn.workitem.id.x() 229 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 230 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 231 %a = load float, float addrspace(1)* %gep0 232 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) 233 store float %med, float addrspace(1)* %out.gep 234 ret void 235} 236 237; GCN-LABEL: {{^}}v_clamp_med3_yab_f32: 238; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 239; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 240define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 241 %tid = call i32 @llvm.amdgcn.workitem.id.x() 242 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 243 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 244 %a = load float, float addrspace(1)* %gep0 245 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) 246 store float %med, float addrspace(1)* %out.gep 247 ret void 248} 249 250; GCN-LABEL: {{^}}v_clamp_med3_yba_f32: 251; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 252; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 253define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 254 %tid = call i32 @llvm.amdgcn.workitem.id.x() 255 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 256 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 257 %a = load float, float addrspace(1)* %gep0 258 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) 259 store float %med, float addrspace(1)* %out.gep 260 ret void 261} 262 263; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32: 264; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 265; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 266define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 267 %tid = call i32 @llvm.amdgcn.workitem.id.x() 268 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 269 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 270 %a = load float, float addrspace(1)* %gep0 271 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) 272 store float %med, float addrspace(1)* %out.gep 273 ret void 274} 275 276; GCN-LABEL: {{^}}v_clamp_med3_bya_f32: 277; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 278; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 279define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 280 %tid = call i32 @llvm.amdgcn.workitem.id.x() 281 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 282 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 283 %a = load float, float addrspace(1)* %gep0 284 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) 285 store float %med, float addrspace(1)* %out.gep 286 ret void 287} 288 289; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32: 290; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0 291define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 { 292 %tid = call i32 @llvm.amdgcn.workitem.id.x() 293 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 294 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0) 295 store float %med, float addrspace(1)* %out.gep 296 ret void 297} 298 299; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32: 300; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 301define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 { 302 %tid = call i32 @llvm.amdgcn.workitem.id.x() 303 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 304 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0) 305 store float %med, float addrspace(1)* %out.gep 306 ret void 307} 308 309; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32: 310; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5 311define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 { 312 %tid = call i32 @llvm.amdgcn.workitem.id.x() 313 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 314 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5) 315 store float %med, float addrspace(1)* %out.gep 316 ret void 317} 318 319; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32: 320; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}} 321define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 { 322 %tid = call i32 @llvm.amdgcn.workitem.id.x() 323 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 324 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float)) 325 store float %med, float addrspace(1)* %out.gep 326 ret void 327} 328 329; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32: 330; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 331define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 { 332 %tid = call i32 @llvm.amdgcn.workitem.id.x() 333 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 334 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) 335 store float %med, float addrspace(1)* %out.gep 336 ret void 337} 338 339; GCN-LABEL: {{^}}v_clamp_constant_snan_f32: 340; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 341define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 { 342 %tid = call i32 @llvm.amdgcn.workitem.id.x() 343 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 344 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) 345 store float %med, float addrspace(1)* %out.gep 346 ret void 347} 348 349; --------------------------------------------------------------------- 350; Test non-default behaviors enabling snans and disabling dx10_clamp 351; --------------------------------------------------------------------- 352 353; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp: 354; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 355; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 356define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 357 %tid = call i32 @llvm.amdgcn.workitem.id.x() 358 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 359 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 360 %a = load float, float addrspace(1)* %gep0 361 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 362 %med = call float @llvm.minnum.f32(float %max, float 1.0) 363 364 store float %med, float addrspace(1)* %out.gep 365 ret void 366} 367 368; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp: 369; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 370; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 371define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 { 372 %tid = call i32 @llvm.amdgcn.workitem.id.x() 373 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 374 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 375 %a = load float, float addrspace(1)* %gep0 376 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 377 %med = call float @llvm.minnum.f32(float %max, float 1.0) 378 379 store float %med, float addrspace(1)* %out.gep 380 ret void 381} 382 383; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp: 384; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 385; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] 386; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] 387define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 388 %tid = call i32 @llvm.amdgcn.workitem.id.x() 389 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 390 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 391 %a = load float, float addrspace(1)* %gep0 392 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 393 %med = call float @llvm.minnum.f32(float %max, float 1.0) 394 395 store float %med, float addrspace(1)* %out.gep 396 ret void 397} 398 399; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src: 400; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 401; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]] 402; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 403define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 404 %tid = call i32 @llvm.amdgcn.workitem.id.x() 405 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 406 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 407 %a = load float, float addrspace(1)* %gep0 408 %add = fadd nnan float %a, 1.0 409 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 410 %med = call float @llvm.minnum.f32(float %max, float 1.0) 411 412 store float %med, float addrspace(1)* %out.gep 413 ret void 414} 415 416; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp: 417; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 418; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 419define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 420 %tid = call i32 @llvm.amdgcn.workitem.id.x() 421 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 422 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 423 %a = load float, float addrspace(1)* %gep0 424 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) 425 store float %med, float addrspace(1)* %out.gep 426 ret void 427} 428 429; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp: 430; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 431; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} 432define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 433 %tid = call i32 @llvm.amdgcn.workitem.id.x() 434 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 435 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 436 %a = load float, float addrspace(1)* %gep0 437 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) 438 store float %med, float addrspace(1)* %out.gep 439 ret void 440} 441 442; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp: 443; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 444; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 445define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 446 %tid = call i32 @llvm.amdgcn.workitem.id.x() 447 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 448 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 449 %a = load float, float addrspace(1)* %gep0 450 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) 451 store float %med, float addrspace(1)* %out.gep 452 ret void 453} 454 455; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp: 456; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 457; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0 458define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 459 %tid = call i32 @llvm.amdgcn.workitem.id.x() 460 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 461 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 462 %a = load float, float addrspace(1)* %gep0 463 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) 464 store float %med, float addrspace(1)* %out.gep 465 ret void 466} 467 468; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp: 469; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 470; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0 471define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 472 %tid = call i32 @llvm.amdgcn.workitem.id.x() 473 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 474 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 475 %a = load float, float addrspace(1)* %gep0 476 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) 477 store float %med, float addrspace(1)* %out.gep 478 ret void 479} 480 481; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp: 482; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 483; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0 484define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 485 %tid = call i32 @llvm.amdgcn.workitem.id.x() 486 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 487 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 488 %a = load float, float addrspace(1)* %gep0 489 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) 490 store float %med, float addrspace(1)* %out.gep 491 ret void 492} 493 494; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp: 495; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000 496define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { 497 %tid = call i32 @llvm.amdgcn.workitem.id.x() 498 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 499 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) 500 store float %med, float addrspace(1)* %out.gep 501 ret void 502} 503 504; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp: 505; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001 506define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { 507 %tid = call i32 @llvm.amdgcn.workitem.id.x() 508 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 509 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) 510 store float %med, float addrspace(1)* %out.gep 511 ret void 512} 513 514; GCN-LABEL: {{^}}v_clamp_v2f16: 515; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 516; GFX9-NOT: [[A]] 517; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}} 518define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 519 %tid = call i32 @llvm.amdgcn.workitem.id.x() 520 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 521 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 522 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 523 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer) 524 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 525 526 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 527 ret void 528} 529 530; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt: 531; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 532; GFX9-NOT: [[A]] 533; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}} 534define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 535 %tid = call i32 @llvm.amdgcn.workitem.id.x() 536 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 537 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 538 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 539 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>) 540 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>) 541 542 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 543 ret void 544} 545 546; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero: 547; GFX9: v_pk_max_f16 548; GFX9: v_pk_min_f16 549define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 550 %tid = call i32 @llvm.amdgcn.workitem.id.x() 551 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 552 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 553 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 554 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>) 555 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 556 557 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 558 ret void 559} 560 561; GCN-LABEL: {{^}}v_clamp_v2f16_not_one: 562; GFX9: v_pk_max_f16 563; GFX9: v_pk_min_f16 564define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 565 %tid = call i32 @llvm.amdgcn.workitem.id.x() 566 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 567 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 568 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 569 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>) 570 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>) 571 572 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 573 ret void 574} 575 576; GCN-LABEL: {{^}}v_clamp_neg_v2f16: 577; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 578; GFX9-NOT: [[A]] 579; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}} 580define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 581 %tid = call i32 @llvm.amdgcn.workitem.id.x() 582 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 583 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 584 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 585 %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a 586 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer) 587 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 588 589 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 590 ret void 591} 592 593; GCN-LABEL: {{^}}v_clamp_negabs_v2f16: 594; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 595; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]] 596; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}} 597define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 598 %tid = call i32 @llvm.amdgcn.workitem.id.x() 599 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 600 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 601 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 602 %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) 603 %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a 604 605 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer) 606 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 607 608 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 609 ret void 610} 611 612; GCN-LABEL: {{^}}v_clamp_neglo_v2f16: 613; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 614; GFX9-NOT: [[A]] 615; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}} 616define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 617 %tid = call i32 @llvm.amdgcn.workitem.id.x() 618 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 619 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 620 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 621 %lo = extractelement <2 x half> %a, i32 0 622 %neg.lo = fsub half -0.0, %lo 623 %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0 624 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer) 625 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 626 627 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 628 ret void 629} 630 631; GCN-LABEL: {{^}}v_clamp_neghi_v2f16: 632; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 633; GFX9-NOT: [[A]] 634; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}} 635define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 636 %tid = call i32 @llvm.amdgcn.workitem.id.x() 637 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 638 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 639 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 640 %hi = extractelement <2 x half> %a, i32 1 641 %neg.hi = fsub half -0.0, %hi 642 %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1 643 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer) 644 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 645 646 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 647 ret void 648} 649 650; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle: 651; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 652; GFX9-NOT: [[A]] 653; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}} 654define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 655 %tid = call i32 @llvm.amdgcn.workitem.id.x() 656 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 657 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 658 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 659 %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0> 660 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer) 661 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 662 663 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 664 ret void 665} 666 667; GCN-LABEL: {{^}}v_clamp_diff_source_f32: 668; GCN: v_add_f32_e32 [[A:v[0-9]+]] 669; GCN: v_add_f32_e32 [[B:v[0-9]+]] 670; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}} 671define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 672{ 673 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0 674 %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1 675 %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2 676 %l0 = load float, float addrspace(1)* %gep0 677 %l1 = load float, float addrspace(1)* %gep1 678 %l2 = load float, float addrspace(1)* %gep2 679 %a = fadd nsz float %l0, %l1 680 %b = fadd nsz float %l0, %l2 681 %res = call nsz float @llvm.maxnum.f32(float %a, float %b) 682 %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0) 683 %min = call nsz float @llvm.minnum.f32(float %max, float 1.0) 684 %out.gep = getelementptr float, float addrspace(1)* %out, i32 3 685 store float %min, float addrspace(1)* %out.gep 686 ret void 687} 688 689declare i32 @llvm.amdgcn.workitem.id.x() #1 690declare float @llvm.fabs.f32(float) #1 691declare float @llvm.minnum.f32(float, float) #1 692declare float @llvm.maxnum.f32(float, float) #1 693declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 694declare double @llvm.fabs.f64(double) #1 695declare double @llvm.minnum.f64(double, double) #1 696declare double @llvm.maxnum.f64(double, double) #1 697declare half @llvm.fabs.f16(half) #1 698declare half @llvm.minnum.f16(half, half) #1 699declare half @llvm.maxnum.f16(half, half) #1 700declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 701declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 702declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 703 704attributes #0 = { nounwind } 705attributes #1 = { nounwind readnone } 706attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" } 707attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" } 708attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" } 709