1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4; IEEE bit enabled for compute kernel, no shouldn't use. 5; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros: 6; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 7; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} 8; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 9define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 10 %tid = call i32 @llvm.amdgcn.workitem.id.x() 11 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 12 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 13 %a = load float, float addrspace(1)* %gep0 14 %add = fadd float %a, 1.0 15 %div2 = fmul float %add, 0.5 16 store float %div2, float addrspace(1)* %out.gep 17 ret void 18} 19 20; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed 21; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz: 22; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 23; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} 24; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 25define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 26 %tid = call i32 @llvm.amdgcn.workitem.id.x() 27 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 28 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 29 %a = load float, float addrspace(1)* %gep0 30 %add = fadd float %a, 1.0 31 %div2 = fmul float %add, 0.5 32 store float %div2, float addrspace(1)* %out.gep 33 ret void 34} 35 36; Only allow without IEEE bit if signed zeros are significant. 37; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros: 38; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 39; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 40define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 { 41 %add = fadd float %a, 1.0 42 %div2 = fmul float %add, 0.5 43 store float %div2, float addrspace(1)* undef 44 ret void 45} 46 47; GCN-LABEL: {{^}}v_omod_div2_f32: 48; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}} 49define amdgpu_ps void @v_omod_div2_f32(float %a) #0 { 50 %add = fadd float %a, 1.0 51 %div2 = fmul float %add, 0.5 52 store float %div2, float addrspace(1)* undef 53 ret void 54} 55 56; GCN-LABEL: {{^}}v_omod_mul2_f32: 57; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}} 58define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 { 59 %add = fadd float %a, 1.0 60 %div2 = fmul float %add, 2.0 61 store float %div2, float addrspace(1)* undef 62 ret void 63} 64 65; GCN-LABEL: {{^}}v_omod_mul4_f32: 66; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}} 67define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 { 68 %add = fadd float %a, 1.0 69 %div2 = fmul float %add, 4.0 70 store float %div2, float addrspace(1)* undef 71 ret void 72} 73 74; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32: 75; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 76; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}} 77define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 { 78 %add = fadd float %a, 1.0 79 %div2 = fmul float %add, 4.0 80 store float %div2, float addrspace(1)* undef 81 store volatile float %add, float addrspace(1)* undef 82 ret void 83} 84 85; GCN-LABEL: {{^}}v_omod_mul4_dbg_use_f32: 86; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}} 87define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 { 88 %add = fadd float %a, 1.0 89 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10 90 %div2 = fmul float %add, 4.0 91 store float %div2, float addrspace(1)* undef 92 ret void 93} 94 95; Clamp is applied after omod, folding both into instruction is OK. 96; GCN-LABEL: {{^}}v_clamp_omod_div2_f32: 97; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 clamp div:2{{$}} 98define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 { 99 %add = fadd float %a, 1.0 100 %div2 = fmul float %add, 0.5 101 102 %max = call float @llvm.maxnum.f32(float %div2, float 0.0) 103 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 104 store float %clamp, float addrspace(1)* undef 105 ret void 106} 107 108; Cannot fold omod into clamp 109; GCN-LABEL: {{^}}v_omod_div2_clamp_f32: 110; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 clamp{{$}} 111; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 112define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 { 113 %add = fadd float %a, 1.0 114 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 115 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 116 %div2 = fmul float %clamp, 0.5 117 store float %div2, float addrspace(1)* undef 118 ret void 119} 120 121; GCN-LABEL: {{^}}v_omod_div2_abs_src_f32: 122; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 123; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ADD]]|, 0.5{{$}} 124define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 { 125 %add = fadd float %a, 1.0 126 %abs.add = call float @llvm.fabs.f32(float %add) 127 %div2 = fmul float %abs.add, 0.5 128 store float %div2, float addrspace(1)* undef 129 ret void 130} 131 132; GCN-LABEL: {{^}}v_omod_add_self_clamp_f32: 133; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, v0 clamp{{$}} 134define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 { 135 %add = fadd float %a, %a 136 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 137 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 138 store float %clamp, float addrspace(1)* undef 139 ret void 140} 141 142; GCN-LABEL: {{^}}v_omod_add_clamp_self_f32: 143; GCN: v_max_f32_e64 [[CLAMP:v[0-9]+]], v0, v0 clamp{{$}} 144; GCN: v_add_f32_e32 v{{[0-9]+}}, [[CLAMP]], [[CLAMP]]{{$}} 145define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 { 146 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 147 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 148 %add = fadd float %clamp, %clamp 149 store float %add, float addrspace(1)* undef 150 ret void 151} 152 153; GCN-LABEL: {{^}}v_omod_add_abs_self_f32: 154; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 155; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, |[[X]]|{{$}} 156define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 { 157 %x = fadd float %a, 1.0 158 %abs.x = call float @llvm.fabs.f32(float %x) 159 %add = fadd float %abs.x, %abs.x 160 store float %add, float addrspace(1)* undef 161 ret void 162} 163 164; GCN-LABEL: {{^}}v_omod_add_abs_x_x_f32: 165 166; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 167; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[X]]{{$}} 168define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 { 169 %x = fadd float %a, 1.0 170 %abs.x = call float @llvm.fabs.f32(float %x) 171 %add = fadd float %abs.x, %x 172 store float %add, float addrspace(1)* undef 173 ret void 174} 175 176; GCN-LABEL: {{^}}v_omod_add_x_abs_x_f32: 177; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 178; GCN: v_add_f32_e64 v{{[0-9]+}}, [[X]], |[[X]]|{{$}} 179define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 { 180 %x = fadd float %a, 1.0 181 %abs.x = call float @llvm.fabs.f32(float %x) 182 %add = fadd float %x, %abs.x 183 store float %add, float addrspace(1)* undef 184 ret void 185} 186 187; Don't fold omod into omod into another omod. 188; GCN-LABEL: {{^}}v_omod_div2_omod_div2_f32: 189; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}} 190; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 191define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 { 192 %add = fadd float %a, 1.0 193 %div2.0 = fmul float %add, 0.5 194 %div2.1 = fmul float %div2.0, 0.5 195 store float %div2.1, float addrspace(1)* undef 196 ret void 197} 198 199; Don't fold omod if denorms enabled 200; GCN-LABEL: {{^}}v_omod_div2_f32_denormals: 201; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 202; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 203define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 { 204 %add = fadd float %a, 1.0 205 %div2 = fmul float %add, 0.5 206 store float %div2, float addrspace(1)* undef 207 ret void 208} 209 210; Don't fold omod if denorms enabled for add form. 211; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals: 212; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 213; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}} 214define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 { 215 %add = fadd float %a, 1.0 216 %mul2 = fadd float %add, %add 217 store float %mul2, float addrspace(1)* undef 218 ret void 219} 220 221; Don't fold omod if denorms enabled 222; GCN-LABEL: {{^}}v_omod_div2_f16_denormals: 223; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 224; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 225define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { 226 %add = fadd half %a, 1.0 227 %div2 = fmul half %add, 0.5 228 store half %div2, half addrspace(1)* undef 229 ret void 230} 231 232; Don't fold omod if denorms enabled for add form. 233; GCN-LABEL: {{^}}v_omod_mul2_f16_denormals: 234; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 235; VI: v_add_f16_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}} 236define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { 237 %add = fadd half %a, 1.0 238 %mul2 = fadd half %add, %add 239 store half %mul2, half addrspace(1)* undef 240 ret void 241} 242 243; GCN-LABEL: {{^}}v_omod_div2_f16_no_denormals: 244; VI-NOT: v0 245; VI: v_add_f16_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}} 246define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { 247 %add = fadd half %a, 1.0 248 %div2 = fmul half %add, 0.5 249 store half %div2, half addrspace(1)* undef 250 ret void 251} 252 253; GCN-LABEL: {{^}}v_omod_mac_to_mad: 254; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} mul:2{{$}} 255define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 { 256 %mul = fmul float %a, %a 257 %add = fadd float %mul, %b 258 %mad = fmul float %add, 2.0 259 %res = fmul float %mad, %b 260 store float %res, float addrspace(1)* undef 261 ret void 262} 263 264declare i32 @llvm.amdgcn.workitem.id.x() #1 265declare float @llvm.fabs.f32(float) #1 266declare float @llvm.floor.f32(float) #1 267declare float @llvm.minnum.f32(float, float) #1 268declare float @llvm.maxnum.f32(float, float) #1 269declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 270declare double @llvm.fabs.f64(double) #1 271declare double @llvm.minnum.f64(double, double) #1 272declare double @llvm.maxnum.f64(double, double) #1 273declare half @llvm.fabs.f16(half) #1 274declare half @llvm.minnum.f16(half, half) #1 275declare half @llvm.maxnum.f16(half, half) #1 276declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 277 278attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" } 279attributes #1 = { nounwind readnone } 280attributes #2 = { nounwind "target-features"="+fp32-denormals" "no-signed-zeros-fp-math"="true" } 281attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" "no-signed-zeros-fp-math"="true" } 282attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" } 283 284!llvm.dbg.cu = !{!0} 285!llvm.module.flags = !{!2, !3} 286 287!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) 288!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null") 289!2 = !{i32 2, !"Dwarf Version", i32 4} 290!3 = !{i32 2, !"Debug Info Version", i32 3} 291!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1) 292!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0) 293!6 = !DISubroutineType(types: !7) 294!7 = !{null, !8} 295!8 = !DIBasicType(name: "float", size: 32, align: 32) 296!9 = !DIExpression() 297!10 = !DILocation(line: 1, column: 42, scope: !5) 298