1; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s 5 6; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s 8; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s 9; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s 10 11 12; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s 13; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s 14 15; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s 16 17; FIXME: Should probably test this, but sometimes selecting fmac is painful to match. 18; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s 19 20 21; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. 22 23target triple = "amdgcn--" 24 25 26declare i32 @llvm.amdgcn.workitem.id.x() #1 27declare float @llvm.fmuladd.f32(float, float, float) #1 28declare half @llvm.fmuladd.f16(half, half, half) #1 29declare float @llvm.fabs.f32(float) #1 30 31; GCN-LABEL: {{^}}fmuladd_f32: 32; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 33; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 34 35; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 36 37; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 38; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 39define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 40 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 41 %r0 = load float, float addrspace(1)* %in1 42 %r1 = load float, float addrspace(1)* %in2 43 %r2 = load float, float addrspace(1)* %in3 44 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) 45 store float %r3, float addrspace(1)* %out 46 ret void 47} 48 49; GCN-LABEL: {{^}}fmul_fadd_f32: 50; GCN-FLUSH: v_mac_f32 51 52; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 53 54; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 55; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 56 57; GCN-DENORM-STRICT: v_mul_f32_e32 58; GCN-DENORM-STRICT: v_add_f32_e32 59define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 60 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 61 %r0 = load volatile float, float addrspace(1)* %in1 62 %r1 = load volatile float, float addrspace(1)* %in2 63 %r2 = load volatile float, float addrspace(1)* %in3 64 %mul = fmul float %r0, %r1 65 %add = fadd float %mul, %r2 66 store float %add, float addrspace(1)* %out 67 ret void 68} 69 70; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 71; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 72; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 73 74; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 75; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 76; SI-FLUSH: buffer_store_dword [[R2]] 77; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 78 79; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 80 81; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 82; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 83 84; SI-DENORM buffer_store_dword [[RESULT]] 85; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 86define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 87 %tid = call i32 @llvm.amdgcn.workitem.id.x() 88 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 89 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 90 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 91 92 %r1 = load volatile float, float addrspace(1)* %gep.0 93 %r2 = load volatile float, float addrspace(1)* %gep.1 94 95 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) 96 store float %r3, float addrspace(1)* %gep.out 97 ret void 98} 99 100; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 101; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 102; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 103 104; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 105; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 106 107; SI-FLUSH: buffer_store_dword [[R2]] 108; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 109 110; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 111 112; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 113; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 114 115; SI-DENORM: buffer_store_dword [[RESULT]] 116; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 117define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 118 %tid = call i32 @llvm.amdgcn.workitem.id.x() 119 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 120 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 121 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 122 123 %r1 = load volatile float, float addrspace(1)* %gep.0 124 %r2 = load volatile float, float addrspace(1)* %gep.1 125 126 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) 127 store float %r3, float addrspace(1)* %gep.out 128 ret void 129} 130 131; GCN-LABEL: {{^}}fadd_a_a_b_f32: 132; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 133; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 134 135; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 136 137; SI-FLUSH: buffer_store_dword [[R2]] 138; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 139 140; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 141 142; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 143; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 144 145; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 146; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 147 148; SI-DENORM: buffer_store_dword [[RESULT]] 149; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 150define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, 151 float addrspace(1)* %in1, 152 float addrspace(1)* %in2) #0 { 153 %tid = call i32 @llvm.amdgcn.workitem.id.x() 154 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 155 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 156 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 157 158 %r0 = load volatile float, float addrspace(1)* %gep.0 159 %r1 = load volatile float, float addrspace(1)* %gep.1 160 161 %add.0 = fadd float %r0, %r0 162 %add.1 = fadd float %add.0, %r1 163 store float %add.1, float addrspace(1)* %gep.out 164 ret void 165} 166 167; GCN-LABEL: {{^}}fadd_b_a_a_f32: 168; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 169; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 170 171; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 172 173; SI-FLUSH: buffer_store_dword [[R2]] 174; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 175 176; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 177 178; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 179; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 180 181; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 182; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 183 184; SI-DENORM: buffer_store_dword [[RESULT]] 185; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 186define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, 187 float addrspace(1)* %in1, 188 float addrspace(1)* %in2) #0 { 189 %tid = call i32 @llvm.amdgcn.workitem.id.x() 190 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 191 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 192 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 193 194 %r0 = load volatile float, float addrspace(1)* %gep.0 195 %r1 = load volatile float, float addrspace(1)* %gep.1 196 197 %add.0 = fadd float %r0, %r0 198 %add.1 = fadd float %r1, %add.0 199 store float %add.1, float addrspace(1)* %gep.out 200 ret void 201} 202 203; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 204; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 205; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 206; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 207; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] 208 209; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 210 211; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 212; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 213 214; SI-DENORM: buffer_store_dword [[RESULT]] 215; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 216define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 217 %tid = call i32 @llvm.amdgcn.workitem.id.x() 218 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 219 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 220 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 221 222 %r1 = load volatile float, float addrspace(1)* %gep.0 223 %r2 = load volatile float, float addrspace(1)* %gep.1 224 225 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) 226 store float %r3, float addrspace(1)* %gep.out 227 ret void 228} 229 230; XXX 231; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 232; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 233; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 234 235; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 236; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 237 238; SI-FLUSH: buffer_store_dword [[R2]] 239; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 240 241; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 242 243; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 244; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 245 246; SI-DENORM: buffer_store_dword [[RESULT]] 247; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 248define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 249 %tid = call i32 @llvm.amdgcn.workitem.id.x() 250 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 251 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 252 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 253 254 %r1 = load volatile float, float addrspace(1)* %gep.0 255 %r2 = load volatile float, float addrspace(1)* %gep.1 256 257 %r1.fneg = fsub float -0.000000e+00, %r1 258 259 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) 260 store float %r3, float addrspace(1)* %gep.out 261 ret void 262} 263 264; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: 265; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 266; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 267 268; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 269; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] 270 271; SI-FLUSH: buffer_store_dword [[R2]] 272; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 273 274; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 275 276; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 277; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 278 279; SI-DENORM: buffer_store_dword [[RESULT]] 280; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 281define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 282 %tid = call i32 @llvm.amdgcn.workitem.id.x() 283 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 284 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 285 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 286 287 %r1 = load volatile float, float addrspace(1)* %gep.0 288 %r2 = load volatile float, float addrspace(1)* %gep.1 289 290 %r1.fneg = fsub float -0.000000e+00, %r1 291 292 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) 293 store float %r3, float addrspace(1)* %gep.out 294 ret void 295} 296 297; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: 298; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 299; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 300; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 301; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 302 303; SI-FLUSH: buffer_store_dword [[RESULT]] 304; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 305 306; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 307 308; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 309; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 310 311; SI-DENORM: buffer_store_dword [[RESULT]] 312; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 313define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 314 %tid = call i32 @llvm.amdgcn.workitem.id.x() 315 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 316 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 317 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 318 319 %r1 = load volatile float, float addrspace(1)* %gep.0 320 %r2 = load volatile float, float addrspace(1)* %gep.1 321 322 %r2.fneg = fsub float -0.000000e+00, %r2 323 324 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) 325 store float %r3, float addrspace(1)* %gep.out 326 ret void 327} 328 329; GCN-LABEL: {{^}}mad_sub_f32: 330; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 331; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 332; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 333; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 334 335; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 336 337; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 338; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 339 340; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 341; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 342 343; SI: buffer_store_dword [[RESULT]] 344; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 345define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 346 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 347 %tid.ext = sext i32 %tid to i64 348 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 349 %add1 = add i64 %tid.ext, 1 350 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 351 %add2 = add i64 %tid.ext, 2 352 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 353 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 354 %a = load volatile float, float addrspace(1)* %gep0, align 4 355 %b = load volatile float, float addrspace(1)* %gep1, align 4 356 %c = load volatile float, float addrspace(1)* %gep2, align 4 357 %mul = fmul float %a, %b 358 %sub = fsub float %mul, %c 359 store float %sub, float addrspace(1)* %outgep, align 4 360 ret void 361} 362 363; GCN-LABEL: {{^}}mad_sub_inv_f32: 364; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 365; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 366; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 367 368; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 369 370; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 371 372; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 373; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 374 375; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 376; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 377 378; SI: buffer_store_dword [[RESULT]] 379; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 380define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 381 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 382 %tid.ext = sext i32 %tid to i64 383 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 384 %add1 = add i64 %tid.ext, 1 385 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 386 %add2 = add i64 %tid.ext, 2 387 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 388 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 389 %a = load volatile float, float addrspace(1)* %gep0, align 4 390 %b = load volatile float, float addrspace(1)* %gep1, align 4 391 %c = load volatile float, float addrspace(1)* %gep2, align 4 392 %mul = fmul float %a, %b 393 %sub = fsub float %c, %mul 394 store float %sub, float addrspace(1)* %outgep, align 4 395 ret void 396} 397 398; GCN-LABEL: {{^}}mad_sub_fabs_f32: 399; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 400; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 401; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 402; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 403 404; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 405 406; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 407; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 408 409; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 410; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 411 412; SI: buffer_store_dword [[RESULT]] 413; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 414define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 415 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 416 %tid.ext = sext i32 %tid to i64 417 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 418 %add1 = add i64 %tid.ext, 1 419 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 420 %add2 = add i64 %tid.ext, 2 421 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 422 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 423 %a = load volatile float, float addrspace(1)* %gep0, align 4 424 %b = load volatile float, float addrspace(1)* %gep1, align 4 425 %c = load volatile float, float addrspace(1)* %gep2, align 4 426 %c.abs = call float @llvm.fabs.f32(float %c) #0 427 %mul = fmul float %a, %b 428 %sub = fsub float %mul, %c.abs 429 store float %sub, float addrspace(1)* %outgep, align 4 430 ret void 431} 432 433; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: 434; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 435; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 436; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 437; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 438; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 439 440; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 441 442; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 443; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 444 445; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 446; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 447 448; SI: buffer_store_dword [[RESULT]] 449; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 450define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 451 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 452 %tid.ext = sext i32 %tid to i64 453 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 454 %add1 = add i64 %tid.ext, 1 455 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 456 %add2 = add i64 %tid.ext, 2 457 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 458 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 459 %a = load volatile float, float addrspace(1)* %gep0, align 4 460 %b = load volatile float, float addrspace(1)* %gep1, align 4 461 %c = load volatile float, float addrspace(1)* %gep2, align 4 462 %c.abs = call float @llvm.fabs.f32(float %c) #0 463 %mul = fmul float %a, %b 464 %sub = fsub float %c.abs, %mul 465 store float %sub, float addrspace(1)* %outgep, align 4 466 ret void 467} 468 469; GCN-LABEL: {{^}}neg_neg_mad_f32: 470; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 471; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 472; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 473 474; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]] 475; SI-FLUSH: buffer_store_dword [[REGC]] 476; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] 477 478; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] 479 480; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 481; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 482 483; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 484; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 485 486; SI-DENORM: buffer_store_dword [[RESULT]] 487; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 488define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 489 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 490 %tid.ext = sext i32 %tid to i64 491 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 492 %add1 = add i64 %tid.ext, 1 493 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 494 %add2 = add i64 %tid.ext, 2 495 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 496 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 497 %a = load volatile float, float addrspace(1)* %gep0, align 4 498 %b = load volatile float, float addrspace(1)* %gep1, align 4 499 %c = load volatile float, float addrspace(1)* %gep2, align 4 500 %nega = fsub float -0.000000e+00, %a 501 %negb = fsub float -0.000000e+00, %b 502 %mul = fmul float %nega, %negb 503 %sub = fadd float %mul, %c 504 store float %sub, float addrspace(1)* %outgep, align 4 505 ret void 506} 507 508; GCN-LABEL: {{^}}mad_fabs_sub_f32: 509; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 510; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 511; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 512; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 513 514; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 515 516; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 517; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 518 519; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 520; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 521 522; SI: buffer_store_dword [[RESULT]] 523; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 524define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 525 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 526 %tid.ext = sext i32 %tid to i64 527 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 528 %add1 = add i64 %tid.ext, 1 529 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 530 %add2 = add i64 %tid.ext, 2 531 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 532 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 533 %a = load volatile float, float addrspace(1)* %gep0, align 4 534 %b = load volatile float, float addrspace(1)* %gep1, align 4 535 %c = load volatile float, float addrspace(1)* %gep2, align 4 536 %b.abs = call float @llvm.fabs.f32(float %b) #0 537 %mul = fmul float %a, %b.abs 538 %sub = fsub float %mul, %c 539 store float %sub, float addrspace(1)* %outgep, align 4 540 ret void 541} 542 543; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: 544; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 545; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 546; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 547; SI-FLUSH: buffer_store_dword [[R2]] 548; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 549 550; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 551 552; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 553; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 554 555; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 556; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 557 558; SI-DENORM: buffer_store_dword [[RESULT]] 559; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 560define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 561 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 562 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 563 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 564 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 565 566 %r1 = load volatile float, float addrspace(1)* %gep.0 567 %r2 = load volatile float, float addrspace(1)* %gep.1 568 569 %add = fadd float %r1, %r1 570 %r3 = fsub float %r2, %add 571 572 store float %r3, float addrspace(1)* %gep.out 573 ret void 574} 575 576; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: 577; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 578; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 579; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 580 581; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 582 583; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 584; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 585 586; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 587; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 588 589; SI: buffer_store_dword [[RESULT]] 590; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 591define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 592 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 593 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 594 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 595 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 596 597 %r1 = load volatile float, float addrspace(1)* %gep.0 598 %r2 = load volatile float, float addrspace(1)* %gep.1 599 600 %add = fadd float %r1, %r1 601 %r3 = fsub float %add, %r2 602 603 store float %r3, float addrspace(1)* %gep.out 604 ret void 605} 606 607attributes #0 = { nounwind } 608attributes #1 = { nounwind readnone } 609