1; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s 5 6; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s 8; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s 9; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s 10 11 12; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s 13; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s 14 15; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s 16 17; FIXME: Should probably test this, but sometimes selecting fmac is painful to match. 18; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s 19 20; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX1030 %s 21; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GCN-DENORM-FASTFMA-STRICT,GFX1030 %s 22 23; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. 24 25target triple = "amdgcn--" 26 27 28declare i32 @llvm.amdgcn.workitem.id.x() #1 29declare float @llvm.fmuladd.f32(float, float, float) #1 30declare half @llvm.fmuladd.f16(half, half, half) #1 31declare float @llvm.fabs.f32(float) #1 32 33; GCN-LABEL: {{^}}fmuladd_f32: 34; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 35; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 36 37; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 38 39; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 40; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 41define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 42 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 43 %r0 = load float, float addrspace(1)* %in1 44 %r1 = load float, float addrspace(1)* %in2 45 %r2 = load float, float addrspace(1)* %in3 46 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) 47 store float %r3, float addrspace(1)* %out 48 ret void 49} 50 51; GCN-LABEL: {{^}}fmul_fadd_f32: 52; GCN-FLUSH: v_mac_f32 53 54; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 55 56; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 57; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 58 59; GCN-DENORM-STRICT: v_mul_f32_e32 60; GCN-DENORM-STRICT: v_add_f32_e32 61define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 62 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 63 %r0 = load volatile float, float addrspace(1)* %in1 64 %r1 = load volatile float, float addrspace(1)* %in2 65 %r2 = load volatile float, float addrspace(1)* %in3 66 %mul = fmul float %r0, %r1 67 %add = fadd float %mul, %r2 68 store float %add, float addrspace(1)* %out 69 ret void 70} 71 72; GCN-LABEL: {{^}}fmul_fadd_contract_f32: 73; GCN-FLUSH-FMAC: v_fmac_f32_e32 74 75; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 76; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 77 78; GCN-DENORM-FASTFMA: v_fma_f32 79define amdgpu_kernel void @fmul_fadd_contract_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 80 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 81 %r0 = load volatile float, float addrspace(1)* %in1 82 %r1 = load volatile float, float addrspace(1)* %in2 83 %r2 = load volatile float, float addrspace(1)* %in3 84 %mul = fmul float %r0, %r1 85 %add = fadd contract float %mul, %r2 86 store float %add, float addrspace(1)* %out 87 ret void 88} 89 90; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 91; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 92; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 93 94; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 95; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 96; SI-FLUSH: buffer_store_dword [[R2]] 97; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 98 99; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 100 101; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 102; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 103 104; SI-DENORM: buffer_store_dword [[RESULT]] 105; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 106define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 107 %tid = call i32 @llvm.amdgcn.workitem.id.x() 108 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 109 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 110 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 111 112 %r1 = load volatile float, float addrspace(1)* %gep.0 113 %r2 = load volatile float, float addrspace(1)* %gep.1 114 115 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) 116 store float %r3, float addrspace(1)* %gep.out 117 ret void 118} 119 120; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 121; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 122; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 123 124; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 125; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 126 127; SI-FLUSH: buffer_store_dword [[R2]] 128; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 129 130; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 131 132; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 133; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 134 135; SI-DENORM: buffer_store_dword [[RESULT]] 136; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 137define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 138 %tid = call i32 @llvm.amdgcn.workitem.id.x() 139 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 140 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 141 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 142 143 %r1 = load volatile float, float addrspace(1)* %gep.0 144 %r2 = load volatile float, float addrspace(1)* %gep.1 145 146 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) 147 store float %r3, float addrspace(1)* %gep.out 148 ret void 149} 150 151; GCN-LABEL: {{^}}fadd_a_a_b_f32: 152; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 153; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 154 155; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 156 157; SI-FLUSH: buffer_store_dword [[R2]] 158; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 159 160; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 161 162; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 163; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 164 165; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 166; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 167 168; SI-DENORM: buffer_store_dword [[RESULT]] 169; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 170define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, 171 float addrspace(1)* %in1, 172 float addrspace(1)* %in2) #0 { 173 %tid = call i32 @llvm.amdgcn.workitem.id.x() 174 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 175 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 176 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 177 178 %r0 = load volatile float, float addrspace(1)* %gep.0 179 %r1 = load volatile float, float addrspace(1)* %gep.1 180 181 %add.0 = fadd float %r0, %r0 182 %add.1 = fadd float %add.0, %r1 183 store float %add.1, float addrspace(1)* %gep.out 184 ret void 185} 186 187; GCN-LABEL: {{^}}fadd_b_a_a_f32: 188; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 189; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 190 191; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 192 193; SI-FLUSH: buffer_store_dword [[R2]] 194; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 195 196; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 197 198; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 199; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 200 201; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 202; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 203 204; SI-DENORM: buffer_store_dword [[RESULT]] 205; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 206define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, 207 float addrspace(1)* %in1, 208 float addrspace(1)* %in2) #0 { 209 %tid = call i32 @llvm.amdgcn.workitem.id.x() 210 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 211 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 212 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 213 214 %r0 = load volatile float, float addrspace(1)* %gep.0 215 %r1 = load volatile float, float addrspace(1)* %gep.1 216 217 %add.0 = fadd float %r0, %r0 218 %add.1 = fadd float %r1, %add.0 219 store float %add.1, float addrspace(1)* %gep.out 220 ret void 221} 222 223; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 224; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 225; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 226; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 227; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] 228 229; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 230 231; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 232; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 233 234; SI-DENORM: buffer_store_dword [[RESULT]] 235; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 236define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 237 %tid = call i32 @llvm.amdgcn.workitem.id.x() 238 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 239 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 240 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 241 242 %r1 = load volatile float, float addrspace(1)* %gep.0 243 %r2 = load volatile float, float addrspace(1)* %gep.1 244 245 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) 246 store float %r3, float addrspace(1)* %gep.out 247 ret void 248} 249 250; XXX 251; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 252; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 253; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 254 255; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 256; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 257 258; SI-FLUSH: buffer_store_dword [[R2]] 259; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 260 261; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 262 263; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 264; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 265 266; SI-DENORM: buffer_store_dword [[RESULT]] 267; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 268define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 269 %tid = call i32 @llvm.amdgcn.workitem.id.x() 270 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 271 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 272 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 273 274 %r1 = load volatile float, float addrspace(1)* %gep.0 275 %r2 = load volatile float, float addrspace(1)* %gep.1 276 277 %r1.fneg = fneg float %r1 278 279 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) 280 store float %r3, float addrspace(1)* %gep.out 281 ret void 282} 283 284; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: 285; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 286; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 287 288; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 289; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] 290 291; SI-FLUSH: buffer_store_dword [[R2]] 292; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 293 294; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 295 296; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 297; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 298 299; SI-DENORM: buffer_store_dword [[RESULT]] 300; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 301define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 302 %tid = call i32 @llvm.amdgcn.workitem.id.x() 303 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 304 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 305 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 306 307 %r1 = load volatile float, float addrspace(1)* %gep.0 308 %r2 = load volatile float, float addrspace(1)* %gep.1 309 310 %r1.fneg = fneg float %r1 311 312 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) 313 store float %r3, float addrspace(1)* %gep.out 314 ret void 315} 316 317; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: 318; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 319; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 320; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 321; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 322 323; SI-FLUSH: buffer_store_dword [[RESULT]] 324; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 325 326; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 327 328; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 329; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 330 331; SI-DENORM: buffer_store_dword [[RESULT]] 332; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 333define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 334 %tid = call i32 @llvm.amdgcn.workitem.id.x() 335 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 336 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 337 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 338 339 %r1 = load volatile float, float addrspace(1)* %gep.0 340 %r2 = load volatile float, float addrspace(1)* %gep.1 341 342 %r2.fneg = fneg float %r2 343 344 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) 345 store float %r3, float addrspace(1)* %gep.out 346 ret void 347} 348 349; GCN-LABEL: {{^}}mad_sub_f32: 350; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 351; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 352; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 353; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 354 355; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 356 357; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 358; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 359 360; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 361; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 362 363; SI: buffer_store_dword [[RESULT]] 364; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 365define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 366 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 367 %tid.ext = sext i32 %tid to i64 368 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 369 %add1 = add i64 %tid.ext, 1 370 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 371 %add2 = add i64 %tid.ext, 2 372 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 373 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 374 %a = load volatile float, float addrspace(1)* %gep0, align 4 375 %b = load volatile float, float addrspace(1)* %gep1, align 4 376 %c = load volatile float, float addrspace(1)* %gep2, align 4 377 %mul = fmul float %a, %b 378 %sub = fsub float %mul, %c 379 store float %sub, float addrspace(1)* %outgep, align 4 380 ret void 381} 382 383; GCN-LABEL: {{^}}mad_sub_inv_f32: 384; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 385; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 386; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 387 388; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 389 390; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 391 392; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 393; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 394 395; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 396; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 397 398; SI: buffer_store_dword [[RESULT]] 399; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 400define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 401 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 402 %tid.ext = sext i32 %tid to i64 403 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 404 %add1 = add i64 %tid.ext, 1 405 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 406 %add2 = add i64 %tid.ext, 2 407 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 408 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 409 %a = load volatile float, float addrspace(1)* %gep0, align 4 410 %b = load volatile float, float addrspace(1)* %gep1, align 4 411 %c = load volatile float, float addrspace(1)* %gep2, align 4 412 %mul = fmul float %a, %b 413 %sub = fsub float %c, %mul 414 store float %sub, float addrspace(1)* %outgep, align 4 415 ret void 416} 417 418; GCN-LABEL: {{^}}mad_sub_fabs_f32: 419; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 420; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 421; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 422; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 423 424; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 425 426; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 427; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 428 429; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 430; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 431 432; SI: buffer_store_dword [[RESULT]] 433; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 434define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 435 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 436 %tid.ext = sext i32 %tid to i64 437 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 438 %add1 = add i64 %tid.ext, 1 439 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 440 %add2 = add i64 %tid.ext, 2 441 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 442 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 443 %a = load volatile float, float addrspace(1)* %gep0, align 4 444 %b = load volatile float, float addrspace(1)* %gep1, align 4 445 %c = load volatile float, float addrspace(1)* %gep2, align 4 446 %c.abs = call float @llvm.fabs.f32(float %c) #0 447 %mul = fmul float %a, %b 448 %sub = fsub float %mul, %c.abs 449 store float %sub, float addrspace(1)* %outgep, align 4 450 ret void 451} 452 453; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: 454; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 455; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 456; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 457; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 458; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 459 460; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 461 462; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 463; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 464 465; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 466; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 467 468; SI: buffer_store_dword [[RESULT]] 469; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 470define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 471 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 472 %tid.ext = sext i32 %tid to i64 473 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 474 %add1 = add i64 %tid.ext, 1 475 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 476 %add2 = add i64 %tid.ext, 2 477 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 478 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 479 %a = load volatile float, float addrspace(1)* %gep0, align 4 480 %b = load volatile float, float addrspace(1)* %gep1, align 4 481 %c = load volatile float, float addrspace(1)* %gep2, align 4 482 %c.abs = call float @llvm.fabs.f32(float %c) #0 483 %mul = fmul float %a, %b 484 %sub = fsub float %c.abs, %mul 485 store float %sub, float addrspace(1)* %outgep, align 4 486 ret void 487} 488 489; GCN-LABEL: {{^}}neg_neg_mad_f32: 490; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 491; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 492; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 493 494; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]] 495; SI-FLUSH: buffer_store_dword [[REGC]] 496; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] 497 498; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] 499 500; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 501; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 502 503; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 504; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 505 506; SI-DENORM: buffer_store_dword [[RESULT]] 507; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 508define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 509 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 510 %tid.ext = sext i32 %tid to i64 511 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 512 %add1 = add i64 %tid.ext, 1 513 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 514 %add2 = add i64 %tid.ext, 2 515 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 516 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 517 %a = load volatile float, float addrspace(1)* %gep0, align 4 518 %b = load volatile float, float addrspace(1)* %gep1, align 4 519 %c = load volatile float, float addrspace(1)* %gep2, align 4 520 %nega = fneg float %a 521 %negb = fneg float %b 522 %mul = fmul float %nega, %negb 523 %sub = fadd float %mul, %c 524 store float %sub, float addrspace(1)* %outgep, align 4 525 ret void 526} 527 528; GCN-LABEL: {{^}}mad_fabs_sub_f32: 529; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 530; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 531; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 532; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 533 534; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 535 536; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 537; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 538 539; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 540; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 541 542; SI: buffer_store_dword [[RESULT]] 543; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 544define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 545 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 546 %tid.ext = sext i32 %tid to i64 547 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 548 %add1 = add i64 %tid.ext, 1 549 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 550 %add2 = add i64 %tid.ext, 2 551 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 552 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 553 %a = load volatile float, float addrspace(1)* %gep0, align 4 554 %b = load volatile float, float addrspace(1)* %gep1, align 4 555 %c = load volatile float, float addrspace(1)* %gep2, align 4 556 %b.abs = call float @llvm.fabs.f32(float %b) #0 557 %mul = fmul float %a, %b.abs 558 %sub = fsub float %mul, %c 559 store float %sub, float addrspace(1)* %outgep, align 4 560 ret void 561} 562 563; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: 564; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 565; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 566; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 567; SI-FLUSH: buffer_store_dword [[R2]] 568; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 569 570; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 571 572; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 573; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 574 575; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 576; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 577 578; SI-DENORM: buffer_store_dword [[RESULT]] 579; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 580define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 581 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 582 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 583 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 584 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 585 586 %r1 = load volatile float, float addrspace(1)* %gep.0 587 %r2 = load volatile float, float addrspace(1)* %gep.1 588 589 %add = fadd float %r1, %r1 590 %r3 = fsub float %r2, %add 591 592 store float %r3, float addrspace(1)* %gep.out 593 ret void 594} 595 596; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: 597; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 598; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 599; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 600 601; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 602 603; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 604; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 605 606; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 607; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 608 609; SI: buffer_store_dword [[RESULT]] 610; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 611define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 612 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 613 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 614 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 615 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 616 617 %r1 = load volatile float, float addrspace(1)* %gep.0 618 %r2 = load volatile float, float addrspace(1)* %gep.1 619 620 %add = fadd float %r1, %r1 621 %r3 = fsub float %add, %r2 622 623 store float %r3, float addrspace(1)* %gep.out 624 ret void 625} 626 627attributes #0 = { nounwind } 628attributes #1 = { nounwind readnone } 629