1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s 4 5; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be 6; beneficial even without fp32 denormals, but they do require no-infs-fp-math 7; for correctness. 8 9declare i32 @llvm.amdgcn.workitem.id.x() #0 10declare double @llvm.fabs.f64(double) #0 11declare double @llvm.fma.f64(double, double, double) #0 12declare float @llvm.fma.f32(float, float, float) #0 13 14; (fadd (fmul x, y), z) -> (fma x, y, z) 15; FUNC-LABEL: {{^}}combine_to_fma_f64_0: 16; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 17; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 18; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 19; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 20; SI: buffer_store_dwordx2 [[RESULT]] 21define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 22 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 23 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 24 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 25 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 26 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 27 28 %a = load volatile double, double addrspace(1)* %gep.0 29 %b = load volatile double, double addrspace(1)* %gep.1 30 %c = load volatile double, double addrspace(1)* %gep.2 31 32 %mul = fmul double %a, %b 33 %fma = fadd double %mul, %c 34 store double %fma, double addrspace(1)* %gep.out 35 ret void 36} 37 38; (fadd (fmul x, y), z) -> (fma x, y, z) 39; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: 40; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 41; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 42; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 43; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 44; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 45; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] 46; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 47; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 48; SI: s_endpgm 49define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 50 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 51 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 52 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 53 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 54 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 55 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 56 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 57 58 %a = load volatile double, double addrspace(1)* %gep.0 59 %b = load volatile double, double addrspace(1)* %gep.1 60 %c = load volatile double, double addrspace(1)* %gep.2 61 %d = load volatile double, double addrspace(1)* %gep.3 62 63 %mul = fmul double %a, %b 64 %fma0 = fadd double %mul, %c 65 %fma1 = fadd double %mul, %d 66 store volatile double %fma0, double addrspace(1)* %gep.out.0 67 store volatile double %fma1, double addrspace(1)* %gep.out.1 68 ret void 69} 70 71; (fadd x, (fmul y, z)) -> (fma y, z, x) 72; FUNC-LABEL: {{^}}combine_to_fma_f64_1: 73; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 74; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 75; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 76; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 77; SI: buffer_store_dwordx2 [[RESULT]] 78define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 79 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 80 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 81 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 82 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 83 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 84 85 %a = load volatile double, double addrspace(1)* %gep.0 86 %b = load volatile double, double addrspace(1)* %gep.1 87 %c = load volatile double, double addrspace(1)* %gep.2 88 89 %mul = fmul double %a, %b 90 %fma = fadd double %c, %mul 91 store double %fma, double addrspace(1)* %gep.out 92 ret void 93} 94 95; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 96; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: 97; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 98; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 99; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 100; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] 101; SI: buffer_store_dwordx2 [[RESULT]] 102define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 103 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 104 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 105 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 106 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 107 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 108 109 %a = load volatile double, double addrspace(1)* %gep.0 110 %b = load volatile double, double addrspace(1)* %gep.1 111 %c = load volatile double, double addrspace(1)* %gep.2 112 113 %mul = fmul double %a, %b 114 %fma = fsub double %mul, %c 115 store double %fma, double addrspace(1)* %gep.out 116 ret void 117} 118 119; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 120; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: 121; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 122; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 123; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 124; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 125; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] 126; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] 127; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 128; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 129; SI: s_endpgm 130define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 131 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 132 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 133 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 134 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 135 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 136 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 137 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 138 139 %a = load volatile double, double addrspace(1)* %gep.0 140 %b = load volatile double, double addrspace(1)* %gep.1 141 %c = load volatile double, double addrspace(1)* %gep.2 142 %d = load volatile double, double addrspace(1)* %gep.3 143 144 %mul = fmul double %a, %b 145 %fma0 = fsub double %mul, %c 146 %fma1 = fsub double %mul, %d 147 store volatile double %fma0, double addrspace(1)* %gep.out.0 148 store volatile double %fma1, double addrspace(1)* %gep.out.1 149 ret void 150} 151 152; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 153; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: 154; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 155; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 156; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 157; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] 158; SI: buffer_store_dwordx2 [[RESULT]] 159define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 160 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 161 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 162 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 163 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 164 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 165 166 %a = load volatile double, double addrspace(1)* %gep.0 167 %b = load volatile double, double addrspace(1)* %gep.1 168 %c = load volatile double, double addrspace(1)* %gep.2 169 170 %mul = fmul double %a, %b 171 %fma = fsub double %c, %mul 172 store double %fma, double addrspace(1)* %gep.out 173 ret void 174} 175 176; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 177; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: 178; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 179; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 180; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 181; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 182; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] 183; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] 184; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 185; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 186; SI: s_endpgm 187define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 188 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 189 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 190 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 191 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 192 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 193 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 194 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 195 196 %a = load volatile double, double addrspace(1)* %gep.0 197 %b = load volatile double, double addrspace(1)* %gep.1 198 %c = load volatile double, double addrspace(1)* %gep.2 199 %d = load volatile double, double addrspace(1)* %gep.3 200 201 %mul = fmul double %a, %b 202 %fma0 = fsub double %c, %mul 203 %fma1 = fsub double %d, %mul 204 store volatile double %fma0, double addrspace(1)* %gep.out.0 205 store volatile double %fma1, double addrspace(1)* %gep.out.1 206 ret void 207} 208 209; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 210; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: 211; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 212; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 213; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 214; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 215; SI: buffer_store_dwordx2 [[RESULT]] 216define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 217 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 218 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 219 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 220 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 221 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 222 223 %a = load volatile double, double addrspace(1)* %gep.0 224 %b = load volatile double, double addrspace(1)* %gep.1 225 %c = load volatile double, double addrspace(1)* %gep.2 226 227 %mul = fmul double %a, %b 228 %mul.neg = fsub double -0.0, %mul 229 %fma = fsub double %mul.neg, %c 230 231 store double %fma, double addrspace(1)* %gep.out 232 ret void 233} 234 235; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 236; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: 237; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 238; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 239; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 240; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 241; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 242; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] 243; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 244; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 245; SI: s_endpgm 246define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 247 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 248 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 249 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 250 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 251 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 252 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 253 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 254 255 %a = load volatile double, double addrspace(1)* %gep.0 256 %b = load volatile double, double addrspace(1)* %gep.1 257 %c = load volatile double, double addrspace(1)* %gep.2 258 %d = load volatile double, double addrspace(1)* %gep.3 259 260 %mul = fmul double %a, %b 261 %mul.neg = fsub double -0.0, %mul 262 %fma0 = fsub double %mul.neg, %c 263 %fma1 = fsub double %mul.neg, %d 264 265 store volatile double %fma0, double addrspace(1)* %gep.out.0 266 store volatile double %fma1, double addrspace(1)* %gep.out.1 267 ret void 268} 269 270; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 271; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: 272; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 273; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 274; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 275; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 276; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 277; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] 278; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 279; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 280; SI: s_endpgm 281define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 282 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 283 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 284 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 285 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 286 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 287 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 288 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 289 290 %a = load volatile double, double addrspace(1)* %gep.0 291 %b = load volatile double, double addrspace(1)* %gep.1 292 %c = load volatile double, double addrspace(1)* %gep.2 293 %d = load volatile double, double addrspace(1)* %gep.3 294 295 %mul = fmul double %a, %b 296 %mul.neg = fsub double -0.0, %mul 297 %fma0 = fsub double %mul.neg, %c 298 %fma1 = fsub double %mul, %d 299 300 store volatile double %fma0, double addrspace(1)* %gep.out.0 301 store volatile double %fma1, double addrspace(1)* %gep.out.1 302 ret void 303} 304 305; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 306 307; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: 308; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 309; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 310; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 311; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 312; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} 313 314; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] 315; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]] 316; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]] 317 318; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] 319; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] 320 321; SI: buffer_store_dwordx2 [[RESULT]] 322define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 324 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 325 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 326 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 327 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 328 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 329 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 330 331 %x = load volatile double, double addrspace(1)* %gep.0 332 %y = load volatile double, double addrspace(1)* %gep.1 333 %z = load volatile double, double addrspace(1)* %gep.2 334 %u = load volatile double, double addrspace(1)* %gep.3 335 %v = load volatile double, double addrspace(1)* %gep.4 336 337 %tmp0 = fmul double %u, %v 338 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 339 %tmp2 = fsub double %tmp1, %z 340 341 store double %tmp2, double addrspace(1)* %gep.out 342 ret void 343} 344 345; fold (fsub x, (fma y, z, (fmul u, v))) 346; -> (fma (fneg y), z, (fma (fneg u), v, x)) 347 348; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: 349; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 350; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 351; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 352; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 353; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} 354 355; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] 356; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]] 357; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]] 358 359; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] 360; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] 361 362; SI: buffer_store_dwordx2 [[RESULT]] 363define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 364 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 365 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 366 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 367 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 368 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 369 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 370 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 371 372 %x = load volatile double, double addrspace(1)* %gep.0 373 %y = load volatile double, double addrspace(1)* %gep.1 374 %z = load volatile double, double addrspace(1)* %gep.2 375 %u = load volatile double, double addrspace(1)* %gep.3 376 %v = load volatile double, double addrspace(1)* %gep.4 377 378 %tmp0 = fmul double %u, %v 379 %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 380 %tmp2 = fsub double %x, %tmp1 381 382 store double %tmp2, double addrspace(1)* %gep.out 383 ret void 384} 385 386; 387; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) 388; 389 390; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: 391; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 392; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 393; 394; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 395define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, 396 float addrspace(1)* %in1, 397 float addrspace(1)* %in2) { 398 %x = load volatile float, float addrspace(1)* %in1 399 %y = load volatile float, float addrspace(1)* %in2 400 %a = fadd float %x, 1.0 401 %m = fmul float %a, %y 402 store float %m, float addrspace(1)* %out 403 ret void 404} 405 406; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: 407; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 408; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 409; 410; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 411define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, 412 float addrspace(1)* %in1, 413 float addrspace(1)* %in2) { 414 %x = load volatile float, float addrspace(1)* %in1 415 %y = load volatile float, float addrspace(1)* %in2 416 %a = fadd float %x, 1.0 417 %m = fmul float %y, %a 418 store float %m, float addrspace(1)* %out 419 ret void 420} 421 422; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: 423; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 424; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 425; 426; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 427define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, 428 float addrspace(1)* %in1, 429 float addrspace(1)* %in2) { 430 %x = load float, float addrspace(1)* %in1 431 %y = load float, float addrspace(1)* %in2 432 %a = fadd float %x, -1.0 433 %m = fmul float %a, %y 434 store float %m, float addrspace(1)* %out 435 ret void 436} 437 438; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: 439; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 440; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 441; 442; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 443define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, 444 float addrspace(1)* %in1, 445 float addrspace(1)* %in2) { 446 %x = load float, float addrspace(1)* %in1 447 %y = load float, float addrspace(1)* %in2 448 %a = fadd float %x, -1.0 449 %m = fmul float %y, %a 450 store float %m, float addrspace(1)* %out 451 ret void 452} 453 454; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: 455; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 456; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 457; 458; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 459define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, 460 float addrspace(1)* %in1, 461 float addrspace(1)* %in2) { 462 %x = load float, float addrspace(1)* %in1 463 %y = load float, float addrspace(1)* %in2 464 %s = fsub float 1.0, %x 465 %m = fmul float %s, %y 466 store float %m, float addrspace(1)* %out 467 ret void 468} 469 470; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: 471; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 472; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 473; 474; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 475define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, 476 float addrspace(1)* %in1, 477 float addrspace(1)* %in2) { 478 %x = load float, float addrspace(1)* %in1 479 %y = load float, float addrspace(1)* %in2 480 %s = fsub float 1.0, %x 481 %m = fmul float %y, %s 482 store float %m, float addrspace(1)* %out 483 ret void 484} 485 486; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: 487; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 488; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 489; 490; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 491define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, 492 float addrspace(1)* %in1, 493 float addrspace(1)* %in2) { 494 %x = load float, float addrspace(1)* %in1 495 %y = load float, float addrspace(1)* %in2 496 %s = fsub float -1.0, %x 497 %m = fmul float %s, %y 498 store float %m, float addrspace(1)* %out 499 ret void 500} 501 502; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: 503; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 504; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 505; 506; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 507define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, 508 float addrspace(1)* %in1, 509 float addrspace(1)* %in2) { 510 %x = load float, float addrspace(1)* %in1 511 %y = load float, float addrspace(1)* %in2 512 %s = fsub float -1.0, %x 513 %m = fmul float %y, %s 514 store float %m, float addrspace(1)* %out 515 ret void 516} 517 518; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: 519; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 520; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 521; 522; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 523define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, 524 float addrspace(1)* %in1, 525 float addrspace(1)* %in2) { 526 %x = load float, float addrspace(1)* %in1 527 %y = load float, float addrspace(1)* %in2 528 %s = fsub float %x, 1.0 529 %m = fmul float %s, %y 530 store float %m, float addrspace(1)* %out 531 ret void 532} 533 534; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: 535; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 536; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 537; 538; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 539define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, 540 float addrspace(1)* %in1, 541 float addrspace(1)* %in2) { 542 %x = load float, float addrspace(1)* %in1 543 %y = load float, float addrspace(1)* %in2 544 %s = fsub float %x, 1.0 545 %m = fmul float %y, %s 546 store float %m, float addrspace(1)* %out 547 ret void 548} 549 550; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: 551; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 552; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 553; 554; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 555define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, 556 float addrspace(1)* %in1, 557 float addrspace(1)* %in2) { 558 %x = load float, float addrspace(1)* %in1 559 %y = load float, float addrspace(1)* %in2 560 %s = fsub float %x, -1.0 561 %m = fmul float %s, %y 562 store float %m, float addrspace(1)* %out 563 ret void 564} 565 566; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: 567; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 568; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 569; 570; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 571define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, 572 float addrspace(1)* %in1, 573 float addrspace(1)* %in2) { 574 %x = load float, float addrspace(1)* %in1 575 %y = load float, float addrspace(1)* %in2 576 %s = fsub float %x, -1.0 577 %m = fmul float %y, %s 578 store float %m, float addrspace(1)* %out 579 ret void 580} 581 582; 583; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) 584; 585 586; FUNC-LABEL: {{^}}test_f32_interp: 587; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]] 588; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]] 589; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]] 590; 591; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] 592; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] 593define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out, 594 float addrspace(1)* %in1, 595 float addrspace(1)* %in2, 596 float addrspace(1)* %in3) { 597 %x = load float, float addrspace(1)* %in1 598 %y = load float, float addrspace(1)* %in2 599 %t = load float, float addrspace(1)* %in3 600 %t1 = fsub float 1.0, %t 601 %tx = fmul float %x, %t 602 %ty = fmul float %y, %t1 603 %r = fadd float %tx, %ty 604 store float %r, float addrspace(1)* %out 605 ret void 606} 607 608; FUNC-LABEL: {{^}}test_f64_interp: 609; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0 610; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]] 611; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]] 612; 613; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] 614; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] 615define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out, 616 double addrspace(1)* %in1, 617 double addrspace(1)* %in2, 618 double addrspace(1)* %in3) { 619 %x = load double, double addrspace(1)* %in1 620 %y = load double, double addrspace(1)* %in2 621 %t = load double, double addrspace(1)* %in3 622 %t1 = fsub double 1.0, %t 623 %tx = fmul double %x, %t 624 %ty = fmul double %y, %t1 625 %r = fadd double %tx, %ty 626 store double %r, double addrspace(1)* %out 627 ret void 628} 629 630; Make sure negative constant cancels out fneg 631; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32: 632; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 633; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] 634; GCN-NOT: [[A]] 635; GCN-NOT: [[B]] 636; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]] 637define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 638 %tid = call i32 @llvm.amdgcn.workitem.id.x() 639 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 640 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 641 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 642 643 %r1 = load volatile float, float addrspace(1)* %gep.0 644 %r2 = load volatile float, float addrspace(1)* %gep.1 645 646 %r1.fneg = fsub float -0.000000e+00, %r1 647 648 %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2) 649 store float %r3, float addrspace(1)* %gep.out 650 ret void 651} 652 653; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32: 654; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 655; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] 656; GCN-NOT: [[A]] 657; GCN-NOT: [[B]] 658; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]] 659define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 660 %tid = call i32 @llvm.amdgcn.workitem.id.x() 661 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 662 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 663 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 664 665 %r1 = load volatile float, float addrspace(1)* %gep.0 666 %r2 = load volatile float, float addrspace(1)* %gep.1 667 668 %r1.fneg = fsub float -0.000000e+00, %r1 669 670 %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2) 671 store float %r3, float addrspace(1)* %gep.out 672 ret void 673} 674 675attributes #0 = { nounwind readnone } 676attributes #1 = { nounwind } 677 678