1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s 4 5; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs 6 7; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be 8; beneficial even without fp32 denormals, but they do require no-infs-fp-math 9; for correctness. 10 11declare i32 @llvm.amdgcn.workitem.id.x() #0 12declare double @llvm.fabs.f64(double) #0 13declare double @llvm.fma.f64(double, double, double) #0 14declare float @llvm.fma.f32(float, float, float) #0 15declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 16 17; (fadd (fmul x, y), z) -> (fma x, y, z) 18; FUNC-LABEL: {{^}}combine_to_fma_f64_0: 19; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 20; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 21; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 22; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 23; SI: buffer_store_dwordx2 [[RESULT]] 24define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 25 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 26 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 27 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 28 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 29 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 30 31 %a = load volatile double, double addrspace(1)* %gep.0 32 %b = load volatile double, double addrspace(1)* %gep.1 33 %c = load volatile double, double addrspace(1)* %gep.2 34 35 %mul = fmul double %a, %b 36 %fma = fadd double %mul, %c 37 store double %fma, double addrspace(1)* %gep.out 38 ret void 39} 40 41; (fadd (fmul x, y), z) -> (fma x, y, z) 42; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: 43; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 44; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 45; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 46; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 47; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 48; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] 49; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 50; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 51; SI: s_endpgm 52define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 53 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 54 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 55 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 56 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 57 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 58 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 59 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 60 61 %a = load volatile double, double addrspace(1)* %gep.0 62 %b = load volatile double, double addrspace(1)* %gep.1 63 %c = load volatile double, double addrspace(1)* %gep.2 64 %d = load volatile double, double addrspace(1)* %gep.3 65 66 %mul = fmul double %a, %b 67 %fma0 = fadd double %mul, %c 68 %fma1 = fadd double %mul, %d 69 store volatile double %fma0, double addrspace(1)* %gep.out.0 70 store volatile double %fma1, double addrspace(1)* %gep.out.1 71 ret void 72} 73 74; (fadd x, (fmul y, z)) -> (fma y, z, x) 75; FUNC-LABEL: {{^}}combine_to_fma_f64_1: 76; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 77; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 78; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 79; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 80; SI: buffer_store_dwordx2 [[RESULT]] 81define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 82 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 83 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 84 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 85 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 86 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 87 88 %a = load volatile double, double addrspace(1)* %gep.0 89 %b = load volatile double, double addrspace(1)* %gep.1 90 %c = load volatile double, double addrspace(1)* %gep.2 91 92 %mul = fmul double %a, %b 93 %fma = fadd double %c, %mul 94 store double %fma, double addrspace(1)* %gep.out 95 ret void 96} 97 98; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 99; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: 100; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 101; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 102; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 103; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] 104; SI: buffer_store_dwordx2 [[RESULT]] 105define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 106 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 107 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 108 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 109 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 110 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 111 112 %a = load volatile double, double addrspace(1)* %gep.0 113 %b = load volatile double, double addrspace(1)* %gep.1 114 %c = load volatile double, double addrspace(1)* %gep.2 115 116 %mul = fmul double %a, %b 117 %fma = fsub double %mul, %c 118 store double %fma, double addrspace(1)* %gep.out 119 ret void 120} 121 122; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 123; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: 124; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 125; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 126; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 127; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 128; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] 129; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] 130; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 131; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 132; SI: s_endpgm 133define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 134 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 135 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 136 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 137 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 138 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 139 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 140 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 141 142 %a = load volatile double, double addrspace(1)* %gep.0 143 %b = load volatile double, double addrspace(1)* %gep.1 144 %c = load volatile double, double addrspace(1)* %gep.2 145 %d = load volatile double, double addrspace(1)* %gep.3 146 147 %mul = fmul double %a, %b 148 %fma0 = fsub double %mul, %c 149 %fma1 = fsub double %mul, %d 150 store volatile double %fma0, double addrspace(1)* %gep.out.0 151 store volatile double %fma1, double addrspace(1)* %gep.out.1 152 ret void 153} 154 155; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 156; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: 157; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 158; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 159; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 160; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] 161; SI: buffer_store_dwordx2 [[RESULT]] 162define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 163 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 164 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 165 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 166 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 167 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 168 169 %a = load volatile double, double addrspace(1)* %gep.0 170 %b = load volatile double, double addrspace(1)* %gep.1 171 %c = load volatile double, double addrspace(1)* %gep.2 172 173 %mul = fmul double %a, %b 174 %fma = fsub double %c, %mul 175 store double %fma, double addrspace(1)* %gep.out 176 ret void 177} 178 179; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 180; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: 181; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 182; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 183; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 184; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 185; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] 186; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] 187; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 188; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 189; SI: s_endpgm 190define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 191 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 192 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 193 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 194 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 195 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 196 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 197 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 198 199 %a = load volatile double, double addrspace(1)* %gep.0 200 %b = load volatile double, double addrspace(1)* %gep.1 201 %c = load volatile double, double addrspace(1)* %gep.2 202 %d = load volatile double, double addrspace(1)* %gep.3 203 204 %mul = fmul double %a, %b 205 %fma0 = fsub double %c, %mul 206 %fma1 = fsub double %d, %mul 207 store volatile double %fma0, double addrspace(1)* %gep.out.0 208 store volatile double %fma1, double addrspace(1)* %gep.out.1 209 ret void 210} 211 212; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 213; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: 214; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 215; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 216; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 217; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 218; SI: buffer_store_dwordx2 [[RESULT]] 219define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 220 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 221 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 222 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 223 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 224 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 225 226 %a = load volatile double, double addrspace(1)* %gep.0 227 %b = load volatile double, double addrspace(1)* %gep.1 228 %c = load volatile double, double addrspace(1)* %gep.2 229 230 %mul = fmul double %a, %b 231 %mul.neg = fsub double -0.0, %mul 232 %fma = fsub double %mul.neg, %c 233 234 store double %fma, double addrspace(1)* %gep.out 235 ret void 236} 237 238; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 239; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: 240; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 241; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 242; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 243; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 244; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 245; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] 246; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 247; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 248; SI: s_endpgm 249define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 250 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 251 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 252 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 253 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 254 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 255 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 256 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 257 258 %a = load volatile double, double addrspace(1)* %gep.0 259 %b = load volatile double, double addrspace(1)* %gep.1 260 %c = load volatile double, double addrspace(1)* %gep.2 261 %d = load volatile double, double addrspace(1)* %gep.3 262 263 %mul = fmul double %a, %b 264 %mul.neg = fsub double -0.0, %mul 265 %fma0 = fsub double %mul.neg, %c 266 %fma1 = fsub double %mul.neg, %d 267 268 store volatile double %fma0, double addrspace(1)* %gep.out.0 269 store volatile double %fma1, double addrspace(1)* %gep.out.1 270 ret void 271} 272 273; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 274; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: 275; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 276; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 277; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 278; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 279; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 280; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] 281; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 282; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 283; SI: s_endpgm 284define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 285 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 286 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 287 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 288 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 289 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 290 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 291 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 292 293 %a = load volatile double, double addrspace(1)* %gep.0 294 %b = load volatile double, double addrspace(1)* %gep.1 295 %c = load volatile double, double addrspace(1)* %gep.2 296 %d = load volatile double, double addrspace(1)* %gep.3 297 298 %mul = fmul double %a, %b 299 %mul.neg = fsub double -0.0, %mul 300 %fma0 = fsub double %mul.neg, %c 301 %fma1 = fsub double %mul, %d 302 303 store volatile double %fma0, double addrspace(1)* %gep.out.0 304 store volatile double %fma1, double addrspace(1)* %gep.out.1 305 ret void 306} 307 308; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 309 310; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: 311; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 312; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 313; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 314; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 315; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} 316 317; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] 318; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]] 319; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]] 320 321; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] 322; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] 323 324; SI: buffer_store_dwordx2 [[RESULT]] 325define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 326 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 327 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 328 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 329 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 330 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 331 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 332 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 333 334 %x = load volatile double, double addrspace(1)* %gep.0 335 %y = load volatile double, double addrspace(1)* %gep.1 336 %z = load volatile double, double addrspace(1)* %gep.2 337 %u = load volatile double, double addrspace(1)* %gep.3 338 %v = load volatile double, double addrspace(1)* %gep.4 339 340 %tmp0 = fmul double %u, %v 341 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 342 %tmp2 = fsub double %tmp1, %z 343 344 store double %tmp2, double addrspace(1)* %gep.out 345 ret void 346} 347 348; fold (fsub x, (fma y, z, (fmul u, v))) 349; -> (fma (fneg y), z, (fma (fneg u), v, x)) 350 351; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: 352; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 353; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 354; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 355; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 356; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} 357 358; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] 359; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]] 360; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]] 361 362; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] 363; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] 364 365; SI: buffer_store_dwordx2 [[RESULT]] 366define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 367 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 368 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 369 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 370 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 371 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 372 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 373 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 374 375 %x = load volatile double, double addrspace(1)* %gep.0 376 %y = load volatile double, double addrspace(1)* %gep.1 377 %z = load volatile double, double addrspace(1)* %gep.2 378 %u = load volatile double, double addrspace(1)* %gep.3 379 %v = load volatile double, double addrspace(1)* %gep.4 380 381 ; nsz flag is needed since this combine may change sign of zero 382 %tmp0 = fmul nsz double %u, %v 383 %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 384 %tmp2 = fsub nsz double %x, %tmp1 385 386 store double %tmp2, double addrspace(1)* %gep.out 387 ret void 388} 389 390; 391; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) 392; 393 394; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: 395; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 396; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 397; 398; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 399define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, 400 float addrspace(1)* %in1, 401 float addrspace(1)* %in2) { 402 %x = load volatile float, float addrspace(1)* %in1 403 %y = load volatile float, float addrspace(1)* %in2 404 %a = fadd float %x, 1.0 405 %m = fmul float %a, %y 406 store float %m, float addrspace(1)* %out 407 ret void 408} 409 410; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: 411; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 412; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 413; 414; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 415define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, 416 float addrspace(1)* %in1, 417 float addrspace(1)* %in2) { 418 %x = load volatile float, float addrspace(1)* %in1 419 %y = load volatile float, float addrspace(1)* %in2 420 %a = fadd float %x, 1.0 421 %m = fmul float %y, %a 422 store float %m, float addrspace(1)* %out 423 ret void 424} 425 426; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: 427; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 428; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 429; 430; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 431define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, 432 float addrspace(1)* %in1, 433 float addrspace(1)* %in2) { 434 %x = load float, float addrspace(1)* %in1 435 %y = load float, float addrspace(1)* %in2 436 %a = fadd float %x, -1.0 437 %m = fmul float %a, %y 438 store float %m, float addrspace(1)* %out 439 ret void 440} 441 442; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: 443; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 444; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 445; 446; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 447define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, 448 float addrspace(1)* %in1, 449 float addrspace(1)* %in2) { 450 %x = load float, float addrspace(1)* %in1 451 %y = load float, float addrspace(1)* %in2 452 %a = fadd float %x, -1.0 453 %m = fmul float %y, %a 454 store float %m, float addrspace(1)* %out 455 ret void 456} 457 458; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: 459; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 460; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 461; 462; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 463define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, 464 float addrspace(1)* %in1, 465 float addrspace(1)* %in2) { 466 %x = load float, float addrspace(1)* %in1 467 %y = load float, float addrspace(1)* %in2 468 %s = fsub float 1.0, %x 469 %m = fmul float %s, %y 470 store float %m, float addrspace(1)* %out 471 ret void 472} 473 474; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: 475; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 476; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 477; 478; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 479define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, 480 float addrspace(1)* %in1, 481 float addrspace(1)* %in2) { 482 %x = load float, float addrspace(1)* %in1 483 %y = load float, float addrspace(1)* %in2 484 %s = fsub float 1.0, %x 485 %m = fmul float %y, %s 486 store float %m, float addrspace(1)* %out 487 ret void 488} 489 490; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: 491; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 492; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 493; 494; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 495define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, 496 float addrspace(1)* %in1, 497 float addrspace(1)* %in2) { 498 %x = load float, float addrspace(1)* %in1 499 %y = load float, float addrspace(1)* %in2 500 %s = fsub float -1.0, %x 501 %m = fmul float %s, %y 502 store float %m, float addrspace(1)* %out 503 ret void 504} 505 506; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: 507; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 508; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 509; 510; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 511define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, 512 float addrspace(1)* %in1, 513 float addrspace(1)* %in2) { 514 %x = load float, float addrspace(1)* %in1 515 %y = load float, float addrspace(1)* %in2 516 %s = fsub float -1.0, %x 517 %m = fmul float %y, %s 518 store float %m, float addrspace(1)* %out 519 ret void 520} 521 522; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: 523; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 524; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 525; 526; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 527define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, 528 float addrspace(1)* %in1, 529 float addrspace(1)* %in2) { 530 %x = load float, float addrspace(1)* %in1 531 %y = load float, float addrspace(1)* %in2 532 %s = fsub float %x, 1.0 533 %m = fmul float %s, %y 534 store float %m, float addrspace(1)* %out 535 ret void 536} 537 538; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: 539; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 540; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 541; 542; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 543define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, 544 float addrspace(1)* %in1, 545 float addrspace(1)* %in2) { 546 %x = load float, float addrspace(1)* %in1 547 %y = load float, float addrspace(1)* %in2 548 %s = fsub float %x, 1.0 549 %m = fmul float %y, %s 550 store float %m, float addrspace(1)* %out 551 ret void 552} 553 554; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: 555; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 556; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 557; 558; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 559define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, 560 float addrspace(1)* %in1, 561 float addrspace(1)* %in2) { 562 %x = load float, float addrspace(1)* %in1 563 %y = load float, float addrspace(1)* %in2 564 %s = fsub float %x, -1.0 565 %m = fmul float %s, %y 566 store float %m, float addrspace(1)* %out 567 ret void 568} 569 570; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: 571; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 572; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 573; 574; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 575define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, 576 float addrspace(1)* %in1, 577 float addrspace(1)* %in2) { 578 %x = load float, float addrspace(1)* %in1 579 %y = load float, float addrspace(1)* %in2 580 %s = fsub float %x, -1.0 581 %m = fmul float %y, %s 582 store float %m, float addrspace(1)* %out 583 ret void 584} 585 586; 587; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) 588; 589 590; FUNC-LABEL: {{^}}test_f32_interp: 591; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]] 592; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]] 593; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]] 594; 595; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] 596; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] 597define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out, 598 float addrspace(1)* %in1, 599 float addrspace(1)* %in2, 600 float addrspace(1)* %in3) { 601 %x = load float, float addrspace(1)* %in1 602 %y = load float, float addrspace(1)* %in2 603 %t = load float, float addrspace(1)* %in3 604 %t1 = fsub float 1.0, %t 605 %tx = fmul float %x, %t 606 %ty = fmul float %y, %t1 607 %r = fadd float %tx, %ty 608 store float %r, float addrspace(1)* %out 609 ret void 610} 611 612; FUNC-LABEL: {{^}}test_f64_interp: 613; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0 614; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]] 615; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]] 616; 617; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] 618; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] 619define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out, 620 double addrspace(1)* %in1, 621 double addrspace(1)* %in2, 622 double addrspace(1)* %in3) { 623 %x = load double, double addrspace(1)* %in1 624 %y = load double, double addrspace(1)* %in2 625 %t = load double, double addrspace(1)* %in3 626 %t1 = fsub double 1.0, %t 627 %tx = fmul double %x, %t 628 %ty = fmul double %y, %t1 629 %r = fadd double %tx, %ty 630 store double %r, double addrspace(1)* %out 631 ret void 632} 633 634; Make sure negative constant cancels out fneg 635; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32: 636; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 637; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] 638; SI-NOT: [[A]] 639; SI-NOT: [[B]] 640; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]] 641define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 642 %tid = call i32 @llvm.amdgcn.workitem.id.x() 643 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 644 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 645 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 646 647 %r1 = load volatile float, float addrspace(1)* %gep.0 648 %r2 = load volatile float, float addrspace(1)* %gep.1 649 650 %r1.fneg = fsub float -0.000000e+00, %r1 651 652 %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2) 653 store float %r3, float addrspace(1)* %gep.out 654 ret void 655} 656 657; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32: 658; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 659; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] 660; SI-NOT: [[A]] 661; SI-NOT: [[B]] 662; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]] 663define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 664 %tid = call i32 @llvm.amdgcn.workitem.id.x() 665 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 666 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 667 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 668 669 %r1 = load volatile float, float addrspace(1)* %gep.0 670 %r2 = load volatile float, float addrspace(1)* %gep.1 671 672 %r1.fneg = fsub float -0.000000e+00, %r1 673 674 %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2) 675 store float %r3, float addrspace(1)* %gep.out 676 ret void 677} 678 679; SI-LABEL: {{^}}fma_neg_b_c_v4f32: 680; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 681; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 682; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 683; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 684define amdgpu_kernel void @fma_neg_b_c_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #2 { 685 %tid = call i32 @llvm.amdgcn.workitem.id.x() 686 %gep.0 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %tid 687 %gep.1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.0, i32 1 688 %gep.2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.1, i32 2 689 %gep.out = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid 690 691 %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %gep.0 692 %tmp1 = load <4 x float>, <4 x float> addrspace(1)* %gep.1 693 %tmp2 = load <4 x float>, <4 x float> addrspace(1)* %gep.2 694 695 %fneg0 = fneg fast <4 x float> %tmp0 696 %fneg1 = fneg fast <4 x float> %tmp1 697 %fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1) 698 699 store <4 x float> %fma0, <4 x float> addrspace(1)* %gep.out 700 ret void 701} 702 703attributes #0 = { nounwind readnone } 704attributes #1 = { nounwind } 705attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" } 706