1; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. 2 3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s 4; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s 5; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s 6 7; Make sure we don't form mad with denormals 8; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s 9; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s 10 11declare i32 @llvm.amdgcn.workitem.id.x() #0 12declare float @llvm.fabs.f32(float) #0 13declare float @llvm.fma.f32(float, float, float) #0 14declare float @llvm.fmuladd.f32(float, float, float) #0 15 16; (fadd (fmul x, y), z) -> (fma x, y, z) 17; FUNC-LABEL: {{^}}combine_to_mad_f32_0: 18; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 19; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 20; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 21 22; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] 23 24; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 25 26; SI-DENORM-SLOWFMAF-NOT: v_fma 27; SI-DENORM-SLOWFMAF-NOT: v_mad 28 29; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 30; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 31 32; SI-DENORM: buffer_store_dword [[RESULT]] 33; SI-STD: buffer_store_dword [[C]] 34define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 35 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 36 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 37 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 38 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 39 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 40 41 %a = load volatile float, float addrspace(1)* %gep.0 42 %b = load volatile float, float addrspace(1)* %gep.1 43 %c = load volatile float, float addrspace(1)* %gep.2 44 45 %mul = fmul float %a, %b 46 %fma = fadd float %mul, %c 47 store float %fma, float addrspace(1)* %gep.out 48 ret void 49} 50 51; (fadd (fmul x, y), z) -> (fma x, y, z) 52; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: 53; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 54; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 55; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 56; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 57 58; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]] 59; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]] 60 61; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] 62; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] 63 64; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 65; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] 66; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 67 68; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 69; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 70; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 71; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 72; SI: s_endpgm 73define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 74 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 75 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 76 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 77 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 78 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 79 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 80 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 81 82 %a = load volatile float, float addrspace(1)* %gep.0 83 %b = load volatile float, float addrspace(1)* %gep.1 84 %c = load volatile float, float addrspace(1)* %gep.2 85 %d = load volatile float, float addrspace(1)* %gep.3 86 87 %mul = fmul float %a, %b 88 %fma0 = fadd float %mul, %c 89 %fma1 = fadd float %mul, %d 90 91 store volatile float %fma0, float addrspace(1)* %gep.out.0 92 store volatile float %fma1, float addrspace(1)* %gep.out.1 93 ret void 94} 95 96; (fadd x, (fmul y, z)) -> (fma y, z, x) 97; FUNC-LABEL: {{^}}combine_to_mad_f32_1: 98; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 99; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 100; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 101 102; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] 103; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 104 105; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 106; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 107 108; SI-DENORM: buffer_store_dword [[RESULT]] 109; SI-STD: buffer_store_dword [[C]] 110define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 111 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 112 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 113 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 114 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 115 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 116 117 %a = load volatile float, float addrspace(1)* %gep.0 118 %b = load volatile float, float addrspace(1)* %gep.1 119 %c = load volatile float, float addrspace(1)* %gep.2 120 121 %mul = fmul float %a, %b 122 %fma = fadd float %c, %mul 123 store float %fma, float addrspace(1)* %gep.out 124 ret void 125} 126 127; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 128; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: 129; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 130; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 131; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 132 133; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 134; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 135 136; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 137; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 138 139; SI: buffer_store_dword [[RESULT]] 140define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 141 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 142 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 143 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 144 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 145 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 146 147 %a = load volatile float, float addrspace(1)* %gep.0 148 %b = load volatile float, float addrspace(1)* %gep.1 149 %c = load volatile float, float addrspace(1)* %gep.2 150 151 %mul = fmul float %a, %b 152 %fma = fsub float %mul, %c 153 store float %fma, float addrspace(1)* %gep.out 154 ret void 155} 156 157; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 158; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: 159; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 160; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 161; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 162; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 163 164; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 165; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 166 167; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 168; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 169 170; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 171; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] 172; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 173 174; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 175; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 176; SI: s_endpgm 177define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 178 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 179 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 180 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 181 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 182 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 183 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 184 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 185 186 %a = load volatile float, float addrspace(1)* %gep.0 187 %b = load volatile float, float addrspace(1)* %gep.1 188 %c = load volatile float, float addrspace(1)* %gep.2 189 %d = load volatile float, float addrspace(1)* %gep.3 190 191 %mul = fmul float %a, %b 192 %fma0 = fsub float %mul, %c 193 %fma1 = fsub float %mul, %d 194 store volatile float %fma0, float addrspace(1)* %gep.out.0 195 store volatile float %fma1, float addrspace(1)* %gep.out.1 196 ret void 197} 198 199; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 200; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: 201; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 202; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 203; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 204 205; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 206; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 207 208; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 209; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 210 211; SI: buffer_store_dword [[RESULT]] 212define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 213 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 214 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 215 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 216 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 217 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 218 219 %a = load volatile float, float addrspace(1)* %gep.0 220 %b = load volatile float, float addrspace(1)* %gep.1 221 %c = load volatile float, float addrspace(1)* %gep.2 222 223 %mul = fmul float %a, %b 224 %fma = fsub float %c, %mul 225 store float %fma, float addrspace(1)* %gep.out 226 ret void 227} 228 229; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 230; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: 231; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 232; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 233; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 234 235; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 236; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 237 238; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 239; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 240 241; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 242; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 243; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 244 245; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 246; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 247; SI: s_endpgm 248define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 249 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 250 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 251 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 252 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 253 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 254 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 255 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 256 257 %a = load volatile float, float addrspace(1)* %gep.0 258 %b = load volatile float, float addrspace(1)* %gep.1 259 %c = load volatile float, float addrspace(1)* %gep.2 260 %d = load volatile float, float addrspace(1)* %gep.3 261 262 %mul = fmul float %a, %b 263 %fma0 = fsub float %c, %mul 264 %fma1 = fsub float %d, %mul 265 store volatile float %fma0, float addrspace(1)* %gep.out.0 266 store volatile float %fma1, float addrspace(1)* %gep.out.1 267 ret void 268} 269 270; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 271; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: 272; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 273; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 274; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 275 276; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] 277 278; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] 279 280; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 281; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]] 282 283; SI: buffer_store_dword [[RESULT]] 284define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 285 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 286 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 287 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 288 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 289 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 290 291 %a = load volatile float, float addrspace(1)* %gep.0 292 %b = load volatile float, float addrspace(1)* %gep.1 293 %c = load volatile float, float addrspace(1)* %gep.2 294 295 %mul = fmul float %a, %b 296 %mul.neg = fsub float -0.0, %mul 297 %fma = fsub float %mul.neg, %c 298 299 store float %fma, float addrspace(1)* %gep.out 300 ret void 301} 302 303; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 304; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: 305; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 306; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 307; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 308 309; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 310; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] 311 312; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 313; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] 314 315; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 316; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] 317; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]] 318 319; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 320; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 321; SI: s_endpgm 322define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 324 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 325 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 326 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 327 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 328 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 329 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 330 331 %a = load volatile float, float addrspace(1)* %gep.0 332 %b = load volatile float, float addrspace(1)* %gep.1 333 %c = load volatile float, float addrspace(1)* %gep.2 334 %d = load volatile float, float addrspace(1)* %gep.3 335 336 %mul = fmul float %a, %b 337 %mul.neg = fsub float -0.0, %mul 338 %fma0 = fsub float %mul.neg, %c 339 %fma1 = fsub float %mul.neg, %d 340 341 store volatile float %fma0, float addrspace(1)* %gep.out.0 342 store volatile float %fma1, float addrspace(1)* %gep.out.1 343 ret void 344} 345 346; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 347; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: 348; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 349; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 350; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 351 352; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 353; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 354 355; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 356; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 357 358; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 359; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] 360; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 361 362; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 363; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 364; SI: s_endpgm 365define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 366 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 367 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 368 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 369 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 370 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 371 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 372 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 373 374 %a = load volatile float, float addrspace(1)* %gep.0 375 %b = load volatile float, float addrspace(1)* %gep.1 376 %c = load volatile float, float addrspace(1)* %gep.2 377 %d = load volatile float, float addrspace(1)* %gep.3 378 379 %mul = fmul float %a, %b 380 %mul.neg = fsub float -0.0, %mul 381 %fma0 = fsub float %mul.neg, %c 382 %fma1 = fsub float %mul, %d 383 384 store volatile float %fma0, float addrspace(1)* %gep.out.0 385 store volatile float %fma1, float addrspace(1)* %gep.out.1 386 ret void 387} 388 389; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 390 391; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: 392; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 393; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 394; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 395; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 396; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 397 398; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 399; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 400; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] 401 402; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]] 403; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]] 404 405; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 406; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 407; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] 408 409; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 410define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 411 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 412 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 413 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 414 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 415 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 416 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 417 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 418 419 %x = load volatile float, float addrspace(1)* %gep.0 420 %y = load volatile float, float addrspace(1)* %gep.1 421 %z = load volatile float, float addrspace(1)* %gep.2 422 %u = load volatile float, float addrspace(1)* %gep.3 423 %v = load volatile float, float addrspace(1)* %gep.4 424 425 %tmp0 = fmul float %u, %v 426 %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 427 %tmp2 = fsub float %tmp1, %z 428 429 store float %tmp2, float addrspace(1)* %gep.out 430 ret void 431} 432 433; fold (fsub x, (fma y, z, (fmul u, v))) 434; -> (fma (fneg y), z, (fma (fneg u), v, x)) 435 436; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: 437; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 438; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 439; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 440; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 441; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 442 443; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 444; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 445; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] 446 447; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]] 448; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]] 449 450; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 451; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 452; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] 453 454; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 455; SI: s_endpgm 456define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 457 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 458 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 459 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 460 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 461 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 462 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 463 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 464 465 %x = load volatile float, float addrspace(1)* %gep.0 466 %y = load volatile float, float addrspace(1)* %gep.1 467 %z = load volatile float, float addrspace(1)* %gep.2 468 %u = load volatile float, float addrspace(1)* %gep.3 469 %v = load volatile float, float addrspace(1)* %gep.4 470 471 %tmp0 = fmul float %u, %v 472 %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 473 %tmp2 = fsub float %x, %tmp1 474 475 store float %tmp2, float addrspace(1)* %gep.out 476 ret void 477} 478 479; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 480 481; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: 482; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 483; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 484; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 485; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 486; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 487 488; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] 489; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]] 490 491; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] 492; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] 493 494; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 495; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] 496; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] 497; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] 498 499; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 500; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 501; SI: s_endpgm 502define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 503 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 504 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 505 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 506 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 507 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 508 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 509 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 510 511 %x = load volatile float, float addrspace(1)* %gep.0 512 %y = load volatile float, float addrspace(1)* %gep.1 513 %z = load volatile float, float addrspace(1)* %gep.2 514 %u = load volatile float, float addrspace(1)* %gep.3 515 %v = load volatile float, float addrspace(1)* %gep.4 516 517 %tmp0 = fmul float %u, %v 518 %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 519 %tmp2 = fsub float %tmp1, %z 520 521 store float %tmp2, float addrspace(1)* %gep.out 522 ret void 523} 524 525; fold (fsub x, (fmuladd y, z, (fmul u, v))) 526; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) 527 528; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: 529; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 530; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 531; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 532; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 533; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 534 535; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] 536; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] 537 538; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] 539; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] 540 541; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 542; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] 543; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] 544; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] 545 546; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 547; SI: s_endpgm 548define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 549 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 550 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 551 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 552 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 553 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 554 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 555 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 556 557 %x = load volatile float, float addrspace(1)* %gep.0 558 %y = load volatile float, float addrspace(1)* %gep.1 559 %z = load volatile float, float addrspace(1)* %gep.2 560 %u = load volatile float, float addrspace(1)* %gep.3 561 %v = load volatile float, float addrspace(1)* %gep.4 562 563 %tmp0 = fmul float %u, %v 564 %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 565 %tmp2 = fsub float %x, %tmp1 566 567 store float %tmp2, float addrspace(1)* %gep.out 568 ret void 569} 570 571attributes #0 = { nounwind readnone } 572attributes #1 = { nounwind } 573