1; RUN: llc -march=amdgcn -mattr=+fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s 2; RUN: llc -march=amdgcn -mattr=-fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s 3 4; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s 5; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s 6 7; FIXME: This should also fold when fma is actually fast if an FMA 8; exists in the original program. 9 10; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z)) 11 12; GCN-LABEL: {{^}}fast_add_fmuladd_fmul: 13; GCN: buffer_load_dword [[X:v[0-9]+]] 14; GCN: buffer_load_dword [[Y:v[0-9]+]] 15; GCN: buffer_load_dword [[Z:v[0-9]+]] 16; GCN: buffer_load_dword [[U:v[0-9]+]] 17; GCN: buffer_load_dword [[V:v[0-9]+]] 18 19; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]] 20; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]] 21; GCN-FLUSH-NEXT: buffer_store_dword [[Z]] 22 23; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]] 24; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]] 25; GCN-FASTFMA: buffer_store_dword [[FMA1]] 26 27; GCN-SLOWFMA: v_mul_f32_e32 28; GCN-SLOWFMA: v_mul_f32_e32 29; GCN-SLOWFMA: v_add_f32_e32 30; GCN-SLOWFMA: v_add_f32_e32 31define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 { 32 %x = load volatile float, float addrspace(1)* undef 33 %y = load volatile float, float addrspace(1)* undef 34 %z = load volatile float, float addrspace(1)* undef 35 %u = load volatile float, float addrspace(1)* undef 36 %v = load volatile float, float addrspace(1)* undef 37 %mul.u.v = fmul fast float %u, %v 38 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 39 %add = fadd fast float %fma, %z 40 store volatile float %add, float addrspace(1)* undef 41 ret void 42} 43 44; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul: 45; GCN: buffer_load_dword [[X:v[0-9]+]] 46; GCN: buffer_load_dword [[Y:v[0-9]+]] 47; GCN: buffer_load_dword [[Z:v[0-9]+]] 48; GCN: buffer_load_dword [[U:v[0-9]+]] 49; GCN: buffer_load_dword [[V:v[0-9]+]] 50 51; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]] 52; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]] 53; GCN-FLUSH-NEXT: buffer_store_dword [[Z]] 54 55; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]] 56; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]] 57; GCN-FASTFMA: buffer_store_dword [[FMA1]] 58define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 { 59 %x = load volatile float, float addrspace(1)* undef 60 %y = load volatile float, float addrspace(1)* undef 61 %z = load volatile float, float addrspace(1)* undef 62 %u = load volatile float, float addrspace(1)* undef 63 %v = load volatile float, float addrspace(1)* undef 64 %mul.u.v = fmul fast float %u, %v 65 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 66 %add = fsub fast float %fma, %z 67 store volatile float %add, float addrspace(1)* undef 68 ret void 69} 70 71; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul: 72; GCN: buffer_load_dword [[X:v[0-9]+]] 73; GCN: buffer_load_dword [[Y:v[0-9]+]] 74; GCN: buffer_load_dword [[Z:v[0-9]+]] 75; GCN: buffer_load_dword [[U:v[0-9]+]] 76; GCN: buffer_load_dword [[V:v[0-9]+]] 77 78; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 79; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] 80; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]] 81 82; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 83; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]] 84; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]] 85 86; GCN-SLOWFMA: v_mul_f32_e32 87; GCN-SLOWFMA: v_mul_f32_e32 88; GCN-SLOWFMA: v_add_f32_e32 89; GCN-SLOWFMA: v_add_f32_e32 90define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 { 91 %x = load volatile float, float addrspace(1)* undef 92 %y = load volatile float, float addrspace(1)* undef 93 %z = load volatile float, float addrspace(1)* undef 94 %u = load volatile float, float addrspace(1)* undef 95 %v = load volatile float, float addrspace(1)* undef 96 %mul.u.v = fmul fast float %u, %v 97 store volatile float %mul.u.v, float addrspace(1)* undef 98 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 99 %add = fadd fast float %fma, %z 100 store volatile float %add, float addrspace(1)* undef 101 ret void 102} 103 104; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute: 105; GCN: buffer_load_dword [[X:v[0-9]+]] 106; GCN: buffer_load_dword [[Y:v[0-9]+]] 107; GCN: buffer_load_dword [[Z:v[0-9]+]] 108; GCN: buffer_load_dword [[U:v[0-9]+]] 109; GCN: buffer_load_dword [[V:v[0-9]+]] 110 111; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 112; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] 113; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]] 114 115; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 116; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]] 117; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]] 118 119; GCN-SLOWFMA: v_mul_f32_e32 120; GCN-SLOWFMA: v_mul_f32_e32 121; GCN-SLOWFMA: v_add_f32_e32 122; GCN-SLOWFMA: v_add_f32_e32 123define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 { 124 %x = load volatile float, float addrspace(1)* undef 125 %y = load volatile float, float addrspace(1)* undef 126 %z = load volatile float, float addrspace(1)* undef 127 %u = load volatile float, float addrspace(1)* undef 128 %v = load volatile float, float addrspace(1)* undef 129 %mul.u.v = fmul fast float %u, %v 130 store volatile float %mul.u.v, float addrspace(1)* undef 131 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 132 %add = fadd fast float %z, %fma 133 store volatile float %add, float addrspace(1)* undef 134 ret void 135} 136 137; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd: 138; GCN: buffer_load_dword [[X:v[0-9]+]] 139; GCN: buffer_load_dword [[Y:v[0-9]+]] 140; GCN: buffer_load_dword [[Z:v[0-9]+]] 141; GCN: buffer_load_dword [[U:v[0-9]+]] 142; GCN: buffer_load_dword [[V:v[0-9]+]] 143 144; GCN-SLOWFMA: v_mul_f32_e32 145; GCN-SLOWFMA: v_mul_f32_e32 146; GCN-SLOWFMA: v_add_f32_e32 147; GCN-SLOWFMA: v_add_f32_e32 148define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 { 149 %x = load volatile float, float addrspace(1)* undef 150 %y = load volatile float, float addrspace(1)* undef 151 %z = load volatile float, float addrspace(1)* undef 152 %u = load volatile float, float addrspace(1)* undef 153 %v = load volatile float, float addrspace(1)* undef 154 %mul.u.v = fmul fast float %u, %v 155 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 156 store volatile float %fma, float addrspace(1)* undef 157 %add = fadd fast float %fma, %z 158 store volatile float %add, float addrspace(1)* undef 159 ret void 160} 161 162; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute: 163; GCN: buffer_load_dword [[X:v[0-9]+]] 164; GCN: buffer_load_dword [[Y:v[0-9]+]] 165; GCN: buffer_load_dword [[Z:v[0-9]+]] 166; GCN: buffer_load_dword [[U:v[0-9]+]] 167; GCN: buffer_load_dword [[V:v[0-9]+]] 168 169; GCN-SLOWFMA: v_mul_f32_e32 170; GCN-SLOWFMA: v_mul_f32_e32 171; GCN-SLOWFMA: v_add_f32_e32 172; GCN-SLOWFMA: v_add_f32_e32 173define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 { 174 %x = load volatile float, float addrspace(1)* undef 175 %y = load volatile float, float addrspace(1)* undef 176 %z = load volatile float, float addrspace(1)* undef 177 %u = load volatile float, float addrspace(1)* undef 178 %v = load volatile float, float addrspace(1)* undef 179 %mul.u.v = fmul fast float %u, %v 180 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 181 store volatile float %fma, float addrspace(1)* undef 182 %add = fadd fast float %z, %fma 183 store volatile float %add, float addrspace(1)* undef 184 ret void 185} 186 187; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul: 188; GCN: buffer_load_dword [[X:v[0-9]+]] 189; GCN: buffer_load_dword [[Y:v[0-9]+]] 190; GCN: buffer_load_dword [[Z:v[0-9]+]] 191; GCN: buffer_load_dword [[U:v[0-9]+]] 192; GCN: buffer_load_dword [[V:v[0-9]+]] 193 194; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 195 196; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]] 197; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]] 198 199; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]] 200; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]] 201 202; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] 203; GCN-SLOWFMA: v_add_f32_e32 204; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]] 205 206; GCN: buffer_store_dword [[MUL]] 207; GCN: buffer_store_dword [[MAD]] 208define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { 209 %x = load volatile float, float addrspace(1)* undef 210 %y = load volatile float, float addrspace(1)* undef 211 %z = load volatile float, float addrspace(1)* undef 212 %u = load volatile float, float addrspace(1)* undef 213 %v = load volatile float, float addrspace(1)* undef 214 %mul.u.v = fmul fast float %u, %v 215 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 216 %add = fsub fast float %fma, %z 217 store volatile float %mul.u.v, float addrspace(1)* undef 218 store volatile float %add, float addrspace(1)* undef 219 ret void 220} 221 222; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd: 223; GCN: buffer_load_dword [[X:v[0-9]+]] 224; GCN: buffer_load_dword [[Y:v[0-9]+]] 225; GCN: buffer_load_dword [[Z:v[0-9]+]] 226; GCN: buffer_load_dword [[U:v[0-9]+]] 227; GCN: buffer_load_dword [[V:v[0-9]+]] 228 229; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 230 231; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] 232; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]] 233; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] 234; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] 235 236; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]] 237; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]] 238; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] 239; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] 240 241; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] 242; GCN-SLOWFMA: v_add_f32_e32 243; GCN-SLOWFMA: v_sub_f32_e32 244define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { 245 %x = load volatile float, float addrspace(1)* undef 246 %y = load volatile float, float addrspace(1)* undef 247 %z = load volatile float, float addrspace(1)* undef 248 %u = load volatile float, float addrspace(1)* undef 249 %v = load volatile float, float addrspace(1)* undef 250 %mul.u.v = fmul fast float %u, %v 251 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 252 %add = fsub fast float %fma, %z 253 store volatile float %fma, float addrspace(1)* undef 254 store volatile float %add, float addrspace(1)* undef 255 ret void 256} 257 258declare float @llvm.fma.f32(float, float, float) #1 259declare float @llvm.fmuladd.f32(float, float, float) #1 260 261attributes #0 = { nounwind } 262attributes #1 = { nounwind readnone } 263