1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s 4 5; GCN-LABEL: {{^}}mac_vvv: 6; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} 7; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 8; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 9; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]] 10; GCN: buffer_store_dword [[C]] 11define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 12entry: 13 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 14 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 15 16 %a = load volatile float, float addrspace(1)* %in 17 %b = load volatile float, float addrspace(1)* %b_ptr 18 %c = load volatile float, float addrspace(1)* %c_ptr 19 20 %tmp0 = fmul float %a, %b 21 %tmp1 = fadd float %tmp0, %c 22 store float %tmp1, float addrspace(1)* %out 23 ret void 24} 25 26; GCN-LABEL: {{^}}mad_inline_sgpr_inline: 27; GCN-NOT: v_mac_f32 28; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5 29define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 { 30entry: 31 %tmp0 = fmul float 0.5, %in 32 %tmp1 = fadd float %tmp0, 0.5 33 store float %tmp1, float addrspace(1)* %out 34 ret void 35} 36 37; GCN-LABEL: {{^}}mad_vvs: 38; GCN-NOT: v_mac_f32 39; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} 40define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 { 41entry: 42 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 43 44 %a = load float, float addrspace(1)* %in 45 %b = load float, float addrspace(1)* %b_ptr 46 47 %tmp0 = fmul float %a, %b 48 %tmp1 = fadd float %tmp0, %c 49 store float %tmp1, float addrspace(1)* %out 50 ret void 51} 52 53; GCN-LABEL: {{^}}mac_ssv: 54; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 55define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 { 56entry: 57 %c = load float, float addrspace(1)* %in 58 59 %tmp0 = fmul float %a, %a 60 %tmp1 = fadd float %tmp0, %c 61 store float %tmp1, float addrspace(1)* %out 62 ret void 63} 64 65; GCN-LABEL: {{^}}mac_mad_same_add: 66; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] 67; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} 68define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 69entry: 70 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 71 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 72 %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3 73 %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4 74 75 %a = load volatile float, float addrspace(1)* %in 76 %b = load volatile float, float addrspace(1)* %b_ptr 77 %c = load volatile float, float addrspace(1)* %c_ptr 78 %d = load volatile float, float addrspace(1)* %d_ptr 79 %e = load volatile float, float addrspace(1)* %e_ptr 80 81 %tmp0 = fmul float %a, %b 82 %tmp1 = fadd float %tmp0, %c 83 84 %tmp2 = fmul float %d, %e 85 %tmp3 = fadd float %tmp2, %c 86 87 %out1 = getelementptr float, float addrspace(1)* %out, i32 1 88 store float %tmp1, float addrspace(1)* %out 89 store float %tmp3, float addrspace(1)* %out1 90 ret void 91} 92 93; There is no advantage to using v_mac when one of the operands is negated 94; and v_mad accepts more operand types. 95 96; GCN-LABEL: {{^}}mad_neg_src0: 97; GCN-NOT: v_mac_f32 98; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 99define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 100entry: 101 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 102 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 103 104 %a = load float, float addrspace(1)* %in 105 %b = load float, float addrspace(1)* %b_ptr 106 %c = load float, float addrspace(1)* %c_ptr 107 108 %neg_a = fsub float -0.0, %a 109 %tmp0 = fmul float %neg_a, %b 110 %tmp1 = fadd float %tmp0, %c 111 112 store float %tmp1, float addrspace(1)* %out 113 ret void 114} 115 116; GCN-LABEL: {{^}}nsz_mad_sub0_src0: 117; GCN-NOT: v_mac_f32 118; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 119define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 { 120entry: 121 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 122 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 123 124 %a = load float, float addrspace(1)* %in 125 %b = load float, float addrspace(1)* %b_ptr 126 %c = load float, float addrspace(1)* %c_ptr 127 128 %neg_a = fsub float 0.0, %a 129 %tmp0 = fmul float %neg_a, %b 130 %tmp1 = fadd float %tmp0, %c 131 132 store float %tmp1, float addrspace(1)* %out 133 ret void 134} 135 136; GCN-LABEL: {{^}}safe_mad_sub0_src0: 137; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, 138; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} 139define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 140entry: 141 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 142 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 143 144 %a = load float, float addrspace(1)* %in 145 %b = load float, float addrspace(1)* %b_ptr 146 %c = load float, float addrspace(1)* %c_ptr 147 148 %neg_a = fsub float 0.0, %a 149 %tmp0 = fmul float %neg_a, %b 150 %tmp1 = fadd float %tmp0, %c 151 152 store float %tmp1, float addrspace(1)* %out 153 ret void 154} 155 156; GCN-LABEL: {{^}}mad_neg_src1: 157; GCN-NOT: v_mac_f32 158; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 159define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 160entry: 161 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 162 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 163 164 %a = load float, float addrspace(1)* %in 165 %b = load float, float addrspace(1)* %b_ptr 166 %c = load float, float addrspace(1)* %c_ptr 167 168 %neg_b = fsub float -0.0, %b 169 %tmp0 = fmul float %a, %neg_b 170 %tmp1 = fadd float %tmp0, %c 171 172 store float %tmp1, float addrspace(1)* %out 173 ret void 174} 175 176; GCN-LABEL: {{^}}nsz_mad_sub0_src1: 177; GCN-NOT: v_mac_f32 178; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 179define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 { 180entry: 181 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 182 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 183 184 %a = load float, float addrspace(1)* %in 185 %b = load float, float addrspace(1)* %b_ptr 186 %c = load float, float addrspace(1)* %c_ptr 187 188 %neg_b = fsub float 0.0, %b 189 %tmp0 = fmul float %a, %neg_b 190 %tmp1 = fadd float %tmp0, %c 191 192 store float %tmp1, float addrspace(1)* %out 193 ret void 194} 195 196; GCN-LABEL: {{^}}mad_neg_src2: 197; GCN-NOT: v_mac 198; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} 199define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 200entry: 201 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 202 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 203 204 %a = load float, float addrspace(1)* %in 205 %b = load float, float addrspace(1)* %b_ptr 206 %c = load float, float addrspace(1)* %c_ptr 207 208 %neg_c = fsub float -0.0, %c 209 %tmp0 = fmul float %a, %b 210 %tmp1 = fadd float %tmp0, %neg_c 211 212 store float %tmp1, float addrspace(1)* %out 213 ret void 214} 215 216; Without special casing the inline constant check for v_mac_f32's 217; src2, this fails to fold the 1.0 into a mad. 218 219; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32: 220; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 221; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 222 223; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] 224; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 225define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 { 226bb: 227 %tid = call i32 @llvm.amdgcn.workitem.id.x() 228 %tid.ext = sext i32 %tid to i64 229 %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext 230 %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext 231 %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 232 %tmp = load volatile float, float addrspace(1)* %gep.a 233 %tmp1 = load volatile float, float addrspace(1)* %gep.b 234 %tmp2 = fadd float %tmp, %tmp 235 %tmp3 = fmul float %tmp2, 4.0 236 %tmp4 = fsub float 1.0, %tmp3 237 %tmp5 = fadd float %tmp4, %tmp1 238 %tmp6 = fadd float %tmp1, %tmp1 239 %tmp7 = fmul float %tmp6, %tmp 240 %tmp8 = fsub float 1.0, %tmp7 241 %tmp9 = fmul float %tmp8, 8.0 242 %tmp10 = fadd float %tmp5, %tmp9 243 store float %tmp10, float addrspace(1)* %gep.out 244 ret void 245} 246 247; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16: 248; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 249; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]] 250 251; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[A]] 252; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]] 253 254; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]] 255; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 256; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}} 257 258; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] 259; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 260define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 { 261bb: 262 %tid = call i32 @llvm.amdgcn.workitem.id.x() 263 %tid.ext = sext i32 %tid to i64 264 %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext 265 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 266 %gep.out = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 267 %tmp = load volatile half, half addrspace(1)* %gep.a 268 %tmp1 = load volatile half, half addrspace(1)* %gep.b 269 %tmp2 = fadd half %tmp, %tmp 270 %tmp3 = fmul half %tmp2, 4.0 271 %tmp4 = fsub half 1.0, %tmp3 272 %tmp5 = fadd half %tmp4, %tmp1 273 %tmp6 = fadd half %tmp1, %tmp1 274 %tmp7 = fmul half %tmp6, %tmp 275 %tmp8 = fsub half 1.0, %tmp7 276 %tmp9 = fmul half %tmp8, 8.0 277 %tmp10 = fadd half %tmp5, %tmp9 278 store half %tmp10, half addrspace(1)* %gep.out 279 ret void 280} 281 282declare i32 @llvm.amdgcn.workitem.id.x() #2 283 284attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } 285attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } 286attributes #2 = { nounwind readnone } 287attributes #3 = { nounwind } 288