1; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s 3 4; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s 5; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s 6 7declare i32 @llvm.amdgcn.workitem.id.x() #1 8declare half @llvm.fmuladd.f16(half, half, half) #1 9declare half @llvm.fabs.f16(half) #1 10 11; GCN-LABEL: {{^}}fmuladd_f16: 12; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 13 14; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 15define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 16 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { 17 %r0 = load half, half addrspace(1)* %in1 18 %r1 = load half, half addrspace(1)* %in2 19 %r2 = load half, half addrspace(1)* %in3 20 %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2) 21 store half %r3, half addrspace(1)* %out 22 ret void 23} 24 25; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 26; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 27; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 28; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 29; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 30 31; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 32; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 33define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 34 %tid = call i32 @llvm.amdgcn.workitem.id.x() 35 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 36 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 37 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 38 39 %r1 = load volatile half, half addrspace(1)* %gep.0 40 %r2 = load volatile half, half addrspace(1)* %gep.1 41 42 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2) 43 store half %r3, half addrspace(1)* %gep.out 44 ret void 45} 46 47; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 48; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 49; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 50; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 51; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 52 53; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 54; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 55define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 56 %tid = call i32 @llvm.amdgcn.workitem.id.x() 57 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 58 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 59 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 60 61 %r1 = load volatile half, half addrspace(1)* %gep.0 62 %r2 = load volatile half, half addrspace(1)* %gep.1 63 64 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2) 65 store half %r3, half addrspace(1)* %gep.out 66 ret void 67} 68 69; GCN-LABEL: {{^}}fadd_a_a_b_f16: 70; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 71; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 72; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 73; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 74 75; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 76 77; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 78; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 79 80; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 81define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, 82 half addrspace(1)* %in1, 83 half addrspace(1)* %in2) #0 { 84 %tid = call i32 @llvm.amdgcn.workitem.id.x() 85 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 86 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 87 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 88 89 %r0 = load volatile half, half addrspace(1)* %gep.0 90 %r1 = load volatile half, half addrspace(1)* %gep.1 91 92 %add.0 = fadd half %r0, %r0 93 %add.1 = fadd half %add.0, %r1 94 store half %add.1, half addrspace(1)* %gep.out 95 ret void 96} 97 98; GCN-LABEL: {{^}}fadd_b_a_a_f16: 99; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 100; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 101; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 102; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 103 104; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 105 106; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 107; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 108 109; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 110define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, 111 half addrspace(1)* %in1, 112 half addrspace(1)* %in2) #0 { 113 %tid = call i32 @llvm.amdgcn.workitem.id.x() 114 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 115 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 116 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 117 118 %r0 = load volatile half, half addrspace(1)* %gep.0 119 %r1 = load volatile half, half addrspace(1)* %gep.1 120 121 %add.0 = fadd half %r0, %r0 122 %add.1 = fadd half %r1, %add.0 123 store half %add.1, half addrspace(1)* %gep.out 124 ret void 125} 126 127; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 128; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 129; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 130; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 131; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]] 132; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 133define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 134 %tid = call i32 @llvm.amdgcn.workitem.id.x() 135 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 136 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 137 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 138 139 %r1 = load volatile half, half addrspace(1)* %gep.0 140 %r2 = load volatile half, half addrspace(1)* %gep.1 141 142 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2) 143 store half %r3, half addrspace(1)* %gep.out 144 ret void 145} 146 147; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 148; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 149; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 150; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 151; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 152 153; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 154; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 155define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 156 %tid = call i32 @llvm.amdgcn.workitem.id.x() 157 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 158 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 159 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 160 161 %r1 = load volatile half, half addrspace(1)* %gep.0 162 %r2 = load volatile half, half addrspace(1)* %gep.1 163 164 %r1.fneg = fsub half -0.000000e+00, %r1 165 166 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2) 167 store half %r3, half addrspace(1)* %gep.out 168 ret void 169} 170 171; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 172; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 173; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 174; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 175; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 176 177; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 178; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 179define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 180 %tid = call i32 @llvm.amdgcn.workitem.id.x() 181 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 182 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 183 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 184 185 %r1 = load volatile half, half addrspace(1)* %gep.0 186 %r2 = load volatile half, half addrspace(1)* %gep.1 187 188 %r1.fneg = fsub half -0.000000e+00, %r1 189 190 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2) 191 store half %r3, half addrspace(1)* %gep.out 192 ret void 193} 194 195; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 196; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 197; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 198; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 199; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 200; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 201define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 202 %tid = call i32 @llvm.amdgcn.workitem.id.x() 203 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 204 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 205 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 206 207 %r1 = load volatile half, half addrspace(1)* %gep.0 208 %r2 = load volatile half, half addrspace(1)* %gep.1 209 210 %r2.fneg = fsub half -0.000000e+00, %r2 211 212 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg) 213 store half %r3, half addrspace(1)* %gep.out 214 ret void 215} 216 217; GCN-LABEL: {{^}}mad_sub_f16: 218; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] 219; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] 220; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] 221 222; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 223 224; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 225 226; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 227; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 228 229; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 230define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 231 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 232 %tid.ext = sext i32 %tid to i64 233 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 234 %add1 = add i64 %tid.ext, 1 235 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 236 %add2 = add i64 %tid.ext, 2 237 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 238 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 239 %a = load volatile half, half addrspace(1)* %gep0, align 2 240 %b = load volatile half, half addrspace(1)* %gep1, align 2 241 %c = load volatile half, half addrspace(1)* %gep2, align 2 242 %mul = fmul half %a, %b 243 %sub = fsub half %mul, %c 244 store half %sub, half addrspace(1)* %outgep, align 2 245 ret void 246} 247 248; GCN-LABEL: {{^}}mad_sub_inv_f16: 249; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] 250; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] 251; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] 252; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 253 254; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 255 256; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 257; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 258 259; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 260define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 261 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 262 %tid.ext = sext i32 %tid to i64 263 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 264 %add1 = add i64 %tid.ext, 1 265 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 266 %add2 = add i64 %tid.ext, 2 267 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 268 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 269 %a = load volatile half, half addrspace(1)* %gep0, align 2 270 %b = load volatile half, half addrspace(1)* %gep1, align 2 271 %c = load volatile half, half addrspace(1)* %gep2, align 2 272 %mul = fmul half %a, %b 273 %sub = fsub half %c, %mul 274 store half %sub, half addrspace(1)* %outgep, align 2 275 ret void 276} 277 278; GCN-LABEL: {{^}}mad_sub_fabs_f16: 279; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] 280; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] 281; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] 282; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 283 284; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 285 286; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 287; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 288 289; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 290define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 291 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 292 %tid.ext = sext i32 %tid to i64 293 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 294 %add1 = add i64 %tid.ext, 1 295 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 296 %add2 = add i64 %tid.ext, 2 297 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 298 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 299 %a = load volatile half, half addrspace(1)* %gep0, align 2 300 %b = load volatile half, half addrspace(1)* %gep1, align 2 301 %c = load volatile half, half addrspace(1)* %gep2, align 2 302 %c.abs = call half @llvm.fabs.f16(half %c) #0 303 %mul = fmul half %a, %b 304 %sub = fsub half %mul, %c.abs 305 store half %sub, half addrspace(1)* %outgep, align 2 306 ret void 307} 308 309; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: 310; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] 311; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] 312; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] 313 314; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 315 316; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 317 318; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 319; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 320 321; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 322define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 324 %tid.ext = sext i32 %tid to i64 325 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 326 %add1 = add i64 %tid.ext, 1 327 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 328 %add2 = add i64 %tid.ext, 2 329 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 330 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 331 %a = load volatile half, half addrspace(1)* %gep0, align 2 332 %b = load volatile half, half addrspace(1)* %gep1, align 2 333 %c = load volatile half, half addrspace(1)* %gep2, align 2 334 %c.abs = call half @llvm.fabs.f16(half %c) #0 335 %mul = fmul half %a, %b 336 %sub = fsub half %c.abs, %mul 337 store half %sub, half addrspace(1)* %outgep, align 2 338 ret void 339} 340 341; GCN-LABEL: {{^}}neg_neg_mad_f16: 342; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] 343; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] 344; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] 345 346; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]] 347; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] 348 349; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] 350 351; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 352; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 353; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 354define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 355 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 356 %tid.ext = sext i32 %tid to i64 357 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 358 %add1 = add i64 %tid.ext, 1 359 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 360 %add2 = add i64 %tid.ext, 2 361 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 362 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 363 %a = load volatile half, half addrspace(1)* %gep0, align 2 364 %b = load volatile half, half addrspace(1)* %gep1, align 2 365 %c = load volatile half, half addrspace(1)* %gep2, align 2 366 %nega = fsub half -0.000000e+00, %a 367 %negb = fsub half -0.000000e+00, %b 368 %mul = fmul half %nega, %negb 369 %sub = fadd half %mul, %c 370 store half %sub, half addrspace(1)* %outgep, align 2 371 ret void 372} 373 374; GCN-LABEL: {{^}}mad_fabs_sub_f16: 375; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] 376; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] 377; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] 378 379; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 380 381; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 382 383; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 384; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 385 386; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 387define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 388 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 389 %tid.ext = sext i32 %tid to i64 390 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 391 %add1 = add i64 %tid.ext, 1 392 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 393 %add2 = add i64 %tid.ext, 2 394 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 395 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 396 %a = load volatile half, half addrspace(1)* %gep0, align 2 397 %b = load volatile half, half addrspace(1)* %gep1, align 2 398 %c = load volatile half, half addrspace(1)* %gep2, align 2 399 %b.abs = call half @llvm.fabs.f16(half %b) #0 400 %mul = fmul half %a, %b.abs 401 %sub = fsub half %mul, %c 402 store half %sub, half addrspace(1)* %outgep, align 2 403 ret void 404} 405 406; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: 407; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 408; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 409; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 410; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 411 412; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 413 414; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 415; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 416 417; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 418define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { 419 %tid = call i32 @llvm.amdgcn.workitem.id.x() 420 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 421 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 422 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 423 424 %r1 = load volatile half, half addrspace(1)* %gep.0 425 %r2 = load volatile half, half addrspace(1)* %gep.1 426 427 %add = fadd half %r1, %r1 428 %r3 = fsub half %r2, %add 429 430 store half %r3, half addrspace(1)* %gep.out 431 ret void 432} 433 434; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: 435; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], 436; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], 437 438; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 439 440; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 441 442; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 443; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 444 445; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 446define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { 447 %tid = call i32 @llvm.amdgcn.workitem.id.x() 448 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 449 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 450 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 451 452 %r1 = load volatile half, half addrspace(1)* %gep.0 453 %r2 = load volatile half, half addrspace(1)* %gep.1 454 455 %add = fadd half %r1, %r1 456 %r3 = fsub half %add, %r2 457 458 store half %r3, half addrspace(1)* %gep.out 459 ret void 460} 461 462attributes #0 = { nounwind } 463attributes #1 = { nounwind readnone } 464