1; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s 3 4; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s 5; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s 6 7; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s 9; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s 10; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s 11 12declare i32 @llvm.amdgcn.workitem.id.x() #1 13declare half @llvm.fmuladd.f16(half, half, half) #1 14declare half @llvm.fabs.f16(half) #1 15 16; GCN-LABEL: {{^}}fmuladd_f16: 17; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 18 19; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 20 21; GFX10-FLUSH: v_mul_f16_e32 22; GFX10-FLUSH: v_add_f16_e32 23; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 24 25define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 26 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { 27 %r0 = load half, half addrspace(1)* %in1 28 %r1 = load half, half addrspace(1)* %in2 29 %r2 = load half, half addrspace(1)* %in3 30 %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2) 31 store half %r3, half addrspace(1)* %out 32 ret void 33} 34 35; GCN-LABEL: {{^}}fmul_fadd_f16: 36; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 37 38; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 39 40; GFX10-FLUSH: v_mul_f16_e32 41; GFX10-FLUSH: v_add_f16_e32 42; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 43 44define amdgpu_kernel void @fmul_fadd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 45 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { 46 %r0 = load half, half addrspace(1)* %in1 47 %r1 = load half, half addrspace(1)* %in2 48 %r2 = load half, half addrspace(1)* %in3 49 %mul = fmul half %r0, %r1 50 %add = fadd half %mul, %r2 51 store half %add, half addrspace(1)* %out 52 ret void 53} 54 55; GCN-LABEL: {{^}}fmul_fadd_contract_f16: 56; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 57 58; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 59 60; GFX10-FLUSH: v_mul_f16_e32 61; GFX10-FLUSH: v_add_f16_e32 62; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 63 64define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 65 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { 66 %r0 = load half, half addrspace(1)* %in1 67 %r1 = load half, half addrspace(1)* %in2 68 %r2 = load half, half addrspace(1)* %in3 69 %mul = fmul half %r0, %r1 70 %add = fadd contract half %mul, %r2 71 store half %add, half addrspace(1)* %out 72 ret void 73} 74 75; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 76; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 77; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 78; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 79; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 80 81; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 82; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]] 83 84; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 85; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] 86 87; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 88; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 89; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 90 91define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 92 %tid = call i32 @llvm.amdgcn.workitem.id.x() 93 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 94 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 95 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 96 97 %r1 = load volatile half, half addrspace(1)* %gep.0 98 %r2 = load volatile half, half addrspace(1)* %gep.1 99 100 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2) 101 store half %r3, half addrspace(1)* %gep.out 102 ret void 103} 104 105; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 106; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 107; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 108; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 109; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 110 111; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 112; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] 113 114; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 115; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] 116 117; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 118; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 119; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 120 121define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 122 %tid = call i32 @llvm.amdgcn.workitem.id.x() 123 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 124 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 125 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 126 127 %r1 = load volatile half, half addrspace(1)* %gep.0 128 %r2 = load volatile half, half addrspace(1)* %gep.1 129 130 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2) 131 store half %r3, half addrspace(1)* %gep.out 132 ret void 133} 134 135; GCN-LABEL: {{^}}fadd_a_a_b_f16: 136; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 137; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 138; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 139; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 140 141; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 142; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] 143 144; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 145; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 146 147; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 148 149; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 150; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] 151; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 152; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 153; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] 154 155define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, 156 half addrspace(1)* %in1, 157 half addrspace(1)* %in2) #0 { 158 %tid = call i32 @llvm.amdgcn.workitem.id.x() 159 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 160 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 161 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 162 163 %r0 = load volatile half, half addrspace(1)* %gep.0 164 %r1 = load volatile half, half addrspace(1)* %gep.1 165 166 %add.0 = fadd half %r0, %r0 167 %add.1 = fadd half %add.0, %r1 168 store half %add.1, half addrspace(1)* %gep.out 169 ret void 170} 171 172; GCN-LABEL: {{^}}fadd_b_a_a_f16: 173; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 174; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 175; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 176; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 177 178; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 179; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] 180 181; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 182; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 183 184; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 185 186; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 187; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] 188; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 189; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 190; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] 191 192define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, 193 half addrspace(1)* %in1, 194 half addrspace(1)* %in2) #0 { 195 %tid = call i32 @llvm.amdgcn.workitem.id.x() 196 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 197 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 198 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 199 200 %r0 = load volatile half, half addrspace(1)* %gep.0 201 %r1 = load volatile half, half addrspace(1)* %gep.1 202 203 %add.0 = fadd half %r0, %r0 204 %add.1 = fadd half %r1, %add.0 205 store half %add.1, half addrspace(1)* %gep.out 206 ret void 207} 208 209; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 210; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 211; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 212; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 213; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 214; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] 215; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 216; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 217; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 218; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] 219; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 220; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 221define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 222 %tid = call i32 @llvm.amdgcn.workitem.id.x() 223 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 224 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 225 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 226 227 %r1 = load volatile half, half addrspace(1)* %gep.0 228 %r2 = load volatile half, half addrspace(1)* %gep.1 229 230 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2) 231 store half %r3, half addrspace(1)* %gep.out 232 ret void 233} 234 235; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 236; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 237; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 238; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] 239; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 240 241; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 242; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 243 244; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 245; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] 246; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 247 248; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] 249; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 250define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 251 %tid = call i32 @llvm.amdgcn.workitem.id.x() 252 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 253 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 254 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 255 256 %r1 = load volatile half, half addrspace(1)* %gep.0 257 %r2 = load volatile half, half addrspace(1)* %gep.1 258 259 %r1.fneg = fneg half %r1 260 261 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2) 262 store half %r3, half addrspace(1)* %gep.out 263 ret void 264} 265 266; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 267; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 268; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 269; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 270; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 271 272; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 273; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 274 275; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 276; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] 277; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 278 279; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] 280; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] 281define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 282 %tid = call i32 @llvm.amdgcn.workitem.id.x() 283 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 284 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 285 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 286 287 %r1 = load volatile half, half addrspace(1)* %gep.0 288 %r2 = load volatile half, half addrspace(1)* %gep.1 289 290 %r1.fneg = fneg half %r1 291 292 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2) 293 store half %r3, half addrspace(1)* %gep.out 294 ret void 295} 296 297; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 298; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 299; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 300; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 301; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 302; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 303; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] 304; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] 305; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 306define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 307 %tid = call i32 @llvm.amdgcn.workitem.id.x() 308 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 309 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 310 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 311 312 %r1 = load volatile half, half addrspace(1)* %gep.0 313 %r2 = load volatile half, half addrspace(1)* %gep.1 314 315 %r2.fneg = fneg half %r2 316 317 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg) 318 store half %r3, half addrspace(1)* %gep.out 319 ret void 320} 321 322; GCN-LABEL: {{^}}mad_sub_f16: 323; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 324; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 325; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 326 327; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 328 329; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 330 331; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 332; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 333 334; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 335 336; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 337; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 338; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 339define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 340 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 341 %tid.ext = sext i32 %tid to i64 342 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 343 %add1 = add i64 %tid.ext, 1 344 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 345 %add2 = add i64 %tid.ext, 2 346 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 347 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 348 %a = load volatile half, half addrspace(1)* %gep0, align 2 349 %b = load volatile half, half addrspace(1)* %gep1, align 2 350 %c = load volatile half, half addrspace(1)* %gep2, align 2 351 %mul = fmul half %a, %b 352 %sub = fsub half %mul, %c 353 store half %sub, half addrspace(1)* %outgep, align 2 354 ret void 355} 356 357; GCN-LABEL: {{^}}mad_sub_inv_f16: 358; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 359; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 360; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 361; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 362 363; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 364; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]] 365 366; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 367; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 368 369; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 370 371; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 372; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 373; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 374; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 375; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]] 376define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 377 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 378 %tid.ext = sext i32 %tid to i64 379 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 380 %add1 = add i64 %tid.ext, 1 381 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 382 %add2 = add i64 %tid.ext, 2 383 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 384 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 385 %a = load volatile half, half addrspace(1)* %gep0, align 2 386 %b = load volatile half, half addrspace(1)* %gep1, align 2 387 %c = load volatile half, half addrspace(1)* %gep2, align 2 388 %mul = fmul half %a, %b 389 %sub = fsub half %c, %mul 390 store half %sub, half addrspace(1)* %outgep, align 2 391 ret void 392} 393 394; GCN-LABEL: {{^}}mad_sub_fabs_f16: 395; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 396; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 397; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 398; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 399 400; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 401 402; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 403; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 404 405; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 406 407; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 408; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 409; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 410define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 411 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 412 %tid.ext = sext i32 %tid to i64 413 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 414 %add1 = add i64 %tid.ext, 1 415 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 416 %add2 = add i64 %tid.ext, 2 417 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 418 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 419 %a = load volatile half, half addrspace(1)* %gep0, align 2 420 %b = load volatile half, half addrspace(1)* %gep1, align 2 421 %c = load volatile half, half addrspace(1)* %gep2, align 2 422 %c.abs = call half @llvm.fabs.f16(half %c) #0 423 %mul = fmul half %a, %b 424 %sub = fsub half %mul, %c.abs 425 store half %sub, half addrspace(1)* %outgep, align 2 426 ret void 427} 428 429; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: 430; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 431; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 432; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 433 434; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 435 436; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 437 438; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 439; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 440 441; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 442 443; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 444; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 445; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 446define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 447 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 448 %tid.ext = sext i32 %tid to i64 449 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 450 %add1 = add i64 %tid.ext, 1 451 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 452 %add2 = add i64 %tid.ext, 2 453 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 454 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 455 %a = load volatile half, half addrspace(1)* %gep0, align 2 456 %b = load volatile half, half addrspace(1)* %gep1, align 2 457 %c = load volatile half, half addrspace(1)* %gep2, align 2 458 %c.abs = call half @llvm.fabs.f16(half %c) #0 459 %mul = fmul half %a, %b 460 %sub = fsub half %c.abs, %mul 461 store half %sub, half addrspace(1)* %outgep, align 2 462 ret void 463} 464 465; GCN-LABEL: {{^}}neg_neg_mad_f16: 466; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 467; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 468; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 469 470; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]] 471; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] 472 473; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] 474; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]] 475 476; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 477; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 478; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 479 480; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 481; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 482; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 483; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 484; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]] 485define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 486 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 487 %tid.ext = sext i32 %tid to i64 488 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 489 %add1 = add i64 %tid.ext, 1 490 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 491 %add2 = add i64 %tid.ext, 2 492 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 493 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 494 %a = load volatile half, half addrspace(1)* %gep0, align 2 495 %b = load volatile half, half addrspace(1)* %gep1, align 2 496 %c = load volatile half, half addrspace(1)* %gep2, align 2 497 %nega = fneg half %a 498 %negb = fneg half %b 499 %mul = fmul half %nega, %negb 500 %sub = fadd half %mul, %c 501 store half %sub, half addrspace(1)* %outgep, align 2 502 ret void 503} 504 505; GCN-LABEL: {{^}}mad_fabs_sub_f16: 506; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] 507; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] 508; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] 509 510; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 511 512; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 513 514; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 515; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 516 517; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 518 519; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 520; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 521; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 522define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { 523 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 524 %tid.ext = sext i32 %tid to i64 525 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext 526 %add1 = add i64 %tid.ext, 1 527 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 528 %add2 = add i64 %tid.ext, 2 529 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 530 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext 531 %a = load volatile half, half addrspace(1)* %gep0, align 2 532 %b = load volatile half, half addrspace(1)* %gep1, align 2 533 %c = load volatile half, half addrspace(1)* %gep2, align 2 534 %b.abs = call half @llvm.fabs.f16(half %b) #0 535 %mul = fmul half %a, %b.abs 536 %sub = fsub half %mul, %c 537 store half %sub, half addrspace(1)* %outgep, align 2 538 ret void 539} 540 541; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: 542; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 543; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 544; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] 545; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 546 547; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 548; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] 549 550; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 551; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 552 553; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 554 555; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 556; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 557; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] 558; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] 559; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] 560define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { 561 %tid = call i32 @llvm.amdgcn.workitem.id.x() 562 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 563 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 564 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 565 566 %r1 = load volatile half, half addrspace(1)* %gep.0 567 %r2 = load volatile half, half addrspace(1)* %gep.1 568 569 %add = fadd half %r1, %r1 570 %r3 = fsub half %r2, %add 571 572 store half %r3, half addrspace(1)* %gep.out 573 ret void 574} 575 576; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: 577; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], 578; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], 579 580; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 581 582; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 583 584; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 585; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 586 587; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 588 589; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 590; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 591; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] 592define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { 593 %tid = call i32 @llvm.amdgcn.workitem.id.x() 594 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid 595 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 596 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid 597 598 %r1 = load volatile half, half addrspace(1)* %gep.0 599 %r2 = load volatile half, half addrspace(1)* %gep.1 600 601 %add = fadd half %r1, %r1 602 %r3 = fsub half %add, %r2 603 604 store half %r3, half addrspace(1)* %gep.out 605 ret void 606} 607 608attributes #0 = { nounwind } 609attributes #1 = { nounwind readnone } 610