1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s 5 6; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s 8; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s 9; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s 10 11declare i32 @llvm.amdgcn.workitem.id.x() #1 12declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 13declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 14 15; GCN-LABEL: {{^}}fmuladd_v2f16: 16; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 17; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 18 19; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 20define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, 21 <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { 22 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 23 %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 24 %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 25 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) 26 store <2 x half> %r3, <2 x half> addrspace(1)* %out 27 ret void 28} 29 30; GCN-LABEL: {{^}}fmul_fadd_v2f16: 31; GFX9-DENORM-STRICT: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 32; GFX9-DENORM-STRICT: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 33 34; GFX9-DENORM-CONTRACT: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 35define amdgpu_kernel void @fmul_fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, 36 <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { 37 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 38 %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 39 %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 40 %r3 = fmul <2 x half> %r0, %r1 41 %r4 = fadd <2 x half> %r3, %r2 42 store <2 x half> %r4, <2 x half> addrspace(1)* %out 43 ret void 44} 45 46; GCN-LABEL: {{^}}fmul_fadd_contract_v2f16: 47; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 48; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 49 50; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 51define amdgpu_kernel void @fmul_fadd_contract_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, 52 <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { 53 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 54 %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 55 %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 56 %r3 = fmul <2 x half> %r0, %r1 57 %r4 = fadd contract <2 x half> %r3, %r2 58 store <2 x half> %r4, <2 x half> addrspace(1)* %out 59 ret void 60} 61 62 63; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16: 64; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 65; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 66; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] 67; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] 68 69; GFX9-FLUSH: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 70 71; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 72; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 73define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 74 %tid = call i32 @llvm.amdgcn.workitem.id.x() 75 %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 76 %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 77 %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 78 79 %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 80 %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 81 82 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> <half 2.0, half 2.0>, <2 x half> %r1, <2 x half> %r2) 83 store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out 84 ret void 85} 86 87; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16: 88; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 89; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 90; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] 91; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] 92 93; GFX9-FLUSH: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 94 95; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 96; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 97define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 98 %tid = call i32 @llvm.amdgcn.workitem.id.x() 99 %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 100 %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 101 %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 102 103 %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 104 %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 105 106 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> <half 2.0, half 2.0>, <2 x half> %r2) 107 store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out 108 ret void 109} 110 111; GCN-LABEL: {{^}}fadd_a_a_b_v2f16: 112; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 113; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 114; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] 115; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] 116 117; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] 118; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] 119 120; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 121 122; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] 123define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out, 124 <2 x half> addrspace(1)* %in1, 125 <2 x half> addrspace(1)* %in2) #0 { 126 %tid = call i32 @llvm.amdgcn.workitem.id.x() 127 %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 128 %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 129 %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 130 131 %r0 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 132 %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 133 134 %add.0 = fadd <2 x half> %r0, %r0 135 %add.1 = fadd <2 x half> %add.0, %r1 136 store <2 x half> %add.1, <2 x half> addrspace(1)* %gep.out 137 ret void 138} 139 140attributes #0 = { nounwind } 141attributes #1 = { nounwind readnone } 142