1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s 3; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s 5 6; Make sure fdiv is promoted to f32. 7 8; GCN-LABEL: {{^}}v_fdiv_f16 9; SI: v_cvt_f32_f16 10; SI: v_cvt_f32_f16 11; SI: v_div_scale_f32 12; SI-DAG: v_div_scale_f32 13; SI-DAG: v_rcp_f32 14; SI: v_fma_f32 15; SI: v_fma_f32 16; SI: v_mul_f32 17; SI: v_fma_f32 18; SI: v_fma_f32 19; SI: v_fma_f32 20; SI: v_div_fmas_f32 21; SI: v_div_fixup_f32 22; SI: v_cvt_f16_f32 23 24; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] 25; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] 26 27; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]] 28; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] 29 30; GFX8_9-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] 31; GFX8_9: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]] 32; GFX8_9: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] 33; GFX8_9: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] 34; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 35define amdgpu_kernel void @v_fdiv_f16( 36 half addrspace(1)* %r, 37 half addrspace(1)* %a, 38 half addrspace(1)* %b) #0 { 39entry: 40 %tid = call i32 @llvm.amdgcn.workitem.id.x() 41 %tid.ext = sext i32 %tid to i64 42 %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext 43 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 44 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 45 %a.val = load volatile half, half addrspace(1)* %gep.a 46 %b.val = load volatile half, half addrspace(1)* %gep.b 47 %r.val = fdiv half %a.val, %b.val 48 store half %r.val, half addrspace(1)* %gep.r 49 ret void 50} 51 52; GCN-LABEL: {{^}}v_rcp_f16: 53; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] 54; GFX8_9-NOT: [[VAL]] 55; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] 56; GFX8_9-NOT: [[RESULT]] 57; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 58define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { 59entry: 60 %tid = call i32 @llvm.amdgcn.workitem.id.x() 61 %tid.ext = sext i32 %tid to i64 62 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 63 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 64 %b.val = load volatile half, half addrspace(1)* %gep.b 65 %r.val = fdiv half 1.0, %b.val 66 store half %r.val, half addrspace(1)* %gep.r 67 ret void 68} 69 70; GCN-LABEL: {{^}}v_rcp_f16_abs: 71; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] 72; GFX8_9-NOT: [[VAL]] 73; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| 74; GFX8_9-NOT: [RESULT]] 75; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 76define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { 77entry: 78 %tid = call i32 @llvm.amdgcn.workitem.id.x() 79 %tid.ext = sext i32 %tid to i64 80 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 81 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 82 %b.val = load volatile half, half addrspace(1)* %gep.b 83 %b.abs = call half @llvm.fabs.f16(half %b.val) 84 %r.val = fdiv half 1.0, %b.abs 85 store half %r.val, half addrspace(1)* %gep.r 86 ret void 87} 88 89; GCN-LABEL: {{^}}v_rcp_f16_arcp: 90; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] 91; GFX8_9-NOT: [[VAL]] 92; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] 93; GFX8_9-NOT: [[RESULT]] 94; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 95define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 { 96entry: 97 %tid = call i32 @llvm.amdgcn.workitem.id.x() 98 %tid.ext = sext i32 %tid to i64 99 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 100 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 101 %b.val = load volatile half, half addrspace(1)* %gep.b 102 %r.val = fdiv arcp half 1.0, %b.val 103 store half %r.val, half addrspace(1)* %gep.r 104 ret void 105} 106 107; GCN-LABEL: {{^}}v_rcp_f16_neg: 108; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] 109; GFX8_9-NOT: [[VAL]] 110; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] 111; GFX8_9-NOT: [RESULT]] 112; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 113define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { 114entry: 115 %tid = call i32 @llvm.amdgcn.workitem.id.x() 116 %tid.ext = sext i32 %tid to i64 117 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 118 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 119 %b.val = load volatile half, half addrspace(1)* %gep.b 120 %r.val = fdiv half -1.0, %b.val 121 store half %r.val, half addrspace(1)* %gep.r 122 ret void 123} 124 125; GCN-LABEL: {{^}}v_rsq_f16: 126; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] 127; GFX8_9-NOT: [[VAL]] 128; GFX8_9: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] 129; GFX8_9-NOT: [RESULT]] 130; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 131define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { 132entry: 133 %tid = call i32 @llvm.amdgcn.workitem.id.x() 134 %tid.ext = sext i32 %tid to i64 135 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 136 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 137 %b.val = load volatile half, half addrspace(1)* %gep.b 138 %b.sqrt = call half @llvm.sqrt.f16(half %b.val) 139 %r.val = fdiv half 1.0, %b.sqrt 140 store half %r.val, half addrspace(1)* %gep.r 141 ret void 142} 143 144; GCN-LABEL: {{^}}v_rsq_f16_neg: 145; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] 146; GFX8_9-NOT: [[VAL]] 147; GFX8_9: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]] 148; GFX8_9-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] 149; GFX8_9-NOT: [RESULT]] 150; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 151define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { 152entry: 153 %tid = call i32 @llvm.amdgcn.workitem.id.x() 154 %tid.ext = sext i32 %tid to i64 155 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 156 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 157 %b.val = load volatile half, half addrspace(1)* %gep.b 158 %b.sqrt = call half @llvm.sqrt.f16(half %b.val) 159 %r.val = fdiv half -1.0, %b.sqrt 160 store half %r.val, half addrspace(1)* %gep.r 161 ret void 162} 163 164; GCN-LABEL: {{^}}v_fdiv_f16_arcp: 165; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] 166; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] 167 168; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] 169; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] 170 171; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 172define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { 173entry: 174 %tid = call i32 @llvm.amdgcn.workitem.id.x() 175 %tid.ext = sext i32 %tid to i64 176 %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext 177 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 178 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 179 %a.val = load volatile half, half addrspace(1)* %gep.a 180 %b.val = load volatile half, half addrspace(1)* %gep.b 181 %r.val = fdiv arcp half %a.val, %b.val 182 store half %r.val, half addrspace(1)* %gep.r 183 ret void 184} 185 186; GCN-LABEL: {{^}}v_fdiv_f16_unsafe: 187; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] 188; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] 189 190; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] 191; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] 192 193; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 194define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { 195entry: 196 %tid = call i32 @llvm.amdgcn.workitem.id.x() 197 %tid.ext = sext i32 %tid to i64 198 %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext 199 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 200 %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext 201 %a.val = load volatile half, half addrspace(1)* %gep.a 202 %b.val = load volatile half, half addrspace(1)* %gep.b 203 %r.val = fdiv half %a.val, %b.val 204 store half %r.val, half addrspace(1)* %gep.r 205 ret void 206} 207 208; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16: 209; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} 210 211; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} 212; GFX8_9: buffer_store_short [[MUL]] 213define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 { 214 %x = load half, half addrspace(1)* undef 215 %rcp = fdiv arcp half %x, 2.0 216 store half %rcp, half addrspace(1)* %out, align 4 217 ret void 218} 219 220; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16: 221; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}} 222 223; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}} 224; GFX8_9: buffer_store_short [[MUL]] 225define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 { 226 %x = load half, half addrspace(1)* undef 227 %rcp = fdiv arcp half %x, 10.0 228 store half %rcp, half addrspace(1)* %out, align 4 229 ret void 230} 231 232; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16: 233; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}} 234 235; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}} 236; GFX8_9: buffer_store_short [[MUL]] 237define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 { 238 %x = load half, half addrspace(1)* undef 239 %rcp = fdiv arcp half %x, -10.0 240 store half %rcp, half addrspace(1)* %out, align 4 241 ret void 242} 243 244declare i32 @llvm.amdgcn.workitem.id.x() #1 245declare half @llvm.sqrt.f16(half) #1 246declare half @llvm.fabs.f16(half) #1 247 248attributes #0 = { nounwind } 249attributes #1 = { nounwind readnone } 250attributes #2 = { nounwind "unsafe-fp-math"="true" } 251