1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s 4 5; GCN-LABEL: {{^}}fpext_f16_to_f32 6; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 7; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]] 8; GCN: buffer_store_dword v[[R_F32]] 9; GCN: s_endpgm 10define amdgpu_kernel void @fpext_f16_to_f32( 11 float addrspace(1)* %r, 12 half addrspace(1)* %a) #0 { 13entry: 14 %a.val = load half, half addrspace(1)* %a 15 %r.val = fpext half %a.val to float 16 store float %r.val, float addrspace(1)* %r 17 ret void 18} 19 20; GCN-LABEL: {{^}}fpext_f16_to_f64 21; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 22; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 23; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]] 24; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}} 25; GCN: s_endpgm 26define amdgpu_kernel void @fpext_f16_to_f64( 27 double addrspace(1)* %r, 28 half addrspace(1)* %a) #0 { 29entry: 30 %a.val = load half, half addrspace(1)* %a 31 %r.val = fpext half %a.val to double 32 store double %r.val, double addrspace(1)* %r 33 ret void 34} 35 36; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32 37; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 38; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] 39; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 40; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] 41; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 42; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} 43; GCN: s_endpgm 44 45define amdgpu_kernel void @fpext_v2f16_to_v2f32( 46 <2 x float> addrspace(1)* %r, 47 <2 x half> addrspace(1)* %a) #0 { 48entry: 49 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 50 %r.val = fpext <2 x half> %a.val to <2 x float> 51 store <2 x float> %r.val, <2 x float> addrspace(1)* %r 52 ret void 53} 54 55; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64 56; GCN: buffer_load_dword 57; SI-DAG: v_lshrrev_b32_e32 58; SI-DAG: v_cvt_f32_f16_e32 59; GFX89: v_cvt_f32_f16_sdwa 60; GCN: v_cvt_f32_f16_e32 61 62; GCN: v_cvt_f64_f32_e32 63; GCN: v_cvt_f64_f32_e32 64; GCN: buffer_store_dwordx4 65; GCN: s_endpgm 66 67define amdgpu_kernel void @fpext_v2f16_to_v2f64( 68 <2 x double> addrspace(1)* %r, 69 <2 x half> addrspace(1)* %a) { 70entry: 71 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 72 %r.val = fpext <2 x half> %a.val to <2 x double> 73 store <2 x double> %r.val, <2 x double> addrspace(1)* %r 74 ret void 75} 76 77; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32: 78; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} 79define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) { 80entry: 81 %a.trunc = trunc i32 %a to i16 82 %a.val = bitcast i16 %a.trunc to half 83 %r.val = fpext half %a.val to float 84 store float %r.val, float addrspace(1)* %r 85 ret void 86} 87 88; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32: 89; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 90; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]] 91define amdgpu_kernel void @fneg_fpext_f16_to_f32( 92 float addrspace(1)* %r, 93 half addrspace(1)* %a) { 94entry: 95 %a.val = load half, half addrspace(1)* %a 96 %a.neg = fsub half -0.0, %a.val 97 %r.val = fpext half %a.neg to float 98 store float %r.val, float addrspace(1)* %r 99 ret void 100} 101 102; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32: 103; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 104; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]| 105define amdgpu_kernel void @fabs_fpext_f16_to_f32( 106 float addrspace(1)* %r, 107 half addrspace(1)* %a) { 108entry: 109 %a.val = load half, half addrspace(1)* %a 110 %a.fabs = call half @llvm.fabs.f16(half %a.val) 111 %r.val = fpext half %a.fabs to float 112 store float %r.val, float addrspace(1)* %r 113 ret void 114} 115 116; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32: 117; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 118; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]| 119define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( 120 float addrspace(1)* %r, 121 half addrspace(1)* %a) { 122entry: 123 %a.val = load half, half addrspace(1)* %a 124 %a.fabs = call half @llvm.fabs.f16(half %a.val) 125 %a.fneg.fabs = fsub half -0.0, %a.fabs 126 %r.val = fpext half %a.fneg.fabs to float 127 store float %r.val, float addrspace(1)* %r 128 ret void 129} 130 131; GCN-LABEL: {{^}}fneg_multi_use_fpext_f16_to_f32: 132; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 133; GCN-DAG: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[A]] 134 135; FIXME: Using the source modifier here only wastes code size 136; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] 137; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 138 139; GCN: store_dword [[CVT]] 140; GCN: store_short [[XOR]] 141define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( 142 float addrspace(1)* %r, 143 half addrspace(1)* %a) { 144entry: 145 %a.val = load half, half addrspace(1)* %a 146 %a.neg = fsub half -0.0, %a.val 147 %r.val = fpext half %a.neg to float 148 store volatile float %r.val, float addrspace(1)* %r 149 store volatile half %a.neg, half addrspace(1)* undef 150 ret void 151} 152 153; GCN-LABEL: {{^}}fneg_multi_foldable_use_fpext_f16_to_f32: 154; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 155; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]] 156; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] 157; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA_NEG]], [[CVTA]] 158; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] 159 160; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] 161; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]] 162 163; GCN: buffer_store_dword [[CVTA_NEG]] 164; GCN: buffer_store_short [[MUL]] 165define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( 166 float addrspace(1)* %r, 167 half addrspace(1)* %a) { 168entry: 169 %a.val = load half, half addrspace(1)* %a 170 %a.neg = fsub half -0.0, %a.val 171 %r.val = fpext half %a.neg to float 172 %mul = fmul half %a.neg, %a.val 173 store volatile float %r.val, float addrspace(1)* %r 174 store volatile half %mul, half addrspace(1)* undef 175 ret void 176} 177 178; GCN-LABEL: {{^}}fabs_multi_use_fpext_f16_to_f32: 179; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 180; GCN-DAG: v_and_b32_e32 [[XOR:v[0-9]+]], 0x7fff, [[A]] 181 182; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] 183; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]| 184 185; GCN: store_dword [[CVT]] 186; GCN: store_short [[XOR]] 187define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( 188 float addrspace(1)* %r, 189 half addrspace(1)* %a) { 190entry: 191 %a.val = load half, half addrspace(1)* %a 192 %a.fabs = call half @llvm.fabs.f16(half %a.val) 193 %r.val = fpext half %a.fabs to float 194 store volatile float %r.val, float addrspace(1)* %r 195 store volatile half %a.fabs, half addrspace(1)* undef 196 ret void 197} 198 199; GCN-LABEL: {{^}}fabs_multi_foldable_use_fpext_f16_to_f32: 200; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 201; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] 202; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], |[[CVTA]]|, [[CVTA]] 203; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] 204; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]] 205 206; GFX89-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]| 207; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]] 208 209; GCN: buffer_store_dword [[ABS_A]] 210; GCN: buffer_store_short [[MUL]] 211define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( 212 float addrspace(1)* %r, 213 half addrspace(1)* %a) { 214entry: 215 %a.val = load half, half addrspace(1)* %a 216 %a.fabs = call half @llvm.fabs.f16(half %a.val) 217 %r.val = fpext half %a.fabs to float 218 %mul = fmul half %a.fabs, %a.val 219 store volatile float %r.val, float addrspace(1)* %r 220 store volatile half %mul, half addrspace(1)* undef 221 ret void 222} 223 224; GCN-LABEL: {{^}}fabs_fneg_multi_use_fpext_f16_to_f32: 225; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 226; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x8000, [[A]] 227 228; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[OR]] 229; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[OR]]| 230 231; GCN: buffer_store_dword [[CVT]] 232; GCN: buffer_store_short [[OR]] 233define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( 234 float addrspace(1)* %r, 235 half addrspace(1)* %a) { 236entry: 237 %a.val = load half, half addrspace(1)* %a 238 %a.fabs = call half @llvm.fabs.f16(half %a.val) 239 %a.fneg.fabs = fsub half -0.0, %a.fabs 240 %r.val = fpext half %a.fneg.fabs to float 241 store volatile float %r.val, float addrspace(1)* %r 242 store volatile half %a.fneg.fabs, half addrspace(1)* undef 243 ret void 244} 245 246; GCN-LABEL: {{^}}fabs_fneg_multi_foldable_use_fpext_f16_to_f32: 247; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 248; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] 249; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], -|[[CVTA]]|, [[CVTA]] 250; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] 251; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]] 252 253; GFX89-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]| 254; GFX89-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]] 255 256; GCN: buffer_store_dword [[FABS_FNEG]] 257; GCN: buffer_store_short [[MUL]] 258define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( 259 float addrspace(1)* %r, 260 half addrspace(1)* %a) { 261entry: 262 %a.val = load half, half addrspace(1)* %a 263 %a.fabs = call half @llvm.fabs.f16(half %a.val) 264 %a.fneg.fabs = fsub half -0.0, %a.fabs 265 %r.val = fpext half %a.fneg.fabs to float 266 %mul = fmul half %a.fneg.fabs, %a.val 267 store volatile float %r.val, float addrspace(1)* %r 268 store volatile half %mul, half addrspace(1)* undef 269 ret void 270} 271 272declare half @llvm.fabs.f16(half) #1 273 274attributes #1 = { nounwind readnone } 275