1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s 4 5declare half @llvm.copysign.f16(half, half) 6declare float @llvm.copysign.f32(float, float) 7declare double @llvm.copysign.f64(double, double) 8declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) 9declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) 10declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) 11 12declare i32 @llvm.amdgcn.workitem.id.x() 13 14; GCN-LABEL: {{^}}test_copysign_f16: 15; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] 16; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] 17; SI: s_brev_b32 s[[CONST:[0-9]+]], -2 18; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] 19; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] 20; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]] 21; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] 22; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] 23; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] 24; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff 25; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]] 26; GCN: buffer_store_short v[[OUT]] 27; GCN: s_endpgm 28define amdgpu_kernel void @test_copysign_f16( 29 half addrspace(1)* %arg_out, 30 half addrspace(1)* %arg_mag, 31 half addrspace(1)* %arg_sign) { 32entry: 33 %mag = load volatile half, half addrspace(1)* %arg_mag 34 %sign = load volatile half, half addrspace(1)* %arg_sign 35 %out = call half @llvm.copysign.f16(half %mag, half %sign) 36 store half %out, half addrspace(1)* %arg_out 37 ret void 38} 39 40; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32: 41; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[MAG:[0-9]+]] 42; GCN-DAG: {{buffer|flat|global}}_load_dword v[[SIGN:[0-9]+]] 43; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 44; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] 45; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]] 46; GCN: buffer_store_dword v[[OUT]] 47; GCN: s_endpgm 48define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32( 49 float addrspace(1)* %arg_out, 50 half addrspace(1)* %arg_mag, 51 float addrspace(1)* %arg_sign) { 52entry: 53 %tid = call i32 @llvm.amdgcn.workitem.id.x() 54 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid 55 %mag = load half, half addrspace(1)* %arg_mag_gep 56 %mag.ext = fpext half %mag to float 57 %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid 58 %sign = load float, float addrspace(1)* %arg_sign_gep 59 %out = call float @llvm.copysign.f32(float %mag.ext, float %sign) 60 store float %out, float addrspace(1)* %arg_out 61 ret void 62} 63 64; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64: 65; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[MAG:[0-9]+]] 66; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} 67; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 68; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] 69; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]] 70; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]] 71; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}} 72; GCN: s_endpgm 73define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64( 74 double addrspace(1)* %arg_out, 75 half addrspace(1)* %arg_mag, 76 double addrspace(1)* %arg_sign) { 77entry: 78 %tid = call i32 @llvm.amdgcn.workitem.id.x() 79 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid 80 %mag = load half, half addrspace(1)* %arg_mag_gep 81 %mag.ext = fpext half %mag to double 82 %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid 83 %sign = load double, double addrspace(1)* %arg_sign_gep 84 %out = call double @llvm.copysign.f64(double %mag.ext, double %sign) 85 store double %out, double addrspace(1)* %arg_out 86 ret void 87} 88 89; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16: 90; GCN-DAG: {{buffer|flat|global}}_load_dword v[[MAG:[0-9]+]] 91; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[SIGN:[0-9]+]] 92; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 93; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] 94; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]] 95; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]] 96; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] 97; GCN: buffer_store_dword v[[OUT]] 98; GCN: s_endpgm 99define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16( 100 float addrspace(1)* %arg_out, 101 float addrspace(1)* %arg_mag, 102 half addrspace(1)* %arg_sign) { 103entry: 104 %tid = call i32 @llvm.amdgcn.workitem.id.x() 105 %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid 106 %mag = load float, float addrspace(1)* %arg_mag_gep 107 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid 108 %sign = load half, half addrspace(1)* %arg_sign_gep 109 %sign.ext = fpext half %sign to float 110 %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) 111 store float %out, float addrspace(1)* %arg_out 112 ret void 113} 114 115; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16: 116; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} 117; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[SIGN:[0-9]+]] 118; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 119; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] 120; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]] 121; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]] 122; GFX89: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]] 123; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}} 124; GCN: s_endpgm 125define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16( 126 double addrspace(1)* %arg_out, 127 double addrspace(1)* %arg_mag, 128 half addrspace(1)* %arg_sign) { 129entry: 130 %tid = call i32 @llvm.amdgcn.workitem.id.x() 131 %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid 132 %mag = load double, double addrspace(1)* %arg_mag_gep 133 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid 134 %sign = load half, half addrspace(1)* %arg_sign_gep 135 %sign.ext = fpext half %sign to double 136 %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) 137 store double %out, double addrspace(1)* %arg_out 138 ret void 139} 140 141; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32: 142; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[MAG:[0-9]+]] 143; GCN-DAG: {{buffer|flat|global}}_load_dword v[[SIGN:[0-9]+]] 144; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 145; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] 146; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]] 147; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] 148; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff 149; GFX89-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]] 150; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] 151; GCN: buffer_store_short v[[OUT]] 152; GCN: s_endpgm 153define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32( 154 half addrspace(1)* %arg_out, 155 half addrspace(1)* %arg_mag, 156 float addrspace(1)* %arg_sign) { 157entry: 158 %tid = call i32 @llvm.amdgcn.workitem.id.x() 159 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid 160 %mag = load half, half addrspace(1)* %arg_mag_gep 161 %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid 162 %sign = load float, float addrspace(1)* %arg_sign_gep 163 %sign.trunc = fptrunc float %sign to half 164 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) 165 store half %out, half addrspace(1)* %arg_out 166 ret void 167} 168 169; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64: 170; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[MAG:[0-9]+]] 171; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} 172; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 173; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] 174; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]] 175; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] 176; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff 177; GFX89-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN_HI]] 178; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] 179; GCN: buffer_store_short v[[OUT]] 180; GCN: s_endpgm 181define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64( 182 half addrspace(1)* %arg_out, 183 half addrspace(1)* %arg_mag, 184 double addrspace(1)* %arg_sign) { 185entry: 186 %tid = call i32 @llvm.amdgcn.workitem.id.x() 187 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid 188 %mag = load half, half addrspace(1)* %arg_mag 189 %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid 190 %sign = load double, double addrspace(1)* %arg_sign_gep 191 %sign.trunc = fptrunc double %sign to half 192 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) 193 store half %out, half addrspace(1)* %arg_out 194 ret void 195} 196 197; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16: 198; GCN-DAG: {{buffer|flat|global}}_load_dword v[[MAG:[0-9]+]] 199; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[SIGN:[0-9]+]] 200; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 201; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]] 202; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] 203; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG_TRUNC]] 204; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]] 205; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] 206; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff 207; GFX89-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]] 208; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]] 209; GCN: buffer_store_short v[[OUT]] 210; GCN: s_endpgm 211define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16( 212 half addrspace(1)* %arg_out, 213 float addrspace(1)* %arg_mag, 214 half addrspace(1)* %arg_sign) { 215entry: 216 %tid = call i32 @llvm.amdgcn.workitem.id.x() 217 %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid 218 %mag = load float, float addrspace(1)* %arg_mag_gep 219 %mag.trunc = fptrunc float %mag to half 220 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid 221 %sign = load half, half addrspace(1)* %arg_sign_gep 222 %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign) 223 store half %out, half addrspace(1)* %arg_out 224 ret void 225} 226 227; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16: 228; GCN: v_bfi_b32 229; GCN: s_endpgm 230define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16( 231 half addrspace(1)* %arg_out, 232 double addrspace(1)* %arg_mag, 233 half addrspace(1)* %arg_sign) { 234entry: 235 %mag = load double, double addrspace(1)* %arg_mag 236 %mag.trunc = fptrunc double %mag to half 237 %sign = load half, half addrspace(1)* %arg_sign 238 %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign) 239 store half %out, half addrspace(1)* %arg_out 240 ret void 241} 242 243; GCN-LABEL: {{^}}test_copysign_v2f16: 244; GCN: v_bfi_b32 245; GCN: v_bfi_b32 246; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 247; GCN: s_endpgm 248define amdgpu_kernel void @test_copysign_v2f16( 249 <2 x half> addrspace(1)* %arg_out, 250 <2 x half> %arg_mag, 251 <2 x half> %arg_sign) { 252entry: 253 %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign) 254 store <2 x half> %out, <2 x half> addrspace(1)* %arg_out 255 ret void 256} 257 258; GCN-LABEL: {{^}}test_copysign_v3f16: 259; GCN: v_bfi_b32 260; GCN: v_bfi_b32 261; GCN: v_bfi_b32 262; GCN: s_endpgm 263define amdgpu_kernel void @test_copysign_v3f16( 264 <3 x half> addrspace(1)* %arg_out, 265 <3 x half> %arg_mag, 266 <3 x half> %arg_sign) { 267entry: 268 %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign) 269 store <3 x half> %out, <3 x half> addrspace(1)* %arg_out 270 ret void 271} 272 273; GCN-LABEL: {{^}}test_copysign_v4f16: 274; GCN: v_bfi_b32 275; GCN: v_bfi_b32 276; GCN: v_bfi_b32 277; GCN: v_bfi_b32 278; GCN: s_endpgm 279define amdgpu_kernel void @test_copysign_v4f16( 280 <4 x half> addrspace(1)* %arg_out, 281 <4 x half> %arg_mag, 282 <4 x half> %arg_sign) { 283entry: 284 %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) 285 store <4 x half> %out, <4 x half> addrspace(1)* %arg_out 286 ret void 287} 288