1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s 4 5declare half @llvm.minnum.f16(half %a, half %b) 6declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 7declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b) 8declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) 9 10; GCN-LABEL: {{^}}minnum_f16: 11; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 12; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 13; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 14; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 15; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] 16; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 17; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] 18; GCN: buffer_store_short v[[R_F16]] 19; GCN: s_endpgm 20define amdgpu_kernel void @minnum_f16( 21 half addrspace(1)* %r, 22 half addrspace(1)* %a, 23 half addrspace(1)* %b) { 24entry: 25 %a.val = load volatile half, half addrspace(1)* %a 26 %b.val = load volatile half, half addrspace(1)* %b 27 %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) 28 store half %r.val, half addrspace(1)* %r 29 ret void 30} 31 32; GCN-LABEL: {{^}}minnum_f16_imm_a: 33; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 34; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 35; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] 36; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 37; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] 38; GCN: buffer_store_short v[[R_F16]] 39; GCN: s_endpgm 40define amdgpu_kernel void @minnum_f16_imm_a( 41 half addrspace(1)* %r, 42 half addrspace(1)* %b) { 43entry: 44 %b.val = load half, half addrspace(1)* %b 45 %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val) 46 store half %r.val, half addrspace(1)* %r 47 ret void 48} 49 50; GCN-LABEL: {{^}}minnum_f16_imm_b: 51; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 52; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 53; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] 54; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 55; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] 56; GCN: buffer_store_short v[[R_F16]] 57; GCN: s_endpgm 58define amdgpu_kernel void @minnum_f16_imm_b( 59 half addrspace(1)* %r, 60 half addrspace(1)* %a) { 61entry: 62 %a.val = load half, half addrspace(1)* %a 63 %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0) 64 store half %r.val, half addrspace(1)* %r 65 ret void 66} 67 68; GCN-LABEL: {{^}}minnum_v2f16: 69; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 70; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 71 72; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 73; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 74; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 75; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 76; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 77; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 78; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] 79; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] 80; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 81; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 82; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 83; SI-NOT: and 84; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 85 86; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] 87; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 88; VI-NOT: and 89; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] 90 91; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] 92 93; GCN: buffer_store_dword v[[R_V2_F16]] 94define amdgpu_kernel void @minnum_v2f16( 95 <2 x half> addrspace(1)* %r, 96 <2 x half> addrspace(1)* %a, 97 <2 x half> addrspace(1)* %b) { 98entry: 99 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 100 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 101 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 102 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 103 ret void 104} 105 106; GCN-LABEL: {{^}}minnum_v2f16_imm_a: 107; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] 108; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 109; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 110; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 111; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] 112; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 113; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] 114; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 115; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 116; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 117; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] 118 119; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 120; SIVI-NOT: and 121; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 122 123 124; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 125; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] 126 127; GCN: buffer_store_dword v[[R_V2_F16]] 128define amdgpu_kernel void @minnum_v2f16_imm_a( 129 <2 x half> addrspace(1)* %r, 130 <2 x half> addrspace(1)* %b) { 131entry: 132 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 133 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 134 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 135 ret void 136} 137 138; GCN-LABEL: {{^}}minnum_v2f16_imm_b: 139; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] 140; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 141; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 142; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 143; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] 144; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 145; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] 146; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 147 148; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 149; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 150; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] 151 152; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 153 154 155; SIVI-NOT: and 156; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 157 158; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 159; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] 160 161; GCN: buffer_store_dword v[[R_V2_F16]] 162define amdgpu_kernel void @minnum_v2f16_imm_b( 163 <2 x half> addrspace(1)* %r, 164 <2 x half> addrspace(1)* %a) { 165entry: 166 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 167 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 168 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 169 ret void 170} 171 172; FIXME: Scalarize with undef half 173; GCN-LABEL: {{^}}minnum_v3f16: 174; GFX9: v_pk_min_f16 175; GFX9: v_pk_min_f16 176define amdgpu_kernel void @minnum_v3f16( 177 <3 x half> addrspace(1)* %r, 178 <3 x half> addrspace(1)* %a, 179 <3 x half> addrspace(1)* %b) { 180entry: 181 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a 182 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b 183 %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 184 store <3 x half> %r.val, <3 x half> addrspace(1)* %r 185 ret void 186} 187 188; GCN-LABEL: {{^}}minnum_v4f16: 189; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} 190; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} 191; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] 192; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] 193; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}} 194define amdgpu_kernel void @minnum_v4f16( 195 <4 x half> addrspace(1)* %r, 196 <4 x half> addrspace(1)* %a, 197 <4 x half> addrspace(1)* %b) { 198entry: 199 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 200 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 201 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 202 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 203 ret void 204} 205 206; GCN-LABEL: {{^}}fmin_v4f16_imm_a: 207; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} 208; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 209; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 210 211; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], [[K0]] 212; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], [[K1]] 213; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}} 214 215; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000 216; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 217 218; VI-DAG: v_min_f16_sdwa v[[MIN_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 219; VI-DAG: v_min_f16_e32 v[[MIN_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] 220; VI-DAG: v_min_f16_sdwa v[[MIN_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 221; VI-DAG: v_min_f16_e32 v[[MIN_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] 222 223; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MIN_LO_LO]], v[[MIN_LO_HI]] 224; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MIN_HI_LO]], v[[MIN_HI_HI]] 225 226; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}} 227define amdgpu_kernel void @fmin_v4f16_imm_a( 228 <4 x half> addrspace(1)* %r, 229 <4 x half> addrspace(1)* %b) { 230entry: 231 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 232 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 233 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 234 ret void 235} 236