1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5declare half @llvm.maxnum.f16(half %a, half %b) 6declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) 7declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b) 8declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) 9 10; GCN-LABEL: {{^}}maxnum_f16: 11; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 12; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 13; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 14; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 15; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] 16; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 17; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] 18; GCN: buffer_store_short v[[R_F16]] 19; GCN: s_endpgm 20define amdgpu_kernel void @maxnum_f16( 21 half addrspace(1)* %r, 22 half addrspace(1)* %a, 23 half addrspace(1)* %b) { 24entry: 25 %a.val = load volatile half, half addrspace(1)* %a 26 %b.val = load volatile half, half addrspace(1)* %b 27 %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) 28 store half %r.val, half addrspace(1)* %r 29 ret void 30} 31 32; GCN-LABEL: {{^}}maxnum_f16_imm_a: 33; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 34; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 35; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] 36; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 37; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] 38; GCN: buffer_store_short v[[R_F16]] 39; GCN: s_endpgm 40define amdgpu_kernel void @maxnum_f16_imm_a( 41 half addrspace(1)* %r, 42 half addrspace(1)* %b) { 43entry: 44 %b.val = load half, half addrspace(1)* %b 45 %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val) 46 store half %r.val, half addrspace(1)* %r 47 ret void 48} 49 50; GCN-LABEL: {{^}}maxnum_f16_imm_b: 51; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 52; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 53; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] 54; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 55; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] 56; GCN: buffer_store_short v[[R_F16]] 57; GCN: s_endpgm 58define amdgpu_kernel void @maxnum_f16_imm_b( 59 half addrspace(1)* %r, 60 half addrspace(1)* %a) { 61entry: 62 %a.val = load half, half addrspace(1)* %a 63 %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0) 64 store half %r.val, half addrspace(1)* %r 65 ret void 66} 67 68; GCN-LABEL: {{^}}maxnum_v2f16: 69; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 70; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 71 72; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 73; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 74; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 75; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 76; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 77; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 78; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] 79; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] 80; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 81; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 82; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 83; SI-NOT: and 84; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 85 86; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] 87; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 88; VI-NOT: and 89; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] 90 91; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] 92 93; GCN: buffer_store_dword v[[R_V2_F16]] 94; GCN: s_endpgm 95define amdgpu_kernel void @maxnum_v2f16( 96 <2 x half> addrspace(1)* %r, 97 <2 x half> addrspace(1)* %a, 98 <2 x half> addrspace(1)* %b) { 99entry: 100 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 101 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 102 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 103 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 104 ret void 105} 106 107; GCN-LABEL: {{^}}maxnum_v2f16_imm_a: 108; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] 109; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 110; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 111; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 112; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] 113; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 114; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] 115; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 116; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 117; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 118; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] 119 120; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 121; SIVI-NOT: and 122; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 123 124 125; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 126; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] 127 128; GCN: buffer_store_dword v[[R_V2_F16]] 129define amdgpu_kernel void @maxnum_v2f16_imm_a( 130 <2 x half> addrspace(1)* %r, 131 <2 x half> addrspace(1)* %b) { 132entry: 133 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 134 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 135 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 136 ret void 137} 138 139; GCN-LABEL: {{^}}maxnum_v2f16_imm_b: 140; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] 141; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 142; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 143; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 144; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] 145; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 146; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] 147; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 148 149; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 150; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 151; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] 152 153; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 154 155 156; SIVI-NOT: and 157; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 158 159; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 160; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] 161 162; GCN: buffer_store_dword v[[R_V2_F16]] 163define amdgpu_kernel void @maxnum_v2f16_imm_b( 164 <2 x half> addrspace(1)* %r, 165 <2 x half> addrspace(1)* %a) { 166entry: 167 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 168 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 169 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 170 ret void 171} 172 173; FIXME: Scalarize with undef half 174; GCN-LABEL: {{^}}maxnum_v3f16: 175; GFX9: v_pk_max_f16 176; GFX9: v_pk_max_f16 177define amdgpu_kernel void @maxnum_v3f16( 178 <3 x half> addrspace(1)* %r, 179 <3 x half> addrspace(1)* %a, 180 <3 x half> addrspace(1)* %b) { 181entry: 182 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a 183 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b 184 %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 185 store <3 x half> %r.val, <3 x half> addrspace(1)* %r 186 ret void 187} 188 189; GCN-LABEL: {{^}}maxnum_v4f16: 190; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} 191; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} 192; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] 193; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] 194; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}} 195define amdgpu_kernel void @maxnum_v4f16( 196 <4 x half> addrspace(1)* %r, 197 <4 x half> addrspace(1)* %a, 198 <4 x half> addrspace(1)* %b) { 199entry: 200 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 201 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 202 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 203 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 204 ret void 205} 206 207; GCN-LABEL: {{^}}fmax_v4f16_imm_a: 208; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} 209; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 210; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 211 212; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], [[K0]] 213; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], [[K1]] 214; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}} 215 216; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000 217; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 218 219; VI-DAG: v_max_f16_sdwa v[[MAX_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 220; VI-DAG: v_max_f16_e32 v[[MAX_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] 221; VI-DAG: v_max_f16_sdwa v[[MAX_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 222; VI-DAG: v_max_f16_e32 v[[MAX_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] 223 224; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MAX_LO_LO]], v[[MAX_LO_HI]] 225; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MAX_HI_LO]], v[[MAX_HI_HI]] 226 227; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}} 228define amdgpu_kernel void @fmax_v4f16_imm_a( 229 <4 x half> addrspace(1)* %r, 230 <4 x half> addrspace(1)* %b) { 231entry: 232 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 233 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 234 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 235 ret void 236} 237