1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s 3 4declare half @llvm.fma.f16(half %a, half %b, half %c) 5declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) 6 7; GCN-LABEL: {{^}}fma_f16 8; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 9; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 10; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 11; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 12; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 13; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 14; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 15; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 16; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] 17; GCN: buffer_store_short v[[R_F16]] 18; GCN: s_endpgm 19define amdgpu_kernel void @fma_f16( 20 half addrspace(1)* %r, 21 half addrspace(1)* %a, 22 half addrspace(1)* %b, 23 half addrspace(1)* %c) { 24 %a.val = load half, half addrspace(1)* %a 25 %b.val = load half, half addrspace(1)* %b 26 %c.val = load half, half addrspace(1)* %c 27 %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half %c.val) 28 store half %r.val, half addrspace(1)* %r 29 ret void 30} 31 32; GCN-LABEL: {{^}}fma_f16_imm_a 33; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 34; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 35 36; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} 37; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 38; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 39; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 40; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 41; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} 42; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] 43; GCN: buffer_store_short v[[R_F16]] 44; GCN: s_endpgm 45define amdgpu_kernel void @fma_f16_imm_a( 46 half addrspace(1)* %r, 47 half addrspace(1)* %b, 48 half addrspace(1)* %c) { 49 %b.val = load half, half addrspace(1)* %b 50 %c.val = load half, half addrspace(1)* %c 51 %r.val = call half @llvm.fma.f16(half 3.0, half %b.val, half %c.val) 52 store half %r.val, half addrspace(1)* %r 53 ret void 54} 55 56; GCN-LABEL: {{^}}fma_f16_imm_b 57; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 58; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 59; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} 60; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 61; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 62; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 63; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 64; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} 65; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] 66; GCN: buffer_store_short v[[R_F16]] 67; GCN: s_endpgm 68define amdgpu_kernel void @fma_f16_imm_b( 69 half addrspace(1)* %r, 70 half addrspace(1)* %a, 71 half addrspace(1)* %c) { 72 %a.val = load half, half addrspace(1)* %a 73 %c.val = load half, half addrspace(1)* %c 74 %r.val = call half @llvm.fma.f16(half %a.val, half 3.0, half %c.val) 75 store half %r.val, half addrspace(1)* %r 76 ret void 77} 78 79; GCN-LABEL: {{^}}fma_f16_imm_c 80; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 81; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 82; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} 83; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 84; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 85; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 86; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 87; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} 88; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] 89; GCN: buffer_store_short v[[R_F16]] 90; GCN: s_endpgm 91define amdgpu_kernel void @fma_f16_imm_c( 92 half addrspace(1)* %r, 93 half addrspace(1)* %a, 94 half addrspace(1)* %b) { 95 %a.val = load half, half addrspace(1)* %a 96 %b.val = load half, half addrspace(1)* %b 97 %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half 3.0) 98 store half %r.val, half addrspace(1)* %r 99 ret void 100} 101 102; GCN-LABEL: {{^}}fma_v2f16 103; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 104; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 105; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] 106 107; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 108; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 109; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 110; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 111; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 112; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 113; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 114 115; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 116; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 117 118 119; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] 120; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 121; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] 122; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 123 124; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 125; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 126; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 127; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] 128; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] 129 130; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 131; GCN-NOT: and 132; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 133; GCN: buffer_store_dword v[[R_V2_F16]] 134; GCN: s_endpgm 135define amdgpu_kernel void @fma_v2f16( 136 <2 x half> addrspace(1)* %r, 137 <2 x half> addrspace(1)* %a, 138 <2 x half> addrspace(1)* %b, 139 <2 x half> addrspace(1)* %c) { 140 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 141 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 142 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 143 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val) 144 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 145 ret void 146} 147 148; GCN-LABEL: {{^}}fma_v2f16_imm_a: 149; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] 150; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 151 152 153; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] 154; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 155 156 157; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} 158; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} 159; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 160; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 161 162; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 163; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 164; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 165; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 166 167; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] 168; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]] 169; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 170; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 171 172; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]] 173; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]] 174 175; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 176; GCN-NOT: and 177; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 178; GCN: buffer_store_dword v[[R_V2_F16]] 179; GCN: s_endpgm 180define amdgpu_kernel void @fma_v2f16_imm_a( 181 <2 x half> addrspace(1)* %r, 182 <2 x half> addrspace(1)* %b, 183 <2 x half> addrspace(1)* %c) { 184 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 185 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 186 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> <half 3.0, half 3.0>, <2 x half> %b.val, <2 x half> %c.val) 187 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 188 ret void 189} 190 191; GCN-LABEL: {{^}}fma_v2f16_imm_b: 192; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] 193; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 194 195; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 196; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] 197 198; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} 199; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} 200 201; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 202; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 203; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 204; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 205 206; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 207; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 208; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]] 209; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 210; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]] 211; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 212 213; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 214; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 215; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] 216; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] 217 218; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 219; GCN-NOT: and 220; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 221; GCN: buffer_store_dword v[[R_V2_F16]] 222; GCN: s_endpgm 223define amdgpu_kernel void @fma_v2f16_imm_b( 224 <2 x half> addrspace(1)* %r, 225 <2 x half> addrspace(1)* %a, 226 <2 x half> addrspace(1)* %c) { 227 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 228 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 229 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> <half 3.0, half 3.0>, <2 x half> %c.val) 230 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 231 ret void 232} 233 234; GCN-LABEL: {{^}}fma_v2f16_imm_c: 235; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 236; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 237 238; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 239; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 240 241; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} 242; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} 243 244; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 245; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 246 247; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 248; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 249 250; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 251; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 252 253; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] 254; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]] 255; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 256; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 257; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 258; GCN-NOT: and 259; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 260 261; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 262; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 263; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] 264; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] 265; GCN-NOT: and 266; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] 267 268 269; GCN: buffer_store_dword v[[R_V2_F16]] 270; GCN: s_endpgm 271define amdgpu_kernel void @fma_v2f16_imm_c( 272 <2 x half> addrspace(1)* %r, 273 <2 x half> addrspace(1)* %a, 274 <2 x half> addrspace(1)* %b) { 275 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 276 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 277 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> <half 3.0, half 3.0>) 278 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 279 ret void 280} 281