1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s 3 4; GCN-LABEL: {{^}}mac_f16: 5; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] 6; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] 7; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]] 8; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 9; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 10; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 11; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] 12; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] 13; SI: buffer_store_short v[[R_F16]] 14; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] 15; VI: buffer_store_short v[[C_F16]] 16; GCN: s_endpgm 17define amdgpu_kernel void @mac_f16( 18 half addrspace(1)* %r, 19 half addrspace(1)* %a, 20 half addrspace(1)* %b, 21 half addrspace(1)* %c) #0 { 22entry: 23 %a.val = load half, half addrspace(1)* %a 24 %b.val = load half, half addrspace(1)* %b 25 %c.val = load half, half addrspace(1)* %c 26 27 %t.val = fmul half %a.val, %b.val 28 %r.val = fadd half %t.val, %c.val 29 30 store half %r.val, half addrspace(1)* %r 31 ret void 32} 33 34; GCN-LABEL: {{^}}mac_f16_same_add: 35; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] 36; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} 37 38; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] 39; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} 40; GCN: s_endpgm 41define amdgpu_kernel void @mac_f16_same_add( 42 half addrspace(1)* %r0, 43 half addrspace(1)* %r1, 44 half addrspace(1)* %a, 45 half addrspace(1)* %b, 46 half addrspace(1)* %c, 47 half addrspace(1)* %d, 48 half addrspace(1)* %e) #0 { 49entry: 50 %a.val = load half, half addrspace(1)* %a 51 %b.val = load half, half addrspace(1)* %b 52 %c.val = load half, half addrspace(1)* %c 53 %d.val = load half, half addrspace(1)* %d 54 %e.val = load half, half addrspace(1)* %e 55 56 %t0.val = fmul half %a.val, %b.val 57 %r0.val = fadd half %t0.val, %c.val 58 59 %t1.val = fmul half %d.val, %e.val 60 %r1.val = fadd half %t1.val, %c.val 61 62 store half %r0.val, half addrspace(1)* %r0 63 store half %r1.val, half addrspace(1)* %r1 64 ret void 65} 66 67; GCN-LABEL: {{^}}mac_f16_neg_a: 68; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 69; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 70; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 71; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] 72 73; VI-NOT: v_mac_f16 74; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 75; GCN: s_endpgm 76define amdgpu_kernel void @mac_f16_neg_a( 77 half addrspace(1)* %r, 78 half addrspace(1)* %a, 79 half addrspace(1)* %b, 80 half addrspace(1)* %c) #0 { 81entry: 82 %a.val = load half, half addrspace(1)* %a 83 %b.val = load half, half addrspace(1)* %b 84 %c.val = load half, half addrspace(1)* %c 85 86 %a.neg = fsub half -0.0, %a.val 87 %t.val = fmul half %a.neg, %b.val 88 %r.val = fadd half %t.val, %c.val 89 90 store half %r.val, half addrspace(1)* %r 91 ret void 92} 93 94; GCN-LABEL: {{^}}mac_f16_neg_b: 95; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 96; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 97; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 98; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] 99 100; VI-NOT: v_mac_f16 101; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 102; GCN: s_endpgm 103define amdgpu_kernel void @mac_f16_neg_b( 104 half addrspace(1)* %r, 105 half addrspace(1)* %a, 106 half addrspace(1)* %b, 107 half addrspace(1)* %c) #0 { 108entry: 109 %a.val = load half, half addrspace(1)* %a 110 %b.val = load half, half addrspace(1)* %b 111 %c.val = load half, half addrspace(1)* %c 112 113 %b.neg = fsub half -0.0, %b.val 114 %t.val = fmul half %a.val, %b.neg 115 %r.val = fadd half %t.val, %c.val 116 117 store half %r.val, half addrspace(1)* %r 118 ret void 119} 120 121; GCN-LABEL: {{^}}mac_f16_neg_c: 122; SI: v_cvt_f32_f16_e32 123; SI: v_cvt_f32_f16_e32 124; SI: v_cvt_f32_f16_e32 125; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 126 127; VI-NOT: v_mac_f16 128; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 129; GCN: s_endpgm 130define amdgpu_kernel void @mac_f16_neg_c( 131 half addrspace(1)* %r, 132 half addrspace(1)* %a, 133 half addrspace(1)* %b, 134 half addrspace(1)* %c) #0 { 135entry: 136 %a.val = load half, half addrspace(1)* %a 137 %b.val = load half, half addrspace(1)* %b 138 %c.val = load half, half addrspace(1)* %c 139 140 %c.neg = fsub half -0.0, %c.val 141 %t.val = fmul half %a.val, %b.val 142 %r.val = fadd half %t.val, %c.neg 143 144 store half %r.val, half addrspace(1)* %r 145 ret void 146} 147 148; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: 149; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 150; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} 151; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 152; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} 153; GCN: s_endpgm 154define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math( 155 half addrspace(1)* %r, 156 half addrspace(1)* %a, 157 half addrspace(1)* %b, 158 half addrspace(1)* %c) #0 { 159entry: 160 %a.val = load half, half addrspace(1)* %a 161 %b.val = load half, half addrspace(1)* %b 162 %c.val = load half, half addrspace(1)* %c 163 164 %a.neg = fsub half 0.0, %a.val 165 %t.val = fmul half %a.neg, %b.val 166 %r.val = fadd half %t.val, %c.val 167 168 store half %r.val, half addrspace(1)* %r 169 ret void 170} 171 172; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: 173; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 174; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] 175; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 176; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] 177; GCN: s_endpgm 178define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math( 179 half addrspace(1)* %r, 180 half addrspace(1)* %a, 181 half addrspace(1)* %b, 182 half addrspace(1)* %c) #0 { 183entry: 184 %a.val = load half, half addrspace(1)* %a 185 %b.val = load half, half addrspace(1)* %b 186 %c.val = load half, half addrspace(1)* %c 187 188 %b.neg = fsub half 0.0, %b.val 189 %t.val = fmul half %a.val, %b.neg 190 %r.val = fadd half %t.val, %c.val 191 192 store half %r.val, half addrspace(1)* %r 193 ret void 194} 195 196; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math: 197; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 198; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} 199; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 200; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} 201; GCN: s_endpgm 202define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math( 203 half addrspace(1)* %r, 204 half addrspace(1)* %a, 205 half addrspace(1)* %b, 206 half addrspace(1)* %c) #0 { 207entry: 208 %a.val = load half, half addrspace(1)* %a 209 %b.val = load half, half addrspace(1)* %b 210 %c.val = load half, half addrspace(1)* %c 211 212 %c.neg = fsub half 0.0, %c.val 213 %t.val = fmul half %a.val, %b.val 214 %r.val = fadd half %t.val, %c.neg 215 216 store half %r.val, half addrspace(1)* %r 217 ret void 218} 219 220; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math: 221; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 222; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 223; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 224; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] 225 226; VI-NOT: v_mac_f16 227; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} 228; GCN: s_endpgm 229define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math( 230 half addrspace(1)* %r, 231 half addrspace(1)* %a, 232 half addrspace(1)* %b, 233 half addrspace(1)* %c) #1 { 234entry: 235 %a.val = load half, half addrspace(1)* %a 236 %b.val = load half, half addrspace(1)* %b 237 %c.val = load half, half addrspace(1)* %c 238 239 %a.neg = fsub half 0.0, %a.val 240 %t.val = fmul half %a.neg, %b.val 241 %r.val = fadd half %t.val, %c.val 242 243 store half %r.val, half addrspace(1)* %r 244 ret void 245} 246 247; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math: 248; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 249; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 250; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 251; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] 252 253; VI-NOT: v_mac_f16 254; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} 255; GCN: s_endpgm 256define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math( 257 half addrspace(1)* %r, 258 half addrspace(1)* %a, 259 half addrspace(1)* %b, 260 half addrspace(1)* %c) #1 { 261entry: 262 %a.val = load half, half addrspace(1)* %a 263 %b.val = load half, half addrspace(1)* %b 264 %c.val = load half, half addrspace(1)* %c 265 266 %b.neg = fsub half 0.0, %b.val 267 %t.val = fmul half %a.val, %b.neg 268 %r.val = fadd half %t.val, %c.val 269 270 store half %r.val, half addrspace(1)* %r 271 ret void 272} 273 274; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math: 275; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 276; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 277; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 278; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]] 279 280; VI-NOT: v_mac_f16 281; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} 282; GCN: s_endpgm 283define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math( 284 half addrspace(1)* %r, 285 half addrspace(1)* %a, 286 half addrspace(1)* %b, 287 half addrspace(1)* %c) #1 { 288entry: 289 %a.val = load half, half addrspace(1)* %a 290 %b.val = load half, half addrspace(1)* %b 291 %c.val = load half, half addrspace(1)* %c 292 293 %c.neg = fsub half 0.0, %c.val 294 %t.val = fmul half %a.val, %b.val 295 %r.val = fadd half %t.val, %c.neg 296 297 store half %r.val, half addrspace(1)* %r 298 ret void 299} 300 301; GCN-LABEL: {{^}}mac_v2f16: 302; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] 303; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] 304; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] 305 306; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 307; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 308; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 309 310; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 311; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 312; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 313 314; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 315; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 316; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 317 318; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] 319; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] 320; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] 321; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] 322; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 323; VI-NOT: and 324; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] 325 326; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 327; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 328; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] 329; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] 330; VI-NOT: and 331; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] 332 333; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] 334; GCN: s_endpgm 335define amdgpu_kernel void @mac_v2f16( 336 <2 x half> addrspace(1)* %r, 337 <2 x half> addrspace(1)* %a, 338 <2 x half> addrspace(1)* %b, 339 <2 x half> addrspace(1)* %c) #0 { 340entry: 341 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 342 call void @llvm.amdgcn.s.barrier() #2 343 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 344 call void @llvm.amdgcn.s.barrier() #2 345 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 346 347 %t.val = fmul <2 x half> %a.val, %b.val 348 %r.val = fadd <2 x half> %t.val, %c.val 349 350 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 351 ret void 352} 353 354; GCN-LABEL: {{^}}mac_v2f16_same_add: 355; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 356; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 357; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 358; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 359 360; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 361; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 362; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 363; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 364 365; GCN: s_endpgm 366define amdgpu_kernel void @mac_v2f16_same_add( 367 <2 x half> addrspace(1)* %r0, 368 <2 x half> addrspace(1)* %r1, 369 <2 x half> addrspace(1)* %a, 370 <2 x half> addrspace(1)* %b, 371 <2 x half> addrspace(1)* %c, 372 <2 x half> addrspace(1)* %d, 373 <2 x half> addrspace(1)* %e) #0 { 374entry: 375 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 376 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 377 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 378 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 379 %e.val = load <2 x half>, <2 x half> addrspace(1)* %e 380 381 %t0.val = fmul <2 x half> %a.val, %b.val 382 %r0.val = fadd <2 x half> %t0.val, %c.val 383 384 %t1.val = fmul <2 x half> %d.val, %e.val 385 %r1.val = fadd <2 x half> %t1.val, %c.val 386 387 store <2 x half> %r0.val, <2 x half> addrspace(1)* %r0 388 store <2 x half> %r1.val, <2 x half> addrspace(1)* %r1 389 ret void 390} 391 392; GCN-LABEL: {{^}}mac_v2f16_neg_a: 393; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 394; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 395 396; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 397; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 398 399; VI-NOT: v_mac_f16 400; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 401; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 402; GCN: s_endpgm 403define amdgpu_kernel void @mac_v2f16_neg_a( 404 <2 x half> addrspace(1)* %r, 405 <2 x half> addrspace(1)* %a, 406 <2 x half> addrspace(1)* %b, 407 <2 x half> addrspace(1)* %c) #0 { 408entry: 409 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 410 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 411 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 412 413 %a.neg = fsub <2 x half> <half -0.0, half -0.0>, %a.val 414 %t.val = fmul <2 x half> %a.neg, %b.val 415 %r.val = fadd <2 x half> %t.val, %c.val 416 417 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 418 ret void 419} 420 421; GCN-LABEL: {{^}}mac_v2f16_neg_b 422; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 423; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 424; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 425; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 426 427 428; VI-NOT: v_mac_f16 429; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 430; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 431; GCN: s_endpgm 432define amdgpu_kernel void @mac_v2f16_neg_b( 433 <2 x half> addrspace(1)* %r, 434 <2 x half> addrspace(1)* %a, 435 <2 x half> addrspace(1)* %b, 436 <2 x half> addrspace(1)* %c) #0 { 437entry: 438 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 439 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 440 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 441 442 %b.neg = fsub <2 x half> <half -0.0, half -0.0>, %b.val 443 %t.val = fmul <2 x half> %a.val, %b.neg 444 %r.val = fadd <2 x half> %t.val, %c.val 445 446 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 447 ret void 448} 449 450; GCN-LABEL: {{^}}mac_v2f16_neg_c: 451; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 452; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 453; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} 454; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} 455; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} 456; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} 457 458; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 459; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 460 461; VI-NOT: v_mac_f16 462; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 463; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 464; GCN: s_endpgm 465define amdgpu_kernel void @mac_v2f16_neg_c( 466 <2 x half> addrspace(1)* %r, 467 <2 x half> addrspace(1)* %a, 468 <2 x half> addrspace(1)* %b, 469 <2 x half> addrspace(1)* %c) #0 { 470entry: 471 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 472 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 473 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 474 475 %c.neg = fsub <2 x half> <half -0.0, half -0.0>, %c.val 476 %t.val = fmul <2 x half> %a.val, %b.val 477 %r.val = fadd <2 x half> %t.val, %c.neg 478 479 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 480 ret void 481} 482 483; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math: 484 485; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} 486; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 487; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} 488; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} 489 490; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 491; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 492; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 493; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 494; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} 495 496; GCN: s_endpgm 497define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( 498 <2 x half> addrspace(1)* %r, 499 <2 x half> addrspace(1)* %a, 500 <2 x half> addrspace(1)* %b, 501 <2 x half> addrspace(1)* %c) #0 { 502entry: 503 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 504 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 505 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 506 507 %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val 508 %t.val = fmul <2 x half> %a.neg, %b.val 509 %r.val = fadd <2 x half> %t.val, %c.val 510 511 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 512 ret void 513} 514 515; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: 516 517; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} 518; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 519; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] 520; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] 521 522; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 523; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 524; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 525; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 526; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] 527 528; GCN: s_endpgm 529define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( 530 <2 x half> addrspace(1)* %r, 531 <2 x half> addrspace(1)* %a, 532 <2 x half> addrspace(1)* %b, 533 <2 x half> addrspace(1)* %c) #0 { 534entry: 535 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 536 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 537 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 538 539 %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val 540 %t.val = fmul <2 x half> %a.val, %b.neg 541 %r.val = fadd <2 x half> %t.val, %c.val 542 543 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 544 ret void 545} 546 547; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: 548 549; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} 550; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 551; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} 552; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} 553 554; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 555; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 556; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 557; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 558; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} 559 560; GCN: s_endpgm 561define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math( 562 <2 x half> addrspace(1)* %r, 563 <2 x half> addrspace(1)* %a, 564 <2 x half> addrspace(1)* %b, 565 <2 x half> addrspace(1)* %c) #0 { 566entry: 567 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 568 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 569 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 570 571 %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val 572 %t.val = fmul <2 x half> %a.val, %b.val 573 %r.val = fadd <2 x half> %t.val, %c.neg 574 575 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 576 ret void 577} 578 579; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math: 580; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 581; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 582; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} 583; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} 584; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} 585; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} 586 587; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 588; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 589 590; VI-NOT: v_mac_f16 591; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 592; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 593; GCN: s_endpgm 594define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math( 595 <2 x half> addrspace(1)* %r, 596 <2 x half> addrspace(1)* %a, 597 <2 x half> addrspace(1)* %b, 598 <2 x half> addrspace(1)* %c) #1 { 599entry: 600 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 601 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 602 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 603 604 %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val 605 %t.val = fmul <2 x half> %a.neg, %b.val 606 %r.val = fadd <2 x half> %t.val, %c.val 607 608 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 609 ret void 610} 611 612; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math: 613; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 614; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 615; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} 616; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} 617; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} 618; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} 619 620; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 621; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 622 623; VI-NOT: v_mac_f16 624; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 625; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 626; GCN: s_endpgm 627define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math( 628 <2 x half> addrspace(1)* %r, 629 <2 x half> addrspace(1)* %a, 630 <2 x half> addrspace(1)* %b, 631 <2 x half> addrspace(1)* %c) #1 { 632entry: 633 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 634 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 635 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 636 637 %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val 638 %t.val = fmul <2 x half> %a.val, %b.neg 639 %r.val = fadd <2 x half> %t.val, %c.val 640 641 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 642 ret void 643} 644 645; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math: 646; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 647; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 648; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} 649; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} 650; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} 651; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} 652 653; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 654; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 655 656; VI-NOT: v_mac_f16 657; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} 658; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} 659; GCN: s_endpgm 660define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math( 661 <2 x half> addrspace(1)* %r, 662 <2 x half> addrspace(1)* %a, 663 <2 x half> addrspace(1)* %b, 664 <2 x half> addrspace(1)* %c) #1 { 665entry: 666 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 667 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 668 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 669 670 %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val 671 %t.val = fmul <2 x half> %a.val, %b.val 672 %r.val = fadd <2 x half> %t.val, %c.neg 673 674 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 675 ret void 676} 677 678declare void @llvm.amdgcn.s.barrier() #2 679 680attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" } 681attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" } 682attributes #2 = { nounwind convergent } 683