1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s 3 4; GCN-LABEL: {{^}}mac_f16: 5; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] 6; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] 7; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]] 8; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 9; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 10; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 11; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] 12; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] 13; SI: buffer_store_short v[[R_F16]] 14; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] 15; VI: buffer_store_short v[[C_F16]] 16; GCN: s_endpgm 17define amdgpu_kernel void @mac_f16( 18 half addrspace(1)* %r, 19 half addrspace(1)* %a, 20 half addrspace(1)* %b, 21 half addrspace(1)* %c) #0 { 22entry: 23 %a.val = load half, half addrspace(1)* %a 24 %b.val = load half, half addrspace(1)* %b 25 %c.val = load half, half addrspace(1)* %c 26 27 %t.val = fmul half %a.val, %b.val 28 %r.val = fadd half %t.val, %c.val 29 30 store half %r.val, half addrspace(1)* %r 31 ret void 32} 33 34; GCN-LABEL: {{^}}mac_f16_same_add: 35; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] 36; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} 37 38; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] 39; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} 40; GCN: s_endpgm 41define amdgpu_kernel void @mac_f16_same_add( 42 half addrspace(1)* %r0, 43 half addrspace(1)* %r1, 44 half addrspace(1)* %a, 45 half addrspace(1)* %b, 46 half addrspace(1)* %c, 47 half addrspace(1)* %d, 48 half addrspace(1)* %e) #0 { 49entry: 50 %a.val = load half, half addrspace(1)* %a 51 %b.val = load half, half addrspace(1)* %b 52 %c.val = load half, half addrspace(1)* %c 53 %d.val = load half, half addrspace(1)* %d 54 %e.val = load half, half addrspace(1)* %e 55 56 %t0.val = fmul half %a.val, %b.val 57 %r0.val = fadd half %t0.val, %c.val 58 59 %t1.val = fmul half %d.val, %e.val 60 %r1.val = fadd half %t1.val, %c.val 61 62 store half %r0.val, half addrspace(1)* %r0 63 store half %r1.val, half addrspace(1)* %r1 64 ret void 65} 66 67; GCN-LABEL: {{^}}mac_f16_neg_a: 68; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 69; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 70; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 71; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] 72 73; VI-NOT: v_mac_f16 74; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 75; GCN: s_endpgm 76define amdgpu_kernel void @mac_f16_neg_a( 77 half addrspace(1)* %r, 78 half addrspace(1)* %a, 79 half addrspace(1)* %b, 80 half addrspace(1)* %c) #0 { 81entry: 82 %a.val = load half, half addrspace(1)* %a 83 %b.val = load half, half addrspace(1)* %b 84 %c.val = load half, half addrspace(1)* %c 85 86 %a.neg = fsub half -0.0, %a.val 87 %t.val = fmul half %a.neg, %b.val 88 %r.val = fadd half %t.val, %c.val 89 90 store half %r.val, half addrspace(1)* %r 91 ret void 92} 93 94; GCN-LABEL: {{^}}mac_f16_neg_b: 95; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 96; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 97; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 98; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] 99 100; VI-NOT: v_mac_f16 101; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 102; GCN: s_endpgm 103define amdgpu_kernel void @mac_f16_neg_b( 104 half addrspace(1)* %r, 105 half addrspace(1)* %a, 106 half addrspace(1)* %b, 107 half addrspace(1)* %c) #0 { 108entry: 109 %a.val = load half, half addrspace(1)* %a 110 %b.val = load half, half addrspace(1)* %b 111 %c.val = load half, half addrspace(1)* %c 112 113 %b.neg = fsub half -0.0, %b.val 114 %t.val = fmul half %a.val, %b.neg 115 %r.val = fadd half %t.val, %c.val 116 117 store half %r.val, half addrspace(1)* %r 118 ret void 119} 120 121; GCN-LABEL: {{^}}mac_f16_neg_c: 122; SI: v_cvt_f32_f16_e32 123; SI: v_cvt_f32_f16_e32 124; SI: v_cvt_f32_f16_e32 125; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 126 127; VI-NOT: v_mac_f16 128; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 129; GCN: s_endpgm 130define amdgpu_kernel void @mac_f16_neg_c( 131 half addrspace(1)* %r, 132 half addrspace(1)* %a, 133 half addrspace(1)* %b, 134 half addrspace(1)* %c) #0 { 135entry: 136 %a.val = load half, half addrspace(1)* %a 137 %b.val = load half, half addrspace(1)* %b 138 %c.val = load half, half addrspace(1)* %c 139 140 %c.neg = fsub half -0.0, %c.val 141 %t.val = fmul half %a.val, %b.val 142 %r.val = fadd half %t.val, %c.neg 143 144 store half %r.val, half addrspace(1)* %r 145 ret void 146} 147 148; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: 149; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 150; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} 151; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 152; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} 153; GCN: s_endpgm 154define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math( 155 half addrspace(1)* %r, 156 half addrspace(1)* %a, 157 half addrspace(1)* %b, 158 half addrspace(1)* %c) #0 { 159entry: 160 %a.val = load half, half addrspace(1)* %a 161 %b.val = load half, half addrspace(1)* %b 162 %c.val = load half, half addrspace(1)* %c 163 164 %a.neg = fsub half 0.0, %a.val 165 %t.val = fmul half %a.neg, %b.val 166 %r.val = fadd half %t.val, %c.val 167 168 store half %r.val, half addrspace(1)* %r 169 ret void 170} 171 172; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: 173; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 174; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] 175; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 176; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] 177; GCN: s_endpgm 178define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math( 179 half addrspace(1)* %r, 180 half addrspace(1)* %a, 181 half addrspace(1)* %b, 182 half addrspace(1)* %c) #0 { 183entry: 184 %a.val = load half, half addrspace(1)* %a 185 %b.val = load half, half addrspace(1)* %b 186 %c.val = load half, half addrspace(1)* %c 187 188 %b.neg = fsub half 0.0, %b.val 189 %t.val = fmul half %a.val, %b.neg 190 %r.val = fadd half %t.val, %c.val 191 192 store half %r.val, half addrspace(1)* %r 193 ret void 194} 195 196; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math: 197; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 198; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} 199; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} 200; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} 201; GCN: s_endpgm 202define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math( 203 half addrspace(1)* %r, 204 half addrspace(1)* %a, 205 half addrspace(1)* %b, 206 half addrspace(1)* %c) #0 { 207entry: 208 %a.val = load half, half addrspace(1)* %a 209 %b.val = load half, half addrspace(1)* %b 210 %c.val = load half, half addrspace(1)* %c 211 212 %c.neg = fsub half 0.0, %c.val 213 %t.val = fmul half %a.val, %b.val 214 %r.val = fadd half %t.val, %c.neg 215 216 store half %r.val, half addrspace(1)* %r 217 ret void 218} 219 220; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math: 221; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 222; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 223; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 224; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] 225 226; VI-NOT: v_mac_f16 227; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} 228; GCN: s_endpgm 229define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math( 230 half addrspace(1)* %r, 231 half addrspace(1)* %a, 232 half addrspace(1)* %b, 233 half addrspace(1)* %c) #1 { 234entry: 235 %a.val = load half, half addrspace(1)* %a 236 %b.val = load half, half addrspace(1)* %b 237 %c.val = load half, half addrspace(1)* %c 238 239 %a.neg = fsub half 0.0, %a.val 240 %t.val = fmul half %a.neg, %b.val 241 %r.val = fadd half %t.val, %c.val 242 243 store half %r.val, half addrspace(1)* %r 244 ret void 245} 246 247; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math: 248; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 249; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 250; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 251; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] 252 253; VI-NOT: v_mac_f16 254; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} 255; GCN: s_endpgm 256define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math( 257 half addrspace(1)* %r, 258 half addrspace(1)* %a, 259 half addrspace(1)* %b, 260 half addrspace(1)* %c) #1 { 261entry: 262 %a.val = load half, half addrspace(1)* %a 263 %b.val = load half, half addrspace(1)* %b 264 %c.val = load half, half addrspace(1)* %c 265 266 %b.neg = fsub half 0.0, %b.val 267 %t.val = fmul half %a.val, %b.neg 268 %r.val = fadd half %t.val, %c.val 269 270 store half %r.val, half addrspace(1)* %r 271 ret void 272} 273 274; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math: 275; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} 276; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} 277; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} 278; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]] 279 280; VI-NOT: v_mac_f16 281; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} 282; GCN: s_endpgm 283define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math( 284 half addrspace(1)* %r, 285 half addrspace(1)* %a, 286 half addrspace(1)* %b, 287 half addrspace(1)* %c) #1 { 288entry: 289 %a.val = load half, half addrspace(1)* %a 290 %b.val = load half, half addrspace(1)* %b 291 %c.val = load half, half addrspace(1)* %c 292 293 %c.neg = fsub half 0.0, %c.val 294 %t.val = fmul half %a.val, %b.val 295 %r.val = fadd half %t.val, %c.neg 296 297 store half %r.val, half addrspace(1)* %r 298 ret void 299} 300 301; GCN-LABEL: {{^}}mac_v2f16: 302; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] 303; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] 304; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] 305 306; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 307; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 308; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 309; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 310; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 311; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 312; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 313; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 314; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 315; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] 316; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] 317; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] 318; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] 319; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 320; VI-NOT: and 321; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] 322 323; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 324; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 325; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] 326; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] 327; VI-NOT: and 328; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] 329 330; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] 331; GCN: s_endpgm 332define amdgpu_kernel void @mac_v2f16( 333 <2 x half> addrspace(1)* %r, 334 <2 x half> addrspace(1)* %a, 335 <2 x half> addrspace(1)* %b, 336 <2 x half> addrspace(1)* %c) #0 { 337entry: 338 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 339 call void @llvm.amdgcn.s.barrier() #2 340 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 341 call void @llvm.amdgcn.s.barrier() #2 342 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 343 344 %t.val = fmul <2 x half> %a.val, %b.val 345 %r.val = fadd <2 x half> %t.val, %c.val 346 347 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 348 ret void 349} 350 351; GCN-LABEL: {{^}}mac_v2f16_same_add: 352; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 353; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 354; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 355; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 356 357; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 358; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 359; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 360; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 361 362; GCN: s_endpgm 363define amdgpu_kernel void @mac_v2f16_same_add( 364 <2 x half> addrspace(1)* %r0, 365 <2 x half> addrspace(1)* %r1, 366 <2 x half> addrspace(1)* %a, 367 <2 x half> addrspace(1)* %b, 368 <2 x half> addrspace(1)* %c, 369 <2 x half> addrspace(1)* %d, 370 <2 x half> addrspace(1)* %e) #0 { 371entry: 372 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 373 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 374 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 375 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 376 %e.val = load <2 x half>, <2 x half> addrspace(1)* %e 377 378 %t0.val = fmul <2 x half> %a.val, %b.val 379 %r0.val = fadd <2 x half> %t0.val, %c.val 380 381 %t1.val = fmul <2 x half> %d.val, %e.val 382 %r1.val = fadd <2 x half> %t1.val, %c.val 383 384 store <2 x half> %r0.val, <2 x half> addrspace(1)* %r0 385 store <2 x half> %r1.val, <2 x half> addrspace(1)* %r1 386 ret void 387} 388 389; GCN-LABEL: {{^}}mac_v2f16_neg_a: 390; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 391; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 392 393; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 394; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 395 396; VI-NOT: v_mac_f16 397; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 398; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 399; GCN: s_endpgm 400define amdgpu_kernel void @mac_v2f16_neg_a( 401 <2 x half> addrspace(1)* %r, 402 <2 x half> addrspace(1)* %a, 403 <2 x half> addrspace(1)* %b, 404 <2 x half> addrspace(1)* %c) #0 { 405entry: 406 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 407 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 408 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 409 410 %a.neg = fsub <2 x half> <half -0.0, half -0.0>, %a.val 411 %t.val = fmul <2 x half> %a.neg, %b.val 412 %r.val = fadd <2 x half> %t.val, %c.val 413 414 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 415 ret void 416} 417 418; GCN-LABEL: {{^}}mac_v2f16_neg_b 419; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 420; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 421; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 422; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 423 424 425; VI-NOT: v_mac_f16 426; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 427; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 428; GCN: s_endpgm 429define amdgpu_kernel void @mac_v2f16_neg_b( 430 <2 x half> addrspace(1)* %r, 431 <2 x half> addrspace(1)* %a, 432 <2 x half> addrspace(1)* %b, 433 <2 x half> addrspace(1)* %c) #0 { 434entry: 435 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 436 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 437 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 438 439 %b.neg = fsub <2 x half> <half -0.0, half -0.0>, %b.val 440 %t.val = fmul <2 x half> %a.val, %b.neg 441 %r.val = fadd <2 x half> %t.val, %c.val 442 443 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 444 ret void 445} 446 447; GCN-LABEL: {{^}}mac_v2f16_neg_c: 448; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 449; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 450; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} 451; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} 452; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} 453; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} 454 455; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 456; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 457 458; VI-NOT: v_mac_f16 459; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 460; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 461; GCN: s_endpgm 462define amdgpu_kernel void @mac_v2f16_neg_c( 463 <2 x half> addrspace(1)* %r, 464 <2 x half> addrspace(1)* %a, 465 <2 x half> addrspace(1)* %b, 466 <2 x half> addrspace(1)* %c) #0 { 467entry: 468 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 469 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 470 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 471 472 %c.neg = fsub <2 x half> <half -0.0, half -0.0>, %c.val 473 %t.val = fmul <2 x half> %a.val, %b.val 474 %r.val = fadd <2 x half> %t.val, %c.neg 475 476 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 477 ret void 478} 479 480; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math: 481 482; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} 483; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 484; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} 485; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} 486 487; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 488; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 489; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 490; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 491; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} 492 493; GCN: s_endpgm 494define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( 495 <2 x half> addrspace(1)* %r, 496 <2 x half> addrspace(1)* %a, 497 <2 x half> addrspace(1)* %b, 498 <2 x half> addrspace(1)* %c) #0 { 499entry: 500 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 501 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 502 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 503 504 %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val 505 %t.val = fmul <2 x half> %a.neg, %b.val 506 %r.val = fadd <2 x half> %t.val, %c.val 507 508 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 509 ret void 510} 511 512; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: 513 514; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} 515; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 516; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] 517; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] 518 519; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 520; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 521; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 522; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 523; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] 524 525; GCN: s_endpgm 526define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( 527 <2 x half> addrspace(1)* %r, 528 <2 x half> addrspace(1)* %a, 529 <2 x half> addrspace(1)* %b, 530 <2 x half> addrspace(1)* %c) #0 { 531entry: 532 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 533 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 534 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 535 536 %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val 537 %t.val = fmul <2 x half> %a.val, %b.neg 538 %r.val = fadd <2 x half> %t.val, %c.val 539 540 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 541 ret void 542} 543 544; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: 545 546; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} 547; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 548; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} 549; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} 550 551; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 552; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} 553; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 554; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 555; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} 556 557; GCN: s_endpgm 558define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math( 559 <2 x half> addrspace(1)* %r, 560 <2 x half> addrspace(1)* %a, 561 <2 x half> addrspace(1)* %b, 562 <2 x half> addrspace(1)* %c) #0 { 563entry: 564 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 565 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 566 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 567 568 %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val 569 %t.val = fmul <2 x half> %a.val, %b.val 570 %r.val = fadd <2 x half> %t.val, %c.neg 571 572 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 573 ret void 574} 575 576; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math: 577; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 578; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 579; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} 580; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} 581; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} 582; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} 583 584; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 585; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 586 587; VI-NOT: v_mac_f16 588; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 589; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 590; GCN: s_endpgm 591define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math( 592 <2 x half> addrspace(1)* %r, 593 <2 x half> addrspace(1)* %a, 594 <2 x half> addrspace(1)* %b, 595 <2 x half> addrspace(1)* %c) #1 { 596entry: 597 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 598 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 599 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 600 601 %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val 602 %t.val = fmul <2 x half> %a.neg, %b.val 603 %r.val = fadd <2 x half> %t.val, %c.val 604 605 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 606 ret void 607} 608 609; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math: 610; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 611; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 612; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} 613; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} 614; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} 615; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} 616 617; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 618; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 619 620; VI-NOT: v_mac_f16 621; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 622; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 623; GCN: s_endpgm 624define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math( 625 <2 x half> addrspace(1)* %r, 626 <2 x half> addrspace(1)* %a, 627 <2 x half> addrspace(1)* %b, 628 <2 x half> addrspace(1)* %c) #1 { 629entry: 630 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 631 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 632 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 633 634 %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val 635 %t.val = fmul <2 x half> %a.val, %b.neg 636 %r.val = fadd <2 x half> %t.val, %c.val 637 638 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 639 ret void 640} 641 642; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math: 643; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} 644; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} 645; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} 646; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} 647; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} 648; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} 649 650; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 651; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 652 653; VI-NOT: v_mac_f16 654; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} 655; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} 656; GCN: s_endpgm 657define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math( 658 <2 x half> addrspace(1)* %r, 659 <2 x half> addrspace(1)* %a, 660 <2 x half> addrspace(1)* %b, 661 <2 x half> addrspace(1)* %c) #1 { 662entry: 663 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 664 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 665 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 666 667 %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val 668 %t.val = fmul <2 x half> %a.val, %b.val 669 %r.val = fadd <2 x half> %t.val, %c.neg 670 671 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 672 ret void 673} 674 675declare void @llvm.amdgcn.s.barrier() #2 676 677attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } 678attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } 679attributes #2 = { nounwind convergent } 680