1; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900,GFX9 %s 2; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX906,GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s 4; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s 5 6; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo: 7; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] 8; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] 9; VI: v_mac_f32 10; CI: v_mad_f32 11define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { 12 %src0.ext = fpext half %src0 to float 13 %src1.ext = fpext half %src1 to float 14 %src2.ext = fpext half %src2 to float 15 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 16 ret float %result 17} 18 19; GCN-LABEL: {{^}}v_mad_mix_f32_f16hi_f16hi_f16hi_int: 20; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding 21; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding 22; CIVI: v_mac_f32 23define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 { 24 %src0.hi = lshr i32 %src0, 16 25 %src1.hi = lshr i32 %src1, 16 26 %src2.hi = lshr i32 %src2, 16 27 %src0.i16 = trunc i32 %src0.hi to i16 28 %src1.i16 = trunc i32 %src1.hi to i16 29 %src2.i16 = trunc i32 %src2.hi to i16 30 %src0.fp16 = bitcast i16 %src0.i16 to half 31 %src1.fp16 = bitcast i16 %src1.i16 to half 32 %src2.fp16 = bitcast i16 %src2.i16 to half 33 %src0.ext = fpext half %src0.fp16 to float 34 %src1.ext = fpext half %src1.fp16 to float 35 %src2.ext = fpext half %src2.fp16 to float 36 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 37 ret float %result 38} 39 40; GCN-LABEL: {{^}}v_mad_mix_f32_f16hi_f16hi_f16hi_elt: 41; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding 42; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding 43; VI: v_mac_f32 44; CI: v_mad_f32 45define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 46 %src0.hi = extractelement <2 x half> %src0, i32 1 47 %src1.hi = extractelement <2 x half> %src1, i32 1 48 %src2.hi = extractelement <2 x half> %src2, i32 1 49 %src0.ext = fpext half %src0.hi to float 50 %src1.ext = fpext half %src1.hi to float 51 %src2.ext = fpext half %src2.hi to float 52 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 53 ret float %result 54} 55 56; GCN-LABEL: {{^}}v_mad_mix_v2f32: 57; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] 58; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] 59; GFX900-NEXT: v_mov_b32_e32 v1, v3 60 61; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] 62; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] 63; GFX906-NEXT: v_mov_b32_e32 v1, v3 64 65; CIVI: v_mac_f32 66define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 67 %src0.ext = fpext <2 x half> %src0 to <2 x float> 68 %src1.ext = fpext <2 x half> %src1 to <2 x float> 69 %src2.ext = fpext <2 x half> %src2 to <2 x float> 70 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) 71 ret <2 x float> %result 72} 73 74; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle: 75; GCN: s_waitcnt 76; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] 77; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] 78; GFX900-NEXT: v_mov_b32_e32 v0, v3 79; GFX900-NEXT: s_setpc_b64 80 81; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] 82; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] 83; GFX906-NEXT: v_mov_b32_e32 v0, v3 84; GFX906-NEXT: s_setpc_b64 85 86; CIVI: v_mac_f32 87define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 88 %src0.shuf = shufflevector <2 x half> %src0, <2 x half> undef, <2 x i32> <i32 1, i32 0> 89 %src1.shuf = shufflevector <2 x half> %src1, <2 x half> undef, <2 x i32> <i32 0, i32 1> 90 %src2.shuf = shufflevector <2 x half> %src2, <2 x half> undef, <2 x i32> <i32 1, i32 1> 91 %src0.ext = fpext <2 x half> %src0.shuf to <2 x float> 92 %src1.ext = fpext <2 x half> %src1.shuf to <2 x float> 93 %src2.ext = fpext <2 x half> %src2.shuf to <2 x float> 94 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) 95 ret <2 x float> %result 96} 97 98; GCN-LABEL: {{^}}v_mad_mix_f32_negf16lo_f16lo_f16lo: 99; GFX900: s_waitcnt 100; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding 101; GFX900-NEXT: s_setpc_b64 102 103; GFX906: s_waitcnt 104; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding 105; GFX906-NEXT: s_setpc_b64 106 107; CIVI: v_mad_f32 108define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { 109 %src0.ext = fpext half %src0 to float 110 %src1.ext = fpext half %src1 to float 111 %src2.ext = fpext half %src2 to float 112 %src0.ext.neg = fsub float -0.0, %src0.ext 113 %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg, float %src1.ext, float %src2.ext) 114 ret float %result 115} 116 117; GCN-LABEL: {{^}}v_mad_mix_f32_absf16lo_f16lo_f16lo: 118; GFX900: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] 119; GFX906: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] 120 121; CIVI: v_mad_f32 122define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { 123 %src0.ext = fpext half %src0 to float 124 %src1.ext = fpext half %src1 to float 125 %src2.ext = fpext half %src2 to float 126 %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) 127 %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext) 128 ret float %result 129} 130 131; GCN-LABEL: {{^}}v_mad_mix_f32_negabsf16lo_f16lo_f16lo: 132; GFX900: s_waitcnt 133; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] 134; GFX900-NEXT: s_setpc_b64 135 136; GFX906: s_waitcnt 137; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] 138; GFX906-NEXT: s_setpc_b64 139 140; CIVI: v_mad_f32 141define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { 142 %src0.ext = fpext half %src0 to float 143 %src1.ext = fpext half %src1 to float 144 %src2.ext = fpext half %src2 to float 145 %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) 146 %src0.ext.neg.abs = fsub float -0.0, %src0.ext.abs 147 %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg.abs, float %src1.ext, float %src2.ext) 148 ret float %result 149} 150 151; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32: 152; GCN: s_waitcnt 153; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 154; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 155; GFX9-NEXT: s_setpc_b64 156 157; CIVI: v_mad_f32 158define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { 159 %src0.ext = fpext half %src0 to float 160 %src1.ext = fpext half %src1 to float 161 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) 162 ret float %result 163} 164 165; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_negf32: 166; GCN: s_waitcnt 167; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding 168; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding 169; GFX9-NEXT: s_setpc_b64 170 171; CIVI: v_mad_f32 172define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %src2) #0 { 173 %src0.ext = fpext half %src0 to float 174 %src1.ext = fpext half %src1 to float 175 %src2.neg = fsub float -0.0, %src2 176 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg) 177 ret float %result 178} 179 180; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_absf32: 181; GCN: s_waitcnt 182; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding 183; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding 184; GFX9-NEXT: s_setpc_b64 185 186; CIVI: v_mad_f32 187define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %src2) #0 { 188 %src0.ext = fpext half %src0 to float 189 %src1.ext = fpext half %src1 to float 190 %src2.abs = call float @llvm.fabs.f32(float %src2) 191 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.abs) 192 ret float %result 193} 194 195; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_negabsf32: 196; GCN: s_waitcnt 197; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding 198; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding 199; GFX9-NEXT: s_setpc_b64 200 201; CIVI: v_mad_f32 202define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float %src2) #0 { 203 %src0.ext = fpext half %src0 to float 204 %src1.ext = fpext half %src1 to float 205 %src2.abs = call float @llvm.fabs.f32(float %src2) 206 %src2.neg.abs = fsub float -0.0, %src2.abs 207 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg.abs) 208 ret float %result 209} 210 211; TODO: Fold inline immediates. Need to be careful because it is an 212; f16 inline immediate that may be converted to f32, not an actual f32 213; inline immediate. 214 215; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imm1: 216; GCN: s_waitcnt 217; GFX9: v_mov_b32_e32 v2, 1.0 218; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 219; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 220 221; CIVI: v_mad_f32 v0, v0, v1, 1.0 222; GCN-NEXT: s_setpc_b64 223define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { 224 %src0.ext = fpext half %src0 to float 225 %src1.ext = fpext half %src1 to float 226 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0) 227 ret float %result 228} 229 230; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: 231; GCN: s_waitcnt 232; GFX9: v_mov_b32_e32 v2, 0.15915494 233; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 234; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 235; VI: v_mad_f32 v0, v0, v1, 0.15915494 236define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 { 237 %src0.ext = fpext half %src0 to float 238 %src1.ext = fpext half %src1 to float 239 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 0x3FC45F3060000000) 240 ret float %result 241} 242 243; Attempt to break inline immediate folding. If the operand is 244; interpreted as f32, the inline immediate is really the f16 inline 245; imm value converted to f32. 246; fpext f16 1/2pi = 0x3e230000 247; f32 1/2pi = 0x3e22f983 248; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: 249; GFX9: v_mov_b32_e32 v2, 0x3e230000 250; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 251; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 252 253; CIVI: v_madak_f32 v0, v0, v1, 0x3e230000 254define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 { 255 %src0.ext = fpext half %src0 to float 256 %src1.ext = fpext half %src1 to float 257 %src2 = fpext half 0xH3118 to float 258 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) 259 ret float %result 260} 261 262; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: 263; GFX9: v_mov_b32_e32 v2, 0x367c0000 264; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 265; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 266 267; CIVI: v_madak_f32 v0, v0, v1, 0x367c0000 268define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { 269 %src0.ext = fpext half %src0 to float 270 %src1.ext = fpext half %src1 to float 271 %src2 = fpext half 0xH003F to float 272 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) 273 ret float %result 274} 275 276; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1: 277; GFX9: v_mov_b32_e32 v3, 1.0 278; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding 279; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding 280; GFX900: v_mov_b32_e32 v1, v2 281 282; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding 283; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding 284; GFX906: v_mov_b32_e32 v1, v2 285define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { 286 %src0.ext = fpext <2 x half> %src0 to <2 x float> 287 %src1.ext = fpext <2 x half> %src1 to <2 x float> 288 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> <float 1.0, float 1.0>) 289 ret <2 x float> %result 290} 291 292; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi: 293; GFX9: v_mov_b32_e32 v3, 0x3e230000 294 295; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding 296; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding 297; GFX900: v_mov_b32_e32 v1, v2 298 299; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding 300; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding 301; GFX906: v_mov_b32_e32 v1, v2 302define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { 303 %src0.ext = fpext <2 x half> %src0 to <2 x float> 304 %src1.ext = fpext <2 x half> %src1 to <2 x float> 305 %src2 = fpext <2 x half> <half 0xH3118, half 0xH3118> to <2 x float> 306 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2) 307 ret <2 x float> %result 308} 309 310; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi: 311; GFX9: v_mov_b32_e32 v3, 0.15915494 312 313; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding 314; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding 315; GFX900: v_mov_b32_e32 v1, v2 316 317; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding 318; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding 319; GFX906: v_mov_b32_e32 v1, v2 320define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { 321 %src0.ext = fpext <2 x half> %src0 to <2 x float> 322 %src1.ext = fpext <2 x half> %src1 to <2 x float> 323 %src2 = fpext <2 x half> <half 0xH3118, half 0xH3118> to <2 x float> 324 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> <float 0x3FC45F3060000000, float 0x3FC45F3060000000>) 325 ret <2 x float> %result 326} 327 328; GCN-LABEL: {{^}}v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: 329; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding 330; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding 331; VI: v_mac_f32_e64 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}} 332; CI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}} 333define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 334 %src0.hi = extractelement <2 x half> %src0, i32 1 335 %src1.hi = extractelement <2 x half> %src1, i32 1 336 %src2.hi = extractelement <2 x half> %src2, i32 1 337 %src0.ext = fpext half %src0.hi to float 338 %src1.ext = fpext half %src1.hi to float 339 %src2.ext = fpext half %src2.hi to float 340 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 341 %max = call float @llvm.maxnum.f32(float %result, float 0.0) 342 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 343 ret float %clamp 344} 345 346; GCN-LABEL: no_mix_simple: 347; GCN: s_waitcnt 348; GCN-NEXT: v_{{mad|fma}}_f32 v0, v0, v1, v2 349; GCN-NEXT: s_setpc_b64 350define float @no_mix_simple(float %src0, float %src1, float %src2) #0 { 351 %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) 352 ret float %result 353} 354 355; GCN-LABEL: no_mix_simple_fabs: 356; GCN: s_waitcnt 357; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2 358; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2 359; GFX906-NEXT: v_fma_f32 v0, v1, |v0|, v2 360; GCN-NEXT: s_setpc_b64 361define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 { 362 %src0.fabs = call float @llvm.fabs.f32(float %src0) 363 %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2) 364 ret float %result 365} 366 367; FIXME: Should abe able to select in thits case 368; All sources are converted from f16, so it doesn't matter 369; v_mad_mix_f32 flushes. 370 371; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: 372; GFX900: v_cvt_f32_f16 373; GFX900: v_cvt_f32_f16 374; GFX900: v_cvt_f32_f16 375; GFX900: v_fma_f32 376define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %src1, half %src2) #1 { 377 %src0.ext = fpext half %src0 to float 378 %src1.ext = fpext half %src1 to float 379 %src2.ext = fpext half %src2 to float 380 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 381 ret float %result 382} 383 384; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_denormals: 385; GFX900: v_cvt_f32_f16 386; GFX900: v_cvt_f32_f16 387; GFX900: v_fma_f32 388 389; GFX906-NOT: v_cvt_f32_f16 390; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] 391define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, float %src2) #1 { 392 %src0.ext = fpext half %src0 to float 393 %src1.ext = fpext half %src1 to float 394 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) 395 ret float %result 396} 397 398; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: 399; GFX9: v_cvt_f32_f16 400; GFX9: v_cvt_f32_f16 401; GFX9: v_cvt_f32_f16 402; GFX9: v_mul_f32 403; GFX9: v_add_f32 404define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, half %src2) #1 { 405 %src0.ext = fpext half %src0 to float 406 %src1.ext = fpext half %src1 to float 407 %src2.ext = fpext half %src2 to float 408 %mul = fmul float %src0.ext, %src1.ext 409 %result = fadd float %mul, %src2.ext 410 ret float %result 411} 412 413; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: 414; GFX9: v_cvt_f32_f16 415; GFX9: v_cvt_f32_f16 416; GFX9: v_mul_f32 417; GFX9: v_add_f32 418define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, float %src2) #1 { 419 %src0.ext = fpext half %src0 to float 420 %src1.ext = fpext half %src1 to float 421 %mul = fmul float %src0.ext, %src1.ext 422 %result = fadd float %mul, %src2 423 ret float %result 424} 425 426; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: 427; GCN: s_waitcnt 428; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding 429; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding 430; GFX9-NEXT: s_setpc_b64 431define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, half %src2) #0 { 432 %src0.ext = fpext half %src0 to float 433 %src1.ext = fpext half %src1 to float 434 %src2.ext = fpext half %src2 to float 435 %mul = fmul contract float %src0.ext, %src1.ext 436 %result = fadd contract float %mul, %src2.ext 437 ret float %result 438} 439 440; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: 441; GCN: s_waitcnt 442; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 443; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding 444; GFX9-NEXT: s_setpc_b64 445define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, float %src2) #0 { 446 %src0.ext = fpext half %src0 to float 447 %src1.ext = fpext half %src1 to float 448 %mul = fmul contract float %src0.ext, %src1.ext 449 %result = fadd contract float %mul, %src2 450 ret float %result 451} 452 453; GCN-LABEL: {{^}}v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: 454; GFX9: s_waitcnt 455; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding 456; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding 457; GFX9-NEXT: s_setpc_b64 458 459; CIVI: v_mad_f32 460define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { 461 %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> 462 %src0 = extractelement <2 x half> %src0.arg.bc, i32 0 463 %src0.neg = fsub half -0.0, %src0 464 %src0.ext = fpext half %src0.neg to float 465 %src1.ext = fpext half %src1 to float 466 %src2.ext = fpext half %src2 to float 467; %src0.ext.neg = fsub float -0.0, %src0.ext 468 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 469 ret float %result 470} 471 472; Make sure we don't fold pre-cvt fneg if we already have a fabs 473; GCN-LABEL: {{^}}v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: 474; GFX900: s_waitcnt 475define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { 476 %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> 477 %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 478 %src0.neg = fsub half -0.0, %src0 479 %src0.ext = fpext half %src0.neg to float 480 %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) 481 %src1.ext = fpext half %src1 to float 482 %src2.ext = fpext half %src2 to float 483 %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext) 484 ret float %result 485} 486 487; GCN-LABEL: {{^}}v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: 488; GFX9: s_waitcnt 489; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] 490; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] 491; GFX9-NEXT: s_setpc_b64 492define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { 493 %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> 494 %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 495 %src0.abs = call half @llvm.fabs.f16(half %src0) 496 %src0.ext = fpext half %src0.abs to float 497 %src1.ext = fpext half %src1 to float 498 %src2.ext = fpext half %src2 to float 499 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 500 ret float %result 501} 502 503; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: 504; GFX9: s_waitcnt 505; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] 506; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] 507; GFX9-NEXT: s_setpc_b64 508define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { 509 %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> 510 %fneg = fsub <2 x half> <half -0.0, half -0.0>, %src0.arg.bc 511 %src0 = extractelement <2 x half> %fneg, i32 1 512 %src0.ext = fpext half %src0 to float 513 %src1.ext = fpext half %src1 to float 514 %src2.ext = fpext half %src2 to float 515 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 516 ret float %result 517} 518 519; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: 520; GFX9: s_waitcnt 521; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] 522; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] 523; GFX9-NEXT: s_setpc_b64 524define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { 525 %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> 526 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) 527 %src0 = extractelement <2 x half> %fabs, i32 1 528 %src0.ext = fpext half %src0 to float 529 %src1.ext = fpext half %src1 to float 530 %src2.ext = fpext half %src2 to float 531 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 532 ret float %result 533} 534 535; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: 536; GFX9: s_waitcnt 537; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] 538; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] 539; GFX9-NEXT: s_setpc_b64 540define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { 541 %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> 542 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) 543 %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs 544 %src0 = extractelement <2 x half> %fneg.fabs, i32 1 545 %src0.ext = fpext half %src0 to float 546 %src1.ext = fpext half %src1 to float 547 %src2.ext = fpext half %src2 to float 548 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 549 ret float %result 550} 551 552declare half @llvm.fabs.f16(half) #2 553declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2 554declare float @llvm.fabs.f32(float) #2 555declare float @llvm.minnum.f32(float, float) #2 556declare float @llvm.maxnum.f32(float, float) #2 557declare float @llvm.fmuladd.f32(float, float, float) #2 558declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #2 559 560attributes #0 = { nounwind "target-features"="-fp32-denormals" } 561attributes #1 = { nounwind "target-features"="+fp32-denormals" } 562attributes #2 = { nounwind readnone speculatable } 563