1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefix=NOFMA 3; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=FMA,FMA-AVX1 4; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma4 < %s | FileCheck %s --check-prefix=FMA4 5; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=FMA,FMA-AVX512 6 7define float @f1(float %0, float %1, float %2) #0 { 8; NOFMA-LABEL: f1: 9; NOFMA: # %bb.0: # %entry 10; NOFMA-NEXT: pushq %rax 11; NOFMA-NEXT: .cfi_def_cfa_offset 16 12; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 13; NOFMA-NEXT: callq fmaf 14; NOFMA-NEXT: popq %rax 15; NOFMA-NEXT: .cfi_def_cfa_offset 8 16; NOFMA-NEXT: retq 17; 18; FMA-LABEL: f1: 19; FMA: # %bb.0: # %entry 20; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 21; FMA-NEXT: retq 22; 23; FMA4-LABEL: f1: 24; FMA4: # %bb.0: # %entry 25; FMA4-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 26; FMA4-NEXT: retq 27entry: 28 %3 = fneg float %0 29 %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %2, 30 metadata !"round.dynamic", 31 metadata !"fpexcept.strict") #0 32 ret float %result 33} 34 35define double @f2(double %0, double %1, double %2) #0 { 36; NOFMA-LABEL: f2: 37; NOFMA: # %bb.0: # %entry 38; NOFMA-NEXT: pushq %rax 39; NOFMA-NEXT: .cfi_def_cfa_offset 16 40; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 41; NOFMA-NEXT: callq fma 42; NOFMA-NEXT: popq %rax 43; NOFMA-NEXT: .cfi_def_cfa_offset 8 44; NOFMA-NEXT: retq 45; 46; FMA-LABEL: f2: 47; FMA: # %bb.0: # %entry 48; FMA-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 49; FMA-NEXT: retq 50; 51; FMA4-LABEL: f2: 52; FMA4: # %bb.0: # %entry 53; FMA4-NEXT: vfnmaddsd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 54; FMA4-NEXT: retq 55entry: 56 %3 = fneg double %0 57 %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %2, 58 metadata !"round.dynamic", 59 metadata !"fpexcept.strict") #0 60 ret double %result 61} 62 63define float @f3(float %0, float %1, float %2) #0 { 64; NOFMA-LABEL: f3: 65; NOFMA: # %bb.0: # %entry 66; NOFMA-NEXT: pushq %rax 67; NOFMA-NEXT: .cfi_def_cfa_offset 16 68; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2 69; NOFMA-NEXT: callq fmaf 70; NOFMA-NEXT: popq %rax 71; NOFMA-NEXT: .cfi_def_cfa_offset 8 72; NOFMA-NEXT: retq 73; 74; FMA-LABEL: f3: 75; FMA: # %bb.0: # %entry 76; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 77; FMA-NEXT: retq 78; 79; FMA4-LABEL: f3: 80; FMA4: # %bb.0: # %entry 81; FMA4-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 82; FMA4-NEXT: retq 83entry: 84 %3 = fneg float %2 85 %result = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %3, 86 metadata !"round.dynamic", 87 metadata !"fpexcept.strict") #0 88 ret float %result 89} 90 91define double @f4(double %0, double %1, double %2) #0 { 92; NOFMA-LABEL: f4: 93; NOFMA: # %bb.0: # %entry 94; NOFMA-NEXT: pushq %rax 95; NOFMA-NEXT: .cfi_def_cfa_offset 16 96; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2 97; NOFMA-NEXT: callq fma 98; NOFMA-NEXT: popq %rax 99; NOFMA-NEXT: .cfi_def_cfa_offset 8 100; NOFMA-NEXT: retq 101; 102; FMA-LABEL: f4: 103; FMA: # %bb.0: # %entry 104; FMA-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 105; FMA-NEXT: retq 106; 107; FMA4-LABEL: f4: 108; FMA4: # %bb.0: # %entry 109; FMA4-NEXT: vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 110; FMA4-NEXT: retq 111entry: 112 %3 = fneg double %2 113 %result = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %3, 114 metadata !"round.dynamic", 115 metadata !"fpexcept.strict") #0 116 ret double %result 117} 118 119define float @f5(float %0, float %1, float %2) #0 { 120; NOFMA-LABEL: f5: 121; NOFMA: # %bb.0: # %entry 122; NOFMA-NEXT: pushq %rax 123; NOFMA-NEXT: .cfi_def_cfa_offset 16 124; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 125; NOFMA-NEXT: xorps %xmm3, %xmm0 126; NOFMA-NEXT: xorps %xmm3, %xmm2 127; NOFMA-NEXT: callq fmaf 128; NOFMA-NEXT: popq %rax 129; NOFMA-NEXT: .cfi_def_cfa_offset 8 130; NOFMA-NEXT: retq 131; 132; FMA-LABEL: f5: 133; FMA: # %bb.0: # %entry 134; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 135; FMA-NEXT: retq 136; 137; FMA4-LABEL: f5: 138; FMA4: # %bb.0: # %entry 139; FMA4-NEXT: vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 140; FMA4-NEXT: retq 141entry: 142 %3 = fneg float %0 143 %4 = fneg float %2 144 %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4, 145 metadata !"round.dynamic", 146 metadata !"fpexcept.strict") #0 147 ret float %result 148} 149 150define double @f6(double %0, double %1, double %2) #0 { 151; NOFMA-LABEL: f6: 152; NOFMA: # %bb.0: # %entry 153; NOFMA-NEXT: pushq %rax 154; NOFMA-NEXT: .cfi_def_cfa_offset 16 155; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] 156; NOFMA-NEXT: xorps %xmm3, %xmm0 157; NOFMA-NEXT: xorps %xmm3, %xmm2 158; NOFMA-NEXT: callq fma 159; NOFMA-NEXT: popq %rax 160; NOFMA-NEXT: .cfi_def_cfa_offset 8 161; NOFMA-NEXT: retq 162; 163; FMA-LABEL: f6: 164; FMA: # %bb.0: # %entry 165; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 166; FMA-NEXT: retq 167; 168; FMA4-LABEL: f6: 169; FMA4: # %bb.0: # %entry 170; FMA4-NEXT: vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 171; FMA4-NEXT: retq 172entry: 173 %3 = fneg double %0 174 %4 = fneg double %2 175 %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4, 176 metadata !"round.dynamic", 177 metadata !"fpexcept.strict") #0 178 ret double %result 179} 180 181define float @f7(float %0, float %1, float %2) #0 { 182; NOFMA-LABEL: f7: 183; NOFMA: # %bb.0: # %entry 184; NOFMA-NEXT: pushq %rax 185; NOFMA-NEXT: .cfi_def_cfa_offset 16 186; NOFMA-NEXT: callq fmaf 187; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 188; NOFMA-NEXT: popq %rax 189; NOFMA-NEXT: .cfi_def_cfa_offset 8 190; NOFMA-NEXT: retq 191; 192; FMA-AVX1-LABEL: f7: 193; FMA-AVX1: # %bb.0: # %entry 194; FMA-AVX1-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 195; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 196; FMA-AVX1-NEXT: retq 197; 198; FMA4-LABEL: f7: 199; FMA4: # %bb.0: # %entry 200; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 201; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 202; FMA4-NEXT: retq 203; 204; FMA-AVX512-LABEL: f7: 205; FMA-AVX512: # %bb.0: # %entry 206; FMA-AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 207; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 208; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 209; FMA-AVX512-NEXT: retq 210entry: 211 %3 = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, 212 metadata !"round.dynamic", 213 metadata !"fpexcept.strict") #0 214 %result = fneg float %3 215 ret float %result 216} 217 218define double @f8(double %0, double %1, double %2) #0 { 219; NOFMA-LABEL: f8: 220; NOFMA: # %bb.0: # %entry 221; NOFMA-NEXT: pushq %rax 222; NOFMA-NEXT: .cfi_def_cfa_offset 16 223; NOFMA-NEXT: callq fma 224; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 225; NOFMA-NEXT: popq %rax 226; NOFMA-NEXT: .cfi_def_cfa_offset 8 227; NOFMA-NEXT: retq 228; 229; FMA-LABEL: f8: 230; FMA: # %bb.0: # %entry 231; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 232; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 233; FMA-NEXT: retq 234; 235; FMA4-LABEL: f8: 236; FMA4: # %bb.0: # %entry 237; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 238; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 239; FMA4-NEXT: retq 240entry: 241 %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, 242 metadata !"round.dynamic", 243 metadata !"fpexcept.strict") #0 244 %result = fneg double %3 245 ret double %result 246} 247 248define float @f9(float %0, float %1, float %2) #0 { 249; NOFMA-LABEL: f9: 250; NOFMA: # %bb.0: # %entry 251; NOFMA-NEXT: pushq %rax 252; NOFMA-NEXT: .cfi_def_cfa_offset 16 253; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 254; NOFMA-NEXT: xorps %xmm3, %xmm0 255; NOFMA-NEXT: xorps %xmm3, %xmm2 256; NOFMA-NEXT: callq fmaf 257; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 258; NOFMA-NEXT: popq %rax 259; NOFMA-NEXT: .cfi_def_cfa_offset 8 260; NOFMA-NEXT: retq 261; 262; FMA-AVX1-LABEL: f9: 263; FMA-AVX1: # %bb.0: # %entry 264; FMA-AVX1-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 265; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 266; FMA-AVX1-NEXT: retq 267; 268; FMA4-LABEL: f9: 269; FMA4: # %bb.0: # %entry 270; FMA4-NEXT: vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 271; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 272; FMA4-NEXT: retq 273; 274; FMA-AVX512-LABEL: f9: 275; FMA-AVX512: # %bb.0: # %entry 276; FMA-AVX512-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 277; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 278; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 279; FMA-AVX512-NEXT: retq 280entry: 281 %3 = fneg float %0 282 %4 = fneg float %2 283 %5 = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4, 284 metadata !"round.dynamic", 285 metadata !"fpexcept.strict") #0 286 %result = fneg float %5 287 ret float %result 288} 289 290define double @f10(double %0, double %1, double %2) #0 { 291; NOFMA-LABEL: f10: 292; NOFMA: # %bb.0: # %entry 293; NOFMA-NEXT: pushq %rax 294; NOFMA-NEXT: .cfi_def_cfa_offset 16 295; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] 296; NOFMA-NEXT: xorps %xmm3, %xmm0 297; NOFMA-NEXT: xorps %xmm3, %xmm2 298; NOFMA-NEXT: callq fma 299; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 300; NOFMA-NEXT: popq %rax 301; NOFMA-NEXT: .cfi_def_cfa_offset 8 302; NOFMA-NEXT: retq 303; 304; FMA-LABEL: f10: 305; FMA: # %bb.0: # %entry 306; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 307; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 308; FMA-NEXT: retq 309; 310; FMA4-LABEL: f10: 311; FMA4: # %bb.0: # %entry 312; FMA4-NEXT: vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 313; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 314; FMA4-NEXT: retq 315entry: 316 %3 = fneg double %0 317 %4 = fneg double %2 318 %5 = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4, 319 metadata !"round.dynamic", 320 metadata !"fpexcept.strict") #0 321 %result = fneg double %5 322 ret double %result 323} 324 325; Verify constrained fmul and fadd aren't fused. 326define float @f11(float %0, float %1, float %2) #0 { 327; NOFMA-LABEL: f11: 328; NOFMA: # %bb.0: # %entry 329; NOFMA-NEXT: mulss %xmm1, %xmm0 330; NOFMA-NEXT: addss %xmm2, %xmm0 331; NOFMA-NEXT: retq 332; 333; FMA-LABEL: f11: 334; FMA: # %bb.0: # %entry 335; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 336; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0 337; FMA-NEXT: retq 338; 339; FMA4-LABEL: f11: 340; FMA4: # %bb.0: # %entry 341; FMA4-NEXT: vmulss %xmm1, %xmm0, %xmm0 342; FMA4-NEXT: vaddss %xmm2, %xmm0, %xmm0 343; FMA4-NEXT: retq 344entry: 345 %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, 346 metadata !"round.dynamic", 347 metadata !"fpexcept.strict") #0 348 %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2, 349 metadata !"round.dynamic", 350 metadata !"fpexcept.strict") #0 351 ret float %4 352} 353 354; Verify constrained fmul and fadd aren't fused. 355define double @f12(double %0, double %1, double %2) #0 { 356; NOFMA-LABEL: f12: 357; NOFMA: # %bb.0: # %entry 358; NOFMA-NEXT: mulsd %xmm1, %xmm0 359; NOFMA-NEXT: addsd %xmm2, %xmm0 360; NOFMA-NEXT: retq 361; 362; FMA-LABEL: f12: 363; FMA: # %bb.0: # %entry 364; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0 365; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0 366; FMA-NEXT: retq 367; 368; FMA4-LABEL: f12: 369; FMA4: # %bb.0: # %entry 370; FMA4-NEXT: vmulsd %xmm1, %xmm0, %xmm0 371; FMA4-NEXT: vaddsd %xmm2, %xmm0, %xmm0 372; FMA4-NEXT: retq 373entry: 374 %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, 375 metadata !"round.dynamic", 376 metadata !"fpexcept.strict") #0 377 %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2, 378 metadata !"round.dynamic", 379 metadata !"fpexcept.strict") #0 380 ret double %4 381} 382 383; Verify that fmuladd(3.5) isn't simplified when the rounding mode is 384; unknown. 385define float @f15() #0 { 386; NOFMA-LABEL: f15: 387; NOFMA: # %bb.0: # %entry 388; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 389; NOFMA-NEXT: movaps %xmm1, %xmm0 390; NOFMA-NEXT: mulss %xmm1, %xmm0 391; NOFMA-NEXT: addss %xmm1, %xmm0 392; NOFMA-NEXT: retq 393; 394; FMA-LABEL: f15: 395; FMA: # %bb.0: # %entry 396; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 397; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 398; FMA-NEXT: retq 399; 400; FMA4-LABEL: f15: 401; FMA4: # %bb.0: # %entry 402; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 403; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 404; FMA4-NEXT: retq 405entry: 406 %result = call float @llvm.experimental.constrained.fmuladd.f32( 407 float 3.5, 408 float 3.5, 409 float 3.5, 410 metadata !"round.dynamic", 411 metadata !"fpexcept.strict") #0 412 ret float %result 413} 414 415; Verify that fmuladd(42.1) isn't simplified when the rounding mode is 416; unknown. 417define double @f16() #0 { 418; NOFMA-LABEL: f16: 419; NOFMA: # %bb.0: # %entry 420; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 421; NOFMA-NEXT: movapd %xmm1, %xmm0 422; NOFMA-NEXT: mulsd %xmm1, %xmm0 423; NOFMA-NEXT: addsd %xmm1, %xmm0 424; NOFMA-NEXT: retq 425; 426; FMA-LABEL: f16: 427; FMA: # %bb.0: # %entry 428; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 429; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 430; FMA-NEXT: retq 431; 432; FMA4-LABEL: f16: 433; FMA4: # %bb.0: # %entry 434; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 435; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 436; FMA4-NEXT: retq 437entry: 438 %result = call double @llvm.experimental.constrained.fmuladd.f64( 439 double 42.1, 440 double 42.1, 441 double 42.1, 442 metadata !"round.dynamic", 443 metadata !"fpexcept.strict") #0 444 ret double %result 445} 446 447; Verify that fma(3.5) isn't simplified when the rounding mode is 448; unknown. 449define float @f17() #0 { 450; NOFMA-LABEL: f17: 451; NOFMA: # %bb.0: # %entry 452; NOFMA-NEXT: pushq %rax 453; NOFMA-NEXT: .cfi_def_cfa_offset 16 454; NOFMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 455; NOFMA-NEXT: movaps %xmm0, %xmm1 456; NOFMA-NEXT: movaps %xmm0, %xmm2 457; NOFMA-NEXT: callq fmaf 458; NOFMA-NEXT: popq %rax 459; NOFMA-NEXT: .cfi_def_cfa_offset 8 460; NOFMA-NEXT: retq 461; 462; FMA-LABEL: f17: 463; FMA: # %bb.0: # %entry 464; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 465; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 466; FMA-NEXT: retq 467; 468; FMA4-LABEL: f17: 469; FMA4: # %bb.0: # %entry 470; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 471; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 472; FMA4-NEXT: retq 473entry: 474 %result = call float @llvm.experimental.constrained.fma.f32( 475 float 3.5, 476 float 3.5, 477 float 3.5, 478 metadata !"round.dynamic", 479 metadata !"fpexcept.strict") #0 480 ret float %result 481} 482 483; Verify that fma(42.1) isn't simplified when the rounding mode is 484; unknown. 485define double @f18() #0 { 486; NOFMA-LABEL: f18: 487; NOFMA: # %bb.0: # %entry 488; NOFMA-NEXT: pushq %rax 489; NOFMA-NEXT: .cfi_def_cfa_offset 16 490; NOFMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 491; NOFMA-NEXT: movaps %xmm0, %xmm1 492; NOFMA-NEXT: movaps %xmm0, %xmm2 493; NOFMA-NEXT: callq fma 494; NOFMA-NEXT: popq %rax 495; NOFMA-NEXT: .cfi_def_cfa_offset 8 496; NOFMA-NEXT: retq 497; 498; FMA-LABEL: f18: 499; FMA: # %bb.0: # %entry 500; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 501; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 502; FMA-NEXT: retq 503; 504; FMA4-LABEL: f18: 505; FMA4: # %bb.0: # %entry 506; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 507; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 508; FMA4-NEXT: retq 509entry: 510 %result = call double @llvm.experimental.constrained.fma.f64( 511 double 42.1, 512 double 42.1, 513 double 42.1, 514 metadata !"round.dynamic", 515 metadata !"fpexcept.strict") #0 516 ret double %result 517} 518 519define <4 x float> @f19(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { 520; NOFMA-LABEL: f19: 521; NOFMA: # %bb.0: # %entry 522; NOFMA-NEXT: subq $88, %rsp 523; NOFMA-NEXT: .cfi_def_cfa_offset 96 524; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 525; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 526; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm0 527; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 528; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 529; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 530; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 531; NOFMA-NEXT: callq fmaf 532; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 533; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 534; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 535; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 536; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 537; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 538; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] 539; NOFMA-NEXT: callq fmaf 540; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload 541; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 542; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 543; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 544; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 545; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 546; NOFMA-NEXT: callq fmaf 547; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 548; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 549; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 550; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 551; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] 552; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 553; NOFMA-NEXT: # xmm0 = mem[1,1,1,1] 554; NOFMA-NEXT: callq fmaf 555; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 556; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 557; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload 558; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] 559; NOFMA-NEXT: movdqa %xmm1, %xmm0 560; NOFMA-NEXT: addq $88, %rsp 561; NOFMA-NEXT: .cfi_def_cfa_offset 8 562; NOFMA-NEXT: retq 563; 564; FMA-LABEL: f19: 565; FMA: # %bb.0: # %entry 566; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 567; FMA-NEXT: retq 568; 569; FMA4-LABEL: f19: 570; FMA4: # %bb.0: # %entry 571; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 572; FMA4-NEXT: retq 573entry: 574 %3 = fneg <4 x float> %0 575 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %2, 576 metadata !"round.dynamic", 577 metadata !"fpexcept.strict") #0 578 ret <4 x float> %result 579} 580 581define <2 x double> @f20(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { 582; NOFMA-LABEL: f20: 583; NOFMA: # %bb.0: # %entry 584; NOFMA-NEXT: subq $72, %rsp 585; NOFMA-NEXT: .cfi_def_cfa_offset 80 586; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 587; NOFMA-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 588; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 589; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 590; NOFMA-NEXT: callq fma 591; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 592; NOFMA-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 593; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 594; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 595; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 596; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 597; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] 598; NOFMA-NEXT: callq fma 599; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 600; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 601; NOFMA-NEXT: movdqa %xmm1, %xmm0 602; NOFMA-NEXT: addq $72, %rsp 603; NOFMA-NEXT: .cfi_def_cfa_offset 8 604; NOFMA-NEXT: retq 605; 606; FMA-LABEL: f20: 607; FMA: # %bb.0: # %entry 608; FMA-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 609; FMA-NEXT: retq 610; 611; FMA4-LABEL: f20: 612; FMA4: # %bb.0: # %entry 613; FMA4-NEXT: vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 614; FMA4-NEXT: retq 615entry: 616 %3 = fneg <2 x double> %0 617 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %2, 618 metadata !"round.dynamic", 619 metadata !"fpexcept.strict") #0 620 ret <2 x double> %result 621} 622 623define <4 x float> @f21(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { 624; NOFMA-LABEL: f21: 625; NOFMA: # %bb.0: # %entry 626; NOFMA-NEXT: subq $88, %rsp 627; NOFMA-NEXT: .cfi_def_cfa_offset 96 628; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 629; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 630; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm2 631; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 632; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 633; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 634; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 635; NOFMA-NEXT: callq fmaf 636; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 637; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 638; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 639; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 640; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 641; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 642; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] 643; NOFMA-NEXT: callq fmaf 644; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload 645; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 646; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 647; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 648; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 649; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 650; NOFMA-NEXT: callq fmaf 651; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 652; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 653; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 654; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 655; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 656; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 657; NOFMA-NEXT: # xmm2 = mem[1,1,1,1] 658; NOFMA-NEXT: callq fmaf 659; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 660; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 661; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload 662; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] 663; NOFMA-NEXT: movaps %xmm1, %xmm0 664; NOFMA-NEXT: addq $88, %rsp 665; NOFMA-NEXT: .cfi_def_cfa_offset 8 666; NOFMA-NEXT: retq 667; 668; FMA-LABEL: f21: 669; FMA: # %bb.0: # %entry 670; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 671; FMA-NEXT: retq 672; 673; FMA4-LABEL: f21: 674; FMA4: # %bb.0: # %entry 675; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 676; FMA4-NEXT: retq 677entry: 678 %3 = fneg <4 x float> %2 679 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %3, 680 metadata !"round.dynamic", 681 metadata !"fpexcept.strict") #0 682 ret <4 x float> %result 683} 684 685define <2 x double> @f22(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { 686; NOFMA-LABEL: f22: 687; NOFMA: # %bb.0: # %entry 688; NOFMA-NEXT: subq $72, %rsp 689; NOFMA-NEXT: .cfi_def_cfa_offset 80 690; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 691; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 692; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2 693; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 694; NOFMA-NEXT: callq fma 695; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 696; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 697; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 698; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 699; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 700; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 701; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] 702; NOFMA-NEXT: callq fma 703; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 704; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 705; NOFMA-NEXT: movaps %xmm1, %xmm0 706; NOFMA-NEXT: addq $72, %rsp 707; NOFMA-NEXT: .cfi_def_cfa_offset 8 708; NOFMA-NEXT: retq 709; 710; FMA-LABEL: f22: 711; FMA: # %bb.0: # %entry 712; FMA-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 713; FMA-NEXT: retq 714; 715; FMA4-LABEL: f22: 716; FMA4: # %bb.0: # %entry 717; FMA4-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 718; FMA4-NEXT: retq 719entry: 720 %3 = fneg <2 x double> %2 721 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %3, 722 metadata !"round.dynamic", 723 metadata !"fpexcept.strict") #0 724 ret <2 x double> %result 725} 726 727define <4 x float> @f23(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { 728; NOFMA-LABEL: f23: 729; NOFMA: # %bb.0: # %entry 730; NOFMA-NEXT: subq $88, %rsp 731; NOFMA-NEXT: .cfi_def_cfa_offset 96 732; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 733; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 734; NOFMA-NEXT: pxor %xmm3, %xmm0 735; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 736; NOFMA-NEXT: pxor %xmm3, %xmm2 737; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 738; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 739; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 740; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 741; NOFMA-NEXT: callq fmaf 742; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 743; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 744; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] 745; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 746; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] 747; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 748; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 749; NOFMA-NEXT: callq fmaf 750; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload 751; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 752; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 753; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 754; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 755; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 756; NOFMA-NEXT: callq fmaf 757; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 758; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 759; NOFMA-NEXT: # xmm0 = mem[1,1,1,1] 760; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 761; NOFMA-NEXT: # xmm2 = mem[1,1,1,1] 762; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 763; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 764; NOFMA-NEXT: callq fmaf 765; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 766; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 767; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload 768; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] 769; NOFMA-NEXT: movdqa %xmm1, %xmm0 770; NOFMA-NEXT: addq $88, %rsp 771; NOFMA-NEXT: .cfi_def_cfa_offset 8 772; NOFMA-NEXT: retq 773; 774; FMA-LABEL: f23: 775; FMA: # %bb.0: # %entry 776; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 777; FMA-NEXT: retq 778; 779; FMA4-LABEL: f23: 780; FMA4: # %bb.0: # %entry 781; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 782; FMA4-NEXT: retq 783entry: 784 %3 = fneg <4 x float> %0 785 %4 = fneg <4 x float> %2 786 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4, 787 metadata !"round.dynamic", 788 metadata !"fpexcept.strict") #0 789 ret <4 x float> %result 790} 791 792define <2 x double> @f24(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { 793; NOFMA-LABEL: f24: 794; NOFMA: # %bb.0: # %entry 795; NOFMA-NEXT: subq $72, %rsp 796; NOFMA-NEXT: .cfi_def_cfa_offset 80 797; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 798; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] 799; NOFMA-NEXT: xorps %xmm3, %xmm0 800; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 801; NOFMA-NEXT: xorps %xmm3, %xmm2 802; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 803; NOFMA-NEXT: callq fma 804; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 805; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload 806; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] 807; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 808; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] 809; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 810; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 811; NOFMA-NEXT: callq fma 812; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 813; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 814; NOFMA-NEXT: movdqa %xmm1, %xmm0 815; NOFMA-NEXT: addq $72, %rsp 816; NOFMA-NEXT: .cfi_def_cfa_offset 8 817; NOFMA-NEXT: retq 818; 819; FMA-LABEL: f24: 820; FMA: # %bb.0: # %entry 821; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 822; FMA-NEXT: retq 823; 824; FMA4-LABEL: f24: 825; FMA4: # %bb.0: # %entry 826; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 827; FMA4-NEXT: retq 828entry: 829 %3 = fneg <2 x double> %0 830 %4 = fneg <2 x double> %2 831 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4, 832 metadata !"round.dynamic", 833 metadata !"fpexcept.strict") #0 834 ret <2 x double> %result 835} 836 837define <4 x float> @f25(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { 838; NOFMA-LABEL: f25: 839; NOFMA: # %bb.0: # %entry 840; NOFMA-NEXT: subq $88, %rsp 841; NOFMA-NEXT: .cfi_def_cfa_offset 96 842; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 843; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 844; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 845; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 846; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 847; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 848; NOFMA-NEXT: callq fmaf 849; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 850; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 851; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 852; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 853; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 854; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 855; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 856; NOFMA-NEXT: callq fmaf 857; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload 858; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 859; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 860; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 861; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 862; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 863; NOFMA-NEXT: callq fmaf 864; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 865; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 866; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 867; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 868; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 869; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 870; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] 871; NOFMA-NEXT: callq fmaf 872; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 873; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 874; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload 875; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] 876; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm1 877; NOFMA-NEXT: movaps %xmm1, %xmm0 878; NOFMA-NEXT: addq $88, %rsp 879; NOFMA-NEXT: .cfi_def_cfa_offset 8 880; NOFMA-NEXT: retq 881; 882; FMA-AVX1-LABEL: f25: 883; FMA-AVX1: # %bb.0: # %entry 884; FMA-AVX1-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 885; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 886; FMA-AVX1-NEXT: retq 887; 888; FMA4-LABEL: f25: 889; FMA4: # %bb.0: # %entry 890; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 891; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 892; FMA4-NEXT: retq 893; 894; FMA-AVX512-LABEL: f25: 895; FMA-AVX512: # %bb.0: # %entry 896; FMA-AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 897; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 898; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 899; FMA-AVX512-NEXT: retq 900entry: 901 %3 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, 902 metadata !"round.dynamic", 903 metadata !"fpexcept.strict") #0 904 %result = fneg <4 x float> %3 905 ret <4 x float> %result 906} 907 908define <2 x double> @f26(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { 909; NOFMA-LABEL: f26: 910; NOFMA: # %bb.0: # %entry 911; NOFMA-NEXT: subq $72, %rsp 912; NOFMA-NEXT: .cfi_def_cfa_offset 80 913; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 914; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 915; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 916; NOFMA-NEXT: callq fma 917; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 918; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 919; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 920; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 921; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 922; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 923; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 924; NOFMA-NEXT: callq fma 925; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 926; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 927; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm1 928; NOFMA-NEXT: movaps %xmm1, %xmm0 929; NOFMA-NEXT: addq $72, %rsp 930; NOFMA-NEXT: .cfi_def_cfa_offset 8 931; NOFMA-NEXT: retq 932; 933; FMA-LABEL: f26: 934; FMA: # %bb.0: # %entry 935; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 936; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 937; FMA-NEXT: retq 938; 939; FMA4-LABEL: f26: 940; FMA4: # %bb.0: # %entry 941; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 942; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 943; FMA4-NEXT: retq 944entry: 945 %3 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2, 946 metadata !"round.dynamic", 947 metadata !"fpexcept.strict") #0 948 %result = fneg <2 x double> %3 949 ret <2 x double> %result 950} 951 952define <4 x float> @f27(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { 953; NOFMA-LABEL: f27: 954; NOFMA: # %bb.0: # %entry 955; NOFMA-NEXT: subq $88, %rsp 956; NOFMA-NEXT: .cfi_def_cfa_offset 96 957; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 958; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 959; NOFMA-NEXT: pxor %xmm3, %xmm0 960; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 961; NOFMA-NEXT: pxor %xmm3, %xmm2 962; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 963; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 964; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 965; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 966; NOFMA-NEXT: callq fmaf 967; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 968; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 969; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] 970; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 971; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] 972; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 973; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 974; NOFMA-NEXT: callq fmaf 975; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload 976; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 977; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 978; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 979; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 980; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 981; NOFMA-NEXT: callq fmaf 982; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 983; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 984; NOFMA-NEXT: # xmm0 = mem[1,1,1,1] 985; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 986; NOFMA-NEXT: # xmm2 = mem[1,1,1,1] 987; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 988; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 989; NOFMA-NEXT: callq fmaf 990; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 991; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 992; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload 993; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] 994; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm1 995; NOFMA-NEXT: movdqa %xmm1, %xmm0 996; NOFMA-NEXT: addq $88, %rsp 997; NOFMA-NEXT: .cfi_def_cfa_offset 8 998; NOFMA-NEXT: retq 999; 1000; FMA-AVX1-LABEL: f27: 1001; FMA-AVX1: # %bb.0: # %entry 1002; FMA-AVX1-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 1003; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 1004; FMA-AVX1-NEXT: retq 1005; 1006; FMA4-LABEL: f27: 1007; FMA4: # %bb.0: # %entry 1008; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 1009; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 1010; FMA4-NEXT: retq 1011; 1012; FMA-AVX512-LABEL: f27: 1013; FMA-AVX512: # %bb.0: # %entry 1014; FMA-AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 1015; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1016; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 1017; FMA-AVX512-NEXT: retq 1018entry: 1019 %3 = fneg <4 x float> %0 1020 %4 = fneg <4 x float> %2 1021 %5 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4, 1022 metadata !"round.dynamic", 1023 metadata !"fpexcept.strict") #0 1024 %result = fneg <4 x float> %5 1025 ret <4 x float> %result 1026} 1027 1028define <2 x double> @f28(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { 1029; NOFMA-LABEL: f28: 1030; NOFMA: # %bb.0: # %entry 1031; NOFMA-NEXT: subq $72, %rsp 1032; NOFMA-NEXT: .cfi_def_cfa_offset 80 1033; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1034; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] 1035; NOFMA-NEXT: xorps %xmm3, %xmm0 1036; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 1037; NOFMA-NEXT: xorps %xmm3, %xmm2 1038; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1039; NOFMA-NEXT: callq fma 1040; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1041; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload 1042; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] 1043; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 1044; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] 1045; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1046; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1047; NOFMA-NEXT: callq fma 1048; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1049; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1050; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm1 1051; NOFMA-NEXT: movdqa %xmm1, %xmm0 1052; NOFMA-NEXT: addq $72, %rsp 1053; NOFMA-NEXT: .cfi_def_cfa_offset 8 1054; NOFMA-NEXT: retq 1055; 1056; FMA-LABEL: f28: 1057; FMA: # %bb.0: # %entry 1058; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 1059; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 1060; FMA-NEXT: retq 1061; 1062; FMA4-LABEL: f28: 1063; FMA4: # %bb.0: # %entry 1064; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 1065; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 1066; FMA4-NEXT: retq 1067entry: 1068 %3 = fneg <2 x double> %0 1069 %4 = fneg <2 x double> %2 1070 %5 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4, 1071 metadata !"round.dynamic", 1072 metadata !"fpexcept.strict") #0 1073 %result = fneg <2 x double> %5 1074 ret <2 x double> %result 1075} 1076 1077attributes #0 = { strictfp } 1078 1079declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) 1080declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) 1081declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) 1082declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) 1083declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) 1084declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) 1085declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) 1086declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) 1087declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) 1088declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata) 1089