1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 7; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 8 9; 10; vXf32 (accum) 11; 12 13define float @test_v2f32(float %a0, <2 x float> %a1) { 14; SSE2-LABEL: test_v2f32: 15; SSE2: # %bb.0: 16; SSE2-NEXT: movaps %xmm1, %xmm2 17; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 18; SSE2-NEXT: mulss %xmm1, %xmm2 19; SSE2-NEXT: mulss %xmm2, %xmm0 20; SSE2-NEXT: retq 21; 22; SSE41-LABEL: test_v2f32: 23; SSE41: # %bb.0: 24; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 25; SSE41-NEXT: mulss %xmm1, %xmm2 26; SSE41-NEXT: mulss %xmm2, %xmm0 27; SSE41-NEXT: retq 28; 29; AVX-LABEL: test_v2f32: 30; AVX: # %bb.0: 31; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 32; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 33; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 34; AVX-NEXT: retq 35; 36; AVX512-LABEL: test_v2f32: 37; AVX512: # %bb.0: 38; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 39; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 40; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 41; AVX512-NEXT: retq 42 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1) 43 ret float %1 44} 45 46define float @test_v4f32(float %a0, <4 x float> %a1) { 47; SSE2-LABEL: test_v4f32: 48; SSE2: # %bb.0: 49; SSE2-NEXT: movaps %xmm1, %xmm2 50; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 51; SSE2-NEXT: mulps %xmm1, %xmm2 52; SSE2-NEXT: movaps %xmm2, %xmm1 53; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 54; SSE2-NEXT: mulss %xmm2, %xmm1 55; SSE2-NEXT: mulss %xmm1, %xmm0 56; SSE2-NEXT: retq 57; 58; SSE41-LABEL: test_v4f32: 59; SSE41: # %bb.0: 60; SSE41-NEXT: movaps %xmm1, %xmm2 61; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 62; SSE41-NEXT: mulps %xmm1, %xmm2 63; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 64; SSE41-NEXT: mulss %xmm2, %xmm1 65; SSE41-NEXT: mulss %xmm1, %xmm0 66; SSE41-NEXT: retq 67; 68; AVX-LABEL: test_v4f32: 69; AVX: # %bb.0: 70; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 71; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 72; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 73; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 74; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 75; AVX-NEXT: retq 76; 77; AVX512-LABEL: test_v4f32: 78; AVX512: # %bb.0: 79; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 80; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 81; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 82; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 83; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 84; AVX512-NEXT: retq 85 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) 86 ret float %1 87} 88 89define float @test_v8f32(float %a0, <8 x float> %a1) { 90; SSE2-LABEL: test_v8f32: 91; SSE2: # %bb.0: 92; SSE2-NEXT: mulps %xmm2, %xmm1 93; SSE2-NEXT: movaps %xmm1, %xmm2 94; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 95; SSE2-NEXT: mulps %xmm1, %xmm2 96; SSE2-NEXT: movaps %xmm2, %xmm1 97; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 98; SSE2-NEXT: mulss %xmm2, %xmm1 99; SSE2-NEXT: mulss %xmm1, %xmm0 100; SSE2-NEXT: retq 101; 102; SSE41-LABEL: test_v8f32: 103; SSE41: # %bb.0: 104; SSE41-NEXT: mulps %xmm2, %xmm1 105; SSE41-NEXT: movaps %xmm1, %xmm2 106; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 107; SSE41-NEXT: mulps %xmm1, %xmm2 108; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 109; SSE41-NEXT: mulss %xmm2, %xmm1 110; SSE41-NEXT: mulss %xmm1, %xmm0 111; SSE41-NEXT: retq 112; 113; AVX-LABEL: test_v8f32: 114; AVX: # %bb.0: 115; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 116; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 117; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 118; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 119; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 120; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 121; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 122; AVX-NEXT: vzeroupper 123; AVX-NEXT: retq 124; 125; AVX512-LABEL: test_v8f32: 126; AVX512: # %bb.0: 127; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 128; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 129; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 130; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 131; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 132; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 133; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 134; AVX512-NEXT: vzeroupper 135; AVX512-NEXT: retq 136 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1) 137 ret float %1 138} 139 140define float @test_v16f32(float %a0, <16 x float> %a1) { 141; SSE2-LABEL: test_v16f32: 142; SSE2: # %bb.0: 143; SSE2-NEXT: mulps %xmm4, %xmm2 144; SSE2-NEXT: mulps %xmm3, %xmm1 145; SSE2-NEXT: mulps %xmm2, %xmm1 146; SSE2-NEXT: movaps %xmm1, %xmm2 147; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 148; SSE2-NEXT: mulps %xmm1, %xmm2 149; SSE2-NEXT: movaps %xmm2, %xmm1 150; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 151; SSE2-NEXT: mulss %xmm2, %xmm1 152; SSE2-NEXT: mulss %xmm1, %xmm0 153; SSE2-NEXT: retq 154; 155; SSE41-LABEL: test_v16f32: 156; SSE41: # %bb.0: 157; SSE41-NEXT: mulps %xmm4, %xmm2 158; SSE41-NEXT: mulps %xmm3, %xmm1 159; SSE41-NEXT: mulps %xmm2, %xmm1 160; SSE41-NEXT: movaps %xmm1, %xmm2 161; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 162; SSE41-NEXT: mulps %xmm1, %xmm2 163; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 164; SSE41-NEXT: mulss %xmm2, %xmm1 165; SSE41-NEXT: mulss %xmm1, %xmm0 166; SSE41-NEXT: retq 167; 168; AVX-LABEL: test_v16f32: 169; AVX: # %bb.0: 170; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 171; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 172; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 173; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 174; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 175; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 176; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 177; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 178; AVX-NEXT: vzeroupper 179; AVX-NEXT: retq 180; 181; AVX512-LABEL: test_v16f32: 182; AVX512: # %bb.0: 183; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 184; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 185; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 186; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 187; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 188; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 189; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 190; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 191; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 192; AVX512-NEXT: vzeroupper 193; AVX512-NEXT: retq 194 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1) 195 ret float %1 196} 197 198; 199; vXf32 (one) 200; 201 202define float @test_v2f32_zero(<2 x float> %a0) { 203; SSE2-LABEL: test_v2f32_zero: 204; SSE2: # %bb.0: 205; SSE2-NEXT: movaps %xmm0, %xmm1 206; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 207; SSE2-NEXT: mulss %xmm0, %xmm1 208; SSE2-NEXT: movaps %xmm1, %xmm0 209; SSE2-NEXT: retq 210; 211; SSE41-LABEL: test_v2f32_zero: 212; SSE41: # %bb.0: 213; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 214; SSE41-NEXT: mulss %xmm1, %xmm0 215; SSE41-NEXT: retq 216; 217; AVX-LABEL: test_v2f32_zero: 218; AVX: # %bb.0: 219; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 220; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 221; AVX-NEXT: retq 222; 223; AVX512-LABEL: test_v2f32_zero: 224; AVX512: # %bb.0: 225; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 226; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 227; AVX512-NEXT: retq 228 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) 229 ret float %1 230} 231 232define float @test_v4f32_zero(<4 x float> %a0) { 233; SSE2-LABEL: test_v4f32_zero: 234; SSE2: # %bb.0: 235; SSE2-NEXT: movaps %xmm0, %xmm1 236; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 237; SSE2-NEXT: mulps %xmm0, %xmm1 238; SSE2-NEXT: movaps %xmm1, %xmm0 239; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 240; SSE2-NEXT: mulss %xmm1, %xmm0 241; SSE2-NEXT: retq 242; 243; SSE41-LABEL: test_v4f32_zero: 244; SSE41: # %bb.0: 245; SSE41-NEXT: movaps %xmm0, %xmm1 246; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 247; SSE41-NEXT: mulps %xmm0, %xmm1 248; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 249; SSE41-NEXT: mulss %xmm0, %xmm1 250; SSE41-NEXT: movaps %xmm1, %xmm0 251; SSE41-NEXT: retq 252; 253; AVX-LABEL: test_v4f32_zero: 254; AVX: # %bb.0: 255; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 256; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 257; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 258; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 259; AVX-NEXT: retq 260; 261; AVX512-LABEL: test_v4f32_zero: 262; AVX512: # %bb.0: 263; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 264; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 265; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 266; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 267; AVX512-NEXT: retq 268 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) 269 ret float %1 270} 271 272define float @test_v8f32_zero(<8 x float> %a0) { 273; SSE2-LABEL: test_v8f32_zero: 274; SSE2: # %bb.0: 275; SSE2-NEXT: mulps %xmm1, %xmm0 276; SSE2-NEXT: movaps %xmm0, %xmm1 277; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 278; SSE2-NEXT: mulps %xmm0, %xmm1 279; SSE2-NEXT: movaps %xmm1, %xmm0 280; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 281; SSE2-NEXT: mulss %xmm1, %xmm0 282; SSE2-NEXT: retq 283; 284; SSE41-LABEL: test_v8f32_zero: 285; SSE41: # %bb.0: 286; SSE41-NEXT: mulps %xmm1, %xmm0 287; SSE41-NEXT: movaps %xmm0, %xmm1 288; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 289; SSE41-NEXT: mulps %xmm0, %xmm1 290; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 291; SSE41-NEXT: mulss %xmm0, %xmm1 292; SSE41-NEXT: movaps %xmm1, %xmm0 293; SSE41-NEXT: retq 294; 295; AVX-LABEL: test_v8f32_zero: 296; AVX: # %bb.0: 297; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 298; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 299; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 300; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 301; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 302; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 303; AVX-NEXT: vzeroupper 304; AVX-NEXT: retq 305; 306; AVX512-LABEL: test_v8f32_zero: 307; AVX512: # %bb.0: 308; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 309; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 310; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 311; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 312; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 313; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 314; AVX512-NEXT: vzeroupper 315; AVX512-NEXT: retq 316 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) 317 ret float %1 318} 319 320define float @test_v16f32_zero(<16 x float> %a0) { 321; SSE2-LABEL: test_v16f32_zero: 322; SSE2: # %bb.0: 323; SSE2-NEXT: mulps %xmm3, %xmm1 324; SSE2-NEXT: mulps %xmm2, %xmm0 325; SSE2-NEXT: mulps %xmm1, %xmm0 326; SSE2-NEXT: movaps %xmm0, %xmm1 327; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 328; SSE2-NEXT: mulps %xmm0, %xmm1 329; SSE2-NEXT: movaps %xmm1, %xmm0 330; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 331; SSE2-NEXT: mulss %xmm1, %xmm0 332; SSE2-NEXT: retq 333; 334; SSE41-LABEL: test_v16f32_zero: 335; SSE41: # %bb.0: 336; SSE41-NEXT: mulps %xmm3, %xmm1 337; SSE41-NEXT: mulps %xmm2, %xmm0 338; SSE41-NEXT: mulps %xmm1, %xmm0 339; SSE41-NEXT: movaps %xmm0, %xmm1 340; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 341; SSE41-NEXT: mulps %xmm0, %xmm1 342; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 343; SSE41-NEXT: mulss %xmm0, %xmm1 344; SSE41-NEXT: movaps %xmm1, %xmm0 345; SSE41-NEXT: retq 346; 347; AVX-LABEL: test_v16f32_zero: 348; AVX: # %bb.0: 349; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 350; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 351; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 352; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 353; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 354; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 355; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 356; AVX-NEXT: vzeroupper 357; AVX-NEXT: retq 358; 359; AVX512-LABEL: test_v16f32_zero: 360; AVX512: # %bb.0: 361; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 362; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 363; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 364; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 365; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 366; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 367; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 368; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 369; AVX512-NEXT: vzeroupper 370; AVX512-NEXT: retq 371 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) 372 ret float %1 373} 374 375; 376; vXf32 (undef) 377; 378 379define float @test_v2f32_undef(<2 x float> %a0) { 380; SSE2-LABEL: test_v2f32_undef: 381; SSE2: # %bb.0: 382; SSE2-NEXT: movaps %xmm0, %xmm1 383; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 384; SSE2-NEXT: mulss %xmm0, %xmm1 385; SSE2-NEXT: movaps %xmm1, %xmm0 386; SSE2-NEXT: retq 387; 388; SSE41-LABEL: test_v2f32_undef: 389; SSE41: # %bb.0: 390; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 391; SSE41-NEXT: mulss %xmm1, %xmm0 392; SSE41-NEXT: retq 393; 394; AVX-LABEL: test_v2f32_undef: 395; AVX: # %bb.0: 396; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 397; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 398; AVX-NEXT: retq 399; 400; AVX512-LABEL: test_v2f32_undef: 401; AVX512: # %bb.0: 402; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 403; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 404; AVX512-NEXT: retq 405 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) 406 ret float %1 407} 408 409define float @test_v4f32_undef(<4 x float> %a0) { 410; SSE2-LABEL: test_v4f32_undef: 411; SSE2: # %bb.0: 412; SSE2-NEXT: movaps %xmm0, %xmm1 413; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 414; SSE2-NEXT: mulps %xmm0, %xmm1 415; SSE2-NEXT: movaps %xmm1, %xmm0 416; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 417; SSE2-NEXT: mulss %xmm1, %xmm0 418; SSE2-NEXT: retq 419; 420; SSE41-LABEL: test_v4f32_undef: 421; SSE41: # %bb.0: 422; SSE41-NEXT: movaps %xmm0, %xmm1 423; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 424; SSE41-NEXT: mulps %xmm0, %xmm1 425; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 426; SSE41-NEXT: mulss %xmm0, %xmm1 427; SSE41-NEXT: movaps %xmm1, %xmm0 428; SSE41-NEXT: retq 429; 430; AVX-LABEL: test_v4f32_undef: 431; AVX: # %bb.0: 432; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 433; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 434; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 435; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 436; AVX-NEXT: retq 437; 438; AVX512-LABEL: test_v4f32_undef: 439; AVX512: # %bb.0: 440; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 441; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 442; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 443; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 444; AVX512-NEXT: retq 445 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) 446 ret float %1 447} 448 449define float @test_v8f32_undef(<8 x float> %a0) { 450; SSE2-LABEL: test_v8f32_undef: 451; SSE2: # %bb.0: 452; SSE2-NEXT: mulps %xmm1, %xmm0 453; SSE2-NEXT: movaps %xmm0, %xmm1 454; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 455; SSE2-NEXT: mulps %xmm0, %xmm1 456; SSE2-NEXT: movaps %xmm1, %xmm0 457; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 458; SSE2-NEXT: mulss %xmm1, %xmm0 459; SSE2-NEXT: retq 460; 461; SSE41-LABEL: test_v8f32_undef: 462; SSE41: # %bb.0: 463; SSE41-NEXT: mulps %xmm1, %xmm0 464; SSE41-NEXT: movaps %xmm0, %xmm1 465; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 466; SSE41-NEXT: mulps %xmm0, %xmm1 467; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 468; SSE41-NEXT: mulss %xmm0, %xmm1 469; SSE41-NEXT: movaps %xmm1, %xmm0 470; SSE41-NEXT: retq 471; 472; AVX-LABEL: test_v8f32_undef: 473; AVX: # %bb.0: 474; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 475; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 476; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 477; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 478; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 479; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 480; AVX-NEXT: vzeroupper 481; AVX-NEXT: retq 482; 483; AVX512-LABEL: test_v8f32_undef: 484; AVX512: # %bb.0: 485; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 486; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 487; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 488; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 489; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 490; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 491; AVX512-NEXT: vzeroupper 492; AVX512-NEXT: retq 493 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) 494 ret float %1 495} 496 497define float @test_v16f32_undef(<16 x float> %a0) { 498; SSE2-LABEL: test_v16f32_undef: 499; SSE2: # %bb.0: 500; SSE2-NEXT: mulps %xmm3, %xmm1 501; SSE2-NEXT: mulps %xmm2, %xmm0 502; SSE2-NEXT: mulps %xmm1, %xmm0 503; SSE2-NEXT: movaps %xmm0, %xmm1 504; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 505; SSE2-NEXT: mulps %xmm0, %xmm1 506; SSE2-NEXT: movaps %xmm1, %xmm0 507; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 508; SSE2-NEXT: mulss %xmm1, %xmm0 509; SSE2-NEXT: retq 510; 511; SSE41-LABEL: test_v16f32_undef: 512; SSE41: # %bb.0: 513; SSE41-NEXT: mulps %xmm3, %xmm1 514; SSE41-NEXT: mulps %xmm2, %xmm0 515; SSE41-NEXT: mulps %xmm1, %xmm0 516; SSE41-NEXT: movaps %xmm0, %xmm1 517; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 518; SSE41-NEXT: mulps %xmm0, %xmm1 519; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 520; SSE41-NEXT: mulss %xmm0, %xmm1 521; SSE41-NEXT: movaps %xmm1, %xmm0 522; SSE41-NEXT: retq 523; 524; AVX-LABEL: test_v16f32_undef: 525; AVX: # %bb.0: 526; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 527; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 528; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 529; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 530; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 531; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 532; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 533; AVX-NEXT: vzeroupper 534; AVX-NEXT: retq 535; 536; AVX512-LABEL: test_v16f32_undef: 537; AVX512: # %bb.0: 538; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 539; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 540; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 541; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 542; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 543; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 544; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 545; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 546; AVX512-NEXT: vzeroupper 547; AVX512-NEXT: retq 548 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) 549 ret float %1 550} 551 552; 553; vXf64 (accum) 554; 555 556define double @test_v2f64(double %a0, <2 x double> %a1) { 557; SSE-LABEL: test_v2f64: 558; SSE: # %bb.0: 559; SSE-NEXT: movapd %xmm1, %xmm2 560; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 561; SSE-NEXT: mulsd %xmm1, %xmm2 562; SSE-NEXT: mulsd %xmm2, %xmm0 563; SSE-NEXT: retq 564; 565; AVX-LABEL: test_v2f64: 566; AVX: # %bb.0: 567; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 568; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 569; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 570; AVX-NEXT: retq 571; 572; AVX512-LABEL: test_v2f64: 573; AVX512: # %bb.0: 574; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 575; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 576; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 577; AVX512-NEXT: retq 578 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) 579 ret double %1 580} 581 582define double @test_v4f64(double %a0, <4 x double> %a1) { 583; SSE-LABEL: test_v4f64: 584; SSE: # %bb.0: 585; SSE-NEXT: mulpd %xmm2, %xmm1 586; SSE-NEXT: movapd %xmm1, %xmm2 587; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 588; SSE-NEXT: mulsd %xmm1, %xmm2 589; SSE-NEXT: mulsd %xmm2, %xmm0 590; SSE-NEXT: retq 591; 592; AVX-LABEL: test_v4f64: 593; AVX: # %bb.0: 594; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 595; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 596; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 597; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 598; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 599; AVX-NEXT: vzeroupper 600; AVX-NEXT: retq 601; 602; AVX512-LABEL: test_v4f64: 603; AVX512: # %bb.0: 604; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 605; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 606; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 607; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 608; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 609; AVX512-NEXT: vzeroupper 610; AVX512-NEXT: retq 611 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1) 612 ret double %1 613} 614 615define double @test_v8f64(double %a0, <8 x double> %a1) { 616; SSE-LABEL: test_v8f64: 617; SSE: # %bb.0: 618; SSE-NEXT: mulpd %xmm4, %xmm2 619; SSE-NEXT: mulpd %xmm3, %xmm1 620; SSE-NEXT: mulpd %xmm2, %xmm1 621; SSE-NEXT: movapd %xmm1, %xmm2 622; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 623; SSE-NEXT: mulsd %xmm1, %xmm2 624; SSE-NEXT: mulsd %xmm2, %xmm0 625; SSE-NEXT: retq 626; 627; AVX-LABEL: test_v8f64: 628; AVX: # %bb.0: 629; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 630; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 631; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 632; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 633; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 634; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 635; AVX-NEXT: vzeroupper 636; AVX-NEXT: retq 637; 638; AVX512-LABEL: test_v8f64: 639; AVX512: # %bb.0: 640; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 641; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 642; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 643; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 644; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 645; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 646; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 647; AVX512-NEXT: vzeroupper 648; AVX512-NEXT: retq 649 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1) 650 ret double %1 651} 652 653define double @test_v16f64(double %a0, <16 x double> %a1) { 654; SSE-LABEL: test_v16f64: 655; SSE: # %bb.0: 656; SSE-NEXT: mulpd %xmm6, %xmm2 657; SSE-NEXT: mulpd %xmm7, %xmm3 658; SSE-NEXT: mulpd %xmm5, %xmm1 659; SSE-NEXT: mulpd %xmm3, %xmm1 660; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 661; SSE-NEXT: mulpd %xmm2, %xmm4 662; SSE-NEXT: mulpd %xmm1, %xmm4 663; SSE-NEXT: movapd %xmm4, %xmm1 664; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 665; SSE-NEXT: mulsd %xmm4, %xmm1 666; SSE-NEXT: mulsd %xmm1, %xmm0 667; SSE-NEXT: retq 668; 669; AVX-LABEL: test_v16f64: 670; AVX: # %bb.0: 671; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm2 672; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 673; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 674; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 675; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 676; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 677; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 678; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 679; AVX-NEXT: vzeroupper 680; AVX-NEXT: retq 681; 682; AVX512-LABEL: test_v16f64: 683; AVX512: # %bb.0: 684; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 685; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 686; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 687; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 688; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 689; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 690; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 691; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 692; AVX512-NEXT: vzeroupper 693; AVX512-NEXT: retq 694 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1) 695 ret double %1 696} 697 698; 699; vXf64 (one) 700; 701 702define double @test_v2f64_zero(<2 x double> %a0) { 703; SSE-LABEL: test_v2f64_zero: 704; SSE: # %bb.0: 705; SSE-NEXT: movapd %xmm0, %xmm1 706; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 707; SSE-NEXT: mulsd %xmm0, %xmm1 708; SSE-NEXT: movapd %xmm1, %xmm0 709; SSE-NEXT: retq 710; 711; AVX-LABEL: test_v2f64_zero: 712; AVX: # %bb.0: 713; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 714; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 715; AVX-NEXT: retq 716; 717; AVX512-LABEL: test_v2f64_zero: 718; AVX512: # %bb.0: 719; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 720; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 721; AVX512-NEXT: retq 722 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) 723 ret double %1 724} 725 726define double @test_v4f64_zero(<4 x double> %a0) { 727; SSE-LABEL: test_v4f64_zero: 728; SSE: # %bb.0: 729; SSE-NEXT: mulpd %xmm1, %xmm0 730; SSE-NEXT: movapd %xmm0, %xmm1 731; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 732; SSE-NEXT: mulsd %xmm0, %xmm1 733; SSE-NEXT: movapd %xmm1, %xmm0 734; SSE-NEXT: retq 735; 736; AVX-LABEL: test_v4f64_zero: 737; AVX: # %bb.0: 738; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 739; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 740; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 741; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 742; AVX-NEXT: vzeroupper 743; AVX-NEXT: retq 744; 745; AVX512-LABEL: test_v4f64_zero: 746; AVX512: # %bb.0: 747; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 748; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 749; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 750; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 751; AVX512-NEXT: vzeroupper 752; AVX512-NEXT: retq 753 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) 754 ret double %1 755} 756 757define double @test_v8f64_zero(<8 x double> %a0) { 758; SSE-LABEL: test_v8f64_zero: 759; SSE: # %bb.0: 760; SSE-NEXT: mulpd %xmm3, %xmm1 761; SSE-NEXT: mulpd %xmm2, %xmm0 762; SSE-NEXT: mulpd %xmm1, %xmm0 763; SSE-NEXT: movapd %xmm0, %xmm1 764; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 765; SSE-NEXT: mulsd %xmm0, %xmm1 766; SSE-NEXT: movapd %xmm1, %xmm0 767; SSE-NEXT: retq 768; 769; AVX-LABEL: test_v8f64_zero: 770; AVX: # %bb.0: 771; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 772; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 773; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 774; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 775; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 776; AVX-NEXT: vzeroupper 777; AVX-NEXT: retq 778; 779; AVX512-LABEL: test_v8f64_zero: 780; AVX512: # %bb.0: 781; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 782; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 783; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 784; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 785; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 786; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 787; AVX512-NEXT: vzeroupper 788; AVX512-NEXT: retq 789 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) 790 ret double %1 791} 792 793define double @test_v16f64_zero(<16 x double> %a0) { 794; SSE-LABEL: test_v16f64_zero: 795; SSE: # %bb.0: 796; SSE-NEXT: mulpd %xmm6, %xmm2 797; SSE-NEXT: mulpd %xmm4, %xmm0 798; SSE-NEXT: mulpd %xmm2, %xmm0 799; SSE-NEXT: mulpd %xmm7, %xmm3 800; SSE-NEXT: mulpd %xmm5, %xmm1 801; SSE-NEXT: mulpd %xmm3, %xmm1 802; SSE-NEXT: mulpd %xmm0, %xmm1 803; SSE-NEXT: movapd %xmm1, %xmm0 804; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 805; SSE-NEXT: mulsd %xmm1, %xmm0 806; SSE-NEXT: retq 807; 808; AVX-LABEL: test_v16f64_zero: 809; AVX: # %bb.0: 810; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 811; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0 812; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 813; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 814; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 815; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 816; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 817; AVX-NEXT: vzeroupper 818; AVX-NEXT: retq 819; 820; AVX512-LABEL: test_v16f64_zero: 821; AVX512: # %bb.0: 822; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 823; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 824; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 825; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 826; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 827; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 828; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 829; AVX512-NEXT: vzeroupper 830; AVX512-NEXT: retq 831 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) 832 ret double %1 833} 834 835; 836; vXf64 (undef) 837; 838 839define double @test_v2f64_undef(<2 x double> %a0) { 840; SSE-LABEL: test_v2f64_undef: 841; SSE: # %bb.0: 842; SSE-NEXT: movapd %xmm0, %xmm1 843; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 844; SSE-NEXT: mulsd %xmm0, %xmm1 845; SSE-NEXT: movapd %xmm1, %xmm0 846; SSE-NEXT: retq 847; 848; AVX-LABEL: test_v2f64_undef: 849; AVX: # %bb.0: 850; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 851; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 852; AVX-NEXT: retq 853; 854; AVX512-LABEL: test_v2f64_undef: 855; AVX512: # %bb.0: 856; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 857; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 858; AVX512-NEXT: retq 859 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) 860 ret double %1 861} 862 863define double @test_v4f64_undef(<4 x double> %a0) { 864; SSE-LABEL: test_v4f64_undef: 865; SSE: # %bb.0: 866; SSE-NEXT: mulpd %xmm1, %xmm0 867; SSE-NEXT: movapd %xmm0, %xmm1 868; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 869; SSE-NEXT: mulsd %xmm0, %xmm1 870; SSE-NEXT: movapd %xmm1, %xmm0 871; SSE-NEXT: retq 872; 873; AVX-LABEL: test_v4f64_undef: 874; AVX: # %bb.0: 875; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 876; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 877; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 878; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 879; AVX-NEXT: vzeroupper 880; AVX-NEXT: retq 881; 882; AVX512-LABEL: test_v4f64_undef: 883; AVX512: # %bb.0: 884; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 885; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 886; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 887; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 888; AVX512-NEXT: vzeroupper 889; AVX512-NEXT: retq 890 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) 891 ret double %1 892} 893 894define double @test_v8f64_undef(<8 x double> %a0) { 895; SSE-LABEL: test_v8f64_undef: 896; SSE: # %bb.0: 897; SSE-NEXT: mulpd %xmm3, %xmm1 898; SSE-NEXT: mulpd %xmm2, %xmm0 899; SSE-NEXT: mulpd %xmm1, %xmm0 900; SSE-NEXT: movapd %xmm0, %xmm1 901; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 902; SSE-NEXT: mulsd %xmm0, %xmm1 903; SSE-NEXT: movapd %xmm1, %xmm0 904; SSE-NEXT: retq 905; 906; AVX-LABEL: test_v8f64_undef: 907; AVX: # %bb.0: 908; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 909; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 910; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 911; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 912; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 913; AVX-NEXT: vzeroupper 914; AVX-NEXT: retq 915; 916; AVX512-LABEL: test_v8f64_undef: 917; AVX512: # %bb.0: 918; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 919; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 920; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 921; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 922; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 923; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 924; AVX512-NEXT: vzeroupper 925; AVX512-NEXT: retq 926 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) 927 ret double %1 928} 929 930define double @test_v16f64_undef(<16 x double> %a0) { 931; SSE-LABEL: test_v16f64_undef: 932; SSE: # %bb.0: 933; SSE-NEXT: mulpd %xmm6, %xmm2 934; SSE-NEXT: mulpd %xmm4, %xmm0 935; SSE-NEXT: mulpd %xmm2, %xmm0 936; SSE-NEXT: mulpd %xmm7, %xmm3 937; SSE-NEXT: mulpd %xmm5, %xmm1 938; SSE-NEXT: mulpd %xmm3, %xmm1 939; SSE-NEXT: mulpd %xmm0, %xmm1 940; SSE-NEXT: movapd %xmm1, %xmm0 941; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 942; SSE-NEXT: mulsd %xmm1, %xmm0 943; SSE-NEXT: retq 944; 945; AVX-LABEL: test_v16f64_undef: 946; AVX: # %bb.0: 947; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 948; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0 949; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 950; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 951; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 952; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 953; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 954; AVX-NEXT: vzeroupper 955; AVX-NEXT: retq 956; 957; AVX512-LABEL: test_v16f64_undef: 958; AVX512: # %bb.0: 959; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 960; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 961; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 962; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 963; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 964; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 965; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 966; AVX512-NEXT: vzeroupper 967; AVX512-NEXT: retq 968 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) 969 ret double %1 970} 971 972declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) 973declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 974declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) 975declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>) 976 977declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) 978declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) 979declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>) 980declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>) 981