1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 8 9; 10; vXf32 (accum) 11; 12 13define float @test_v2f32(float %a0, <2 x float> %a1) { 14; SSE2-LABEL: test_v2f32: 15; SSE2: # %bb.0: 16; SSE2-NEXT: mulss %xmm1, %xmm0 17; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 18; SSE2-NEXT: mulss %xmm1, %xmm0 19; SSE2-NEXT: retq 20; 21; SSE41-LABEL: test_v2f32: 22; SSE41: # %bb.0: 23; SSE41-NEXT: mulss %xmm1, %xmm0 24; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 25; SSE41-NEXT: mulss %xmm1, %xmm0 26; SSE41-NEXT: retq 27; 28; AVX-LABEL: test_v2f32: 29; AVX: # %bb.0: 30; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 31; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 32; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 33; AVX-NEXT: retq 34; 35; AVX512-LABEL: test_v2f32: 36; AVX512: # %bb.0: 37; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 38; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 39; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 40; AVX512-NEXT: retq 41 %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1) 42 ret float %1 43} 44 45define float @test_v4f32(float %a0, <4 x float> %a1) { 46; SSE2-LABEL: test_v4f32: 47; SSE2: # %bb.0: 48; SSE2-NEXT: mulss %xmm1, %xmm0 49; SSE2-NEXT: movaps %xmm1, %xmm2 50; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 51; SSE2-NEXT: mulss %xmm2, %xmm0 52; SSE2-NEXT: movaps %xmm1, %xmm2 53; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 54; SSE2-NEXT: mulss %xmm2, %xmm0 55; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 56; SSE2-NEXT: mulss %xmm1, %xmm0 57; SSE2-NEXT: retq 58; 59; SSE41-LABEL: test_v4f32: 60; SSE41: # %bb.0: 61; SSE41-NEXT: mulss %xmm1, %xmm0 62; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 63; SSE41-NEXT: mulss %xmm2, %xmm0 64; SSE41-NEXT: movaps %xmm1, %xmm2 65; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 66; SSE41-NEXT: mulss %xmm2, %xmm0 67; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 68; SSE41-NEXT: mulss %xmm1, %xmm0 69; SSE41-NEXT: retq 70; 71; AVX-LABEL: test_v4f32: 72; AVX: # %bb.0: 73; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 74; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 75; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 76; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 77; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 78; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 79; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 80; AVX-NEXT: retq 81; 82; AVX512-LABEL: test_v4f32: 83; AVX512: # %bb.0: 84; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 85; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 86; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 87; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 88; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 89; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 90; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 91; AVX512-NEXT: retq 92 %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) 93 ret float %1 94} 95 96define float @test_v8f32(float %a0, <8 x float> %a1) { 97; SSE2-LABEL: test_v8f32: 98; SSE2: # %bb.0: 99; SSE2-NEXT: mulss %xmm1, %xmm0 100; SSE2-NEXT: movaps %xmm1, %xmm3 101; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] 102; SSE2-NEXT: mulss %xmm3, %xmm0 103; SSE2-NEXT: movaps %xmm1, %xmm3 104; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 105; SSE2-NEXT: mulss %xmm3, %xmm0 106; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 107; SSE2-NEXT: mulss %xmm1, %xmm0 108; SSE2-NEXT: mulss %xmm2, %xmm0 109; SSE2-NEXT: movaps %xmm2, %xmm1 110; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 111; SSE2-NEXT: mulss %xmm1, %xmm0 112; SSE2-NEXT: movaps %xmm2, %xmm1 113; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 114; SSE2-NEXT: mulss %xmm1, %xmm0 115; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 116; SSE2-NEXT: mulss %xmm2, %xmm0 117; SSE2-NEXT: retq 118; 119; SSE41-LABEL: test_v8f32: 120; SSE41: # %bb.0: 121; SSE41-NEXT: mulss %xmm1, %xmm0 122; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 123; SSE41-NEXT: mulss %xmm3, %xmm0 124; SSE41-NEXT: movaps %xmm1, %xmm3 125; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 126; SSE41-NEXT: mulss %xmm3, %xmm0 127; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 128; SSE41-NEXT: mulss %xmm1, %xmm0 129; SSE41-NEXT: mulss %xmm2, %xmm0 130; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 131; SSE41-NEXT: mulss %xmm1, %xmm0 132; SSE41-NEXT: movaps %xmm2, %xmm1 133; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 134; SSE41-NEXT: mulss %xmm1, %xmm0 135; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 136; SSE41-NEXT: mulss %xmm2, %xmm0 137; SSE41-NEXT: retq 138; 139; AVX-LABEL: test_v8f32: 140; AVX: # %bb.0: 141; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 142; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 143; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 144; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 145; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 146; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 147; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 148; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 149; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 150; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 151; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 152; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 153; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 154; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 155; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 156; AVX-NEXT: vzeroupper 157; AVX-NEXT: retq 158; 159; AVX512-LABEL: test_v8f32: 160; AVX512: # %bb.0: 161; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 162; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 163; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 164; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 165; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 166; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 167; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 168; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 169; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 170; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 171; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 172; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 173; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 174; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 175; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 176; AVX512-NEXT: vzeroupper 177; AVX512-NEXT: retq 178 %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1) 179 ret float %1 180} 181 182define float @test_v16f32(float %a0, <16 x float> %a1) { 183; SSE2-LABEL: test_v16f32: 184; SSE2: # %bb.0: 185; SSE2-NEXT: mulss %xmm1, %xmm0 186; SSE2-NEXT: movaps %xmm1, %xmm5 187; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] 188; SSE2-NEXT: mulss %xmm5, %xmm0 189; SSE2-NEXT: movaps %xmm1, %xmm5 190; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 191; SSE2-NEXT: mulss %xmm5, %xmm0 192; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 193; SSE2-NEXT: mulss %xmm1, %xmm0 194; SSE2-NEXT: mulss %xmm2, %xmm0 195; SSE2-NEXT: movaps %xmm2, %xmm1 196; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 197; SSE2-NEXT: mulss %xmm1, %xmm0 198; SSE2-NEXT: movaps %xmm2, %xmm1 199; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 200; SSE2-NEXT: mulss %xmm1, %xmm0 201; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 202; SSE2-NEXT: mulss %xmm2, %xmm0 203; SSE2-NEXT: mulss %xmm3, %xmm0 204; SSE2-NEXT: movaps %xmm3, %xmm1 205; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 206; SSE2-NEXT: mulss %xmm1, %xmm0 207; SSE2-NEXT: movaps %xmm3, %xmm1 208; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 209; SSE2-NEXT: mulss %xmm1, %xmm0 210; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 211; SSE2-NEXT: mulss %xmm3, %xmm0 212; SSE2-NEXT: mulss %xmm4, %xmm0 213; SSE2-NEXT: movaps %xmm4, %xmm1 214; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] 215; SSE2-NEXT: mulss %xmm1, %xmm0 216; SSE2-NEXT: movaps %xmm4, %xmm1 217; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 218; SSE2-NEXT: mulss %xmm1, %xmm0 219; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 220; SSE2-NEXT: mulss %xmm4, %xmm0 221; SSE2-NEXT: retq 222; 223; SSE41-LABEL: test_v16f32: 224; SSE41: # %bb.0: 225; SSE41-NEXT: mulss %xmm1, %xmm0 226; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] 227; SSE41-NEXT: mulss %xmm5, %xmm0 228; SSE41-NEXT: movaps %xmm1, %xmm5 229; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 230; SSE41-NEXT: mulss %xmm5, %xmm0 231; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 232; SSE41-NEXT: mulss %xmm1, %xmm0 233; SSE41-NEXT: mulss %xmm2, %xmm0 234; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 235; SSE41-NEXT: mulss %xmm1, %xmm0 236; SSE41-NEXT: movaps %xmm2, %xmm1 237; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 238; SSE41-NEXT: mulss %xmm1, %xmm0 239; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 240; SSE41-NEXT: mulss %xmm2, %xmm0 241; SSE41-NEXT: mulss %xmm3, %xmm0 242; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 243; SSE41-NEXT: mulss %xmm1, %xmm0 244; SSE41-NEXT: movaps %xmm3, %xmm1 245; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 246; SSE41-NEXT: mulss %xmm1, %xmm0 247; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 248; SSE41-NEXT: mulss %xmm3, %xmm0 249; SSE41-NEXT: mulss %xmm4, %xmm0 250; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] 251; SSE41-NEXT: mulss %xmm1, %xmm0 252; SSE41-NEXT: movaps %xmm4, %xmm1 253; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 254; SSE41-NEXT: mulss %xmm1, %xmm0 255; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 256; SSE41-NEXT: mulss %xmm4, %xmm0 257; SSE41-NEXT: retq 258; 259; AVX-LABEL: test_v16f32: 260; AVX: # %bb.0: 261; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 262; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 263; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 264; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 265; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 266; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] 267; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 268; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 269; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 270; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 271; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 272; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 273; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 274; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 275; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 276; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 277; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 278; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 279; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 280; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 281; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] 282; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 283; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 284; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 285; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 286; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 287; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 288; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 289; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 290; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 291; AVX-NEXT: vzeroupper 292; AVX-NEXT: retq 293; 294; AVX512-LABEL: test_v16f32: 295; AVX512: # %bb.0: 296; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 297; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 298; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 299; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 300; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 301; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 302; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 303; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 304; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 305; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 306; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 307; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 308; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 309; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 310; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 311; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 312; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 313; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 314; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 315; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 316; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 317; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 318; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 319; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 320; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 321; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 322; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 323; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 324; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 325; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 326; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 327; AVX512-NEXT: vzeroupper 328; AVX512-NEXT: retq 329 %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1) 330 ret float %1 331} 332 333; 334; vXf32 (one) 335; 336 337define float @test_v2f32_one(<2 x float> %a0) { 338; SSE2-LABEL: test_v2f32_one: 339; SSE2: # %bb.0: 340; SSE2-NEXT: movaps %xmm0, %xmm1 341; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 342; SSE2-NEXT: mulss %xmm0, %xmm1 343; SSE2-NEXT: movaps %xmm1, %xmm0 344; SSE2-NEXT: retq 345; 346; SSE41-LABEL: test_v2f32_one: 347; SSE41: # %bb.0: 348; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 349; SSE41-NEXT: mulss %xmm1, %xmm0 350; SSE41-NEXT: retq 351; 352; AVX-LABEL: test_v2f32_one: 353; AVX: # %bb.0: 354; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 355; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 356; AVX-NEXT: retq 357; 358; AVX512-LABEL: test_v2f32_one: 359; AVX512: # %bb.0: 360; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 361; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 362; AVX512-NEXT: retq 363 %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) 364 ret float %1 365} 366 367define float @test_v4f32_one(<4 x float> %a0) { 368; SSE2-LABEL: test_v4f32_one: 369; SSE2: # %bb.0: 370; SSE2-NEXT: movaps %xmm0, %xmm1 371; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 372; SSE2-NEXT: mulss %xmm0, %xmm1 373; SSE2-NEXT: movaps %xmm0, %xmm2 374; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 375; SSE2-NEXT: mulss %xmm1, %xmm2 376; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 377; SSE2-NEXT: mulss %xmm2, %xmm0 378; SSE2-NEXT: retq 379; 380; SSE41-LABEL: test_v4f32_one: 381; SSE41: # %bb.0: 382; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 383; SSE41-NEXT: mulss %xmm0, %xmm1 384; SSE41-NEXT: movaps %xmm0, %xmm2 385; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 386; SSE41-NEXT: mulss %xmm1, %xmm2 387; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 388; SSE41-NEXT: mulss %xmm2, %xmm0 389; SSE41-NEXT: retq 390; 391; AVX-LABEL: test_v4f32_one: 392; AVX: # %bb.0: 393; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 394; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 395; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 396; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 397; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 398; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 399; AVX-NEXT: retq 400; 401; AVX512-LABEL: test_v4f32_one: 402; AVX512: # %bb.0: 403; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 404; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 405; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 406; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 407; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 408; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 409; AVX512-NEXT: retq 410 %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) 411 ret float %1 412} 413 414define float @test_v8f32_one(<8 x float> %a0) { 415; SSE2-LABEL: test_v8f32_one: 416; SSE2: # %bb.0: 417; SSE2-NEXT: movaps %xmm0, %xmm2 418; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 419; SSE2-NEXT: mulss %xmm0, %xmm2 420; SSE2-NEXT: movaps %xmm0, %xmm3 421; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 422; SSE2-NEXT: mulss %xmm2, %xmm3 423; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 424; SSE2-NEXT: mulss %xmm3, %xmm0 425; SSE2-NEXT: mulss %xmm1, %xmm0 426; SSE2-NEXT: movaps %xmm1, %xmm2 427; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 428; SSE2-NEXT: mulss %xmm2, %xmm0 429; SSE2-NEXT: movaps %xmm1, %xmm2 430; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 431; SSE2-NEXT: mulss %xmm2, %xmm0 432; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 433; SSE2-NEXT: mulss %xmm1, %xmm0 434; SSE2-NEXT: retq 435; 436; SSE41-LABEL: test_v8f32_one: 437; SSE41: # %bb.0: 438; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 439; SSE41-NEXT: mulss %xmm0, %xmm2 440; SSE41-NEXT: movaps %xmm0, %xmm3 441; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 442; SSE41-NEXT: mulss %xmm2, %xmm3 443; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 444; SSE41-NEXT: mulss %xmm3, %xmm0 445; SSE41-NEXT: mulss %xmm1, %xmm0 446; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 447; SSE41-NEXT: mulss %xmm2, %xmm0 448; SSE41-NEXT: movaps %xmm1, %xmm2 449; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 450; SSE41-NEXT: mulss %xmm2, %xmm0 451; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 452; SSE41-NEXT: mulss %xmm1, %xmm0 453; SSE41-NEXT: retq 454; 455; AVX-LABEL: test_v8f32_one: 456; AVX: # %bb.0: 457; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 458; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 459; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 460; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 461; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 462; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 463; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 464; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 465; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 466; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 467; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 468; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 469; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 470; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 471; AVX-NEXT: vzeroupper 472; AVX-NEXT: retq 473; 474; AVX512-LABEL: test_v8f32_one: 475; AVX512: # %bb.0: 476; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 477; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 478; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 479; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 480; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 481; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 482; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 483; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 484; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 485; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 486; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 487; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 488; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 489; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 490; AVX512-NEXT: vzeroupper 491; AVX512-NEXT: retq 492 %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) 493 ret float %1 494} 495 496define float @test_v16f32_one(<16 x float> %a0) { 497; SSE2-LABEL: test_v16f32_one: 498; SSE2: # %bb.0: 499; SSE2-NEXT: movaps %xmm0, %xmm4 500; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 501; SSE2-NEXT: mulss %xmm0, %xmm4 502; SSE2-NEXT: movaps %xmm0, %xmm5 503; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 504; SSE2-NEXT: mulss %xmm4, %xmm5 505; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 506; SSE2-NEXT: mulss %xmm5, %xmm0 507; SSE2-NEXT: mulss %xmm1, %xmm0 508; SSE2-NEXT: movaps %xmm1, %xmm4 509; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] 510; SSE2-NEXT: mulss %xmm4, %xmm0 511; SSE2-NEXT: movaps %xmm1, %xmm4 512; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 513; SSE2-NEXT: mulss %xmm4, %xmm0 514; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 515; SSE2-NEXT: mulss %xmm1, %xmm0 516; SSE2-NEXT: mulss %xmm2, %xmm0 517; SSE2-NEXT: movaps %xmm2, %xmm1 518; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 519; SSE2-NEXT: mulss %xmm1, %xmm0 520; SSE2-NEXT: movaps %xmm2, %xmm1 521; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 522; SSE2-NEXT: mulss %xmm1, %xmm0 523; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 524; SSE2-NEXT: mulss %xmm2, %xmm0 525; SSE2-NEXT: mulss %xmm3, %xmm0 526; SSE2-NEXT: movaps %xmm3, %xmm1 527; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 528; SSE2-NEXT: mulss %xmm1, %xmm0 529; SSE2-NEXT: movaps %xmm3, %xmm1 530; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 531; SSE2-NEXT: mulss %xmm1, %xmm0 532; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 533; SSE2-NEXT: mulss %xmm3, %xmm0 534; SSE2-NEXT: retq 535; 536; SSE41-LABEL: test_v16f32_one: 537; SSE41: # %bb.0: 538; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 539; SSE41-NEXT: mulss %xmm0, %xmm4 540; SSE41-NEXT: movaps %xmm0, %xmm5 541; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 542; SSE41-NEXT: mulss %xmm4, %xmm5 543; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 544; SSE41-NEXT: mulss %xmm5, %xmm0 545; SSE41-NEXT: mulss %xmm1, %xmm0 546; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 547; SSE41-NEXT: mulss %xmm4, %xmm0 548; SSE41-NEXT: movaps %xmm1, %xmm4 549; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 550; SSE41-NEXT: mulss %xmm4, %xmm0 551; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 552; SSE41-NEXT: mulss %xmm1, %xmm0 553; SSE41-NEXT: mulss %xmm2, %xmm0 554; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 555; SSE41-NEXT: mulss %xmm1, %xmm0 556; SSE41-NEXT: movaps %xmm2, %xmm1 557; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 558; SSE41-NEXT: mulss %xmm1, %xmm0 559; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 560; SSE41-NEXT: mulss %xmm2, %xmm0 561; SSE41-NEXT: mulss %xmm3, %xmm0 562; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 563; SSE41-NEXT: mulss %xmm1, %xmm0 564; SSE41-NEXT: movaps %xmm3, %xmm1 565; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 566; SSE41-NEXT: mulss %xmm1, %xmm0 567; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 568; SSE41-NEXT: mulss %xmm3, %xmm0 569; SSE41-NEXT: retq 570; 571; AVX-LABEL: test_v16f32_one: 572; AVX: # %bb.0: 573; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 574; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm2 575; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 576; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 577; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] 578; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 579; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 580; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 581; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 582; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 583; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 584; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 585; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 586; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 587; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 588; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 589; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 590; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 591; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 592; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 593; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 594; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 595; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 596; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 597; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 598; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 599; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 600; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 601; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 602; AVX-NEXT: vzeroupper 603; AVX-NEXT: retq 604; 605; AVX512-LABEL: test_v16f32_one: 606; AVX512: # %bb.0: 607; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 608; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 609; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 610; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 611; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 612; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 613; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 614; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 615; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 616; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 617; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 618; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 619; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 620; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 621; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 622; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 623; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 624; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 625; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 626; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 627; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 628; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 629; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 630; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 631; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 632; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 633; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 634; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 635; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 636; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 637; AVX512-NEXT: vzeroupper 638; AVX512-NEXT: retq 639 %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) 640 ret float %1 641} 642 643; 644; vXf32 (undef) 645; 646 647define float @test_v2f32_undef(<2 x float> %a0) { 648; SSE2-LABEL: test_v2f32_undef: 649; SSE2: # %bb.0: 650; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 651; SSE2-NEXT: mulss {{.*}}(%rip), %xmm0 652; SSE2-NEXT: retq 653; 654; SSE41-LABEL: test_v2f32_undef: 655; SSE41: # %bb.0: 656; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 657; SSE41-NEXT: mulss {{.*}}(%rip), %xmm0 658; SSE41-NEXT: retq 659; 660; AVX-LABEL: test_v2f32_undef: 661; AVX: # %bb.0: 662; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 663; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 664; AVX-NEXT: retq 665; 666; AVX512-LABEL: test_v2f32_undef: 667; AVX512: # %bb.0: 668; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 669; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 670; AVX512-NEXT: retq 671 %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0) 672 ret float %1 673} 674 675define float @test_v4f32_undef(<4 x float> %a0) { 676; SSE2-LABEL: test_v4f32_undef: 677; SSE2: # %bb.0: 678; SSE2-NEXT: movaps %xmm0, %xmm1 679; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 680; SSE2-NEXT: mulss {{.*}}(%rip), %xmm1 681; SSE2-NEXT: movaps %xmm0, %xmm2 682; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 683; SSE2-NEXT: mulss %xmm1, %xmm2 684; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 685; SSE2-NEXT: mulss %xmm2, %xmm0 686; SSE2-NEXT: retq 687; 688; SSE41-LABEL: test_v4f32_undef: 689; SSE41: # %bb.0: 690; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 691; SSE41-NEXT: mulss {{.*}}(%rip), %xmm1 692; SSE41-NEXT: movaps %xmm0, %xmm2 693; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 694; SSE41-NEXT: mulss %xmm1, %xmm2 695; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 696; SSE41-NEXT: mulss %xmm2, %xmm0 697; SSE41-NEXT: retq 698; 699; AVX-LABEL: test_v4f32_undef: 700; AVX: # %bb.0: 701; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 702; AVX-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 703; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 704; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 705; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 706; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 707; AVX-NEXT: retq 708; 709; AVX512-LABEL: test_v4f32_undef: 710; AVX512: # %bb.0: 711; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 712; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 713; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 714; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 715; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 716; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 717; AVX512-NEXT: retq 718 %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0) 719 ret float %1 720} 721 722define float @test_v8f32_undef(<8 x float> %a0) { 723; SSE2-LABEL: test_v8f32_undef: 724; SSE2: # %bb.0: 725; SSE2-NEXT: movaps %xmm0, %xmm2 726; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 727; SSE2-NEXT: mulss {{.*}}(%rip), %xmm2 728; SSE2-NEXT: movaps %xmm0, %xmm3 729; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 730; SSE2-NEXT: mulss %xmm2, %xmm3 731; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 732; SSE2-NEXT: mulss %xmm3, %xmm0 733; SSE2-NEXT: mulss %xmm1, %xmm0 734; SSE2-NEXT: movaps %xmm1, %xmm2 735; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 736; SSE2-NEXT: mulss %xmm2, %xmm0 737; SSE2-NEXT: movaps %xmm1, %xmm2 738; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 739; SSE2-NEXT: mulss %xmm2, %xmm0 740; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 741; SSE2-NEXT: mulss %xmm1, %xmm0 742; SSE2-NEXT: retq 743; 744; SSE41-LABEL: test_v8f32_undef: 745; SSE41: # %bb.0: 746; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 747; SSE41-NEXT: mulss {{.*}}(%rip), %xmm2 748; SSE41-NEXT: movaps %xmm0, %xmm3 749; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 750; SSE41-NEXT: mulss %xmm2, %xmm3 751; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 752; SSE41-NEXT: mulss %xmm3, %xmm0 753; SSE41-NEXT: mulss %xmm1, %xmm0 754; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 755; SSE41-NEXT: mulss %xmm2, %xmm0 756; SSE41-NEXT: movaps %xmm1, %xmm2 757; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 758; SSE41-NEXT: mulss %xmm2, %xmm0 759; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 760; SSE41-NEXT: mulss %xmm1, %xmm0 761; SSE41-NEXT: retq 762; 763; AVX-LABEL: test_v8f32_undef: 764; AVX: # %bb.0: 765; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 766; AVX-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 767; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 768; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 769; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 770; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 771; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 772; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 773; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 774; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 775; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 776; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 777; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 778; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 779; AVX-NEXT: vzeroupper 780; AVX-NEXT: retq 781; 782; AVX512-LABEL: test_v8f32_undef: 783; AVX512: # %bb.0: 784; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 785; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 786; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 787; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 788; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 789; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 790; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 791; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 792; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 793; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 794; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 795; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 796; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 797; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 798; AVX512-NEXT: vzeroupper 799; AVX512-NEXT: retq 800 %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0) 801 ret float %1 802} 803 804define float @test_v16f32_undef(<16 x float> %a0) { 805; SSE2-LABEL: test_v16f32_undef: 806; SSE2: # %bb.0: 807; SSE2-NEXT: movaps %xmm0, %xmm4 808; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 809; SSE2-NEXT: mulss {{.*}}(%rip), %xmm4 810; SSE2-NEXT: movaps %xmm0, %xmm5 811; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 812; SSE2-NEXT: mulss %xmm4, %xmm5 813; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 814; SSE2-NEXT: mulss %xmm5, %xmm0 815; SSE2-NEXT: mulss %xmm1, %xmm0 816; SSE2-NEXT: movaps %xmm1, %xmm4 817; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] 818; SSE2-NEXT: mulss %xmm4, %xmm0 819; SSE2-NEXT: movaps %xmm1, %xmm4 820; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 821; SSE2-NEXT: mulss %xmm4, %xmm0 822; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 823; SSE2-NEXT: mulss %xmm1, %xmm0 824; SSE2-NEXT: mulss %xmm2, %xmm0 825; SSE2-NEXT: movaps %xmm2, %xmm1 826; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 827; SSE2-NEXT: mulss %xmm1, %xmm0 828; SSE2-NEXT: movaps %xmm2, %xmm1 829; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 830; SSE2-NEXT: mulss %xmm1, %xmm0 831; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 832; SSE2-NEXT: mulss %xmm2, %xmm0 833; SSE2-NEXT: mulss %xmm3, %xmm0 834; SSE2-NEXT: movaps %xmm3, %xmm1 835; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 836; SSE2-NEXT: mulss %xmm1, %xmm0 837; SSE2-NEXT: movaps %xmm3, %xmm1 838; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 839; SSE2-NEXT: mulss %xmm1, %xmm0 840; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 841; SSE2-NEXT: mulss %xmm3, %xmm0 842; SSE2-NEXT: retq 843; 844; SSE41-LABEL: test_v16f32_undef: 845; SSE41: # %bb.0: 846; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 847; SSE41-NEXT: mulss {{.*}}(%rip), %xmm4 848; SSE41-NEXT: movaps %xmm0, %xmm5 849; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 850; SSE41-NEXT: mulss %xmm4, %xmm5 851; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 852; SSE41-NEXT: mulss %xmm5, %xmm0 853; SSE41-NEXT: mulss %xmm1, %xmm0 854; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 855; SSE41-NEXT: mulss %xmm4, %xmm0 856; SSE41-NEXT: movaps %xmm1, %xmm4 857; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 858; SSE41-NEXT: mulss %xmm4, %xmm0 859; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 860; SSE41-NEXT: mulss %xmm1, %xmm0 861; SSE41-NEXT: mulss %xmm2, %xmm0 862; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 863; SSE41-NEXT: mulss %xmm1, %xmm0 864; SSE41-NEXT: movaps %xmm2, %xmm1 865; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 866; SSE41-NEXT: mulss %xmm1, %xmm0 867; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 868; SSE41-NEXT: mulss %xmm2, %xmm0 869; SSE41-NEXT: mulss %xmm3, %xmm0 870; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 871; SSE41-NEXT: mulss %xmm1, %xmm0 872; SSE41-NEXT: movaps %xmm3, %xmm1 873; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 874; SSE41-NEXT: mulss %xmm1, %xmm0 875; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 876; SSE41-NEXT: mulss %xmm3, %xmm0 877; SSE41-NEXT: retq 878; 879; AVX-LABEL: test_v16f32_undef: 880; AVX: # %bb.0: 881; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 882; AVX-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 883; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 884; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 885; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] 886; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 887; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 888; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 889; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 890; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 891; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 892; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 893; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 894; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 895; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 896; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 897; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 898; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 899; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 900; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 901; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 902; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 903; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 904; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 905; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 906; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 907; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 908; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 909; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 910; AVX-NEXT: vzeroupper 911; AVX-NEXT: retq 912; 913; AVX512-LABEL: test_v16f32_undef: 914; AVX512: # %bb.0: 915; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 916; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 917; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 918; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 919; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 920; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 921; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 922; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 923; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 924; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 925; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 926; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 927; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 928; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 929; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 930; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 931; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 932; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 933; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 934; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 935; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 936; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 937; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 938; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 939; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 940; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 941; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 942; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 943; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 944; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 945; AVX512-NEXT: vzeroupper 946; AVX512-NEXT: retq 947 %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0) 948 ret float %1 949} 950 951; 952; vXf64 (accum) 953; 954 955define double @test_v2f64(double %a0, <2 x double> %a1) { 956; SSE-LABEL: test_v2f64: 957; SSE: # %bb.0: 958; SSE-NEXT: mulsd %xmm1, %xmm0 959; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 960; SSE-NEXT: mulsd %xmm1, %xmm0 961; SSE-NEXT: retq 962; 963; AVX-LABEL: test_v2f64: 964; AVX: # %bb.0: 965; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 966; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 967; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 968; AVX-NEXT: retq 969; 970; AVX512-LABEL: test_v2f64: 971; AVX512: # %bb.0: 972; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 973; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 974; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 975; AVX512-NEXT: retq 976 %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) 977 ret double %1 978} 979 980define double @test_v4f64(double %a0, <4 x double> %a1) { 981; SSE-LABEL: test_v4f64: 982; SSE: # %bb.0: 983; SSE-NEXT: mulsd %xmm1, %xmm0 984; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 985; SSE-NEXT: mulsd %xmm1, %xmm0 986; SSE-NEXT: mulsd %xmm2, %xmm0 987; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 988; SSE-NEXT: mulsd %xmm2, %xmm0 989; SSE-NEXT: retq 990; 991; AVX-LABEL: test_v4f64: 992; AVX: # %bb.0: 993; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 994; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 995; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 996; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 997; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 998; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 999; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1000; AVX-NEXT: vzeroupper 1001; AVX-NEXT: retq 1002; 1003; AVX512-LABEL: test_v4f64: 1004; AVX512: # %bb.0: 1005; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1006; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1007; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1008; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 1009; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1010; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1011; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1012; AVX512-NEXT: vzeroupper 1013; AVX512-NEXT: retq 1014 %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1) 1015 ret double %1 1016} 1017 1018define double @test_v8f64(double %a0, <8 x double> %a1) { 1019; SSE-LABEL: test_v8f64: 1020; SSE: # %bb.0: 1021; SSE-NEXT: mulsd %xmm1, %xmm0 1022; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1023; SSE-NEXT: mulsd %xmm1, %xmm0 1024; SSE-NEXT: mulsd %xmm2, %xmm0 1025; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1026; SSE-NEXT: mulsd %xmm2, %xmm0 1027; SSE-NEXT: mulsd %xmm3, %xmm0 1028; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1029; SSE-NEXT: mulsd %xmm3, %xmm0 1030; SSE-NEXT: mulsd %xmm4, %xmm0 1031; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1032; SSE-NEXT: mulsd %xmm4, %xmm0 1033; SSE-NEXT: retq 1034; 1035; AVX-LABEL: test_v8f64: 1036; AVX: # %bb.0: 1037; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1038; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 1039; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1040; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1041; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1042; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1043; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1044; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1045; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1046; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1047; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1048; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1049; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1050; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1051; AVX-NEXT: vzeroupper 1052; AVX-NEXT: retq 1053; 1054; AVX512-LABEL: test_v8f64: 1055; AVX512: # %bb.0: 1056; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1057; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1058; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1059; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1060; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1061; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1062; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1063; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1064; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1065; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1066; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1067; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1068; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1069; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1070; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1071; AVX512-NEXT: vzeroupper 1072; AVX512-NEXT: retq 1073 %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1) 1074 ret double %1 1075} 1076 1077define double @test_v16f64(double %a0, <16 x double> %a1) { 1078; SSE2-LABEL: test_v16f64: 1079; SSE2: # %bb.0: 1080; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 1081; SSE2-NEXT: mulsd %xmm1, %xmm0 1082; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1083; SSE2-NEXT: mulsd %xmm1, %xmm0 1084; SSE2-NEXT: mulsd %xmm2, %xmm0 1085; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1086; SSE2-NEXT: mulsd %xmm2, %xmm0 1087; SSE2-NEXT: mulsd %xmm3, %xmm0 1088; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1089; SSE2-NEXT: mulsd %xmm3, %xmm0 1090; SSE2-NEXT: mulsd %xmm4, %xmm0 1091; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1092; SSE2-NEXT: mulsd %xmm4, %xmm0 1093; SSE2-NEXT: mulsd %xmm5, %xmm0 1094; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1095; SSE2-NEXT: mulsd %xmm5, %xmm0 1096; SSE2-NEXT: mulsd %xmm6, %xmm0 1097; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1098; SSE2-NEXT: mulsd %xmm6, %xmm0 1099; SSE2-NEXT: mulsd %xmm7, %xmm0 1100; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1101; SSE2-NEXT: mulsd %xmm7, %xmm0 1102; SSE2-NEXT: mulsd %xmm8, %xmm0 1103; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] 1104; SSE2-NEXT: mulsd %xmm8, %xmm0 1105; SSE2-NEXT: retq 1106; 1107; SSE41-LABEL: test_v16f64: 1108; SSE41: # %bb.0: 1109; SSE41-NEXT: mulsd %xmm1, %xmm0 1110; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1111; SSE41-NEXT: mulsd %xmm1, %xmm0 1112; SSE41-NEXT: mulsd %xmm2, %xmm0 1113; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1114; SSE41-NEXT: mulsd %xmm2, %xmm0 1115; SSE41-NEXT: mulsd %xmm3, %xmm0 1116; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1117; SSE41-NEXT: mulsd %xmm3, %xmm0 1118; SSE41-NEXT: mulsd %xmm4, %xmm0 1119; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1120; SSE41-NEXT: mulsd %xmm4, %xmm0 1121; SSE41-NEXT: mulsd %xmm5, %xmm0 1122; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1123; SSE41-NEXT: mulsd %xmm5, %xmm0 1124; SSE41-NEXT: mulsd %xmm6, %xmm0 1125; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1126; SSE41-NEXT: mulsd %xmm6, %xmm0 1127; SSE41-NEXT: mulsd %xmm7, %xmm0 1128; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1129; SSE41-NEXT: mulsd %xmm7, %xmm0 1130; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0 1131; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0 1132; SSE41-NEXT: retq 1133; 1134; AVX-LABEL: test_v16f64: 1135; AVX: # %bb.0: 1136; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1137; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] 1138; AVX-NEXT: vmulsd %xmm5, %xmm0, %xmm0 1139; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1140; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1141; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1142; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1143; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1144; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1145; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1146; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1147; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1148; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1149; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1150; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1151; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1152; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1153; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1154; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1155; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1156; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1157; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 1158; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm4[1,0] 1159; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1160; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 1161; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1162; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1163; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1164; AVX-NEXT: vzeroupper 1165; AVX-NEXT: retq 1166; 1167; AVX512-LABEL: test_v16f64: 1168; AVX512: # %bb.0: 1169; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1170; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 1171; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1172; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 1173; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1174; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1175; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1176; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3 1177; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1178; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1179; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1180; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1181; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1182; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1183; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1184; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1185; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1186; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1187; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1 1188; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1189; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1190; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1191; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1 1192; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1193; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1194; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1195; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1 1196; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1197; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1198; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1199; AVX512-NEXT: vzeroupper 1200; AVX512-NEXT: retq 1201 %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1) 1202 ret double %1 1203} 1204 1205; 1206; vXf64 (one) 1207; 1208 1209define double @test_v2f64_one(<2 x double> %a0) { 1210; SSE-LABEL: test_v2f64_one: 1211; SSE: # %bb.0: 1212; SSE-NEXT: movapd %xmm0, %xmm1 1213; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1214; SSE-NEXT: mulsd %xmm0, %xmm1 1215; SSE-NEXT: movapd %xmm1, %xmm0 1216; SSE-NEXT: retq 1217; 1218; AVX-LABEL: test_v2f64_one: 1219; AVX: # %bb.0: 1220; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1221; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1222; AVX-NEXT: retq 1223; 1224; AVX512-LABEL: test_v2f64_one: 1225; AVX512: # %bb.0: 1226; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1227; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1228; AVX512-NEXT: retq 1229 %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) 1230 ret double %1 1231} 1232 1233define double @test_v4f64_one(<4 x double> %a0) { 1234; SSE-LABEL: test_v4f64_one: 1235; SSE: # %bb.0: 1236; SSE-NEXT: movapd %xmm0, %xmm2 1237; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 1238; SSE-NEXT: mulsd %xmm0, %xmm2 1239; SSE-NEXT: mulsd %xmm1, %xmm2 1240; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1241; SSE-NEXT: mulsd %xmm1, %xmm2 1242; SSE-NEXT: movapd %xmm2, %xmm0 1243; SSE-NEXT: retq 1244; 1245; AVX-LABEL: test_v4f64_one: 1246; AVX: # %bb.0: 1247; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1248; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm1 1249; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1250; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1251; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1252; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1253; AVX-NEXT: vzeroupper 1254; AVX-NEXT: retq 1255; 1256; AVX512-LABEL: test_v4f64_one: 1257; AVX512: # %bb.0: 1258; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1259; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1 1260; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1261; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1262; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1263; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1264; AVX512-NEXT: vzeroupper 1265; AVX512-NEXT: retq 1266 %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) 1267 ret double %1 1268} 1269 1270define double @test_v8f64_one(<8 x double> %a0) { 1271; SSE-LABEL: test_v8f64_one: 1272; SSE: # %bb.0: 1273; SSE-NEXT: movapd %xmm0, %xmm4 1274; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 1275; SSE-NEXT: mulsd %xmm0, %xmm4 1276; SSE-NEXT: mulsd %xmm1, %xmm4 1277; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1278; SSE-NEXT: mulsd %xmm1, %xmm4 1279; SSE-NEXT: mulsd %xmm2, %xmm4 1280; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1281; SSE-NEXT: mulsd %xmm2, %xmm4 1282; SSE-NEXT: mulsd %xmm3, %xmm4 1283; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1284; SSE-NEXT: mulsd %xmm3, %xmm4 1285; SSE-NEXT: movapd %xmm4, %xmm0 1286; SSE-NEXT: retq 1287; 1288; AVX-LABEL: test_v8f64_one: 1289; AVX: # %bb.0: 1290; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1291; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm2 1292; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1293; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2 1294; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1295; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0 1296; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1297; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1298; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1299; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1300; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1301; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1302; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1303; AVX-NEXT: vzeroupper 1304; AVX-NEXT: retq 1305; 1306; AVX512-LABEL: test_v8f64_one: 1307; AVX512: # %bb.0: 1308; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1309; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1 1310; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1311; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1312; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1313; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1314; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1315; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1316; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1317; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1318; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1319; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1320; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1321; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1322; AVX512-NEXT: vzeroupper 1323; AVX512-NEXT: retq 1324 %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) 1325 ret double %1 1326} 1327 1328define double @test_v16f64_one(<16 x double> %a0) { 1329; SSE-LABEL: test_v16f64_one: 1330; SSE: # %bb.0: 1331; SSE-NEXT: movapd %xmm0, %xmm8 1332; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1333; SSE-NEXT: mulsd %xmm8, %xmm0 1334; SSE-NEXT: mulsd %xmm1, %xmm0 1335; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1336; SSE-NEXT: mulsd %xmm1, %xmm0 1337; SSE-NEXT: mulsd %xmm2, %xmm0 1338; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1339; SSE-NEXT: mulsd %xmm2, %xmm0 1340; SSE-NEXT: mulsd %xmm3, %xmm0 1341; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1342; SSE-NEXT: mulsd %xmm3, %xmm0 1343; SSE-NEXT: mulsd %xmm4, %xmm0 1344; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1345; SSE-NEXT: mulsd %xmm4, %xmm0 1346; SSE-NEXT: mulsd %xmm5, %xmm0 1347; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1348; SSE-NEXT: mulsd %xmm5, %xmm0 1349; SSE-NEXT: mulsd %xmm6, %xmm0 1350; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1351; SSE-NEXT: mulsd %xmm6, %xmm0 1352; SSE-NEXT: mulsd %xmm7, %xmm0 1353; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1354; SSE-NEXT: mulsd %xmm7, %xmm0 1355; SSE-NEXT: retq 1356; 1357; AVX-LABEL: test_v16f64_one: 1358; AVX: # %bb.0: 1359; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1360; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm4 1361; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1362; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4 1363; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1364; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0 1365; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1366; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1367; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 1368; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1369; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1370; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1371; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1372; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1373; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1374; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1375; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1376; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1377; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1378; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1379; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1380; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1381; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1382; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1383; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1384; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1385; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1386; AVX-NEXT: vzeroupper 1387; AVX-NEXT: retq 1388; 1389; AVX512-LABEL: test_v16f64_one: 1390; AVX512: # %bb.0: 1391; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1392; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm2 1393; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1394; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1395; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1396; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1397; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1398; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1399; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1400; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1401; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1402; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm2 1403; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1404; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm0 1405; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1406; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1407; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1408; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1409; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1410; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1411; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1412; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1413; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1414; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1415; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1416; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1417; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1418; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1419; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1420; AVX512-NEXT: vzeroupper 1421; AVX512-NEXT: retq 1422 %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) 1423 ret double %1 1424} 1425 1426; 1427; vXf64 (undef) 1428; 1429 1430define double @test_v2f64_undef(<2 x double> %a0) { 1431; SSE-LABEL: test_v2f64_undef: 1432; SSE: # %bb.0: 1433; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1434; SSE-NEXT: mulsd {{.*}}(%rip), %xmm0 1435; SSE-NEXT: retq 1436; 1437; AVX-LABEL: test_v2f64_undef: 1438; AVX: # %bb.0: 1439; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1440; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 1441; AVX-NEXT: retq 1442; 1443; AVX512-LABEL: test_v2f64_undef: 1444; AVX512: # %bb.0: 1445; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1446; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 1447; AVX512-NEXT: retq 1448 %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0) 1449 ret double %1 1450} 1451 1452define double @test_v4f64_undef(<4 x double> %a0) { 1453; SSE-LABEL: test_v4f64_undef: 1454; SSE: # %bb.0: 1455; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1456; SSE-NEXT: mulsd {{.*}}(%rip), %xmm0 1457; SSE-NEXT: mulsd %xmm1, %xmm0 1458; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1459; SSE-NEXT: mulsd %xmm1, %xmm0 1460; SSE-NEXT: retq 1461; 1462; AVX-LABEL: test_v4f64_undef: 1463; AVX: # %bb.0: 1464; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1465; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm1, %xmm1 1466; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1467; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1468; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1469; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1470; AVX-NEXT: vzeroupper 1471; AVX-NEXT: retq 1472; 1473; AVX512-LABEL: test_v4f64_undef: 1474; AVX512: # %bb.0: 1475; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1476; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm1, %xmm1 1477; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1478; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1479; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1480; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1481; AVX512-NEXT: vzeroupper 1482; AVX512-NEXT: retq 1483 %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0) 1484 ret double %1 1485} 1486 1487define double @test_v8f64_undef(<8 x double> %a0) { 1488; SSE-LABEL: test_v8f64_undef: 1489; SSE: # %bb.0: 1490; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1491; SSE-NEXT: mulsd {{.*}}(%rip), %xmm0 1492; SSE-NEXT: mulsd %xmm1, %xmm0 1493; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1494; SSE-NEXT: mulsd %xmm1, %xmm0 1495; SSE-NEXT: mulsd %xmm2, %xmm0 1496; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1497; SSE-NEXT: mulsd %xmm2, %xmm0 1498; SSE-NEXT: mulsd %xmm3, %xmm0 1499; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1500; SSE-NEXT: mulsd %xmm3, %xmm0 1501; SSE-NEXT: retq 1502; 1503; AVX-LABEL: test_v8f64_undef: 1504; AVX: # %bb.0: 1505; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1506; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm2, %xmm2 1507; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1508; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2 1509; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1510; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0 1511; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1512; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1513; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1514; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1515; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1516; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1517; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1518; AVX-NEXT: vzeroupper 1519; AVX-NEXT: retq 1520; 1521; AVX512-LABEL: test_v8f64_undef: 1522; AVX512: # %bb.0: 1523; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1524; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm1, %xmm1 1525; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1526; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1527; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1528; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1529; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1530; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1531; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1532; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1533; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1534; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1535; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1536; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1537; AVX512-NEXT: vzeroupper 1538; AVX512-NEXT: retq 1539 %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0) 1540 ret double %1 1541} 1542 1543define double @test_v16f64_undef(<16 x double> %a0) { 1544; SSE-LABEL: test_v16f64_undef: 1545; SSE: # %bb.0: 1546; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1547; SSE-NEXT: mulsd {{.*}}(%rip), %xmm0 1548; SSE-NEXT: mulsd %xmm1, %xmm0 1549; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1550; SSE-NEXT: mulsd %xmm1, %xmm0 1551; SSE-NEXT: mulsd %xmm2, %xmm0 1552; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1553; SSE-NEXT: mulsd %xmm2, %xmm0 1554; SSE-NEXT: mulsd %xmm3, %xmm0 1555; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1556; SSE-NEXT: mulsd %xmm3, %xmm0 1557; SSE-NEXT: mulsd %xmm4, %xmm0 1558; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1559; SSE-NEXT: mulsd %xmm4, %xmm0 1560; SSE-NEXT: mulsd %xmm5, %xmm0 1561; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1562; SSE-NEXT: mulsd %xmm5, %xmm0 1563; SSE-NEXT: mulsd %xmm6, %xmm0 1564; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1565; SSE-NEXT: mulsd %xmm6, %xmm0 1566; SSE-NEXT: mulsd %xmm7, %xmm0 1567; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1568; SSE-NEXT: mulsd %xmm7, %xmm0 1569; SSE-NEXT: retq 1570; 1571; AVX-LABEL: test_v16f64_undef: 1572; AVX: # %bb.0: 1573; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1574; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm4, %xmm4 1575; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1576; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4 1577; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1578; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0 1579; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1580; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1581; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 1582; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1583; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1584; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1585; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1586; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1587; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1588; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1589; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1590; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1591; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1592; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1593; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1594; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1595; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1596; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1597; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1598; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1599; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1600; AVX-NEXT: vzeroupper 1601; AVX-NEXT: retq 1602; 1603; AVX512-LABEL: test_v16f64_undef: 1604; AVX512: # %bb.0: 1605; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1606; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm2, %xmm2 1607; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1608; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1609; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1610; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1611; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1612; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1613; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1614; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1615; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1616; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm2 1617; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1618; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm0 1619; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1620; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1621; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1622; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1623; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1624; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1625; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1626; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1627; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1628; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1629; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1630; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1631; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1632; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1633; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1634; AVX512-NEXT: vzeroupper 1635; AVX512-NEXT: retq 1636 %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0) 1637 ret double %1 1638} 1639 1640declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) 1641declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 1642declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) 1643declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>) 1644 1645declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) 1646declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) 1647declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>) 1648declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>) 1649