1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 9 10; 11; vXf32 (accum) 12; 13 14define float @test_v2f32(float %a0, <2 x float> %a1) { 15; SSE2-LABEL: test_v2f32: 16; SSE2: # %bb.0: 17; SSE2-NEXT: addss %xmm1, %xmm0 18; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 19; SSE2-NEXT: addss %xmm1, %xmm0 20; SSE2-NEXT: retq 21; 22; SSE41-LABEL: test_v2f32: 23; SSE41: # %bb.0: 24; SSE41-NEXT: addss %xmm1, %xmm0 25; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 26; SSE41-NEXT: addss %xmm1, %xmm0 27; SSE41-NEXT: retq 28; 29; AVX-LABEL: test_v2f32: 30; AVX: # %bb.0: 31; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 32; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 33; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 34; AVX-NEXT: retq 35; 36; AVX512-LABEL: test_v2f32: 37; AVX512: # %bb.0: 38; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 39; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 40; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 41; AVX512-NEXT: retq 42 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) 43 ret float %1 44} 45 46define float @test_v4f32(float %a0, <4 x float> %a1) { 47; SSE2-LABEL: test_v4f32: 48; SSE2: # %bb.0: 49; SSE2-NEXT: addss %xmm1, %xmm0 50; SSE2-NEXT: movaps %xmm1, %xmm2 51; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 52; SSE2-NEXT: addss %xmm2, %xmm0 53; SSE2-NEXT: movaps %xmm1, %xmm2 54; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 55; SSE2-NEXT: addss %xmm2, %xmm0 56; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 57; SSE2-NEXT: addss %xmm1, %xmm0 58; SSE2-NEXT: retq 59; 60; SSE41-LABEL: test_v4f32: 61; SSE41: # %bb.0: 62; SSE41-NEXT: addss %xmm1, %xmm0 63; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 64; SSE41-NEXT: addss %xmm2, %xmm0 65; SSE41-NEXT: movaps %xmm1, %xmm2 66; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 67; SSE41-NEXT: addss %xmm2, %xmm0 68; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 69; SSE41-NEXT: addss %xmm1, %xmm0 70; SSE41-NEXT: retq 71; 72; AVX-LABEL: test_v4f32: 73; AVX: # %bb.0: 74; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 75; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 76; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 77; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 78; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 79; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 80; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 81; AVX-NEXT: retq 82; 83; AVX512-LABEL: test_v4f32: 84; AVX512: # %bb.0: 85; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 86; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 87; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 88; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 89; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 90; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 91; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 92; AVX512-NEXT: retq 93 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) 94 ret float %1 95} 96 97define float @test_v8f32(float %a0, <8 x float> %a1) { 98; SSE2-LABEL: test_v8f32: 99; SSE2: # %bb.0: 100; SSE2-NEXT: addss %xmm1, %xmm0 101; SSE2-NEXT: movaps %xmm1, %xmm3 102; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] 103; SSE2-NEXT: addss %xmm3, %xmm0 104; SSE2-NEXT: movaps %xmm1, %xmm3 105; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 106; SSE2-NEXT: addss %xmm3, %xmm0 107; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 108; SSE2-NEXT: addss %xmm1, %xmm0 109; SSE2-NEXT: addss %xmm2, %xmm0 110; SSE2-NEXT: movaps %xmm2, %xmm1 111; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 112; SSE2-NEXT: addss %xmm1, %xmm0 113; SSE2-NEXT: movaps %xmm2, %xmm1 114; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 115; SSE2-NEXT: addss %xmm1, %xmm0 116; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 117; SSE2-NEXT: addss %xmm2, %xmm0 118; SSE2-NEXT: retq 119; 120; SSE41-LABEL: test_v8f32: 121; SSE41: # %bb.0: 122; SSE41-NEXT: addss %xmm1, %xmm0 123; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 124; SSE41-NEXT: addss %xmm3, %xmm0 125; SSE41-NEXT: movaps %xmm1, %xmm3 126; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 127; SSE41-NEXT: addss %xmm3, %xmm0 128; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 129; SSE41-NEXT: addss %xmm1, %xmm0 130; SSE41-NEXT: addss %xmm2, %xmm0 131; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 132; SSE41-NEXT: addss %xmm1, %xmm0 133; SSE41-NEXT: movaps %xmm2, %xmm1 134; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 135; SSE41-NEXT: addss %xmm1, %xmm0 136; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 137; SSE41-NEXT: addss %xmm2, %xmm0 138; SSE41-NEXT: retq 139; 140; AVX-LABEL: test_v8f32: 141; AVX: # %bb.0: 142; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 143; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 144; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 145; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 146; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 147; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 148; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 149; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 150; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 151; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 152; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 153; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 154; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 155; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 156; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 157; AVX-NEXT: vzeroupper 158; AVX-NEXT: retq 159; 160; AVX512-LABEL: test_v8f32: 161; AVX512: # %bb.0: 162; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 163; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 164; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 165; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 166; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 167; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 168; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 169; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 170; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 171; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 172; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 173; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 174; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 175; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 176; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 177; AVX512-NEXT: vzeroupper 178; AVX512-NEXT: retq 179 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) 180 ret float %1 181} 182 183define float @test_v16f32(float %a0, <16 x float> %a1) { 184; SSE2-LABEL: test_v16f32: 185; SSE2: # %bb.0: 186; SSE2-NEXT: addss %xmm1, %xmm0 187; SSE2-NEXT: movaps %xmm1, %xmm5 188; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] 189; SSE2-NEXT: addss %xmm5, %xmm0 190; SSE2-NEXT: movaps %xmm1, %xmm5 191; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 192; SSE2-NEXT: addss %xmm5, %xmm0 193; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 194; SSE2-NEXT: addss %xmm1, %xmm0 195; SSE2-NEXT: addss %xmm2, %xmm0 196; SSE2-NEXT: movaps %xmm2, %xmm1 197; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 198; SSE2-NEXT: addss %xmm1, %xmm0 199; SSE2-NEXT: movaps %xmm2, %xmm1 200; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 201; SSE2-NEXT: addss %xmm1, %xmm0 202; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 203; SSE2-NEXT: addss %xmm2, %xmm0 204; SSE2-NEXT: addss %xmm3, %xmm0 205; SSE2-NEXT: movaps %xmm3, %xmm1 206; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 207; SSE2-NEXT: addss %xmm1, %xmm0 208; SSE2-NEXT: movaps %xmm3, %xmm1 209; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 210; SSE2-NEXT: addss %xmm1, %xmm0 211; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 212; SSE2-NEXT: addss %xmm3, %xmm0 213; SSE2-NEXT: addss %xmm4, %xmm0 214; SSE2-NEXT: movaps %xmm4, %xmm1 215; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] 216; SSE2-NEXT: addss %xmm1, %xmm0 217; SSE2-NEXT: movaps %xmm4, %xmm1 218; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 219; SSE2-NEXT: addss %xmm1, %xmm0 220; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 221; SSE2-NEXT: addss %xmm4, %xmm0 222; SSE2-NEXT: retq 223; 224; SSE41-LABEL: test_v16f32: 225; SSE41: # %bb.0: 226; SSE41-NEXT: addss %xmm1, %xmm0 227; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] 228; SSE41-NEXT: addss %xmm5, %xmm0 229; SSE41-NEXT: movaps %xmm1, %xmm5 230; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 231; SSE41-NEXT: addss %xmm5, %xmm0 232; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 233; SSE41-NEXT: addss %xmm1, %xmm0 234; SSE41-NEXT: addss %xmm2, %xmm0 235; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 236; SSE41-NEXT: addss %xmm1, %xmm0 237; SSE41-NEXT: movaps %xmm2, %xmm1 238; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 239; SSE41-NEXT: addss %xmm1, %xmm0 240; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 241; SSE41-NEXT: addss %xmm2, %xmm0 242; SSE41-NEXT: addss %xmm3, %xmm0 243; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 244; SSE41-NEXT: addss %xmm1, %xmm0 245; SSE41-NEXT: movaps %xmm3, %xmm1 246; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 247; SSE41-NEXT: addss %xmm1, %xmm0 248; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 249; SSE41-NEXT: addss %xmm3, %xmm0 250; SSE41-NEXT: addss %xmm4, %xmm0 251; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] 252; SSE41-NEXT: addss %xmm1, %xmm0 253; SSE41-NEXT: movaps %xmm4, %xmm1 254; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 255; SSE41-NEXT: addss %xmm1, %xmm0 256; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 257; SSE41-NEXT: addss %xmm4, %xmm0 258; SSE41-NEXT: retq 259; 260; AVX-LABEL: test_v16f32: 261; AVX: # %bb.0: 262; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 263; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 264; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 265; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 266; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 267; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] 268; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 269; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 270; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 271; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 272; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 273; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 274; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 275; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 276; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 277; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 278; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 279; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 280; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 281; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 282; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] 283; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 284; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 285; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 286; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 287; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 288; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 289; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 290; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 291; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 292; AVX-NEXT: vzeroupper 293; AVX-NEXT: retq 294; 295; AVX512-LABEL: test_v16f32: 296; AVX512: # %bb.0: 297; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 298; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 299; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 300; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 301; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 302; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 303; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 304; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 305; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 306; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 307; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 308; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 309; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 310; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 311; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 312; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 313; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 314; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 315; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 316; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 317; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 318; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 319; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 320; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 321; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 322; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 323; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 324; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 325; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 326; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 327; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 328; AVX512-NEXT: vzeroupper 329; AVX512-NEXT: retq 330 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) 331 ret float %1 332} 333 334; 335; vXf32 (zero) 336; 337 338define float @test_v2f32_zero(<2 x float> %a0) { 339; SSE2-LABEL: test_v2f32_zero: 340; SSE2: # %bb.0: 341; SSE2-NEXT: movaps %xmm0, %xmm1 342; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 343; SSE2-NEXT: addss %xmm0, %xmm1 344; SSE2-NEXT: movaps %xmm1, %xmm0 345; SSE2-NEXT: retq 346; 347; SSE41-LABEL: test_v2f32_zero: 348; SSE41: # %bb.0: 349; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 350; SSE41-NEXT: addss %xmm1, %xmm0 351; SSE41-NEXT: retq 352; 353; AVX1-SLOW-LABEL: test_v2f32_zero: 354; AVX1-SLOW: # %bb.0: 355; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 356; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 357; AVX1-SLOW-NEXT: retq 358; 359; AVX1-FAST-LABEL: test_v2f32_zero: 360; AVX1-FAST: # %bb.0: 361; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 362; AVX1-FAST-NEXT: retq 363; 364; AVX2-LABEL: test_v2f32_zero: 365; AVX2: # %bb.0: 366; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 367; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 368; AVX2-NEXT: retq 369; 370; AVX512-LABEL: test_v2f32_zero: 371; AVX512: # %bb.0: 372; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 373; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 374; AVX512-NEXT: retq 375 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %a0) 376 ret float %1 377} 378 379define float @test_v4f32_zero(<4 x float> %a0) { 380; SSE2-LABEL: test_v4f32_zero: 381; SSE2: # %bb.0: 382; SSE2-NEXT: movaps %xmm0, %xmm1 383; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 384; SSE2-NEXT: addss %xmm0, %xmm1 385; SSE2-NEXT: movaps %xmm0, %xmm2 386; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 387; SSE2-NEXT: addss %xmm1, %xmm2 388; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 389; SSE2-NEXT: addss %xmm2, %xmm0 390; SSE2-NEXT: retq 391; 392; SSE41-LABEL: test_v4f32_zero: 393; SSE41: # %bb.0: 394; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 395; SSE41-NEXT: addss %xmm0, %xmm1 396; SSE41-NEXT: movaps %xmm0, %xmm2 397; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 398; SSE41-NEXT: addss %xmm1, %xmm2 399; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 400; SSE41-NEXT: addss %xmm2, %xmm0 401; SSE41-NEXT: retq 402; 403; AVX1-SLOW-LABEL: test_v4f32_zero: 404; AVX1-SLOW: # %bb.0: 405; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 406; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 407; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 408; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 409; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 410; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 411; AVX1-SLOW-NEXT: retq 412; 413; AVX1-FAST-LABEL: test_v4f32_zero: 414; AVX1-FAST: # %bb.0: 415; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 416; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 417; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 418; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 419; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 420; AVX1-FAST-NEXT: retq 421; 422; AVX2-LABEL: test_v4f32_zero: 423; AVX2: # %bb.0: 424; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 425; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 426; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 427; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 428; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 429; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 430; AVX2-NEXT: retq 431; 432; AVX512-LABEL: test_v4f32_zero: 433; AVX512: # %bb.0: 434; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 435; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 436; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 437; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 438; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 439; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 440; AVX512-NEXT: retq 441 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0) 442 ret float %1 443} 444 445define float @test_v8f32_zero(<8 x float> %a0) { 446; SSE2-LABEL: test_v8f32_zero: 447; SSE2: # %bb.0: 448; SSE2-NEXT: movaps %xmm0, %xmm2 449; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 450; SSE2-NEXT: addss %xmm0, %xmm2 451; SSE2-NEXT: movaps %xmm0, %xmm3 452; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 453; SSE2-NEXT: addss %xmm2, %xmm3 454; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 455; SSE2-NEXT: addss %xmm3, %xmm0 456; SSE2-NEXT: addss %xmm1, %xmm0 457; SSE2-NEXT: movaps %xmm1, %xmm2 458; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 459; SSE2-NEXT: addss %xmm2, %xmm0 460; SSE2-NEXT: movaps %xmm1, %xmm2 461; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 462; SSE2-NEXT: addss %xmm2, %xmm0 463; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 464; SSE2-NEXT: addss %xmm1, %xmm0 465; SSE2-NEXT: retq 466; 467; SSE41-LABEL: test_v8f32_zero: 468; SSE41: # %bb.0: 469; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 470; SSE41-NEXT: addss %xmm0, %xmm2 471; SSE41-NEXT: movaps %xmm0, %xmm3 472; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 473; SSE41-NEXT: addss %xmm2, %xmm3 474; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 475; SSE41-NEXT: addss %xmm3, %xmm0 476; SSE41-NEXT: addss %xmm1, %xmm0 477; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 478; SSE41-NEXT: addss %xmm2, %xmm0 479; SSE41-NEXT: movaps %xmm1, %xmm2 480; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 481; SSE41-NEXT: addss %xmm2, %xmm0 482; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 483; SSE41-NEXT: addss %xmm1, %xmm0 484; SSE41-NEXT: retq 485; 486; AVX1-SLOW-LABEL: test_v8f32_zero: 487; AVX1-SLOW: # %bb.0: 488; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 489; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 490; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 491; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 492; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 493; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 494; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 495; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm1 496; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 497; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 498; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 499; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 500; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 501; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 502; AVX1-SLOW-NEXT: vzeroupper 503; AVX1-SLOW-NEXT: retq 504; 505; AVX1-FAST-LABEL: test_v8f32_zero: 506; AVX1-FAST: # %bb.0: 507; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 508; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 509; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 510; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 511; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 512; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 513; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm1 514; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 515; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 516; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 517; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 518; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 519; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 520; AVX1-FAST-NEXT: vzeroupper 521; AVX1-FAST-NEXT: retq 522; 523; AVX2-LABEL: test_v8f32_zero: 524; AVX2: # %bb.0: 525; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 526; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 527; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 528; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 529; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 530; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 531; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 532; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm1 533; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 534; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 535; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 536; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 537; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 538; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 539; AVX2-NEXT: vzeroupper 540; AVX2-NEXT: retq 541; 542; AVX512-LABEL: test_v8f32_zero: 543; AVX512: # %bb.0: 544; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 545; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 546; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 547; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 548; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 549; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 550; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 551; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 552; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 553; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 554; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 555; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 556; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 557; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 558; AVX512-NEXT: vzeroupper 559; AVX512-NEXT: retq 560 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a0) 561 ret float %1 562} 563 564define float @test_v16f32_zero(<16 x float> %a0) { 565; SSE2-LABEL: test_v16f32_zero: 566; SSE2: # %bb.0: 567; SSE2-NEXT: movaps %xmm0, %xmm4 568; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 569; SSE2-NEXT: addss %xmm0, %xmm4 570; SSE2-NEXT: movaps %xmm0, %xmm5 571; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 572; SSE2-NEXT: addss %xmm4, %xmm5 573; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 574; SSE2-NEXT: addss %xmm5, %xmm0 575; SSE2-NEXT: addss %xmm1, %xmm0 576; SSE2-NEXT: movaps %xmm1, %xmm4 577; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] 578; SSE2-NEXT: addss %xmm4, %xmm0 579; SSE2-NEXT: movaps %xmm1, %xmm4 580; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 581; SSE2-NEXT: addss %xmm4, %xmm0 582; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 583; SSE2-NEXT: addss %xmm1, %xmm0 584; SSE2-NEXT: addss %xmm2, %xmm0 585; SSE2-NEXT: movaps %xmm2, %xmm1 586; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 587; SSE2-NEXT: addss %xmm1, %xmm0 588; SSE2-NEXT: movaps %xmm2, %xmm1 589; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 590; SSE2-NEXT: addss %xmm1, %xmm0 591; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 592; SSE2-NEXT: addss %xmm2, %xmm0 593; SSE2-NEXT: addss %xmm3, %xmm0 594; SSE2-NEXT: movaps %xmm3, %xmm1 595; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 596; SSE2-NEXT: addss %xmm1, %xmm0 597; SSE2-NEXT: movaps %xmm3, %xmm1 598; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 599; SSE2-NEXT: addss %xmm1, %xmm0 600; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 601; SSE2-NEXT: addss %xmm3, %xmm0 602; SSE2-NEXT: retq 603; 604; SSE41-LABEL: test_v16f32_zero: 605; SSE41: # %bb.0: 606; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 607; SSE41-NEXT: addss %xmm0, %xmm4 608; SSE41-NEXT: movaps %xmm0, %xmm5 609; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 610; SSE41-NEXT: addss %xmm4, %xmm5 611; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 612; SSE41-NEXT: addss %xmm5, %xmm0 613; SSE41-NEXT: addss %xmm1, %xmm0 614; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 615; SSE41-NEXT: addss %xmm4, %xmm0 616; SSE41-NEXT: movaps %xmm1, %xmm4 617; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 618; SSE41-NEXT: addss %xmm4, %xmm0 619; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 620; SSE41-NEXT: addss %xmm1, %xmm0 621; SSE41-NEXT: addss %xmm2, %xmm0 622; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 623; SSE41-NEXT: addss %xmm1, %xmm0 624; SSE41-NEXT: movaps %xmm2, %xmm1 625; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 626; SSE41-NEXT: addss %xmm1, %xmm0 627; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 628; SSE41-NEXT: addss %xmm2, %xmm0 629; SSE41-NEXT: addss %xmm3, %xmm0 630; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 631; SSE41-NEXT: addss %xmm1, %xmm0 632; SSE41-NEXT: movaps %xmm3, %xmm1 633; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 634; SSE41-NEXT: addss %xmm1, %xmm0 635; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 636; SSE41-NEXT: addss %xmm3, %xmm0 637; SSE41-NEXT: retq 638; 639; AVX1-SLOW-LABEL: test_v16f32_zero: 640; AVX1-SLOW: # %bb.0: 641; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 642; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 643; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 644; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 645; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] 646; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 647; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 648; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm2 649; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 650; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 651; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 652; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 653; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 654; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 655; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 656; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 657; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 658; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 659; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 660; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 661; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 662; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 663; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 664; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 665; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 666; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 667; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 668; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 669; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 670; AVX1-SLOW-NEXT: vzeroupper 671; AVX1-SLOW-NEXT: retq 672; 673; AVX1-FAST-LABEL: test_v16f32_zero: 674; AVX1-FAST: # %bb.0: 675; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm2 676; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 677; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 678; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] 679; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 680; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 681; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm2 682; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 683; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 684; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 685; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 686; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 687; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0 688; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 689; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 690; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 691; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 692; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 693; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 694; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 695; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 696; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 697; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 698; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 699; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 700; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 701; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 702; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 703; AVX1-FAST-NEXT: vzeroupper 704; AVX1-FAST-NEXT: retq 705; 706; AVX2-LABEL: test_v16f32_zero: 707; AVX2: # %bb.0: 708; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 709; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm2 710; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 711; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 712; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] 713; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 714; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 715; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm2 716; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 717; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 718; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 719; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 720; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 721; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm0 722; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 723; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 724; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 725; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 726; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 727; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 728; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 729; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 730; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 731; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 732; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 733; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 734; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 735; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 736; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 737; AVX2-NEXT: vzeroupper 738; AVX2-NEXT: retq 739; 740; AVX512-LABEL: test_v16f32_zero: 741; AVX512: # %bb.0: 742; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 743; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 744; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 745; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 746; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 747; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 748; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 749; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 750; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 751; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 752; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 753; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 754; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 755; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 756; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 757; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 758; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 759; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 760; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 761; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 762; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 763; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 764; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 765; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 766; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 767; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 768; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 769; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 770; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 771; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 772; AVX512-NEXT: vzeroupper 773; AVX512-NEXT: retq 774 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float -0.0, <16 x float> %a0) 775 ret float %1 776} 777 778; 779; vXf32 (undef) 780; 781 782define float @test_v2f32_undef(<2 x float> %a0) { 783; SSE2-LABEL: test_v2f32_undef: 784; SSE2: # %bb.0: 785; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 786; SSE2-NEXT: addss {{.*}}(%rip), %xmm0 787; SSE2-NEXT: retq 788; 789; SSE41-LABEL: test_v2f32_undef: 790; SSE41: # %bb.0: 791; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 792; SSE41-NEXT: addss {{.*}}(%rip), %xmm0 793; SSE41-NEXT: retq 794; 795; AVX-LABEL: test_v2f32_undef: 796; AVX: # %bb.0: 797; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 798; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 799; AVX-NEXT: retq 800; 801; AVX512-LABEL: test_v2f32_undef: 802; AVX512: # %bb.0: 803; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 804; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 805; AVX512-NEXT: retq 806 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0) 807 ret float %1 808} 809 810define float @test_v4f32_undef(<4 x float> %a0) { 811; SSE2-LABEL: test_v4f32_undef: 812; SSE2: # %bb.0: 813; SSE2-NEXT: movaps %xmm0, %xmm1 814; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 815; SSE2-NEXT: addss {{.*}}(%rip), %xmm1 816; SSE2-NEXT: movaps %xmm0, %xmm2 817; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 818; SSE2-NEXT: addss %xmm1, %xmm2 819; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 820; SSE2-NEXT: addss %xmm2, %xmm0 821; SSE2-NEXT: retq 822; 823; SSE41-LABEL: test_v4f32_undef: 824; SSE41: # %bb.0: 825; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 826; SSE41-NEXT: addss {{.*}}(%rip), %xmm1 827; SSE41-NEXT: movaps %xmm0, %xmm2 828; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 829; SSE41-NEXT: addss %xmm1, %xmm2 830; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 831; SSE41-NEXT: addss %xmm2, %xmm0 832; SSE41-NEXT: retq 833; 834; AVX-LABEL: test_v4f32_undef: 835; AVX: # %bb.0: 836; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 837; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 838; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 839; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 840; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 841; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 842; AVX-NEXT: retq 843; 844; AVX512-LABEL: test_v4f32_undef: 845; AVX512: # %bb.0: 846; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 847; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 848; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 849; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 850; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 851; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 852; AVX512-NEXT: retq 853 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0) 854 ret float %1 855} 856 857define float @test_v8f32_undef(<8 x float> %a0) { 858; SSE2-LABEL: test_v8f32_undef: 859; SSE2: # %bb.0: 860; SSE2-NEXT: movaps %xmm0, %xmm2 861; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 862; SSE2-NEXT: addss {{.*}}(%rip), %xmm2 863; SSE2-NEXT: movaps %xmm0, %xmm3 864; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 865; SSE2-NEXT: addss %xmm2, %xmm3 866; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 867; SSE2-NEXT: addss %xmm3, %xmm0 868; SSE2-NEXT: addss %xmm1, %xmm0 869; SSE2-NEXT: movaps %xmm1, %xmm2 870; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 871; SSE2-NEXT: addss %xmm2, %xmm0 872; SSE2-NEXT: movaps %xmm1, %xmm2 873; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 874; SSE2-NEXT: addss %xmm2, %xmm0 875; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 876; SSE2-NEXT: addss %xmm1, %xmm0 877; SSE2-NEXT: retq 878; 879; SSE41-LABEL: test_v8f32_undef: 880; SSE41: # %bb.0: 881; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 882; SSE41-NEXT: addss {{.*}}(%rip), %xmm2 883; SSE41-NEXT: movaps %xmm0, %xmm3 884; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 885; SSE41-NEXT: addss %xmm2, %xmm3 886; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 887; SSE41-NEXT: addss %xmm3, %xmm0 888; SSE41-NEXT: addss %xmm1, %xmm0 889; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 890; SSE41-NEXT: addss %xmm2, %xmm0 891; SSE41-NEXT: movaps %xmm1, %xmm2 892; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 893; SSE41-NEXT: addss %xmm2, %xmm0 894; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 895; SSE41-NEXT: addss %xmm1, %xmm0 896; SSE41-NEXT: retq 897; 898; AVX-LABEL: test_v8f32_undef: 899; AVX: # %bb.0: 900; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 901; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 902; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 903; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 904; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 905; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 906; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 907; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 908; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 909; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 910; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 911; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 912; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 913; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 914; AVX-NEXT: vzeroupper 915; AVX-NEXT: retq 916; 917; AVX512-LABEL: test_v8f32_undef: 918; AVX512: # %bb.0: 919; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 920; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 921; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 922; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 923; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 924; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 925; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 926; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 927; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 928; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 929; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 930; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 931; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 932; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 933; AVX512-NEXT: vzeroupper 934; AVX512-NEXT: retq 935 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0) 936 ret float %1 937} 938 939define float @test_v16f32_undef(<16 x float> %a0) { 940; SSE2-LABEL: test_v16f32_undef: 941; SSE2: # %bb.0: 942; SSE2-NEXT: movaps %xmm0, %xmm4 943; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 944; SSE2-NEXT: addss {{.*}}(%rip), %xmm4 945; SSE2-NEXT: movaps %xmm0, %xmm5 946; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 947; SSE2-NEXT: addss %xmm4, %xmm5 948; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 949; SSE2-NEXT: addss %xmm5, %xmm0 950; SSE2-NEXT: addss %xmm1, %xmm0 951; SSE2-NEXT: movaps %xmm1, %xmm4 952; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] 953; SSE2-NEXT: addss %xmm4, %xmm0 954; SSE2-NEXT: movaps %xmm1, %xmm4 955; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 956; SSE2-NEXT: addss %xmm4, %xmm0 957; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 958; SSE2-NEXT: addss %xmm1, %xmm0 959; SSE2-NEXT: addss %xmm2, %xmm0 960; SSE2-NEXT: movaps %xmm2, %xmm1 961; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 962; SSE2-NEXT: addss %xmm1, %xmm0 963; SSE2-NEXT: movaps %xmm2, %xmm1 964; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 965; SSE2-NEXT: addss %xmm1, %xmm0 966; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 967; SSE2-NEXT: addss %xmm2, %xmm0 968; SSE2-NEXT: addss %xmm3, %xmm0 969; SSE2-NEXT: movaps %xmm3, %xmm1 970; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 971; SSE2-NEXT: addss %xmm1, %xmm0 972; SSE2-NEXT: movaps %xmm3, %xmm1 973; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 974; SSE2-NEXT: addss %xmm1, %xmm0 975; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 976; SSE2-NEXT: addss %xmm3, %xmm0 977; SSE2-NEXT: retq 978; 979; SSE41-LABEL: test_v16f32_undef: 980; SSE41: # %bb.0: 981; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 982; SSE41-NEXT: addss {{.*}}(%rip), %xmm4 983; SSE41-NEXT: movaps %xmm0, %xmm5 984; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 985; SSE41-NEXT: addss %xmm4, %xmm5 986; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 987; SSE41-NEXT: addss %xmm5, %xmm0 988; SSE41-NEXT: addss %xmm1, %xmm0 989; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 990; SSE41-NEXT: addss %xmm4, %xmm0 991; SSE41-NEXT: movaps %xmm1, %xmm4 992; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 993; SSE41-NEXT: addss %xmm4, %xmm0 994; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 995; SSE41-NEXT: addss %xmm1, %xmm0 996; SSE41-NEXT: addss %xmm2, %xmm0 997; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 998; SSE41-NEXT: addss %xmm1, %xmm0 999; SSE41-NEXT: movaps %xmm2, %xmm1 1000; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 1001; SSE41-NEXT: addss %xmm1, %xmm0 1002; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 1003; SSE41-NEXT: addss %xmm2, %xmm0 1004; SSE41-NEXT: addss %xmm3, %xmm0 1005; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 1006; SSE41-NEXT: addss %xmm1, %xmm0 1007; SSE41-NEXT: movaps %xmm3, %xmm1 1008; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 1009; SSE41-NEXT: addss %xmm1, %xmm0 1010; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 1011; SSE41-NEXT: addss %xmm3, %xmm0 1012; SSE41-NEXT: retq 1013; 1014; AVX-LABEL: test_v16f32_undef: 1015; AVX: # %bb.0: 1016; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 1017; AVX-NEXT: vaddss {{.*}}(%rip), %xmm2, %xmm2 1018; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 1019; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 1020; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] 1021; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 1022; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1023; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 1024; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 1025; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 1026; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 1027; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 1028; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1029; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 1030; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1031; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1032; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1033; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1034; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1035; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 1036; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1037; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1038; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1039; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1040; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1041; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1042; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1043; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 1044; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1045; AVX-NEXT: vzeroupper 1046; AVX-NEXT: retq 1047; 1048; AVX512-LABEL: test_v16f32_undef: 1049; AVX512: # %bb.0: 1050; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1051; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 1052; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1053; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1054; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 1055; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1056; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1057; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1058; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 1059; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 1060; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 1061; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 1062; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 1063; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1064; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1065; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1066; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 1067; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 1068; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 1069; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 1070; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 1071; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1072; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1073; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 1074; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 1075; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1076; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1077; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1078; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1079; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 1080; AVX512-NEXT: vzeroupper 1081; AVX512-NEXT: retq 1082 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0) 1083 ret float %1 1084} 1085 1086; 1087; vXf64 (accum) 1088; 1089 1090define double @test_v2f64(double %a0, <2 x double> %a1) { 1091; SSE-LABEL: test_v2f64: 1092; SSE: # %bb.0: 1093; SSE-NEXT: addsd %xmm1, %xmm0 1094; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1095; SSE-NEXT: addsd %xmm1, %xmm0 1096; SSE-NEXT: retq 1097; 1098; AVX-LABEL: test_v2f64: 1099; AVX: # %bb.0: 1100; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1101; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1102; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1103; AVX-NEXT: retq 1104; 1105; AVX512-LABEL: test_v2f64: 1106; AVX512: # %bb.0: 1107; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1108; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1109; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1110; AVX512-NEXT: retq 1111 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) 1112 ret double %1 1113} 1114 1115define double @test_v4f64(double %a0, <4 x double> %a1) { 1116; SSE-LABEL: test_v4f64: 1117; SSE: # %bb.0: 1118; SSE-NEXT: addsd %xmm1, %xmm0 1119; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1120; SSE-NEXT: addsd %xmm1, %xmm0 1121; SSE-NEXT: addsd %xmm2, %xmm0 1122; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1123; SSE-NEXT: addsd %xmm2, %xmm0 1124; SSE-NEXT: retq 1125; 1126; AVX-LABEL: test_v4f64: 1127; AVX: # %bb.0: 1128; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1129; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1130; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1131; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1132; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1133; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1134; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1135; AVX-NEXT: vzeroupper 1136; AVX-NEXT: retq 1137; 1138; AVX512-LABEL: test_v4f64: 1139; AVX512: # %bb.0: 1140; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1141; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1142; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1143; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 1144; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1145; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1146; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1147; AVX512-NEXT: vzeroupper 1148; AVX512-NEXT: retq 1149 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) 1150 ret double %1 1151} 1152 1153define double @test_v8f64(double %a0, <8 x double> %a1) { 1154; SSE-LABEL: test_v8f64: 1155; SSE: # %bb.0: 1156; SSE-NEXT: addsd %xmm1, %xmm0 1157; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1158; SSE-NEXT: addsd %xmm1, %xmm0 1159; SSE-NEXT: addsd %xmm2, %xmm0 1160; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1161; SSE-NEXT: addsd %xmm2, %xmm0 1162; SSE-NEXT: addsd %xmm3, %xmm0 1163; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1164; SSE-NEXT: addsd %xmm3, %xmm0 1165; SSE-NEXT: addsd %xmm4, %xmm0 1166; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1167; SSE-NEXT: addsd %xmm4, %xmm0 1168; SSE-NEXT: retq 1169; 1170; AVX-LABEL: test_v8f64: 1171; AVX: # %bb.0: 1172; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1173; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 1174; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1175; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1176; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1177; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1178; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1179; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1180; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1181; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1182; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1183; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1184; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1185; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1186; AVX-NEXT: vzeroupper 1187; AVX-NEXT: retq 1188; 1189; AVX512-LABEL: test_v8f64: 1190; AVX512: # %bb.0: 1191; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1192; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1193; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1194; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1195; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1196; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1197; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1198; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1199; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1200; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1201; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1202; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1203; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1204; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1205; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1206; AVX512-NEXT: vzeroupper 1207; AVX512-NEXT: retq 1208 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) 1209 ret double %1 1210} 1211 1212define double @test_v16f64(double %a0, <16 x double> %a1) { 1213; SSE2-LABEL: test_v16f64: 1214; SSE2: # %bb.0: 1215; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 1216; SSE2-NEXT: addsd %xmm1, %xmm0 1217; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1218; SSE2-NEXT: addsd %xmm1, %xmm0 1219; SSE2-NEXT: addsd %xmm2, %xmm0 1220; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1221; SSE2-NEXT: addsd %xmm2, %xmm0 1222; SSE2-NEXT: addsd %xmm3, %xmm0 1223; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1224; SSE2-NEXT: addsd %xmm3, %xmm0 1225; SSE2-NEXT: addsd %xmm4, %xmm0 1226; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1227; SSE2-NEXT: addsd %xmm4, %xmm0 1228; SSE2-NEXT: addsd %xmm5, %xmm0 1229; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1230; SSE2-NEXT: addsd %xmm5, %xmm0 1231; SSE2-NEXT: addsd %xmm6, %xmm0 1232; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1233; SSE2-NEXT: addsd %xmm6, %xmm0 1234; SSE2-NEXT: addsd %xmm7, %xmm0 1235; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1236; SSE2-NEXT: addsd %xmm7, %xmm0 1237; SSE2-NEXT: addsd %xmm8, %xmm0 1238; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] 1239; SSE2-NEXT: addsd %xmm8, %xmm0 1240; SSE2-NEXT: retq 1241; 1242; SSE41-LABEL: test_v16f64: 1243; SSE41: # %bb.0: 1244; SSE41-NEXT: addsd %xmm1, %xmm0 1245; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1246; SSE41-NEXT: addsd %xmm1, %xmm0 1247; SSE41-NEXT: addsd %xmm2, %xmm0 1248; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1249; SSE41-NEXT: addsd %xmm2, %xmm0 1250; SSE41-NEXT: addsd %xmm3, %xmm0 1251; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1252; SSE41-NEXT: addsd %xmm3, %xmm0 1253; SSE41-NEXT: addsd %xmm4, %xmm0 1254; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1255; SSE41-NEXT: addsd %xmm4, %xmm0 1256; SSE41-NEXT: addsd %xmm5, %xmm0 1257; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1258; SSE41-NEXT: addsd %xmm5, %xmm0 1259; SSE41-NEXT: addsd %xmm6, %xmm0 1260; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1261; SSE41-NEXT: addsd %xmm6, %xmm0 1262; SSE41-NEXT: addsd %xmm7, %xmm0 1263; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1264; SSE41-NEXT: addsd %xmm7, %xmm0 1265; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0 1266; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0 1267; SSE41-NEXT: retq 1268; 1269; AVX-LABEL: test_v16f64: 1270; AVX: # %bb.0: 1271; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1272; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] 1273; AVX-NEXT: vaddsd %xmm5, %xmm0, %xmm0 1274; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1275; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1276; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1277; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1278; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1279; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1280; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1281; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1282; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1283; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1284; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1285; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1286; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1287; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1288; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1289; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1290; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1291; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1292; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1293; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm4[1,0] 1294; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1295; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 1296; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1297; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1298; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1299; AVX-NEXT: vzeroupper 1300; AVX-NEXT: retq 1301; 1302; AVX512-LABEL: test_v16f64: 1303; AVX512: # %bb.0: 1304; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1305; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 1306; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1307; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 1308; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1309; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1310; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1311; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3 1312; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1313; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1314; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1315; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1316; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1317; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1318; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1319; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1320; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1321; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1322; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1 1323; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1324; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1325; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1326; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1 1327; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1328; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1329; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1330; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1 1331; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1332; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1333; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1334; AVX512-NEXT: vzeroupper 1335; AVX512-NEXT: retq 1336 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) 1337 ret double %1 1338} 1339 1340; 1341; vXf64 (zero) 1342; 1343 1344define double @test_v2f64_zero(<2 x double> %a0) { 1345; SSE-LABEL: test_v2f64_zero: 1346; SSE: # %bb.0: 1347; SSE-NEXT: movapd %xmm0, %xmm1 1348; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1349; SSE-NEXT: addsd %xmm0, %xmm1 1350; SSE-NEXT: movapd %xmm1, %xmm0 1351; SSE-NEXT: retq 1352; 1353; AVX1-SLOW-LABEL: test_v2f64_zero: 1354; AVX1-SLOW: # %bb.0: 1355; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1356; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1357; AVX1-SLOW-NEXT: retq 1358; 1359; AVX1-FAST-LABEL: test_v2f64_zero: 1360; AVX1-FAST: # %bb.0: 1361; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1362; AVX1-FAST-NEXT: retq 1363; 1364; AVX2-LABEL: test_v2f64_zero: 1365; AVX2: # %bb.0: 1366; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1367; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1368; AVX2-NEXT: retq 1369; 1370; AVX512-LABEL: test_v2f64_zero: 1371; AVX512: # %bb.0: 1372; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1373; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1374; AVX512-NEXT: retq 1375 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %a0) 1376 ret double %1 1377} 1378 1379define double @test_v4f64_zero(<4 x double> %a0) { 1380; SSE-LABEL: test_v4f64_zero: 1381; SSE: # %bb.0: 1382; SSE-NEXT: movapd %xmm0, %xmm2 1383; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 1384; SSE-NEXT: addsd %xmm0, %xmm2 1385; SSE-NEXT: addsd %xmm1, %xmm2 1386; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1387; SSE-NEXT: addsd %xmm1, %xmm2 1388; SSE-NEXT: movapd %xmm2, %xmm0 1389; SSE-NEXT: retq 1390; 1391; AVX1-SLOW-LABEL: test_v4f64_zero: 1392; AVX1-SLOW: # %bb.0: 1393; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1394; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1395; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1396; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1397; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1398; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1399; AVX1-SLOW-NEXT: vzeroupper 1400; AVX1-SLOW-NEXT: retq 1401; 1402; AVX1-FAST-LABEL: test_v4f64_zero: 1403; AVX1-FAST: # %bb.0: 1404; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm1 1405; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1406; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1407; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1408; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1409; AVX1-FAST-NEXT: vzeroupper 1410; AVX1-FAST-NEXT: retq 1411; 1412; AVX2-LABEL: test_v4f64_zero: 1413; AVX2: # %bb.0: 1414; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1415; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1416; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1417; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1418; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1419; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1420; AVX2-NEXT: vzeroupper 1421; AVX2-NEXT: retq 1422; 1423; AVX512-LABEL: test_v4f64_zero: 1424; AVX512: # %bb.0: 1425; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1426; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1427; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1428; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1429; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1430; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1431; AVX512-NEXT: vzeroupper 1432; AVX512-NEXT: retq 1433 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a0) 1434 ret double %1 1435} 1436 1437define double @test_v8f64_zero(<8 x double> %a0) { 1438; SSE-LABEL: test_v8f64_zero: 1439; SSE: # %bb.0: 1440; SSE-NEXT: movapd %xmm0, %xmm4 1441; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 1442; SSE-NEXT: addsd %xmm0, %xmm4 1443; SSE-NEXT: addsd %xmm1, %xmm4 1444; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1445; SSE-NEXT: addsd %xmm1, %xmm4 1446; SSE-NEXT: addsd %xmm2, %xmm4 1447; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1448; SSE-NEXT: addsd %xmm2, %xmm4 1449; SSE-NEXT: addsd %xmm3, %xmm4 1450; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1451; SSE-NEXT: addsd %xmm3, %xmm4 1452; SSE-NEXT: movapd %xmm4, %xmm0 1453; SSE-NEXT: retq 1454; 1455; AVX1-SLOW-LABEL: test_v8f64_zero: 1456; AVX1-SLOW: # %bb.0: 1457; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1458; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1459; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1460; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1461; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1462; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1463; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1464; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1465; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1466; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 1467; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1468; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1469; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1470; AVX1-SLOW-NEXT: vzeroupper 1471; AVX1-SLOW-NEXT: retq 1472; 1473; AVX1-FAST-LABEL: test_v8f64_zero: 1474; AVX1-FAST: # %bb.0: 1475; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm2 1476; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1477; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1478; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1479; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1480; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1481; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1482; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1483; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 1484; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1485; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1486; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1487; AVX1-FAST-NEXT: vzeroupper 1488; AVX1-FAST-NEXT: retq 1489; 1490; AVX2-LABEL: test_v8f64_zero: 1491; AVX2: # %bb.0: 1492; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1493; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1494; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1495; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1496; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1497; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1498; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1499; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1500; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1501; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 1502; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1503; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1504; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1505; AVX2-NEXT: vzeroupper 1506; AVX2-NEXT: retq 1507; 1508; AVX512-LABEL: test_v8f64_zero: 1509; AVX512: # %bb.0: 1510; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1511; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1512; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1513; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1514; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1515; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1516; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1517; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1518; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1519; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1520; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1521; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1522; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1523; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1524; AVX512-NEXT: vzeroupper 1525; AVX512-NEXT: retq 1526 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double -0.0, <8 x double> %a0) 1527 ret double %1 1528} 1529 1530define double @test_v16f64_zero(<16 x double> %a0) { 1531; SSE-LABEL: test_v16f64_zero: 1532; SSE: # %bb.0: 1533; SSE-NEXT: movapd %xmm0, %xmm8 1534; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1535; SSE-NEXT: addsd %xmm8, %xmm0 1536; SSE-NEXT: addsd %xmm1, %xmm0 1537; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1538; SSE-NEXT: addsd %xmm1, %xmm0 1539; SSE-NEXT: addsd %xmm2, %xmm0 1540; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1541; SSE-NEXT: addsd %xmm2, %xmm0 1542; SSE-NEXT: addsd %xmm3, %xmm0 1543; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1544; SSE-NEXT: addsd %xmm3, %xmm0 1545; SSE-NEXT: addsd %xmm4, %xmm0 1546; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1547; SSE-NEXT: addsd %xmm4, %xmm0 1548; SSE-NEXT: addsd %xmm5, %xmm0 1549; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1550; SSE-NEXT: addsd %xmm5, %xmm0 1551; SSE-NEXT: addsd %xmm6, %xmm0 1552; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1553; SSE-NEXT: addsd %xmm6, %xmm0 1554; SSE-NEXT: addsd %xmm7, %xmm0 1555; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1556; SSE-NEXT: addsd %xmm7, %xmm0 1557; SSE-NEXT: retq 1558; 1559; AVX1-SLOW-LABEL: test_v16f64_zero: 1560; AVX1-SLOW: # %bb.0: 1561; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1562; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm4 1563; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1564; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1565; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1566; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1567; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1568; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1569; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1570; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 1571; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1572; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1573; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1574; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1575; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1576; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1577; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm1 1578; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1579; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1580; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1581; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1582; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1583; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1584; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm1 1585; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1586; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1587; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1588; AVX1-SLOW-NEXT: vzeroupper 1589; AVX1-SLOW-NEXT: retq 1590; 1591; AVX1-FAST-LABEL: test_v16f64_zero: 1592; AVX1-FAST: # %bb.0: 1593; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm4 1594; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1595; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1596; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1597; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1598; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1599; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1600; AVX1-FAST-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1601; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 1602; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1603; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1604; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1605; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1606; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1607; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1608; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm1 1609; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1610; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1611; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1612; AVX1-FAST-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1613; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1614; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1615; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm1 1616; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1617; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1618; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1619; AVX1-FAST-NEXT: vzeroupper 1620; AVX1-FAST-NEXT: retq 1621; 1622; AVX2-LABEL: test_v16f64_zero: 1623; AVX2: # %bb.0: 1624; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1625; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm4 1626; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1627; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1628; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1629; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1630; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1631; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1632; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1633; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 1634; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1635; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1636; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1637; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1638; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1639; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1640; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 1641; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1642; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1643; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1644; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1645; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1646; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1647; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm1 1648; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1649; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1650; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1651; AVX2-NEXT: vzeroupper 1652; AVX2-NEXT: retq 1653; 1654; AVX512-LABEL: test_v16f64_zero: 1655; AVX512: # %bb.0: 1656; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1657; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1658; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1659; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1660; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1661; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1662; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1663; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1664; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1665; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1666; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1667; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1668; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1669; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1670; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1671; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1672; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1673; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1674; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1675; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1676; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1677; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1678; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1679; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1680; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1681; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1682; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1683; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1684; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1685; AVX512-NEXT: vzeroupper 1686; AVX512-NEXT: retq 1687 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double -0.0, <16 x double> %a0) 1688 ret double %1 1689} 1690 1691; 1692; vXf64 (undef) 1693; 1694 1695define double @test_v2f64_undef(<2 x double> %a0) { 1696; SSE-LABEL: test_v2f64_undef: 1697; SSE: # %bb.0: 1698; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1699; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1700; SSE-NEXT: retq 1701; 1702; AVX-LABEL: test_v2f64_undef: 1703; AVX: # %bb.0: 1704; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1705; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 1706; AVX-NEXT: retq 1707; 1708; AVX512-LABEL: test_v2f64_undef: 1709; AVX512: # %bb.0: 1710; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1711; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 1712; AVX512-NEXT: retq 1713 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0) 1714 ret double %1 1715} 1716 1717define double @test_v4f64_undef(<4 x double> %a0) { 1718; SSE-LABEL: test_v4f64_undef: 1719; SSE: # %bb.0: 1720; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1721; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1722; SSE-NEXT: addsd %xmm1, %xmm0 1723; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1724; SSE-NEXT: addsd %xmm1, %xmm0 1725; SSE-NEXT: retq 1726; 1727; AVX-LABEL: test_v4f64_undef: 1728; AVX: # %bb.0: 1729; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1730; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1731; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1732; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1733; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1734; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1735; AVX-NEXT: vzeroupper 1736; AVX-NEXT: retq 1737; 1738; AVX512-LABEL: test_v4f64_undef: 1739; AVX512: # %bb.0: 1740; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1741; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1742; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1743; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1744; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1745; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1746; AVX512-NEXT: vzeroupper 1747; AVX512-NEXT: retq 1748 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0) 1749 ret double %1 1750} 1751 1752define double @test_v8f64_undef(<8 x double> %a0) { 1753; SSE-LABEL: test_v8f64_undef: 1754; SSE: # %bb.0: 1755; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1756; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1757; SSE-NEXT: addsd %xmm1, %xmm0 1758; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1759; SSE-NEXT: addsd %xmm1, %xmm0 1760; SSE-NEXT: addsd %xmm2, %xmm0 1761; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1762; SSE-NEXT: addsd %xmm2, %xmm0 1763; SSE-NEXT: addsd %xmm3, %xmm0 1764; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1765; SSE-NEXT: addsd %xmm3, %xmm0 1766; SSE-NEXT: retq 1767; 1768; AVX-LABEL: test_v8f64_undef: 1769; AVX: # %bb.0: 1770; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1771; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm2, %xmm2 1772; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1773; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1774; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1775; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1776; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1777; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1778; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1779; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1780; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1781; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1782; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1783; AVX-NEXT: vzeroupper 1784; AVX-NEXT: retq 1785; 1786; AVX512-LABEL: test_v8f64_undef: 1787; AVX512: # %bb.0: 1788; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1789; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1790; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1791; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1792; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1793; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1794; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1795; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1796; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1797; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1798; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1799; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1800; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1801; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1802; AVX512-NEXT: vzeroupper 1803; AVX512-NEXT: retq 1804 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0) 1805 ret double %1 1806} 1807 1808define double @test_v16f64_undef(<16 x double> %a0) { 1809; SSE-LABEL: test_v16f64_undef: 1810; SSE: # %bb.0: 1811; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1812; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1813; SSE-NEXT: addsd %xmm1, %xmm0 1814; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1815; SSE-NEXT: addsd %xmm1, %xmm0 1816; SSE-NEXT: addsd %xmm2, %xmm0 1817; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1818; SSE-NEXT: addsd %xmm2, %xmm0 1819; SSE-NEXT: addsd %xmm3, %xmm0 1820; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1821; SSE-NEXT: addsd %xmm3, %xmm0 1822; SSE-NEXT: addsd %xmm4, %xmm0 1823; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1824; SSE-NEXT: addsd %xmm4, %xmm0 1825; SSE-NEXT: addsd %xmm5, %xmm0 1826; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1827; SSE-NEXT: addsd %xmm5, %xmm0 1828; SSE-NEXT: addsd %xmm6, %xmm0 1829; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1830; SSE-NEXT: addsd %xmm6, %xmm0 1831; SSE-NEXT: addsd %xmm7, %xmm0 1832; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1833; SSE-NEXT: addsd %xmm7, %xmm0 1834; SSE-NEXT: retq 1835; 1836; AVX-LABEL: test_v16f64_undef: 1837; AVX: # %bb.0: 1838; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1839; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm4, %xmm4 1840; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1841; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1842; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1843; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1844; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1845; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1846; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1847; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1848; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1849; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1850; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1851; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1852; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1853; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1854; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1855; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1856; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1857; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1858; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1859; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1860; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1861; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1862; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1863; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1864; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1865; AVX-NEXT: vzeroupper 1866; AVX-NEXT: retq 1867; 1868; AVX512-LABEL: test_v16f64_undef: 1869; AVX512: # %bb.0: 1870; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1871; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm2, %xmm2 1872; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1873; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1874; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1875; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1876; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1877; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1878; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1879; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1880; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1881; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1882; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1883; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1884; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1885; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1886; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1887; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1888; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1889; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1890; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1891; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1892; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1893; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1894; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1895; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1896; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1897; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1898; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1899; AVX512-NEXT: vzeroupper 1900; AVX512-NEXT: retq 1901 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0) 1902 ret double %1 1903} 1904 1905declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) 1906declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 1907declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 1908declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) 1909 1910declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) 1911declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 1912declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>) 1913declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>) 1914