1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
3
4; rdar://11314175: SD Scheduler, BuildSchedUnits assert:
5;                  N->getNodeId() == -1 && "Node already inserted!
6
7define void @func() nounwind ssp {
8; CHECK-LABEL: func:
9; CHECK:       ## BB#0:
10; CHECK-NEXT:    vmovups 0, %xmm0
11; CHECK-NEXT:    vxorps %ymm1, %ymm1, %ymm1
12; CHECK-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
14; CHECK-NEXT:    vbroadcastss 32, %xmm3
15; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
16; CHECK-NEXT:    vmulps %ymm0, %ymm2, %ymm2
17; CHECK-NEXT:    vmulps %ymm0, %ymm0, %ymm0
18; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
19; CHECK-NEXT:    vaddps %ymm0, %ymm0, %ymm0
20; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
21; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
22; CHECK-NEXT:    vaddps %ymm0, %ymm0, %ymm0
23; CHECK-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
24; CHECK-NEXT:    vsubps %ymm0, %ymm0, %ymm0
25; CHECK-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
26; CHECK-NEXT:    vmovaps %ymm0, (%rax)
27; CHECK-NEXT:    vzeroupper
28; CHECK-NEXT:    retq
29  %tmp = load <4 x float>, <4 x float>* null, align 1
30  %tmp14 = getelementptr <4 x float>, <4 x float>* null, i32 2
31  %tmp15 = load <4 x float>, <4 x float>* %tmp14, align 1
32  %tmp16 = shufflevector <4 x float> %tmp, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
33  %tmp17 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp16, <4 x float> undef, i8 1)
34  %tmp18 = bitcast <4 x float> %tmp to <16 x i8>
35  %tmp19 = shufflevector <16 x i8> %tmp18, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
36  %tmp20 = bitcast <16 x i8> %tmp19 to <4 x float>
37  %tmp21 = bitcast <4 x float> %tmp15 to <16 x i8>
38  %tmp22 = shufflevector <16 x i8> undef, <16 x i8> %tmp21, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
39  %tmp23 = bitcast <16 x i8> %tmp22 to <4 x float>
40  %tmp24 = shufflevector <4 x float> %tmp20, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
41  %tmp25 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp24, <4 x float> %tmp23, i8 1)
42  %tmp26 = fmul <8 x float> %tmp17, undef
43  %tmp27 = fmul <8 x float> %tmp25, undef
44  %tmp28 = fadd <8 x float> %tmp26, %tmp27
45  %tmp29 = fadd <8 x float> %tmp28, undef
46  %tmp30 = shufflevector <8 x float> %tmp29, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
47  %tmp31 = fmul <4 x float> undef, %tmp30
48  %tmp32 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> zeroinitializer, <4 x float> %tmp31, i8 1)
49  %tmp33 = fadd <8 x float> undef, %tmp32
50  %tmp34 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %tmp33, <8 x float> undef) nounwind
51  %tmp35 = fsub <8 x float> %tmp34, undef
52  %tmp36 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> zeroinitializer, <8 x float> %tmp35) nounwind
53  store <8 x float> %tmp36, <8 x float>* undef, align 32
54  ret void
55}
56
57declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
58
59declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
60