1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
3
4; rdar://11314175: SD Scheduler, BuildSchedUnits assert:
5;                  N->getNodeId() == -1 && "Node already inserted!
6
7define void @func(<4 x float> %a, <16 x i8> %b, <16 x i8> %c, <8 x float> %d, <8 x float> %e, <8 x float>* %f) nounwind ssp {
8; CHECK-LABEL: func:
9; CHECK:       ## %bb.0:
10; CHECK-NEXT:    vmovdqu 0, %xmm0
11; CHECK-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
12; CHECK-NEXT:    vmulps %xmm1, %xmm1, %xmm1
13; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
14; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
15; CHECK-NEXT:    vaddps %xmm0, %xmm0, %xmm0
16; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
17; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
18; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
19; CHECK-NEXT:    vaddps %ymm0, %ymm0, %ymm0
20; CHECK-NEXT:    vhaddps %ymm4, %ymm0, %ymm0
21; CHECK-NEXT:    vsubps %ymm0, %ymm0, %ymm0
22; CHECK-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
23; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
24; CHECK-NEXT:    vzeroupper
25; CHECK-NEXT:    retq
26  %tmp = load <4 x float>, <4 x float>* null, align 1
27  %tmp14 = getelementptr <4 x float>, <4 x float>* null, i32 2
28  %tmp15 = load <4 x float>, <4 x float>* %tmp14, align 1
29  %tmp16 = shufflevector <4 x float> %tmp, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
30  %tmp17 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp16, <4 x float> %a, i8 1)
31  %tmp18 = bitcast <4 x float> %tmp to <16 x i8>
32  %tmp19 = shufflevector <16 x i8> %tmp18, <16 x i8> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
33  %tmp20 = bitcast <16 x i8> %tmp19 to <4 x float>
34  %tmp21 = bitcast <4 x float> %tmp15 to <16 x i8>
35  %tmp22 = shufflevector <16 x i8> %c, <16 x i8> %tmp21, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
36  %tmp23 = bitcast <16 x i8> %tmp22 to <4 x float>
37  %tmp24 = shufflevector <4 x float> %tmp20, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
38  %tmp25 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp24, <4 x float> %tmp23, i8 1)
39  %tmp26 = fmul <8 x float> %tmp17, %tmp17
40  %tmp27 = fmul <8 x float> %tmp25, %tmp25
41  %tmp28 = fadd <8 x float> %tmp26, %tmp27
42  %tmp29 = fadd <8 x float> %tmp28, %tmp28
43  %tmp30 = shufflevector <8 x float> %tmp29, <8 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
44  %tmp31 = fmul <4 x float> %tmp30, %tmp30
45  %tmp32 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> zeroinitializer, <4 x float> %tmp31, i8 1)
46  %tmp33 = fadd <8 x float> %tmp32, %tmp32
47  %tmp34 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %tmp33, <8 x float> %e) nounwind
48  %tmp35 = fsub <8 x float> %tmp34, %tmp34
49  %tmp36 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> zeroinitializer, <8 x float> %tmp35) nounwind
50  store <8 x float> %tmp36, <8 x float>* %f, align 32
51  ret void
52}
53
54declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
55
56declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
57