1; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
2
3; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
4
5define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
6; CHECK-LABEL:       test_x86_avx_vinsertf128_pd_256_1:
7; CHECK:             vinsertf128 $1, %xmm1, %ymm0, %ymm0
8  %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
9  ret <4 x double> %res
10}
11declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
12
13define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
14; CHECK-LABEL:      test_x86_avx_vinsertf128_ps_256_1:
15; CHECK:            vinsertf128 $1, %xmm1, %ymm0, %ymm0
16  %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
17  ret <8 x float> %res
18}
19declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
20
21define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
22; CHECK-LABEL:    test_x86_avx_vinsertf128_si_256_1:
23; CHECK:          vinsertf128 $1, %xmm1, %ymm0, %ymm0
24  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
25  ret <8 x i32> %res
26}
27
28; Verify that high bits of the immediate are masked off. This should be the equivalent
29; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
30; not a vinsertf128 $1.
31define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
32; CHECK-LABEL:    test_x86_avx_vinsertf128_si_256_2:
33; CHECK-NOT:      vinsertf128 $1, %xmm1, %ymm0, %ymm0
34  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
35  ret <8 x i32> %res
36}
37declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
38
39; We don't check any vextractf128 variant with immediate 0 because that's just a move.
40
41define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) {
42; CHECK-LABEL:       test_x86_avx_vextractf128_pd_256_1:
43; CHECK:             vextractf128 $1, %ymm0, %xmm0
44  %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1)
45  ret <2 x double> %res
46}
47declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
48
49define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) {
50; CHECK-LABEL:       test_x86_avx_vextractf128_ps_256_1:
51; CHECK:             vextractf128 $1, %ymm0, %xmm0
52  %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1)
53  ret <4 x float> %res
54}
55declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
56
57define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) {
58; CHECK-LABEL:    test_x86_avx_vextractf128_si_256_1:
59; CHECK:          vextractf128 $1, %ymm0, %xmm0
60  %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1)
61  ret <4 x i32> %res
62}
63declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
64
65; Verify that high bits of the immediate are masked off. This should be the equivalent
66; of a vextractf128 $0 which should be optimized away, so just check that it's
67; not a vextractf128 of any kind.
68define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
69; CHECK-LABEL:       test_x86_avx_extractf128_pd_256_2:
70; CHECK-NOT:         vextractf128
71  %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
72  ret <2 x double> %res
73}
74
75
76define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
77; CHECK-LABEL:       test_x86_avx_blend_pd_256:
78; CHECK:             vblendpd
79  %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
80  ret <4 x double> %res
81}
82declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
83
84
85define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
86; CHECK-LABEL:      test_x86_avx_blend_ps_256:
87; CHECK:            vblendps
88  %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
89  ret <8 x float> %res
90}
91declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
92
93
94define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
95; CHECK-LABEL:      test_x86_avx_dp_ps_256:
96; CHECK:            vdpps
97  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
98  ret <8 x float> %res
99}
100declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
101
102
103define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
104; CHECK-LABEL:    test_x86_sse2_psll_dq:
105; CHECK:          vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
106  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
107  ret <2 x i64> %res
108}
109declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
110
111
112define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
113; CHECK-LABEL:    test_x86_sse2_psrl_dq:
114; CHECK:          vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
115  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
116  ret <2 x i64> %res
117}
118declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
119
120
121define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
122; CHECK-LABEL:       test_x86_sse41_blendpd:
123; CHECK:             vblendpd
124  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
125  ret <2 x double> %res
126}
127declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
128
129
130define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
131; CHECK-LABEL:      test_x86_sse41_blendps:
132; CHECK:            vblendps
133  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
134  ret <4 x float> %res
135}
136declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
137
138
139define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
140; CHECK-LABEL:    test_x86_sse41_pblendw:
141; CHECK:          vpblendw
142  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
143  ret <8 x i16> %res
144}
145declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
146
147
148define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
149; CHECK-LABEL: test_x86_sse41_pmovsxbd:
150; CHECK:       # BB#0:
151; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
152; CHECK-NEXT:    retl
153  %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
154  ret <4 x i32> %res
155}
156declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
157
158
159define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
160; CHECK-LABEL: test_x86_sse41_pmovsxbq:
161; CHECK:       # BB#0:
162; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm0
163; CHECK-NEXT:    retl
164  %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
165  ret <2 x i64> %res
166}
167declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
168
169
170define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
171; CHECK-LABEL: test_x86_sse41_pmovsxbw:
172; CHECK:       # BB#0:
173; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0
174; CHECK-NEXT:    retl
175  %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
176  ret <8 x i16> %res
177}
178declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
179
180
181define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
182; CHECK-LABEL: test_x86_sse41_pmovsxdq:
183; CHECK:       # BB#0:
184; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
185; CHECK-NEXT:    retl
186  %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
187  ret <2 x i64> %res
188}
189declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
190
191
192define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
193; CHECK-LABEL: test_x86_sse41_pmovsxwd:
194; CHECK:       # BB#0:
195; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
196; CHECK-NEXT:    retl
197  %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
198  ret <4 x i32> %res
199}
200declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
201
202
203define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
204; CHECK-LABEL: test_x86_sse41_pmovsxwq:
205; CHECK:       # BB#0:
206; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0
207; CHECK-NEXT:    retl
208  %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
209  ret <2 x i64> %res
210}
211declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
212