1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
3; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
4
5define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
6; SSE41-LABEL: test_x86_sse41_blendvpd:
7; SSE41:       ## BB#0:
8; SSE41-NEXT:    movapd %xmm0, %xmm3
9; SSE41-NEXT:    movaps %xmm2, %xmm0
10; SSE41-NEXT:    blendvpd %xmm1, %xmm3
11; SSE41-NEXT:    movapd %xmm3, %xmm0
12; SSE41-NEXT:    retl
13;
14; KNL-LABEL: test_x86_sse41_blendvpd:
15; KNL:       ## BB#0:
16; KNL-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
17; KNL-NEXT:    retl
18  %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
19  ret <2 x double> %res
20}
21declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
22
23
24define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
25; SSE41-LABEL: test_x86_sse41_blendvps:
26; SSE41:       ## BB#0:
27; SSE41-NEXT:    movaps %xmm0, %xmm3
28; SSE41-NEXT:    movaps %xmm2, %xmm0
29; SSE41-NEXT:    blendvps %xmm1, %xmm3
30; SSE41-NEXT:    movaps %xmm3, %xmm0
31; SSE41-NEXT:    retl
32;
33; KNL-LABEL: test_x86_sse41_blendvps:
34; KNL:       ## BB#0:
35; KNL-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
36; KNL-NEXT:    retl
37  %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
38  ret <4 x float> %res
39}
40declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
41
42
43define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
44; SSE41-LABEL: test_x86_sse41_dppd:
45; SSE41:       ## BB#0:
46; SSE41-NEXT:    dppd $7, %xmm1, %xmm0
47; SSE41-NEXT:    retl
48;
49; KNL-LABEL: test_x86_sse41_dppd:
50; KNL:       ## BB#0:
51; KNL-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0
52; KNL-NEXT:    retl
53  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
54  ret <2 x double> %res
55}
56declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
57
58
59define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
60; SSE41-LABEL: test_x86_sse41_dpps:
61; SSE41:       ## BB#0:
62; SSE41-NEXT:    dpps $7, %xmm1, %xmm0
63; SSE41-NEXT:    retl
64;
65; KNL-LABEL: test_x86_sse41_dpps:
66; KNL:       ## BB#0:
67; KNL-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0
68; KNL-NEXT:    retl
69  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
70  ret <4 x float> %res
71}
72declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
73
74
75define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
76; SSE41-LABEL: test_x86_sse41_insertps:
77; SSE41:       ## BB#0:
78; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
79; SSE41-NEXT:    retl
80;
81; KNL-LABEL: test_x86_sse41_insertps:
82; KNL:       ## BB#0:
83; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
84; KNL-NEXT:    retl
85  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) ; <<4 x float>> [#uses=1]
86  ret <4 x float> %res
87}
88declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
89
90
91
92define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
93; SSE41-LABEL: test_x86_sse41_mpsadbw:
94; SSE41:       ## BB#0:
95; SSE41-NEXT:    mpsadbw $7, %xmm1, %xmm0
96; SSE41-NEXT:    retl
97;
98; KNL-LABEL: test_x86_sse41_mpsadbw:
99; KNL:       ## BB#0:
100; KNL-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0
101; KNL-NEXT:    retl
102  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
103  ret <8 x i16> %res
104}
105declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
106
107
108define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
109; SSE41-LABEL: test_x86_sse41_packusdw:
110; SSE41:       ## BB#0:
111; SSE41-NEXT:    packusdw %xmm1, %xmm0
112; SSE41-NEXT:    retl
113;
114; KNL-LABEL: test_x86_sse41_packusdw:
115; KNL:       ## BB#0:
116; KNL-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
117; KNL-NEXT:    retl
118  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
119  ret <8 x i16> %res
120}
121declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
122
123
124define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
125; SSE41-LABEL: test_x86_sse41_pblendvb:
126; SSE41:       ## BB#0:
127; SSE41-NEXT:    movdqa %xmm0, %xmm3
128; SSE41-NEXT:    movaps %xmm2, %xmm0
129; SSE41-NEXT:    pblendvb %xmm1, %xmm3
130; SSE41-NEXT:    movdqa %xmm3, %xmm0
131; SSE41-NEXT:    retl
132;
133; KNL-LABEL: test_x86_sse41_pblendvb:
134; KNL:       ## BB#0:
135; KNL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
136; KNL-NEXT:    retl
137  %res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
138  ret <16 x i8> %res
139}
140declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
141
142
143define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
144; SSE41-LABEL: test_x86_sse41_phminposuw:
145; SSE41:       ## BB#0:
146; SSE41-NEXT:    phminposuw %xmm0, %xmm0
147; SSE41-NEXT:    retl
148;
149; KNL-LABEL: test_x86_sse41_phminposuw:
150; KNL:       ## BB#0:
151; KNL-NEXT:    vphminposuw %xmm0, %xmm0
152; KNL-NEXT:    retl
153  %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
154  ret <8 x i16> %res
155}
156declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
157
158
159define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
160; SSE41-LABEL: test_x86_sse41_pmaxsb:
161; SSE41:       ## BB#0:
162; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
163; SSE41-NEXT:    retl
164;
165; KNL-LABEL: test_x86_sse41_pmaxsb:
166; KNL:       ## BB#0:
167; KNL-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
168; KNL-NEXT:    retl
169  %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
170  ret <16 x i8> %res
171}
172declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
173
174
175define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
176; SSE41-LABEL: test_x86_sse41_pmaxsd:
177; SSE41:       ## BB#0:
178; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
179; SSE41-NEXT:    retl
180;
181; KNL-LABEL: test_x86_sse41_pmaxsd:
182; KNL:       ## BB#0:
183; KNL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
184; KNL-NEXT:    retl
185  %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
186  ret <4 x i32> %res
187}
188declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
189
190
191define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
192; SSE41-LABEL: test_x86_sse41_pmaxud:
193; SSE41:       ## BB#0:
194; SSE41-NEXT:    pmaxud %xmm1, %xmm0
195; SSE41-NEXT:    retl
196;
197; KNL-LABEL: test_x86_sse41_pmaxud:
198; KNL:       ## BB#0:
199; KNL-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
200; KNL-NEXT:    retl
201  %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
202  ret <4 x i32> %res
203}
204declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
205
206
207define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
208; SSE41-LABEL: test_x86_sse41_pmaxuw:
209; SSE41:       ## BB#0:
210; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
211; SSE41-NEXT:    retl
212;
213; KNL-LABEL: test_x86_sse41_pmaxuw:
214; KNL:       ## BB#0:
215; KNL-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
216; KNL-NEXT:    retl
217  %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
218  ret <8 x i16> %res
219}
220declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
221
222
223define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
224; SSE41-LABEL: test_x86_sse41_pminsb:
225; SSE41:       ## BB#0:
226; SSE41-NEXT:    pminsb %xmm1, %xmm0
227; SSE41-NEXT:    retl
228;
229; KNL-LABEL: test_x86_sse41_pminsb:
230; KNL:       ## BB#0:
231; KNL-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
232; KNL-NEXT:    retl
233  %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
234  ret <16 x i8> %res
235}
236declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
237
238
239define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
240; SSE41-LABEL: test_x86_sse41_pminsd:
241; SSE41:       ## BB#0:
242; SSE41-NEXT:    pminsd %xmm1, %xmm0
243; SSE41-NEXT:    retl
244;
245; KNL-LABEL: test_x86_sse41_pminsd:
246; KNL:       ## BB#0:
247; KNL-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
248; KNL-NEXT:    retl
249  %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
250  ret <4 x i32> %res
251}
252declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
253
254
255define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
256; SSE41-LABEL: test_x86_sse41_pminud:
257; SSE41:       ## BB#0:
258; SSE41-NEXT:    pminud %xmm1, %xmm0
259; SSE41-NEXT:    retl
260;
261; KNL-LABEL: test_x86_sse41_pminud:
262; KNL:       ## BB#0:
263; KNL-NEXT:    vpminud %xmm1, %xmm0, %xmm0
264; KNL-NEXT:    retl
265  %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
266  ret <4 x i32> %res
267}
268declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
269
270
271define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
272; SSE41-LABEL: test_x86_sse41_pminuw:
273; SSE41:       ## BB#0:
274; SSE41-NEXT:    pminuw %xmm1, %xmm0
275; SSE41-NEXT:    retl
276;
277; KNL-LABEL: test_x86_sse41_pminuw:
278; KNL:       ## BB#0:
279; KNL-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
280; KNL-NEXT:    retl
281  %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
282  ret <8 x i16> %res
283}
284declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
285
286
287define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
288; SSE41-LABEL: test_x86_sse41_pmuldq:
289; SSE41:       ## BB#0:
290; SSE41-NEXT:    pmuldq %xmm1, %xmm0
291; SSE41-NEXT:    retl
292;
293; KNL-LABEL: test_x86_sse41_pmuldq:
294; KNL:       ## BB#0:
295; KNL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
296; KNL-NEXT:    retl
297  %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
298  ret <2 x i64> %res
299}
300declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
301
302
303define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
304; SSE41-LABEL: test_x86_sse41_ptestc:
305; SSE41:       ## BB#0:
306; SSE41-NEXT:    ptest %xmm1, %xmm0
307; SSE41-NEXT:    sbbl %eax, %eax
308; SSE41-NEXT:    andl $1, %eax
309; SSE41-NEXT:    retl
310;
311; KNL-LABEL: test_x86_sse41_ptestc:
312; KNL:       ## BB#0:
313; KNL-NEXT:    vptest %xmm1, %xmm0
314; KNL-NEXT:    sbbl %eax, %eax
315; KNL-NEXT:    andl $1, %eax
316; KNL-NEXT:    retl
317  %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
318  ret i32 %res
319}
320declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
321
322
323define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
324; SSE41-LABEL: test_x86_sse41_ptestnzc:
325; SSE41:       ## BB#0:
326; SSE41-NEXT:    xorl %eax, %eax
327; SSE41-NEXT:    ptest %xmm1, %xmm0
328; SSE41-NEXT:    seta %al
329; SSE41-NEXT:    retl
330;
331; KNL-LABEL: test_x86_sse41_ptestnzc:
332; KNL:       ## BB#0:
333; KNL-NEXT:    xorl %eax, %eax
334; KNL-NEXT:    vptest %xmm1, %xmm0
335; KNL-NEXT:    seta %al
336; KNL-NEXT:    retl
337  %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
338  ret i32 %res
339}
340declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
341
342
343define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
344; SSE41-LABEL: test_x86_sse41_ptestz:
345; SSE41:       ## BB#0:
346; SSE41-NEXT:    xorl %eax, %eax
347; SSE41-NEXT:    ptest %xmm1, %xmm0
348; SSE41-NEXT:    sete %al
349; SSE41-NEXT:    retl
350;
351; KNL-LABEL: test_x86_sse41_ptestz:
352; KNL:       ## BB#0:
353; KNL-NEXT:    xorl %eax, %eax
354; KNL-NEXT:    vptest %xmm1, %xmm0
355; KNL-NEXT:    sete %al
356; KNL-NEXT:    retl
357  %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
358  ret i32 %res
359}
360declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
361
362
363define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
364; SSE41-LABEL: test_x86_sse41_round_pd:
365; SSE41:       ## BB#0:
366; SSE41-NEXT:    roundpd $7, %xmm0, %xmm0
367; SSE41-NEXT:    retl
368;
369; KNL-LABEL: test_x86_sse41_round_pd:
370; KNL:       ## BB#0:
371; KNL-NEXT:    vroundpd $7, %xmm0, %xmm0
372; KNL-NEXT:    retl
373  %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
374  ret <2 x double> %res
375}
376declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
377
378
379define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
380; SSE41-LABEL: test_x86_sse41_round_ps:
381; SSE41:       ## BB#0:
382; SSE41-NEXT:    roundps $7, %xmm0, %xmm0
383; SSE41-NEXT:    retl
384;
385; KNL-LABEL: test_x86_sse41_round_ps:
386; KNL:       ## BB#0:
387; KNL-NEXT:    vroundps $7, %xmm0, %xmm0
388; KNL-NEXT:    retl
389  %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
390  ret <4 x float> %res
391}
392declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
393
394
395define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
396; SSE41-LABEL: test_x86_sse41_round_sd:
397; SSE41:       ## BB#0:
398; SSE41-NEXT:    roundsd $7, %xmm1, %xmm0
399; SSE41-NEXT:    retl
400;
401; KNL-LABEL: test_x86_sse41_round_sd:
402; KNL:       ## BB#0:
403; KNL-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm0
404; KNL-NEXT:    retl
405  %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
406  ret <2 x double> %res
407}
408declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
409
410
411define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
412; SSE41-LABEL: test_x86_sse41_round_ss:
413; SSE41:       ## BB#0:
414; SSE41-NEXT:    roundss $7, %xmm1, %xmm0
415; SSE41-NEXT:    retl
416;
417; KNL-LABEL: test_x86_sse41_round_ss:
418; KNL:       ## BB#0:
419; KNL-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm0
420; KNL-NEXT:    retl
421  %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
422  ret <4 x float> %res
423}
424declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
425