1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
7;
8; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB)
9
10declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
11
12define <16 x i8> @combine_vpshufb_zero(<16 x i8> %a0) {
13; SSE-LABEL: combine_vpshufb_zero:
14; SSE:       # BB#0:
15; SSE-NEXT:    xorps %xmm0, %xmm0
16; SSE-NEXT:    retq
17;
18; AVX-LABEL: combine_vpshufb_zero:
19; AVX:       # BB#0:
20; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
21; AVX-NEXT:    retq
22  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
23  %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
24  %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
25  ret <16 x i8> %res2
26}
27
28define <16 x i8> @combine_vpshufb_movq(<16 x i8> %a0) {
29; SSE-LABEL: combine_vpshufb_movq:
30; SSE:       # BB#0:
31; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
32; SSE-NEXT:    retq
33;
34; AVX-LABEL: combine_vpshufb_movq:
35; AVX:       # BB#0:
36; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
37; AVX-NEXT:    retq
38  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>)
39  %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>)
40  ret <16 x i8> %res1
41}
42
43define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) {
44; SSE-LABEL: combine_pshufb_movddup:
45; SSE:       # BB#0:
46; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
47; SSE-NEXT:    retq
48;
49; AVX-LABEL: combine_pshufb_movddup:
50; AVX:       # BB#0:
51; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
52; AVX-NEXT:    retq
53  %1 = bitcast <4 x float> %a0 to <16 x i8>
54  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
55  %3 = bitcast <16 x i8> %2 to <4 x float>
56  %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
57  ret <4 x float> %4
58}
59
60define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) {
61; SSE-LABEL: combine_pshufb_movshdup:
62; SSE:       # BB#0:
63; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
64; SSE-NEXT:    retq
65;
66; AVX-LABEL: combine_pshufb_movshdup:
67; AVX:       # BB#0:
68; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
69; AVX-NEXT:    retq
70  %1 = bitcast <4 x float> %a0 to <16 x i8>
71  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
72  %3 = bitcast <16 x i8> %2 to <4 x float>
73  %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
74  ret <4 x float> %4
75}
76
77define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) {
78; SSE-LABEL: combine_pshufb_movsldup:
79; SSE:       # BB#0:
80; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
81; SSE-NEXT:    retq
82;
83; AVX-LABEL: combine_pshufb_movsldup:
84; AVX:       # BB#0:
85; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
86; AVX-NEXT:    retq
87  %1 = bitcast <4 x float> %a0 to <16 x i8>
88  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
89  %3 = bitcast <16 x i8> %2 to <4 x float>
90  %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
91  ret <4 x float> %4
92}
93
94define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) {
95; SSE-LABEL: combine_pshufb_palignr:
96; SSE:       # BB#0:
97; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
98; SSE-NEXT:    retq
99;
100; AVX-LABEL: combine_pshufb_palignr:
101; AVX:       # BB#0:
102; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
103; AVX-NEXT:    retq
104  %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
105  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
106  ret <16 x i8> %2
107}
108
109define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) {
110; SSE-LABEL: combine_pshufb_pslldq:
111; SSE:       # BB#0:
112; SSE-NEXT:    xorps %xmm0, %xmm0
113; SSE-NEXT:    retq
114;
115; AVX-LABEL: combine_pshufb_pslldq:
116; AVX:       # BB#0:
117; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
118; AVX-NEXT:    retq
119  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
120  %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
121  ret <16 x i8> %2
122}
123
124define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
125; SSE-LABEL: combine_pshufb_psrldq:
126; SSE:       # BB#0:
127; SSE-NEXT:    xorps %xmm0, %xmm0
128; SSE-NEXT:    retq
129;
130; AVX-LABEL: combine_pshufb_psrldq:
131; AVX:       # BB#0:
132; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
133; AVX-NEXT:    retq
134  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
135  %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
136  ret <16 x i8> %2
137}
138
139define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
140; SSE-LABEL: combine_pshufb_as_pslldq:
141; SSE:       # BB#0:
142; SSE-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
143; SSE-NEXT:    retq
144;
145; AVX-LABEL: combine_pshufb_as_pslldq:
146; AVX:       # BB#0:
147; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
148; AVX-NEXT:    retq
149  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
150  ret <16 x i8> %res0
151}
152
153define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
154; SSE-LABEL: combine_pshufb_as_psrldq:
155; SSE:       # BB#0:
156; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
157; SSE-NEXT:    retq
158;
159; AVX-LABEL: combine_pshufb_as_psrldq:
160; AVX:       # BB#0:
161; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
162; AVX-NEXT:    retq
163  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
164  ret <16 x i8> %res0
165}
166
167define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
168; SSE-LABEL: combine_pshufb_as_pshuflw:
169; SSE:       # BB#0:
170; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
171; SSE-NEXT:    retq
172;
173; AVX-LABEL: combine_pshufb_as_pshuflw:
174; AVX:       # BB#0:
175; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
176; AVX-NEXT:    retq
177  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
178  ret <16 x i8> %res0
179}
180
181define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
182; SSE-LABEL: combine_pshufb_as_pshufhw:
183; SSE:       # BB#0:
184; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
185; SSE-NEXT:    retq
186;
187; AVX-LABEL: combine_pshufb_as_pshufhw:
188; AVX:       # BB#0:
189; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
190; AVX-NEXT:    retq
191  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
192  ret <16 x i8> %res0
193}
194
195define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
196; SSE-LABEL: combine_pshufb_not_as_pshufw:
197; SSE:       # BB#0:
198; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
199; SSE-NEXT:    retq
200;
201; AVX-LABEL: combine_pshufb_not_as_pshufw:
202; AVX:       # BB#0:
203; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
204; AVX-NEXT:    retq
205  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
206  %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
207  ret <16 x i8> %res1
208}
209
210define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
211; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
212; SSE:       # BB#0:
213; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
214; SSE-NEXT:    retq
215;
216; AVX-LABEL: combine_pshufb_as_unary_unpcklbw:
217; AVX:       # BB#0:
218; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
219; AVX-NEXT:    retq
220  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>)
221  ret <16 x i8> %1
222}
223
224define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
225; SSE-LABEL: combine_pshufb_as_unary_unpckhwd:
226; SSE:       # BB#0:
227; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
228; SSE-NEXT:    retq
229;
230; AVX-LABEL: combine_pshufb_as_unary_unpckhwd:
231; AVX:       # BB#0:
232; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
233; AVX-NEXT:    retq
234  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>)
235  ret <16 x i8> %1
236}
237
238define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
239; SSE-LABEL: combine_unpckl_arg0_pshufb:
240; SSE:       # BB#0:
241; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
242; SSE-NEXT:    retq
243;
244; AVX-LABEL: combine_unpckl_arg0_pshufb:
245; AVX:       # BB#0:
246; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
247; AVX-NEXT:    retq
248  %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
249  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>)
250  ret <16 x i8> %2
251}
252
253define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
254; SSE-LABEL: combine_unpckl_arg1_pshufb:
255; SSE:       # BB#0:
256; SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
257; SSE-NEXT:    movdqa %xmm1, %xmm0
258; SSE-NEXT:    retq
259;
260; AVX-LABEL: combine_unpckl_arg1_pshufb:
261; AVX:       # BB#0:
262; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
263; AVX-NEXT:    retq
264  %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
265  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1>)
266  ret <16 x i8> %2
267}
268