1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=sse2    | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-- -mattr=sse4.2  | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-- -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
7
8define void @foo(<4 x float> %in, <4 x i8>* %out) {
9; SSE2-LABEL: foo:
10; SSE2:       # %bb.0:
11; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
12; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
13; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
14; SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
15; SSE2-NEXT:    shll $8, %ecx
16; SSE2-NEXT:    orl %eax, %ecx
17; SSE2-NEXT:    movd %ecx, %xmm0
18; SSE2-NEXT:    movl $65280, %eax # imm = 0xFF00
19; SSE2-NEXT:    orl -{{[0-9]+}}(%rsp), %eax
20; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
21; SSE2-NEXT:    movd %xmm0, (%rdi)
22; SSE2-NEXT:    retq
23;
24; SSE42-LABEL: foo:
25; SSE42:       # %bb.0:
26; SSE42-NEXT:    cvttps2dq %xmm0, %xmm0
27; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
28; SSE42-NEXT:    movl $255, %eax
29; SSE42-NEXT:    pinsrb $3, %eax, %xmm0
30; SSE42-NEXT:    movd %xmm0, (%rdi)
31; SSE42-NEXT:    retq
32;
33; AVX-LABEL: foo:
34; AVX:       # %bb.0:
35; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
36; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
37; AVX-NEXT:    movl $255, %eax
38; AVX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
39; AVX-NEXT:    vmovd %xmm0, (%rdi)
40; AVX-NEXT:    retq
41  %t0 = fptosi <4 x float> %in to <4 x i32>
42  %t1 = trunc <4 x i32> %t0 to <4 x i16>
43  %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
44  %t3 = trunc <8 x i16> %t2 to <8 x i8>
45  %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
46  %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
47  store <4 x i8> %t5, <4 x i8>* %out
48  ret void
49}
50
51define <16 x i64> @catcat(<4 x i64> %x) {
52; SSE-LABEL: catcat:
53; SSE:       # %bb.0:
54; SSE-NEXT:    movq %rdi, %rax
55; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
56; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
57; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
58; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
59; SSE-NEXT:    movdqa %xmm1, 112(%rdi)
60; SSE-NEXT:    movdqa %xmm1, 96(%rdi)
61; SSE-NEXT:    movdqa %xmm3, 80(%rdi)
62; SSE-NEXT:    movdqa %xmm3, 64(%rdi)
63; SSE-NEXT:    movdqa %xmm0, 48(%rdi)
64; SSE-NEXT:    movdqa %xmm0, 32(%rdi)
65; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
66; SSE-NEXT:    movdqa %xmm2, (%rdi)
67; SSE-NEXT:    retq
68;
69; AVX1-LABEL: catcat:
70; AVX1:       # %bb.0:
71; AVX1-NEXT:    vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
72; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3]
73; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
74; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,2,3]
75; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
76; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm4
77; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
78; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
79; AVX1-NEXT:    vmovaps %ymm4, %ymm0
80; AVX1-NEXT:    retq
81;
82; AVX2-LABEL: catcat:
83; AVX2:       # %bb.0:
84; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,1,1,1]
85; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2]
86; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[3,3,3,3]
87; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
88; AVX2-NEXT:    retq
89;
90; AVX512F-LABEL: catcat:
91; AVX512F:       # %bb.0:
92; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
93; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
94; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm2
95; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3]
96; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm1
97; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
98; AVX512F-NEXT:    retq
99  %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
100  %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
101  %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
102  ret  <16 x i64> %r
103}
104
105define <16 x i64> @load_catcat(<4 x i64>* %p) {
106; SSE-LABEL: load_catcat:
107; SSE:       # %bb.0:
108; SSE-NEXT:    movq %rdi, %rax
109; SSE-NEXT:    movdqa (%rsi), %xmm0
110; SSE-NEXT:    movdqa 16(%rsi), %xmm1
111; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
112; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
113; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
114; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
115; SSE-NEXT:    movdqa %xmm1, 112(%rdi)
116; SSE-NEXT:    movdqa %xmm1, 96(%rdi)
117; SSE-NEXT:    movdqa %xmm3, 80(%rdi)
118; SSE-NEXT:    movdqa %xmm3, 64(%rdi)
119; SSE-NEXT:    movdqa %xmm0, 48(%rdi)
120; SSE-NEXT:    movdqa %xmm0, 32(%rdi)
121; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
122; SSE-NEXT:    movdqa %xmm2, (%rdi)
123; SSE-NEXT:    retq
124;
125; AVX1-LABEL: load_catcat:
126; AVX1:       # %bb.0:
127; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm0
128; AVX1-NEXT:    vbroadcastsd 8(%rdi), %ymm1
129; AVX1-NEXT:    vbroadcastsd 16(%rdi), %ymm2
130; AVX1-NEXT:    vbroadcastsd 24(%rdi), %ymm3
131; AVX1-NEXT:    retq
132;
133; AVX2-LABEL: load_catcat:
134; AVX2:       # %bb.0:
135; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
136; AVX2-NEXT:    vbroadcastsd 8(%rdi), %ymm1
137; AVX2-NEXT:    vbroadcastsd 16(%rdi), %ymm2
138; AVX2-NEXT:    vbroadcastsd 24(%rdi), %ymm3
139; AVX2-NEXT:    retq
140;
141; AVX512F-LABEL: load_catcat:
142; AVX512F:       # %bb.0:
143; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
144; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5]
145; AVX512F-NEXT:    vpermq %zmm1, %zmm0, %zmm0
146; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7]
147; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
148; AVX512F-NEXT:    retq
149  %x = load <4 x i64>, <4 x i64>* %p
150  %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
151  %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
152  %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
153  ret  <16 x i64> %r
154}
155
156; Use weird types to make sure we do not miscompile a case where
157; the source ops are not an even multiple size of the result.
158
159define <4 x i32> @cat_ext_straddle(<6 x i32>* %px, <6 x i32>* %py) {
160; SSE-LABEL: cat_ext_straddle:
161; SSE:       # %bb.0:
162; SSE-NEXT:    movaps 16(%rdi), %xmm0
163; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
164; SSE-NEXT:    retq
165;
166; AVX-LABEL: cat_ext_straddle:
167; AVX:       # %bb.0:
168; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
169; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
170; AVX-NEXT:    retq
171  %x = load <6 x i32>, <6 x i32>* %px
172  %y = load <6 x i32>, <6 x i32>* %py
173  %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
174  %ext = shufflevector <12 x i32> %cat, <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
175  ret <4 x i32> %ext
176}
177