1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
6
7;
8; Variable Rotates
9;
10
11define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
12; AVX1-LABEL: var_rotate_v4i64:
13; AVX1:       # BB#0:
14; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
15; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
16; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
17; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
18; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
19; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm6
20; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
21; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm4
22; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
23; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm6
24; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
25; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
26; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
27; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
28; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm4
29; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
30; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm2
31; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
32; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm4
33; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
34; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm0
35; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
36; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
37; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
38; AVX1-NEXT:    retq
39;
40; AVX2-LABEL: var_rotate_v4i64:
41; AVX2:       # BB#0:
42; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
43; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
44; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
45; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
46; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
47; AVX2-NEXT:    retq
48;
49; XOPAVX1-LABEL: var_rotate_v4i64:
50; XOPAVX1:       # BB#0:
51; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
52; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
53; XOPAVX1-NEXT:    vprotq %xmm2, %xmm3, %xmm2
54; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
55; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
56; XOPAVX1-NEXT:    retq
57;
58; XOPAVX2-LABEL: var_rotate_v4i64:
59; XOPAVX2:       # BB#0:
60; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
61; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
62; XOPAVX2-NEXT:    vprotq %xmm2, %xmm3, %xmm2
63; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
64; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
65; XOPAVX2-NEXT:    retq
66  %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
67  %shl = shl <4 x i64> %a, %b
68  %lshr = lshr <4 x i64> %a, %b64
69  %or = or <4 x i64> %shl, %lshr
70  ret <4 x i64> %or
71}
72
73define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
74; AVX1-LABEL: var_rotate_v8i32:
75; AVX1:       # BB#0:
76; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
77; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm2
78; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
79; AVX1-NEXT:    vpsubd %xmm4, %xmm3, %xmm3
80; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
81; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
82; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
83; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
84; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
85; AVX1-NEXT:    vpmulld %xmm6, %xmm4, %xmm4
86; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
87; AVX1-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
88; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
89; AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
90; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
91; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
92; AVX1-NEXT:    vpsrld %xmm4, %xmm6, %xmm4
93; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
94; AVX1-NEXT:    vpsrld %xmm5, %xmm6, %xmm5
95; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
96; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
97; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
98; AVX1-NEXT:    vpsrld %xmm7, %xmm6, %xmm7
99; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
100; AVX1-NEXT:    vpsrld %xmm3, %xmm6, %xmm3
101; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
102; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
103; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
104; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
105; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
106; AVX1-NEXT:    vpsrld %xmm6, %xmm0, %xmm6
107; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
108; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
109; AVX1-NEXT:    vpsrld %xmm5, %xmm0, %xmm5
110; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
111; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
112; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
113; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
114; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
115; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
116; AVX1-NEXT:    retq
117;
118; AVX2-LABEL: var_rotate_v8i32:
119; AVX2:       # BB#0:
120; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
121; AVX2-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
122; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
123; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
124; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
125; AVX2-NEXT:    retq
126;
127; XOPAVX1-LABEL: var_rotate_v8i32:
128; XOPAVX1:       # BB#0:
129; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
130; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
131; XOPAVX1-NEXT:    vprotd %xmm2, %xmm3, %xmm2
132; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
133; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
134; XOPAVX1-NEXT:    retq
135;
136; XOPAVX2-LABEL: var_rotate_v8i32:
137; XOPAVX2:       # BB#0:
138; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
139; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
140; XOPAVX2-NEXT:    vprotd %xmm2, %xmm3, %xmm2
141; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
142; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
143; XOPAVX2-NEXT:    retq
144  %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
145  %shl = shl <8 x i32> %a, %b
146  %lshr = lshr <8 x i32> %a, %b32
147  %or = or <8 x i32> %shl, %lshr
148  ret <8 x i32> %or
149}
150
151define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
152; AVX1-LABEL: var_rotate_v16i16:
153; AVX1:       # BB#0:
154; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
155; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm2
156; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
157; AVX1-NEXT:    vpsubw %xmm4, %xmm3, %xmm3
158; AVX1-NEXT:    vpsllw $12, %xmm4, %xmm5
159; AVX1-NEXT:    vpsllw $4, %xmm4, %xmm4
160; AVX1-NEXT:    vpor %xmm5, %xmm4, %xmm5
161; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm6
162; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
163; AVX1-NEXT:    vpsllw $8, %xmm4, %xmm7
164; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm4, %xmm5
165; AVX1-NEXT:    vpsllw $4, %xmm5, %xmm7
166; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
167; AVX1-NEXT:    vpsllw $2, %xmm5, %xmm7
168; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
169; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
170; AVX1-NEXT:    vpsllw $1, %xmm5, %xmm7
171; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
172; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
173; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm6
174; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
175; AVX1-NEXT:    vpor %xmm6, %xmm1, %xmm1
176; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm6
177; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm7
178; AVX1-NEXT:    vpblendvb %xmm1, %xmm7, %xmm0, %xmm1
179; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm7
180; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
181; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm7
182; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
183; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
184; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm7
185; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
186; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
187; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
188; AVX1-NEXT:    vpsllw $12, %xmm3, %xmm5
189; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm3
190; AVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
191; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm5
192; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm6
193; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm4, %xmm3
194; AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm4
195; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
196; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm4
197; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
198; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
199; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
200; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
201; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
202; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm4
203; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
204; AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
205; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm4
206; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm5
207; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm0, %xmm0
208; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
209; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
210; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
211; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
212; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
213; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
214; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
215; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
216; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
217; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
218; AVX1-NEXT:    retq
219;
220; AVX2-LABEL: var_rotate_v16i16:
221; AVX2:       # BB#0:
222; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
223; AVX2-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
224; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
225; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
226; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
227; AVX2-NEXT:    vpsllvd %ymm4, %ymm5, %ymm4
228; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
229; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
230; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
231; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
232; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
233; AVX2-NEXT:    vpackusdw %ymm4, %ymm1, %ymm1
234; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
235; AVX2-NEXT:    vpsrlvd %ymm4, %ymm5, %ymm4
236; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
237; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
238; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
239; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
240; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
241; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
242; AVX2-NEXT:    retq
243;
244; XOPAVX1-LABEL: var_rotate_v16i16:
245; XOPAVX1:       # BB#0:
246; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
247; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
248; XOPAVX1-NEXT:    vprotw %xmm2, %xmm3, %xmm2
249; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
250; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
251; XOPAVX1-NEXT:    retq
252;
253; XOPAVX2-LABEL: var_rotate_v16i16:
254; XOPAVX2:       # BB#0:
255; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
256; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
257; XOPAVX2-NEXT:    vprotw %xmm2, %xmm3, %xmm2
258; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
259; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
260; XOPAVX2-NEXT:    retq
261  %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
262  %shl = shl <16 x i16> %a, %b
263  %lshr = lshr <16 x i16> %a, %b16
264  %or = or <16 x i16> %shl, %lshr
265  ret <16 x i16> %or
266}
267
268define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
269; AVX1-LABEL: var_rotate_v32i8:
270; AVX1:       # BB#0:
271; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
272; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm8
273; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
274; AVX1-NEXT:    vpsubb %xmm4, %xmm3, %xmm9
275; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
276; AVX1-NEXT:    vpsllw $4, %xmm5, %xmm6
277; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
278; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
279; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
280; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm5, %xmm6
281; AVX1-NEXT:    vpsllw $2, %xmm6, %xmm2
282; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
283; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
284; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
285; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm6, %xmm2
286; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm6
287; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
288; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm2, %xmm2
289; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
290; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
291; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
292; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm4
293; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm6
294; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
295; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
296; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm4, %xmm3
297; AVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm4
298; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
299; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm3, %xmm1
300; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
301; AVX1-NEXT:    vpsrlw $4, %xmm5, %xmm2
302; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
303; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
304; AVX1-NEXT:    vpsllw $5, %xmm9, %xmm4
305; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm5, %xmm2
306; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm5
307; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
308; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
309; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
310; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
311; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm5
312; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
313; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
314; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
315; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
316; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
317; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
318; AVX1-NEXT:    vpsllw $5, %xmm8, %xmm4
319; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
320; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
321; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
322; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
323; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
324; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
325; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
326; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
327; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
328; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
329; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
330; AVX1-NEXT:    retq
331;
332; AVX2-LABEL: var_rotate_v32i8:
333; AVX2:       # BB#0:
334; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
335; AVX2-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
336; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
337; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
338; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
339; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm3
340; AVX2-NEXT:    vpsllw $2, %ymm3, %ymm4
341; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
342; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
343; AVX2-NEXT:    vpblendvb %ymm1, %ymm4, %ymm3, %ymm3
344; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
345; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
346; AVX2-NEXT:    vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
347; AVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
348; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
349; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
350; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
351; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
352; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
353; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
354; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
355; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
356; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
357; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
358; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
359; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
360; AVX2-NEXT:    retq
361;
362; XOPAVX1-LABEL: var_rotate_v32i8:
363; XOPAVX1:       # BB#0:
364; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
365; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
366; XOPAVX1-NEXT:    vprotb %xmm2, %xmm3, %xmm2
367; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
368; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
369; XOPAVX1-NEXT:    retq
370;
371; XOPAVX2-LABEL: var_rotate_v32i8:
372; XOPAVX2:       # BB#0:
373; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
374; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
375; XOPAVX2-NEXT:    vprotb %xmm2, %xmm3, %xmm2
376; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
377; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
378; XOPAVX2-NEXT:    retq
379  %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
380  %shl = shl <32 x i8> %a, %b
381  %lshr = lshr <32 x i8> %a, %b8
382  %or = or <32 x i8> %shl, %lshr
383  ret <32 x i8> %or
384}
385
386;
387; Constant Rotates
388;
389
390define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
391; AVX1-LABEL: constant_rotate_v4i64:
392; AVX1:       # BB#0:
393; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
394; AVX1-NEXT:    vpsllq $60, %xmm1, %xmm2
395; AVX1-NEXT:    vpsllq $50, %xmm1, %xmm3
396; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
397; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm3
398; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm4
399; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
400; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
401; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm3
402; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm1
403; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
404; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
405; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
406; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
407; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
408; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
409; AVX1-NEXT:    retq
410;
411; AVX2-LABEL: constant_rotate_v4i64:
412; AVX2:       # BB#0:
413; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
414; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
415; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
416; AVX2-NEXT:    retq
417;
418; XOPAVX1-LABEL: constant_rotate_v4i64:
419; XOPAVX1:       # BB#0:
420; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
421; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
422; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm2, %xmm3
423; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
424; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
425; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm4
426; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm2, %xmm2
427; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
428; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm0, %xmm0
429; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
430; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
431; XOPAVX1-NEXT:    retq
432;
433; XOPAVX2-LABEL: constant_rotate_v4i64:
434; XOPAVX2:       # BB#0:
435; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
436; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
437; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
438; XOPAVX2-NEXT:    retq
439  %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
440  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 2>
441  %or = or <4 x i64> %shl, %lshr
442  ret <4 x i64> %or
443}
444
445define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
446; AVX1-LABEL: constant_rotate_v8i32:
447; AVX1:       # BB#0:
448; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
449; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
450; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm3
451; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
452; AVX1-NEXT:    vpsrld $21, %xmm2, %xmm3
453; AVX1-NEXT:    vpsrld $23, %xmm2, %xmm4
454; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
455; AVX1-NEXT:    vpsrld $22, %xmm2, %xmm4
456; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
457; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
458; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
459; AVX1-NEXT:    vpsrld $25, %xmm0, %xmm3
460; AVX1-NEXT:    vpsrld $27, %xmm0, %xmm4
461; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
462; AVX1-NEXT:    vpsrld $26, %xmm0, %xmm4
463; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
464; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
465; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
466; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
467; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
468; AVX1-NEXT:    retq
469;
470; AVX2-LABEL: constant_rotate_v8i32:
471; AVX2:       # BB#0:
472; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
473; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
474; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
475; AVX2-NEXT:    retq
476;
477; XOPAVX1-LABEL: constant_rotate_v8i32:
478; XOPAVX1:       # BB#0:
479; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
480; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
481; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm2, %xmm3
482; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
483; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
484; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm2, %xmm2
485; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
486; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
487; XOPAVX1-NEXT:    retq
488;
489; XOPAVX2-LABEL: constant_rotate_v8i32:
490; XOPAVX2:       # BB#0:
491; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
492; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
493; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
494; XOPAVX2-NEXT:    retq
495  %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
496  %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
497  %or = or <8 x i32> %shl, %lshr
498  ret <8 x i32> %or
499}
500
501define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
502; AVX1-LABEL: constant_rotate_v16i16:
503; AVX1:       # BB#0:
504; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
505; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
506; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm3
507; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
508; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm3
509; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
510; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
511; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4],xmm2[5,6,7]
512; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
513; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6],xmm2[7]
514; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
515; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
516; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
517; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3,4,5,6,7]
518; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
519; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3,4],xmm0[5,6,7]
520; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
521; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3,4],xmm3[5,6],xmm0[7]
522; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
523; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
524; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
525; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
526; AVX1-NEXT:    retq
527;
528; AVX2-LABEL: constant_rotate_v16i16:
529; AVX2:       # BB#0:
530; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
531; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
532; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
533; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
534; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
535; AVX2-NEXT:    vpsrlvd %ymm4, %ymm5, %ymm4
536; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
537; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
538; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
539; AVX2-NEXT:    vpsrlvd %ymm3, %ymm0, %ymm0
540; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
541; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
542; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
543; AVX2-NEXT:    retq
544;
545; XOPAVX1-LABEL: constant_rotate_v16i16:
546; XOPAVX1:       # BB#0:
547; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
548; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
549; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm2, %xmm3
550; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
551; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
552; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm3, %xmm4
553; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm2, %xmm2
554; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm3, %xmm3
555; XOPAVX1-NEXT:    vpshlw %xmm3, %xmm0, %xmm0
556; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
557; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
558; XOPAVX1-NEXT:    retq
559;
560; XOPAVX2-LABEL: constant_rotate_v16i16:
561; XOPAVX2:       # BB#0:
562; XOPAVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
563; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
564; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm3
565; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
566; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
567; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
568; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
569; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
570; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
571; XOPAVX2-NEXT:    retq
572  %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
573  %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
574  %or = or <16 x i16> %shl, %lshr
575  ret <16 x i16> %or
576}
577
578define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
579; AVX1-LABEL: constant_rotate_v32i8:
580; AVX1:       # BB#0:
581; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
582; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm2
583; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
584; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
585; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
586; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
587; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm2
588; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm5
589; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
590; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
591; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm7
592; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm2, %xmm2
593; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm5
594; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm3
595; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm2, %xmm2
596; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm5
597; AVX1-NEXT:    vpand %xmm8, %xmm5, %xmm5
598; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm0, %xmm4
599; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm5
600; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
601; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
602; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
603; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
604; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm9
605; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm3
606; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
607; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
608; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
609; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
610; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
611; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm3
612; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
613; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
614; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm7
615; AVX1-NEXT:    vpblendvb %xmm7, %xmm3, %xmm1, %xmm1
616; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm3
617; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
618; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
619; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm2
620; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm1, %xmm1
621; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
622; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
623; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm0, %xmm0
624; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
625; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
626; AVX1-NEXT:    vpblendvb %xmm7, %xmm3, %xmm0, %xmm0
627; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
628; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
629; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
630; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
631; AVX1-NEXT:    vorps %ymm0, %ymm9, %ymm0
632; AVX1-NEXT:    retq
633;
634; AVX2-LABEL: constant_rotate_v32i8:
635; AVX2:       # BB#0:
636; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
637; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
638; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
639; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
640; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm2
641; AVX2-NEXT:    vpsllw $2, %ymm2, %ymm3
642; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
643; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
644; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm2, %ymm2
645; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
646; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
647; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm2, %ymm1
648; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
649; AVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
650; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
651; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
652; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
653; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm3
654; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
655; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
656; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
657; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm3
658; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
659; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
660; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
661; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
662; AVX2-NEXT:    retq
663;
664; XOPAVX1-LABEL: constant_rotate_v32i8:
665; XOPAVX1:       # BB#0:
666; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
667; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
668; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm3
669; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm1
670; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
671; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
672; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm3, %xmm3
673; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
674; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
675; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
676; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
677; XOPAVX1-NEXT:    retq
678;
679; XOPAVX2-LABEL: constant_rotate_v32i8:
680; XOPAVX2:       # BB#0:
681; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
682; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
683; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm3
684; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm1
685; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
686; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
687; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm3, %xmm3
688; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
689; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
690; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
691; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
692; XOPAVX2-NEXT:    retq
693  %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
694  %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
695  %or = or <32 x i8> %shl, %lshr
696  ret <32 x i8> %or
697}
698
699;
700; Uniform Constant Rotates
701;
702
703define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
704; AVX1-LABEL: splatconstant_rotate_v4i64:
705; AVX1:       # BB#0:
706; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
707; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
708; AVX1-NEXT:    vpsllq $14, %xmm2, %xmm3
709; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
710; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm0
711; AVX1-NEXT:    vpsrlq $50, %xmm2, %xmm2
712; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
713; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
714; AVX1-NEXT:    retq
715;
716; AVX2-LABEL: splatconstant_rotate_v4i64:
717; AVX2:       # BB#0:
718; AVX2-NEXT:    vpsllq $14, %ymm0, %ymm1
719; AVX2-NEXT:    vpsrlq $50, %ymm0, %ymm0
720; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
721; AVX2-NEXT:    retq
722;
723; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
724; XOPAVX1:       # BB#0:
725; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm1
726; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
727; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm0
728; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
729; XOPAVX1-NEXT:    retq
730;
731; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
732; XOPAVX2:       # BB#0:
733; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm1
734; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
735; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm0
736; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
737; XOPAVX2-NEXT:    retq
738  %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
739  %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
740  %or = or <4 x i64> %shl, %lshr
741  ret <4 x i64> %or
742}
743
744define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
745; AVX1-LABEL: splatconstant_rotate_v8i32:
746; AVX1:       # BB#0:
747; AVX1-NEXT:    vpslld $4, %xmm0, %xmm1
748; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
749; AVX1-NEXT:    vpslld $4, %xmm2, %xmm3
750; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
751; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
752; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm2
753; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
754; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
755; AVX1-NEXT:    retq
756;
757; AVX2-LABEL: splatconstant_rotate_v8i32:
758; AVX2:       # BB#0:
759; AVX2-NEXT:    vpslld $4, %ymm0, %ymm1
760; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm0
761; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
762; AVX2-NEXT:    retq
763;
764; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
765; XOPAVX1:       # BB#0:
766; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
767; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
768; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
769; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
770; XOPAVX1-NEXT:    retq
771;
772; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
773; XOPAVX2:       # BB#0:
774; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
775; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
776; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
777; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
778; XOPAVX2-NEXT:    retq
779  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
780  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
781  %or = or <8 x i32> %shl, %lshr
782  ret <8 x i32> %or
783}
784
785define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
786; AVX1-LABEL: splatconstant_rotate_v16i16:
787; AVX1:       # BB#0:
788; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm1
789; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
790; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm3
791; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
792; AVX1-NEXT:    vpsrlw $9, %xmm0, %xmm0
793; AVX1-NEXT:    vpsrlw $9, %xmm2, %xmm2
794; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
795; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
796; AVX1-NEXT:    retq
797;
798; AVX2-LABEL: splatconstant_rotate_v16i16:
799; AVX2:       # BB#0:
800; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm1
801; AVX2-NEXT:    vpsrlw $9, %ymm0, %ymm0
802; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
803; AVX2-NEXT:    retq
804;
805; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
806; XOPAVX1:       # BB#0:
807; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm1
808; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
809; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm0
810; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
811; XOPAVX1-NEXT:    retq
812;
813; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
814; XOPAVX2:       # BB#0:
815; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm1
816; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
817; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm0
818; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
819; XOPAVX2-NEXT:    retq
820  %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
821  %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
822  %or = or <16 x i16> %shl, %lshr
823  ret <16 x i16> %or
824}
825
826define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
827; AVX1-LABEL: splatconstant_rotate_v32i8:
828; AVX1:       # BB#0:
829; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
830; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm2
831; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
832; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
833; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
834; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
835; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
836; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
837; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
838; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
839; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
840; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
841; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
842; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
843; AVX1-NEXT:    retq
844;
845; AVX2-LABEL: splatconstant_rotate_v32i8:
846; AVX2:       # BB#0:
847; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
848; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
849; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
850; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
851; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
852; AVX2-NEXT:    retq
853;
854; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
855; XOPAVX1:       # BB#0:
856; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
857; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
858; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
859; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
860; XOPAVX1-NEXT:    retq
861;
862; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
863; XOPAVX2:       # BB#0:
864; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
865; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
866; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
867; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
868; XOPAVX2-NEXT:    retq
869  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
870  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
871  %or = or <32 x i8> %shl, %lshr
872  ret <32 x i8> %or
873}
874
875;
876; Masked Uniform Constant Rotates
877;
878
879define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
880; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
881; AVX1:       # BB#0:
882; AVX1-NEXT:    vpsllq $15, %xmm0, %xmm1
883; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
884; AVX1-NEXT:    vpsllq $15, %xmm2, %xmm3
885; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
886; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm0
887; AVX1-NEXT:    vpsrlq $49, %xmm2, %xmm2
888; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
889; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
890; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
891; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
892; AVX1-NEXT:    retq
893;
894; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
895; AVX2:       # BB#0:
896; AVX2-NEXT:    vpsllq $15, %ymm0, %ymm1
897; AVX2-NEXT:    vpsrlq $49, %ymm0, %ymm0
898; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
899; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
900; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
901; AVX2-NEXT:    retq
902;
903; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
904; XOPAVX1:       # BB#0:
905; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm1
906; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
907; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm0
908; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
909; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
910; XOPAVX1-NEXT:    retq
911;
912; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
913; XOPAVX2:       # BB#0:
914; XOPAVX2-NEXT:    vprotq $15, %xmm0, %xmm1
915; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
916; XOPAVX2-NEXT:    vprotq $15, %xmm0, %xmm0
917; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
918; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
919; XOPAVX2-NEXT:    retq
920  %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
921  %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
922  %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
923  %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
924  %or = or <4 x i64> %lmask, %rmask
925  ret <4 x i64> %or
926}
927
928define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
929; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
930; AVX1:       # BB#0:
931; AVX1-NEXT:    vpslld $4, %xmm0, %xmm1
932; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
933; AVX1-NEXT:    vpslld $4, %xmm2, %xmm3
934; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
935; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
936; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm2
937; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
938; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
939; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
940; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
941; AVX1-NEXT:    retq
942;
943; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
944; AVX2:       # BB#0:
945; AVX2-NEXT:    vpslld $4, %ymm0, %ymm1
946; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm0
947; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
948; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
949; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
950; AVX2-NEXT:    retq
951;
952; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
953; XOPAVX1:       # BB#0:
954; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
955; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
956; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
957; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
958; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
959; XOPAVX1-NEXT:    retq
960;
961; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
962; XOPAVX2:       # BB#0:
963; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
964; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
965; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
966; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
967; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
968; XOPAVX2-NEXT:    retq
969  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
970  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
971  %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
972  %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
973  %or = or <8 x i32> %lmask, %rmask
974  ret <8 x i32> %or
975}
976
977define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
978; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
979; AVX1:       # BB#0:
980; AVX1-NEXT:    vpsllw $5, %xmm0, %xmm1
981; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
982; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm3
983; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
984; AVX1-NEXT:    vpsrlw $11, %xmm0, %xmm0
985; AVX1-NEXT:    vpsrlw $11, %xmm2, %xmm2
986; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
987; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
988; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
989; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
990; AVX1-NEXT:    retq
991;
992; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
993; AVX2:       # BB#0:
994; AVX2-NEXT:    vpsllw $5, %ymm0, %ymm1
995; AVX2-NEXT:    vpsrlw $11, %ymm0, %ymm0
996; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
997; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
998; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
999; AVX2-NEXT:    retq
1000;
1001; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1002; XOPAVX1:       # BB#0:
1003; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm1
1004; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1005; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm0
1006; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1007; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1008; XOPAVX1-NEXT:    retq
1009;
1010; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1011; XOPAVX2:       # BB#0:
1012; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm1
1013; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1014; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm0
1015; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1016; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1017; XOPAVX2-NEXT:    retq
1018  %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1019  %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1020  %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1021  %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1022  %or = or <16 x i16> %lmask, %rmask
1023  ret <16 x i16> %or
1024}
1025
1026define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1027; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1028; AVX1:       # BB#0:
1029; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1030; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm2
1031; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1032; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1033; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
1034; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
1035; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1036; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1037; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1038; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1039; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1040; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1041; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1042; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1043; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm1
1044; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
1045; AVX1-NEXT:    retq
1046;
1047; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1048; AVX2:       # BB#0:
1049; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
1050; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1051; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1052; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1053; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1054; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1055; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1056; AVX2-NEXT:    retq
1057;
1058; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1059; XOPAVX1:       # BB#0:
1060; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
1061; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1062; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
1063; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1064; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1065; XOPAVX1-NEXT:    retq
1066;
1067; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1068; XOPAVX2:       # BB#0:
1069; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
1070; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1071; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
1072; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1073; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1074; XOPAVX2-NEXT:    retq
1075  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1076  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1077  %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1078  %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1079  %or = or <32 x i8> %lmask, %rmask
1080  ret <32 x i8> %or
1081}
1082