1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
6
7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8; X32-LABEL: test_mm256_abs_epi8:
9; X32:       # BB#0:
10; X32-NEXT:    vpabsb %ymm0, %ymm0
11; X32-NEXT:    retl
12;
13; X64-LABEL: test_mm256_abs_epi8:
14; X64:       # BB#0:
15; X64-NEXT:    vpabsb %ymm0, %ymm0
16; X64-NEXT:    retq
17  %arg = bitcast <4 x i64> %a0 to <32 x i8>
18  %call = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %arg)
19  %res = bitcast <32 x i8> %call to <4 x i64>
20  ret <4 x i64> %res
21}
22declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
23
24define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
25; X32-LABEL: test_mm256_abs_epi16:
26; X32:       # BB#0:
27; X32-NEXT:    vpabsw %ymm0, %ymm0
28; X32-NEXT:    retl
29;
30; X64-LABEL: test_mm256_abs_epi16:
31; X64:       # BB#0:
32; X64-NEXT:    vpabsw %ymm0, %ymm0
33; X64-NEXT:    retq
34  %arg = bitcast <4 x i64> %a0 to <16 x i16>
35  %call = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %arg)
36  %res = bitcast <16 x i16> %call to <4 x i64>
37  ret <4 x i64> %res
38}
39declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
40
41define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
42; X32-LABEL: test_mm256_abs_epi32:
43; X32:       # BB#0:
44; X32-NEXT:    vpabsd %ymm0, %ymm0
45; X32-NEXT:    retl
46;
47; X64-LABEL: test_mm256_abs_epi32:
48; X64:       # BB#0:
49; X64-NEXT:    vpabsd %ymm0, %ymm0
50; X64-NEXT:    retq
51  %arg = bitcast <4 x i64> %a0 to <8 x i32>
52  %call = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %arg)
53  %res = bitcast <8 x i32> %call to <4 x i64>
54  ret <4 x i64> %res
55}
56declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
57
58define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
59; X32-LABEL: test_mm256_add_epi8:
60; X32:       # BB#0:
61; X32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
62; X32-NEXT:    retl
63;
64; X64-LABEL: test_mm256_add_epi8:
65; X64:       # BB#0:
66; X64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
67; X64-NEXT:    retq
68  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
69  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
70  %res = add <32 x i8> %arg0, %arg1
71  %bc = bitcast <32 x i8> %res to <4 x i64>
72  ret <4 x i64> %bc
73}
74
75define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
76; X32-LABEL: test_mm256_add_epi16:
77; X32:       # BB#0:
78; X32-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
79; X32-NEXT:    retl
80;
81; X64-LABEL: test_mm256_add_epi16:
82; X64:       # BB#0:
83; X64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
84; X64-NEXT:    retq
85  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
86  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
87  %res = add <16 x i16> %arg0, %arg1
88  %bc = bitcast <16 x i16> %res to <4 x i64>
89  ret <4 x i64> %bc
90}
91
92define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
93; X32-LABEL: test_mm256_add_epi32:
94; X32:       # BB#0:
95; X32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
96; X32-NEXT:    retl
97;
98; X64-LABEL: test_mm256_add_epi32:
99; X64:       # BB#0:
100; X64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
101; X64-NEXT:    retq
102  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
103  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
104  %res = add <8 x i32> %arg0, %arg1
105  %bc = bitcast <8 x i32> %res to <4 x i64>
106  ret <4 x i64> %bc
107}
108
109define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
110; X32-LABEL: test_mm256_add_epi64:
111; X32:       # BB#0:
112; X32-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
113; X32-NEXT:    retl
114;
115; X64-LABEL: test_mm256_add_epi64:
116; X64:       # BB#0:
117; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
118; X64-NEXT:    retq
119  %res = add <4 x i64> %a0, %a1
120  ret <4 x i64> %res
121}
122
123define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
124; X32-LABEL: test_mm256_adds_epi8:
125; X32:       # BB#0:
126; X32-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
127; X32-NEXT:    retl
128;
129; X64-LABEL: test_mm256_adds_epi8:
130; X64:       # BB#0:
131; X64-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
132; X64-NEXT:    retq
133  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
134  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
135  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1)
136  %bc = bitcast <32 x i8> %res to <4 x i64>
137  ret <4 x i64> %bc
138}
139declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
140
141define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
142; X32-LABEL: test_mm256_adds_epi16:
143; X32:       # BB#0:
144; X32-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
145; X32-NEXT:    retl
146;
147; X64-LABEL: test_mm256_adds_epi16:
148; X64:       # BB#0:
149; X64-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
150; X64-NEXT:    retq
151  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
152  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
153  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1)
154  %bc = bitcast <16 x i16> %res to <4 x i64>
155  ret <4 x i64> %bc
156}
157declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
158
159define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
160; X32-LABEL: test_mm256_adds_epu8:
161; X32:       # BB#0:
162; X32-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
163; X32-NEXT:    retl
164;
165; X64-LABEL: test_mm256_adds_epu8:
166; X64:       # BB#0:
167; X64-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
168; X64-NEXT:    retq
169  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
170  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
171  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1)
172  %bc = bitcast <32 x i8> %res to <4 x i64>
173  ret <4 x i64> %bc
174}
175declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
176
177define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
178; X32-LABEL: test_mm256_adds_epu16:
179; X32:       # BB#0:
180; X32-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
181; X32-NEXT:    retl
182;
183; X64-LABEL: test_mm256_adds_epu16:
184; X64:       # BB#0:
185; X64-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
186; X64-NEXT:    retq
187  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
188  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
189  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1)
190  %bc = bitcast <16 x i16> %res to <4 x i64>
191  ret <4 x i64> %bc
192}
193declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
194
195define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
196; X32-LABEL: test_mm256_alignr_epi8:
197; X32:       # BB#0:
198; X32-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
199; X32-NEXT:    retl
200;
201; X64-LABEL: test_mm256_alignr_epi8:
202; X64:       # BB#0:
203; X64-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
204; X64-NEXT:    retq
205  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
206  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
207  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
208  %res = bitcast <32 x i8> %shuf to <4 x i64>
209  ret <4 x i64> %res
210}
211
212define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
213; X32-LABEL: test2_mm256_alignr_epi8:
214; X32:       # BB#0:
215; X32-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
216; X32-NEXT:    retl
217;
218; X64-LABEL: test2_mm256_alignr_epi8:
219; X64:       # BB#0:
220; X64-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
221; X64-NEXT:    retq
222  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
223  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
224  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
225  %res = bitcast <32 x i8> %shuf to <4 x i64>
226  ret <4 x i64> %res
227}
228
229define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
230; X32-LABEL: test_mm256_and_si256:
231; X32:       # BB#0:
232; X32-NEXT:    vandps %ymm1, %ymm0, %ymm0
233; X32-NEXT:    retl
234;
235; X64-LABEL: test_mm256_and_si256:
236; X64:       # BB#0:
237; X64-NEXT:    vandps %ymm1, %ymm0, %ymm0
238; X64-NEXT:    retq
239  %res = and <4 x i64> %a0, %a1
240  ret <4 x i64> %res
241}
242
243define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
244; X32-LABEL: test_mm256_andnot_si256:
245; X32:       # BB#0:
246; X32-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
247; X32-NEXT:    vpxor %ymm2, %ymm0, %ymm0
248; X32-NEXT:    vpand %ymm1, %ymm0, %ymm0
249; X32-NEXT:    retl
250;
251; X64-LABEL: test_mm256_andnot_si256:
252; X64:       # BB#0:
253; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
254; X64-NEXT:    vpxor %ymm2, %ymm0, %ymm0
255; X64-NEXT:    vpand %ymm1, %ymm0, %ymm0
256; X64-NEXT:    retq
257  %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
258  %res = and <4 x i64> %not, %a1
259  ret <4 x i64> %res
260}
261
262define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) {
263; X32-LABEL: test_mm256_avg_epu8:
264; X32:       # BB#0:
265; X32-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
266; X32-NEXT:    retl
267;
268; X64-LABEL: test_mm256_avg_epu8:
269; X64:       # BB#0:
270; X64-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
271; X64-NEXT:    retq
272  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
273  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
274  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
275  %bc = bitcast <32 x i8> %res to <4 x i64>
276  ret <4 x i64> %bc
277}
278declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
279
280define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) {
281; X32-LABEL: test_mm256_avg_epu16:
282; X32:       # BB#0:
283; X32-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
284; X32-NEXT:    retl
285;
286; X64-LABEL: test_mm256_avg_epu16:
287; X64:       # BB#0:
288; X64-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
289; X64-NEXT:    retq
290  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
291  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
292  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
293  %bc = bitcast <16 x i16> %res to <4 x i64>
294  ret <4 x i64> %bc
295}
296declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
297
298define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
299; X32-LABEL: test_mm256_blend_epi16:
300; X32:       # BB#0:
301; X32-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
302; X32-NEXT:    retl
303;
304; X64-LABEL: test_mm256_blend_epi16:
305; X64:       # BB#0:
306; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
307; X64-NEXT:    retq
308  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
309  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
310  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
311  %res = bitcast <16 x i16> %shuf to <4 x i64>
312  ret <4 x i64> %res
313}
314
315define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
316; X32-LABEL: test_mm_blend_epi32:
317; X32:       # BB#0:
318; X32-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
319; X32-NEXT:    retl
320;
321; X64-LABEL: test_mm_blend_epi32:
322; X64:       # BB#0:
323; X64-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
324; X64-NEXT:    retq
325  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
326  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
327  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
328  %res = bitcast <4 x i32> %shuf to <2 x i64>
329  ret <2 x i64> %res
330}
331
332define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
333; X32-LABEL: test_mm256_blend_epi32:
334; X32:       # BB#0:
335; X32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
336; X32-NEXT:    retl
337;
338; X64-LABEL: test_mm256_blend_epi32:
339; X64:       # BB#0:
340; X64-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
341; X64-NEXT:    retq
342  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
343  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
344  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
345  %res = bitcast <8 x i32> %shuf to <4 x i64>
346  ret <4 x i64> %res
347}
348
349define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
350; X32-LABEL: test_mm256_blendv_epi8:
351; X32:       # BB#0:
352; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
353; X32-NEXT:    retl
354;
355; X64-LABEL: test_mm256_blendv_epi8:
356; X64:       # BB#0:
357; X64-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
358; X64-NEXT:    retq
359  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
360  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
361  %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
362  %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
363  %res = bitcast <32 x i8> %call to <4 x i64>
364  ret <4 x i64> %res
365}
366declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
367
368define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
369; X32-LABEL: test_mm_broadcastb_epi8:
370; X32:       # BB#0:
371; X32-NEXT:    vpbroadcastb %xmm0, %xmm0
372; X32-NEXT:    retl
373;
374; X64-LABEL: test_mm_broadcastb_epi8:
375; X64:       # BB#0:
376; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
377; X64-NEXT:    retq
378  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
379  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
380  %res = bitcast <16 x i8> %shuf to <2 x i64>
381  ret <2 x i64> %res
382}
383
384define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
385; X32-LABEL: test_mm256_broadcastb_epi8:
386; X32:       # BB#0:
387; X32-NEXT:    vpbroadcastb %xmm0, %ymm0
388; X32-NEXT:    retl
389;
390; X64-LABEL: test_mm256_broadcastb_epi8:
391; X64:       # BB#0:
392; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
393; X64-NEXT:    retq
394  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
395  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
396  %res = bitcast <32 x i8> %shuf to <4 x i64>
397  ret <4 x i64> %res
398}
399
400define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
401; X32-LABEL: test_mm_broadcastd_epi32:
402; X32:       # BB#0:
403; X32-NEXT:    vbroadcastss %xmm0, %xmm0
404; X32-NEXT:    retl
405;
406; X64-LABEL: test_mm_broadcastd_epi32:
407; X64:       # BB#0:
408; X64-NEXT:    vbroadcastss %xmm0, %xmm0
409; X64-NEXT:    retq
410  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
411  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
412  %res = bitcast <4 x i32> %shuf to <2 x i64>
413  ret <2 x i64> %res
414}
415
416define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
417; X32-LABEL: test_mm256_broadcastd_epi32:
418; X32:       # BB#0:
419; X32-NEXT:    vbroadcastss %xmm0, %ymm0
420; X32-NEXT:    retl
421;
422; X64-LABEL: test_mm256_broadcastd_epi32:
423; X64:       # BB#0:
424; X64-NEXT:    vbroadcastss %xmm0, %ymm0
425; X64-NEXT:    retq
426  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
427  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
428  %res = bitcast <8 x i32> %shuf to <4 x i64>
429  ret <4 x i64> %res
430}
431
432define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
433; X32-LABEL: test_mm_broadcastq_epi64:
434; X32:       # BB#0:
435; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
436; X32-NEXT:    retl
437;
438; X64-LABEL: test_mm_broadcastq_epi64:
439; X64:       # BB#0:
440; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
441; X64-NEXT:    retq
442  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
443  ret <2 x i64> %res
444}
445
446define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
447; X32-LABEL: test_mm256_broadcastq_epi64:
448; X32:       # BB#0:
449; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
450; X32-NEXT:    retl
451;
452; X64-LABEL: test_mm256_broadcastq_epi64:
453; X64:       # BB#0:
454; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
455; X64-NEXT:    retq
456  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
457  ret <4 x i64> %res
458}
459
460define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
461; X32-LABEL: test_mm_broadcastsd_pd:
462; X32:       # BB#0:
463; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
464; X32-NEXT:    retl
465;
466; X64-LABEL: test_mm_broadcastsd_pd:
467; X64:       # BB#0:
468; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
469; X64-NEXT:    retq
470  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
471  ret <2 x double> %res
472}
473
474define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
475; X32-LABEL: test_mm256_broadcastsd_pd:
476; X32:       # BB#0:
477; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
478; X32-NEXT:    retl
479;
480; X64-LABEL: test_mm256_broadcastsd_pd:
481; X64:       # BB#0:
482; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
483; X64-NEXT:    retq
484  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
485  ret <4 x double> %res
486}
487
488define <4 x i64> @test_mm256_broadcastsi128_si256(<4 x i64> %a0) {
489; X32-LABEL: test_mm256_broadcastsi128_si256:
490; X32:       # BB#0:
491; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
492; X32-NEXT:    retl
493;
494; X64-LABEL: test_mm256_broadcastsi128_si256:
495; X64:       # BB#0:
496; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
497; X64-NEXT:    retq
498  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
499  ret <4 x i64> %res
500}
501
502define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
503; X32-LABEL: test_mm_broadcastss_ps:
504; X32:       # BB#0:
505; X32-NEXT:    vbroadcastss %xmm0, %xmm0
506; X32-NEXT:    retl
507;
508; X64-LABEL: test_mm_broadcastss_ps:
509; X64:       # BB#0:
510; X64-NEXT:    vbroadcastss %xmm0, %xmm0
511; X64-NEXT:    retq
512  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
513  ret <4 x float> %res
514}
515
516define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
517; X32-LABEL: test_mm256_broadcastss_ps:
518; X32:       # BB#0:
519; X32-NEXT:    vbroadcastss %xmm0, %ymm0
520; X32-NEXT:    retl
521;
522; X64-LABEL: test_mm256_broadcastss_ps:
523; X64:       # BB#0:
524; X64-NEXT:    vbroadcastss %xmm0, %ymm0
525; X64-NEXT:    retq
526  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
527  ret <8 x float> %res
528}
529
530define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
531; X32-LABEL: test_mm_broadcastw_epi16:
532; X32:       # BB#0:
533; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
534; X32-NEXT:    retl
535;
536; X64-LABEL: test_mm_broadcastw_epi16:
537; X64:       # BB#0:
538; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
539; X64-NEXT:    retq
540  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
541  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
542  %res = bitcast <8 x i16> %shuf to <2 x i64>
543  ret <2 x i64> %res
544}
545
546define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
547; X32-LABEL: test_mm256_broadcastw_epi16:
548; X32:       # BB#0:
549; X32-NEXT:    vpbroadcastw %xmm0, %ymm0
550; X32-NEXT:    retl
551;
552; X64-LABEL: test_mm256_broadcastw_epi16:
553; X64:       # BB#0:
554; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
555; X64-NEXT:    retq
556  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
557  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
558  %res = bitcast <16 x i16> %shuf to <4 x i64>
559  ret <4 x i64> %res
560}
561
562define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
563; X32-LABEL: test_mm256_bslli_epi128:
564; X32:       # BB#0:
565; X32-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
566; X32-NEXT:    retl
567;
568; X64-LABEL: test_mm256_bslli_epi128:
569; X64:       # BB#0:
570; X64-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
571; X64-NEXT:    retq
572  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
573  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
574  %res = bitcast <32 x i8> %shuf to <4 x i64>
575  ret <4 x i64> %res
576}
577
578define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
579; X32-LABEL: test_mm256_bsrli_epi128:
580; X32:       # BB#0:
581; X32-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
582; X32-NEXT:    retl
583;
584; X64-LABEL: test_mm256_bsrli_epi128:
585; X64:       # BB#0:
586; X64-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
587; X64-NEXT:    retq
588  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
589  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
590  %res = bitcast <32 x i8> %shuf to <4 x i64>
591  ret <4 x i64> %res
592}
593
594define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
595; X32-LABEL: test_mm256_cmpeq_epi8:
596; X32:       # BB#0:
597; X32-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
598; X32-NEXT:    retl
599;
600; X64-LABEL: test_mm256_cmpeq_epi8:
601; X64:       # BB#0:
602; X64-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
603; X64-NEXT:    retq
604  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
605  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
606  %cmp = icmp eq <32 x i8> %arg0, %arg1
607  %res = sext <32 x i1> %cmp to <32 x i8>
608  %bc = bitcast <32 x i8> %res to <4 x i64>
609  ret <4 x i64> %bc
610}
611
612define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
613; X32-LABEL: test_mm256_cmpeq_epi16:
614; X32:       # BB#0:
615; X32-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
616; X32-NEXT:    retl
617;
618; X64-LABEL: test_mm256_cmpeq_epi16:
619; X64:       # BB#0:
620; X64-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
621; X64-NEXT:    retq
622  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
623  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
624  %cmp = icmp eq <16 x i16> %arg0, %arg1
625  %res = sext <16 x i1> %cmp to <16 x i16>
626  %bc = bitcast <16 x i16> %res to <4 x i64>
627  ret <4 x i64> %bc
628}
629
630define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
631; X32-LABEL: test_mm256_cmpeq_epi32:
632; X32:       # BB#0:
633; X32-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
634; X32-NEXT:    retl
635;
636; X64-LABEL: test_mm256_cmpeq_epi32:
637; X64:       # BB#0:
638; X64-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
639; X64-NEXT:    retq
640  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
641  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
642  %cmp = icmp eq <8 x i32> %arg0, %arg1
643  %res = sext <8 x i1> %cmp to <8 x i32>
644  %bc = bitcast <8 x i32> %res to <4 x i64>
645  ret <4 x i64> %bc
646}
647
648define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
649; X32-LABEL: test_mm256_cmpeq_epi64:
650; X32:       # BB#0:
651; X32-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
652; X32-NEXT:    retl
653;
654; X64-LABEL: test_mm256_cmpeq_epi64:
655; X64:       # BB#0:
656; X64-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
657; X64-NEXT:    retq
658  %cmp = icmp eq <4 x i64> %a0, %a1
659  %res = sext <4 x i1> %cmp to <4 x i64>
660  ret <4 x i64> %res
661}
662
663define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
664; X32-LABEL: test_mm256_cmpgt_epi8:
665; X32:       # BB#0:
666; X32-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
667; X32-NEXT:    retl
668;
669; X64-LABEL: test_mm256_cmpgt_epi8:
670; X64:       # BB#0:
671; X64-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
672; X64-NEXT:    retq
673  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
674  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
675  %cmp = icmp sgt <32 x i8> %arg0, %arg1
676  %res = sext <32 x i1> %cmp to <32 x i8>
677  %bc = bitcast <32 x i8> %res to <4 x i64>
678  ret <4 x i64> %bc
679}
680
681define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
682; X32-LABEL: test_mm256_cmpgt_epi16:
683; X32:       # BB#0:
684; X32-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
685; X32-NEXT:    retl
686;
687; X64-LABEL: test_mm256_cmpgt_epi16:
688; X64:       # BB#0:
689; X64-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
690; X64-NEXT:    retq
691  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
692  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
693  %cmp = icmp sgt <16 x i16> %arg0, %arg1
694  %res = sext <16 x i1> %cmp to <16 x i16>
695  %bc = bitcast <16 x i16> %res to <4 x i64>
696  ret <4 x i64> %bc
697}
698
699define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
700; X32-LABEL: test_mm256_cmpgt_epi32:
701; X32:       # BB#0:
702; X32-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
703; X32-NEXT:    retl
704;
705; X64-LABEL: test_mm256_cmpgt_epi32:
706; X64:       # BB#0:
707; X64-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
708; X64-NEXT:    retq
709  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
710  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
711  %cmp = icmp sgt <8 x i32> %arg0, %arg1
712  %res = sext <8 x i1> %cmp to <8 x i32>
713  %bc = bitcast <8 x i32> %res to <4 x i64>
714  ret <4 x i64> %bc
715}
716
717define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
718; X32-LABEL: test_mm256_cmpgt_epi64:
719; X32:       # BB#0:
720; X32-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
721; X32-NEXT:    retl
722;
723; X64-LABEL: test_mm256_cmpgt_epi64:
724; X64:       # BB#0:
725; X64-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
726; X64-NEXT:    retq
727  %cmp = icmp sgt <4 x i64> %a0, %a1
728  %res = sext <4 x i1> %cmp to <4 x i64>
729  ret <4 x i64> %res
730}
731
732define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
733; X32-LABEL: test_mm256_cvtepi8_epi16:
734; X32:       # BB#0:
735; X32-NEXT:    vpmovsxbw %xmm0, %ymm0
736; X32-NEXT:    retl
737;
738; X64-LABEL: test_mm256_cvtepi8_epi16:
739; X64:       # BB#0:
740; X64-NEXT:    vpmovsxbw %xmm0, %ymm0
741; X64-NEXT:    retq
742  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
743  %ext = sext <16 x i8> %arg0 to <16 x i16>
744  %res = bitcast <16 x i16> %ext to <4 x i64>
745  ret <4 x i64> %res
746}
747
748define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
749; X32-LABEL: test_mm256_cvtepi8_epi32:
750; X32:       # BB#0:
751; X32-NEXT:    vpmovsxbd %xmm0, %ymm0
752; X32-NEXT:    retl
753;
754; X64-LABEL: test_mm256_cvtepi8_epi32:
755; X64:       # BB#0:
756; X64-NEXT:    vpmovsxbd %xmm0, %ymm0
757; X64-NEXT:    retq
758  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
759  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
760  %ext = sext <8 x i8> %shuf to <8 x i32>
761  %res = bitcast <8 x i32> %ext to <4 x i64>
762  ret <4 x i64> %res
763}
764
765define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
766; X32-LABEL: test_mm256_cvtepi8_epi64:
767; X32:       # BB#0:
768; X32-NEXT:    vpmovsxbq %xmm0, %ymm0
769; X32-NEXT:    retl
770;
771; X64-LABEL: test_mm256_cvtepi8_epi64:
772; X64:       # BB#0:
773; X64-NEXT:    vpmovsxbq %xmm0, %ymm0
774; X64-NEXT:    retq
775  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
776  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
777  %ext = sext <4 x i8> %shuf to <4 x i64>
778  ret <4 x i64> %ext
779}
780
781define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
782; X32-LABEL: test_mm256_cvtepi16_epi32:
783; X32:       # BB#0:
784; X32-NEXT:    vpmovsxwd %xmm0, %ymm0
785; X32-NEXT:    retl
786;
787; X64-LABEL: test_mm256_cvtepi16_epi32:
788; X64:       # BB#0:
789; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
790; X64-NEXT:    retq
791  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
792  %ext = sext <8 x i16> %arg0 to <8 x i32>
793  %res = bitcast <8 x i32> %ext to <4 x i64>
794  ret <4 x i64> %res
795}
796
797define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
798; X32-LABEL: test_mm256_cvtepi16_epi64:
799; X32:       # BB#0:
800; X32-NEXT:    vpmovsxwq %xmm0, %ymm0
801; X32-NEXT:    retl
802;
803; X64-LABEL: test_mm256_cvtepi16_epi64:
804; X64:       # BB#0:
805; X64-NEXT:    vpmovsxwq %xmm0, %ymm0
806; X64-NEXT:    retq
807  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
808  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
809  %ext = sext <4 x i16> %shuf to <4 x i64>
810  ret <4 x i64> %ext
811}
812
813define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
814; X32-LABEL: test_mm256_cvtepi32_epi64:
815; X32:       # BB#0:
816; X32-NEXT:    vpmovsxdq %xmm0, %ymm0
817; X32-NEXT:    retl
818;
819; X64-LABEL: test_mm256_cvtepi32_epi64:
820; X64:       # BB#0:
821; X64-NEXT:    vpmovsxdq %xmm0, %ymm0
822; X64-NEXT:    retq
823  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
824  %ext = sext <4 x i32> %arg0 to <4 x i64>
825  ret <4 x i64> %ext
826}
827
828define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
829; X32-LABEL: test_mm256_cvtepu8_epi16:
830; X32:       # BB#0:
831; X32-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
832; X32-NEXT:    retl
833;
834; X64-LABEL: test_mm256_cvtepu8_epi16:
835; X64:       # BB#0:
836; X64-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
837; X64-NEXT:    retq
838  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
839  %ext = zext <16 x i8> %arg0 to <16 x i16>
840  %res = bitcast <16 x i16> %ext to <4 x i64>
841  ret <4 x i64> %res
842}
843
844define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
845; X32-LABEL: test_mm256_cvtepu8_epi32:
846; X32:       # BB#0:
847; X32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
848; X32-NEXT:    retl
849;
850; X64-LABEL: test_mm256_cvtepu8_epi32:
851; X64:       # BB#0:
852; X64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
853; X64-NEXT:    retq
854  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
855  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
856  %ext = zext <8 x i8> %shuf to <8 x i32>
857  %res = bitcast <8 x i32> %ext to <4 x i64>
858  ret <4 x i64> %res
859}
860
861define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
862; X32-LABEL: test_mm256_cvtepu8_epi64:
863; X32:       # BB#0:
864; X32-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
865; X32-NEXT:    retl
866;
867; X64-LABEL: test_mm256_cvtepu8_epi64:
868; X64:       # BB#0:
869; X64-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
870; X64-NEXT:    retq
871  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
872  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
873  %ext = zext <4 x i8> %shuf to <4 x i64>
874  ret <4 x i64> %ext
875}
876
877define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
878; X32-LABEL: test_mm256_cvtepu16_epi32:
879; X32:       # BB#0:
880; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
881; X32-NEXT:    retl
882;
883; X64-LABEL: test_mm256_cvtepu16_epi32:
884; X64:       # BB#0:
885; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
886; X64-NEXT:    retq
887  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
888  %ext = zext <8 x i16> %arg0 to <8 x i32>
889  %res = bitcast <8 x i32> %ext to <4 x i64>
890  ret <4 x i64> %res
891}
892
893define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
894; X32-LABEL: test_mm256_cvtepu16_epi64:
895; X32:       # BB#0:
896; X32-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
897; X32-NEXT:    retl
898;
899; X64-LABEL: test_mm256_cvtepu16_epi64:
900; X64:       # BB#0:
901; X64-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
902; X64-NEXT:    retq
903  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
904  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
905  %ext = zext <4 x i16> %shuf to <4 x i64>
906  ret <4 x i64> %ext
907}
908
909define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
910; X32-LABEL: test_mm256_cvtepu32_epi64:
911; X32:       # BB#0:
912; X32-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
913; X32-NEXT:    retl
914;
915; X64-LABEL: test_mm256_cvtepu32_epi64:
916; X64:       # BB#0:
917; X64-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
918; X64-NEXT:    retq
919  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
920  %ext = zext <4 x i32> %arg0 to <4 x i64>
921  ret <4 x i64> %ext
922}
923
924define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
925; X32-LABEL: test_mm256_extracti128_si256:
926; X32:       # BB#0:
927; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
928; X32-NEXT:    vzeroupper
929; X32-NEXT:    retl
930;
931; X64-LABEL: test_mm256_extracti128_si256:
932; X64:       # BB#0:
933; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
934; X64-NEXT:    vzeroupper
935; X64-NEXT:    retq
936  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
937  ret <2 x i64> %res
938}
939
940define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
941; X32-LABEL: test_mm256_hadd_epi16:
942; X32:       # BB#0:
943; X32-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
944; X32-NEXT:    retl
945;
946; X64-LABEL: test_mm256_hadd_epi16:
947; X64:       # BB#0:
948; X64-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
949; X64-NEXT:    retq
950  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
951  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
952  %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
953  %bc = bitcast <16 x i16> %res to <4 x i64>
954  ret <4 x i64> %bc
955}
956declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
957
958define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
959; X32-LABEL: test_mm256_hadd_epi32:
960; X32:       # BB#0:
961; X32-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
962; X32-NEXT:    retl
963;
964; X64-LABEL: test_mm256_hadd_epi32:
965; X64:       # BB#0:
966; X64-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
967; X64-NEXT:    retq
968  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
969  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
970  %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
971  %bc = bitcast <8 x i32> %res to <4 x i64>
972  ret <4 x i64> %bc
973}
974declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
975
976define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
977; X32-LABEL: test_mm256_hadds_epi16:
978; X32:       # BB#0:
979; X32-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
980; X32-NEXT:    retl
981;
982; X64-LABEL: test_mm256_hadds_epi16:
983; X64:       # BB#0:
984; X64-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
985; X64-NEXT:    retq
986  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
987  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
988  %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
989  %bc = bitcast <16 x i16> %res to <4 x i64>
990  ret <4 x i64> %bc
991}
992declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
993
994define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
995; X32-LABEL: test_mm256_hsub_epi16:
996; X32:       # BB#0:
997; X32-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
998; X32-NEXT:    retl
999;
1000; X64-LABEL: test_mm256_hsub_epi16:
1001; X64:       # BB#0:
1002; X64-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
1003; X64-NEXT:    retq
1004  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1005  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1006  %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
1007  %bc = bitcast <16 x i16> %res to <4 x i64>
1008  ret <4 x i64> %bc
1009}
1010declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
1011
1012define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1013; X32-LABEL: test_mm256_hsub_epi32:
1014; X32:       # BB#0:
1015; X32-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
1016; X32-NEXT:    retl
1017;
1018; X64-LABEL: test_mm256_hsub_epi32:
1019; X64:       # BB#0:
1020; X64-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
1021; X64-NEXT:    retq
1022  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1023  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1024  %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
1025  %bc = bitcast <8 x i32> %res to <4 x i64>
1026  ret <4 x i64> %bc
1027}
1028declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
1029
1030define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1031; X32-LABEL: test_mm256_hsubs_epi16:
1032; X32:       # BB#0:
1033; X32-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
1034; X32-NEXT:    retl
1035;
1036; X64-LABEL: test_mm256_hsubs_epi16:
1037; X64:       # BB#0:
1038; X64-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
1039; X64-NEXT:    retq
1040  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1041  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1042  %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1043  %bc = bitcast <16 x i16> %res to <4 x i64>
1044  ret <4 x i64> %bc
1045}
1046declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
1047
1048define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
1049; X32-LABEL: test_mm_i32gather_epi32:
1050; X32:       # BB#0:
1051; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1052; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1053; X32-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
1054; X32-NEXT:    vmovdqa %xmm1, %xmm0
1055; X32-NEXT:    retl
1056;
1057; X64-LABEL: test_mm_i32gather_epi32:
1058; X64:       # BB#0:
1059; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1060; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
1061; X64-NEXT:    vmovdqa %xmm1, %xmm0
1062; X64-NEXT:    retq
1063  %arg0 = bitcast i32 *%a0 to i8*
1064  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1065  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1066  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
1067  %bc = bitcast <4 x i32> %call to <2 x i64>
1068  ret <2 x i64> %bc
1069}
1070declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
1071
1072define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1073; X32-LABEL: test_mm_mask_i32gather_epi32:
1074; X32:       # BB#0:
1075; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1076; X32-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
1077; X32-NEXT:    retl
1078;
1079; X64-LABEL: test_mm_mask_i32gather_epi32:
1080; X64:       # BB#0:
1081; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
1082; X64-NEXT:    retq
1083  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1084  %arg1 = bitcast i32 *%a1 to i8*
1085  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1086  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1087  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
1088  %bc = bitcast <4 x i32> %call to <2 x i64>
1089  ret <2 x i64> %bc
1090}
1091
1092define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
1093; X32-LABEL: test_mm256_i32gather_epi32:
1094; X32:       # BB#0:
1095; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1096; X32-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1097; X32-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
1098; X32-NEXT:    vmovdqa %ymm1, %ymm0
1099; X32-NEXT:    retl
1100;
1101; X64-LABEL: test_mm256_i32gather_epi32:
1102; X64:       # BB#0:
1103; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1104; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
1105; X64-NEXT:    vmovdqa %ymm1, %ymm0
1106; X64-NEXT:    retq
1107  %arg0 = bitcast i32 *%a0 to i8*
1108  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1109  %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
1110  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
1111  %bc = bitcast <8 x i32> %call to <4 x i64>
1112  ret <4 x i64> %bc
1113}
1114declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
1115
1116define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1117; X32-LABEL: test_mm256_mask_i32gather_epi32:
1118; X32:       # BB#0:
1119; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1120; X32-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
1121; X32-NEXT:    retl
1122;
1123; X64-LABEL: test_mm256_mask_i32gather_epi32:
1124; X64:       # BB#0:
1125; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
1126; X64-NEXT:    retq
1127  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1128  %arg1 = bitcast i32 *%a1 to i8*
1129  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1130  %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
1131  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
1132  %bc = bitcast <8 x i32> %call to <4 x i64>
1133  ret <4 x i64> %bc
1134}
1135
1136define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
1137; X32-LABEL: test_mm_i32gather_epi64:
1138; X32:       # BB#0:
1139; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1140; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1141; X32-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
1142; X32-NEXT:    vmovdqa %xmm1, %xmm0
1143; X32-NEXT:    retl
1144;
1145; X64-LABEL: test_mm_i32gather_epi64:
1146; X64:       # BB#0:
1147; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1148; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
1149; X64-NEXT:    vmovdqa %xmm1, %xmm0
1150; X64-NEXT:    retq
1151  %arg0 = bitcast i64 *%a0 to i8*
1152  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1153  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1154  ret <2 x i64> %res
1155}
1156declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
1157
1158define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1159; X32-LABEL: test_mm_mask_i32gather_epi64:
1160; X32:       # BB#0:
1161; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1162; X32-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
1163; X32-NEXT:    retl
1164;
1165; X64-LABEL: test_mm_mask_i32gather_epi64:
1166; X64:       # BB#0:
1167; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
1168; X64-NEXT:    retq
1169  %arg1 = bitcast i64 *%a1 to i8*
1170  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1171  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
1172  ret <2 x i64> %res
1173}
1174
1175define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
1176; X32-LABEL: test_mm256_i32gather_epi64:
1177; X32:       # BB#0:
1178; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1179; X32-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1180; X32-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
1181; X32-NEXT:    vmovdqa %ymm1, %ymm0
1182; X32-NEXT:    retl
1183;
1184; X64-LABEL: test_mm256_i32gather_epi64:
1185; X64:       # BB#0:
1186; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1187; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
1188; X64-NEXT:    vmovdqa %ymm1, %ymm0
1189; X64-NEXT:    retq
1190  %arg0 = bitcast i64 *%a0 to i8*
1191  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1192  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1193  ret <4 x i64> %res
1194}
1195declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
1196
1197define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
1198; X32-LABEL: test_mm256_mask_i32gather_epi64:
1199; X32:       # BB#0:
1200; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1201; X32-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
1202; X32-NEXT:    retl
1203;
1204; X64-LABEL: test_mm256_mask_i32gather_epi64:
1205; X64:       # BB#0:
1206; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
1207; X64-NEXT:    retq
1208  %arg1 = bitcast i64 *%a1 to i8*
1209  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1210  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
1211  ret <4 x i64> %res
1212}
1213
1214define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
1215; X32-LABEL: test_mm_i32gather_pd:
1216; X32:       # BB#0:
1217; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1218; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1219; X32-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
1220; X32-NEXT:    vmovapd %xmm1, %xmm0
1221; X32-NEXT:    retl
1222;
1223; X64-LABEL: test_mm_i32gather_pd:
1224; X64:       # BB#0:
1225; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1226; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
1227; X64-NEXT:    vmovapd %xmm1, %xmm0
1228; X64-NEXT:    retq
1229  %arg0 = bitcast double *%a0 to i8*
1230  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1231  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1232  %sext = sext <2 x i1> %cmp to <2 x i64>
1233  %mask = bitcast <2 x i64> %sext to <2 x double>
1234  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
1235  ret <2 x double> %res
1236}
1237declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
1238
1239define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1240; X32-LABEL: test_mm_mask_i32gather_pd:
1241; X32:       # BB#0:
1242; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1243; X32-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
1244; X32-NEXT:    retl
1245;
1246; X64-LABEL: test_mm_mask_i32gather_pd:
1247; X64:       # BB#0:
1248; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
1249; X64-NEXT:    retq
1250  %arg1 = bitcast double *%a1 to i8*
1251  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1252  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
1253  ret <2 x double> %res
1254}
1255
1256define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
1257; X32-LABEL: test_mm256_i32gather_pd:
1258; X32:       # BB#0:
1259; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1260; X32-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
1261; X32-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1262; X32-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
1263; X32-NEXT:    vmovapd %ymm1, %ymm0
1264; X32-NEXT:    retl
1265;
1266; X64-LABEL: test_mm256_i32gather_pd:
1267; X64:       # BB#0:
1268; X64-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
1269; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1270; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
1271; X64-NEXT:    vmovapd %ymm1, %ymm0
1272; X64-NEXT:    retq
1273  %arg0 = bitcast double *%a0 to i8*
1274  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1275  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1276  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
1277  ret <4 x double> %res
1278}
1279declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
1280
1281define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
1282; X32-LABEL: test_mm256_mask_i32gather_pd:
1283; X32:       # BB#0:
1284; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1285; X32-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
1286; X32-NEXT:    retl
1287;
1288; X64-LABEL: test_mm256_mask_i32gather_pd:
1289; X64:       # BB#0:
1290; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1291; X64-NEXT:    retq
1292  %arg1 = bitcast double *%a1 to i8*
1293  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1294  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1295  ret <4 x double> %res
1296}
1297
1298define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1299; X32-LABEL: test_mm_i32gather_ps:
1300; X32:       # BB#0:
1301; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1302; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1303; X32-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1304; X32-NEXT:    vmovaps %xmm1, %xmm0
1305; X32-NEXT:    retl
1306;
1307; X64-LABEL: test_mm_i32gather_ps:
1308; X64:       # BB#0:
1309; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1310; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1311; X64-NEXT:    vmovaps %xmm1, %xmm0
1312; X64-NEXT:    retq
1313  %arg0 = bitcast float *%a0 to i8*
1314  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1315  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1316  %sext = sext <4 x i1> %cmp to <4 x i32>
1317  %mask = bitcast <4 x i32> %sext to <4 x float>
1318  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1319  ret <4 x float> %call
1320}
1321declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1322
1323define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1324; X32-LABEL: test_mm_mask_i32gather_ps:
1325; X32:       # BB#0:
1326; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1327; X32-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1328; X32-NEXT:    retl
1329;
1330; X64-LABEL: test_mm_mask_i32gather_ps:
1331; X64:       # BB#0:
1332; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1333; X64-NEXT:    retq
1334  %arg1 = bitcast float *%a1 to i8*
1335  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1336  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1337  ret <4 x float> %call
1338}
1339
1340define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1341; X32-LABEL: test_mm256_i32gather_ps:
1342; X32:       # BB#0:
1343; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1344; X32-NEXT:    vxorps %ymm1, %ymm1, %ymm1
1345; X32-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1346; X32-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1347; X32-NEXT:    vmovaps %ymm1, %ymm0
1348; X32-NEXT:    retl
1349;
1350; X64-LABEL: test_mm256_i32gather_ps:
1351; X64:       # BB#0:
1352; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
1353; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1354; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1355; X64-NEXT:    vmovaps %ymm1, %ymm0
1356; X64-NEXT:    retq
1357  %arg0 = bitcast float *%a0 to i8*
1358  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1359  %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1360  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1361  ret <8 x float> %call
1362}
1363declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1364
1365define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1366; X32-LABEL: test_mm256_mask_i32gather_ps:
1367; X32:       # BB#0:
1368; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1369; X32-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1370; X32-NEXT:    retl
1371;
1372; X64-LABEL: test_mm256_mask_i32gather_ps:
1373; X64:       # BB#0:
1374; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1375; X64-NEXT:    retq
1376  %arg1 = bitcast float *%a1 to i8*
1377  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1378  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1379  ret <8 x float> %call
1380}
1381
1382define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1383; X32-LABEL: test_mm_i64gather_epi32:
1384; X32:       # BB#0:
1385; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1386; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1387; X32-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1388; X32-NEXT:    vmovdqa %xmm1, %xmm0
1389; X32-NEXT:    retl
1390;
1391; X64-LABEL: test_mm_i64gather_epi32:
1392; X64:       # BB#0:
1393; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1394; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1395; X64-NEXT:    vmovdqa %xmm1, %xmm0
1396; X64-NEXT:    retq
1397  %arg0 = bitcast i32 *%a0 to i8*
1398  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1399  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1400  %bc = bitcast <4 x i32> %call to <2 x i64>
1401  ret <2 x i64> %bc
1402}
1403declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1404
1405define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1406; X32-LABEL: test_mm_mask_i64gather_epi32:
1407; X32:       # BB#0:
1408; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1409; X32-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1410; X32-NEXT:    retl
1411;
1412; X64-LABEL: test_mm_mask_i64gather_epi32:
1413; X64:       # BB#0:
1414; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1415; X64-NEXT:    retq
1416  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1417  %arg1 = bitcast i32 *%a1 to i8*
1418  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1419  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1420  %bc = bitcast <4 x i32> %call to <2 x i64>
1421  ret <2 x i64> %bc
1422}
1423
1424define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1425; X32-LABEL: test_mm256_i64gather_epi32:
1426; X32:       # BB#0:
1427; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1428; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1429; X32-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1430; X32-NEXT:    vmovdqa %xmm1, %xmm0
1431; X32-NEXT:    vzeroupper
1432; X32-NEXT:    retl
1433;
1434; X64-LABEL: test_mm256_i64gather_epi32:
1435; X64:       # BB#0:
1436; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1437; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1438; X64-NEXT:    vmovdqa %xmm1, %xmm0
1439; X64-NEXT:    vzeroupper
1440; X64-NEXT:    retq
1441  %arg0 = bitcast i32 *%a0 to i8*
1442  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1443  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1444  %bc = bitcast <4 x i32> %call to <2 x i64>
1445  ret <2 x i64> %bc
1446}
1447declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1448
1449define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1450; X32-LABEL: test_mm256_mask_i64gather_epi32:
1451; X32:       # BB#0:
1452; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1453; X32-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1454; X32-NEXT:    vzeroupper
1455; X32-NEXT:    retl
1456;
1457; X64-LABEL: test_mm256_mask_i64gather_epi32:
1458; X64:       # BB#0:
1459; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1460; X64-NEXT:    vzeroupper
1461; X64-NEXT:    retq
1462  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1463  %arg1 = bitcast i32 *%a1 to i8*
1464  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1465  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1466  %bc = bitcast <4 x i32> %call to <2 x i64>
1467  ret <2 x i64> %bc
1468}
1469
1470define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1471; X32-LABEL: test_mm_i64gather_epi64:
1472; X32:       # BB#0:
1473; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1474; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1475; X32-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1476; X32-NEXT:    vmovdqa %xmm1, %xmm0
1477; X32-NEXT:    retl
1478;
1479; X64-LABEL: test_mm_i64gather_epi64:
1480; X64:       # BB#0:
1481; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1482; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1483; X64-NEXT:    vmovdqa %xmm1, %xmm0
1484; X64-NEXT:    retq
1485  %arg0 = bitcast i64 *%a0 to i8*
1486  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1487  ret <2 x i64> %call
1488}
1489declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1490
1491define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1492; X32-LABEL: test_mm_mask_i64gather_epi64:
1493; X32:       # BB#0:
1494; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1495; X32-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1496; X32-NEXT:    retl
1497;
1498; X64-LABEL: test_mm_mask_i64gather_epi64:
1499; X64:       # BB#0:
1500; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1501; X64-NEXT:    retq
1502  %arg1 = bitcast i64 *%a1 to i8*
1503  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1504  ret <2 x i64> %call
1505}
1506
1507define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1508; X32-LABEL: test_mm256_i64gather_epi64:
1509; X32:       # BB#0:
1510; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1511; X32-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1512; X32-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1513; X32-NEXT:    vmovdqa %ymm1, %ymm0
1514; X32-NEXT:    retl
1515;
1516; X64-LABEL: test_mm256_i64gather_epi64:
1517; X64:       # BB#0:
1518; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1519; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1520; X64-NEXT:    vmovdqa %ymm1, %ymm0
1521; X64-NEXT:    retq
1522  %arg0 = bitcast i64 *%a0 to i8*
1523  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1524  ret <4 x i64> %call
1525}
1526declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1527
1528define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1529; X32-LABEL: test_mm256_mask_i64gather_epi64:
1530; X32:       # BB#0:
1531; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1532; X32-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1533; X32-NEXT:    retl
1534;
1535; X64-LABEL: test_mm256_mask_i64gather_epi64:
1536; X64:       # BB#0:
1537; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1538; X64-NEXT:    retq
1539  %arg1 = bitcast i64 *%a1 to i8*
1540  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1541  ret <4 x i64> %call
1542}
1543
1544define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1545; X32-LABEL: test_mm_i64gather_pd:
1546; X32:       # BB#0:
1547; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1548; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1549; X32-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1550; X32-NEXT:    vmovapd %xmm1, %xmm0
1551; X32-NEXT:    retl
1552;
1553; X64-LABEL: test_mm_i64gather_pd:
1554; X64:       # BB#0:
1555; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1556; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1557; X64-NEXT:    vmovapd %xmm1, %xmm0
1558; X64-NEXT:    retq
1559  %arg0 = bitcast double *%a0 to i8*
1560  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1561  %sext = sext <2 x i1> %cmp to <2 x i64>
1562  %mask = bitcast <2 x i64> %sext to <2 x double>
1563  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1564  ret <2 x double> %call
1565}
1566declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1567
1568define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1569; X32-LABEL: test_mm_mask_i64gather_pd:
1570; X32:       # BB#0:
1571; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1572; X32-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1573; X32-NEXT:    retl
1574;
1575; X64-LABEL: test_mm_mask_i64gather_pd:
1576; X64:       # BB#0:
1577; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1578; X64-NEXT:    retq
1579  %arg1 = bitcast double *%a1 to i8*
1580  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1581  ret <2 x double> %call
1582}
1583
1584define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1585; X32-LABEL: test_mm256_i64gather_pd:
1586; X32:       # BB#0:
1587; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1588; X32-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
1589; X32-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1590; X32-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1591; X32-NEXT:    vmovapd %ymm1, %ymm0
1592; X32-NEXT:    retl
1593;
1594; X64-LABEL: test_mm256_i64gather_pd:
1595; X64:       # BB#0:
1596; X64-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
1597; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1598; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1599; X64-NEXT:    vmovapd %ymm1, %ymm0
1600; X64-NEXT:    retq
1601  %arg0 = bitcast double *%a0 to i8*
1602  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1603  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1604  ret <4 x double> %call
1605}
1606declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1607
1608define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1609; X32-LABEL: test_mm256_mask_i64gather_pd:
1610; X32:       # BB#0:
1611; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1612; X32-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1613; X32-NEXT:    retl
1614;
1615; X64-LABEL: test_mm256_mask_i64gather_pd:
1616; X64:       # BB#0:
1617; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1618; X64-NEXT:    retq
1619  %arg1 = bitcast i64 *%a1 to i8*
1620  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1621  ret <4 x double> %call
1622}
1623
1624define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1625; X32-LABEL: test_mm_i64gather_ps:
1626; X32:       # BB#0:
1627; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1628; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1629; X32-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1630; X32-NEXT:    vmovaps %xmm1, %xmm0
1631; X32-NEXT:    retl
1632;
1633; X64-LABEL: test_mm_i64gather_ps:
1634; X64:       # BB#0:
1635; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1636; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1637; X64-NEXT:    vmovaps %xmm1, %xmm0
1638; X64-NEXT:    retq
1639  %arg0 = bitcast float *%a0 to i8*
1640  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1641  %sext = sext <4 x i1> %cmp to <4 x i32>
1642  %mask = bitcast <4 x i32> %sext to <4 x float>
1643  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1644  ret <4 x float> %call
1645}
1646declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1647
1648define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1649; X32-LABEL: test_mm_mask_i64gather_ps:
1650; X32:       # BB#0:
1651; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1652; X32-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1653; X32-NEXT:    retl
1654;
1655; X64-LABEL: test_mm_mask_i64gather_ps:
1656; X64:       # BB#0:
1657; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1658; X64-NEXT:    retq
1659  %arg1 = bitcast float *%a1 to i8*
1660  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1661  ret <4 x float> %call
1662}
1663
1664define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1665; X32-LABEL: test_mm256_i64gather_ps:
1666; X32:       # BB#0:
1667; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1668; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1669; X32-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1670; X32-NEXT:    vmovaps %xmm1, %xmm0
1671; X32-NEXT:    vzeroupper
1672; X32-NEXT:    retl
1673;
1674; X64-LABEL: test_mm256_i64gather_ps:
1675; X64:       # BB#0:
1676; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1677; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1678; X64-NEXT:    vmovaps %xmm1, %xmm0
1679; X64-NEXT:    vzeroupper
1680; X64-NEXT:    retq
1681  %arg0 = bitcast float *%a0 to i8*
1682  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1683  %sext = sext <4 x i1> %cmp to <4 x i32>
1684  %mask = bitcast <4 x i32> %sext to <4 x float>
1685  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1686  ret <4 x float> %call
1687}
1688declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1689
1690define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1691; X32-LABEL: test_mm256_mask_i64gather_ps:
1692; X32:       # BB#0:
1693; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1694; X32-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1695; X32-NEXT:    vzeroupper
1696; X32-NEXT:    retl
1697;
1698; X64-LABEL: test_mm256_mask_i64gather_ps:
1699; X64:       # BB#0:
1700; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1701; X64-NEXT:    vzeroupper
1702; X64-NEXT:    retq
1703  %arg1 = bitcast float *%a1 to i8*
1704  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1705  ret <4 x float> %call
1706}
1707
1708define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1709; X32-LABEL: test0_mm256_inserti128_si256:
1710; X32:       # BB#0:
1711; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1712; X32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1713; X32-NEXT:    retl
1714;
1715; X64-LABEL: test0_mm256_inserti128_si256:
1716; X64:       # BB#0:
1717; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1718; X64-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1719; X64-NEXT:    retq
1720  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1721  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1722  ret <4 x i64> %res
1723}
1724
1725define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1726; X32-LABEL: test1_mm256_inserti128_si256:
1727; X32:       # BB#0:
1728; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1729; X32-NEXT:    retl
1730;
1731; X64-LABEL: test1_mm256_inserti128_si256:
1732; X64:       # BB#0:
1733; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1734; X64-NEXT:    retq
1735  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1736  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1737  ret <4 x i64> %res
1738}
1739
1740define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1741; X32-LABEL: test_mm256_madd_epi16:
1742; X32:       # BB#0:
1743; X32-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1744; X32-NEXT:    retl
1745;
1746; X64-LABEL: test_mm256_madd_epi16:
1747; X64:       # BB#0:
1748; X64-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1749; X64-NEXT:    retq
1750  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1751  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1752  %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1753  %bc = bitcast <8 x i32> %res to <4 x i64>
1754  ret <4 x i64> %bc
1755}
1756declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1757
1758define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1759; X32-LABEL: test_mm256_maddubs_epi16:
1760; X32:       # BB#0:
1761; X32-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1762; X32-NEXT:    retl
1763;
1764; X64-LABEL: test_mm256_maddubs_epi16:
1765; X64:       # BB#0:
1766; X64-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1767; X64-NEXT:    retq
1768  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1769  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1770  %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1771  %bc = bitcast <16 x i16> %res to <4 x i64>
1772  ret <4 x i64> %bc
1773}
1774declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1775
1776define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1777; X32-LABEL: test_mm_maskload_epi32:
1778; X32:       # BB#0:
1779; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1780; X32-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1781; X32-NEXT:    retl
1782;
1783; X64-LABEL: test_mm_maskload_epi32:
1784; X64:       # BB#0:
1785; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1786; X64-NEXT:    retq
1787  %arg0 = bitcast i32* %a0 to i8*
1788  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1789  %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1790  %bc = bitcast <4 x i32> %call to <2 x i64>
1791  ret <2 x i64> %bc
1792}
1793declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1794
1795define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1796; X32-LABEL: test_mm256_maskload_epi32:
1797; X32:       # BB#0:
1798; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1799; X32-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1800; X32-NEXT:    retl
1801;
1802; X64-LABEL: test_mm256_maskload_epi32:
1803; X64:       # BB#0:
1804; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1805; X64-NEXT:    retq
1806  %arg0 = bitcast i32* %a0 to i8*
1807  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1808  %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1809  %bc = bitcast <8 x i32> %call to <4 x i64>
1810  ret <4 x i64> %bc
1811}
1812declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1813
1814define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1815; X32-LABEL: test_mm_maskload_epi64:
1816; X32:       # BB#0:
1817; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1818; X32-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1819; X32-NEXT:    retl
1820;
1821; X64-LABEL: test_mm_maskload_epi64:
1822; X64:       # BB#0:
1823; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1824; X64-NEXT:    retq
1825  %arg0 = bitcast i64* %a0 to i8*
1826  %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1827  ret <2 x i64> %res
1828}
1829declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1830
1831define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1832; X32-LABEL: test_mm256_maskload_epi64:
1833; X32:       # BB#0:
1834; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1835; X32-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1836; X32-NEXT:    retl
1837;
1838; X64-LABEL: test_mm256_maskload_epi64:
1839; X64:       # BB#0:
1840; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1841; X64-NEXT:    retq
1842  %arg0 = bitcast i64* %a0 to i8*
1843  %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1844  ret <4 x i64> %res
1845}
1846declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1847
1848define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1849; X32-LABEL: test_mm_maskstore_epi32:
1850; X32:       # BB#0:
1851; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1852; X32-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1853; X32-NEXT:    retl
1854;
1855; X64-LABEL: test_mm_maskstore_epi32:
1856; X64:       # BB#0:
1857; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1858; X64-NEXT:    retq
1859  %arg0 = bitcast float* %a0 to i8*
1860  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1861  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1862  call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1863  ret void
1864}
1865declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1866
1867define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1868; X32-LABEL: test_mm256_maskstore_epi32:
1869; X32:       # BB#0:
1870; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1871; X32-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1872; X32-NEXT:    vzeroupper
1873; X32-NEXT:    retl
1874;
1875; X64-LABEL: test_mm256_maskstore_epi32:
1876; X64:       # BB#0:
1877; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1878; X64-NEXT:    vzeroupper
1879; X64-NEXT:    retq
1880  %arg0 = bitcast float* %a0 to i8*
1881  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1882  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1883  call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1884  ret void
1885}
1886declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1887
1888define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1889; X32-LABEL: test_mm_maskstore_epi64:
1890; X32:       # BB#0:
1891; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1892; X32-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1893; X32-NEXT:    retl
1894;
1895; X64-LABEL: test_mm_maskstore_epi64:
1896; X64:       # BB#0:
1897; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1898; X64-NEXT:    retq
1899  %arg0 = bitcast i64* %a0 to i8*
1900  call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1901  ret void
1902}
1903declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1904
1905define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1906; X32-LABEL: test_mm256_maskstore_epi64:
1907; X32:       # BB#0:
1908; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1909; X32-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1910; X32-NEXT:    vzeroupper
1911; X32-NEXT:    retl
1912;
1913; X64-LABEL: test_mm256_maskstore_epi64:
1914; X64:       # BB#0:
1915; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1916; X64-NEXT:    vzeroupper
1917; X64-NEXT:    retq
1918  %arg0 = bitcast i64* %a0 to i8*
1919  call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1920  ret void
1921}
1922declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1923
1924define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1925; X32-LABEL: test_mm256_max_epi8:
1926; X32:       # BB#0:
1927; X32-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1928; X32-NEXT:    retl
1929;
1930; X64-LABEL: test_mm256_max_epi8:
1931; X64:       # BB#0:
1932; X64-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1933; X64-NEXT:    retq
1934  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1935  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1936  %cmp = icmp sgt <32 x i8> %arg0, %arg1
1937  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1938  %bc = bitcast <32 x i8> %sel to <4 x i64>
1939  ret <4 x i64> %bc
1940}
1941
1942define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1943; X32-LABEL: test_mm256_max_epi16:
1944; X32:       # BB#0:
1945; X32-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1946; X32-NEXT:    retl
1947;
1948; X64-LABEL: test_mm256_max_epi16:
1949; X64:       # BB#0:
1950; X64-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1951; X64-NEXT:    retq
1952  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1953  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1954  %cmp = icmp sgt <16 x i16> %arg0, %arg1
1955  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1956  %bc = bitcast <16 x i16> %sel to <4 x i64>
1957  ret <4 x i64> %bc
1958}
1959
1960define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1961; X32-LABEL: test_mm256_max_epi32:
1962; X32:       # BB#0:
1963; X32-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1964; X32-NEXT:    retl
1965;
1966; X64-LABEL: test_mm256_max_epi32:
1967; X64:       # BB#0:
1968; X64-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1969; X64-NEXT:    retq
1970  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1971  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1972  %cmp = icmp sgt <8 x i32> %arg0, %arg1
1973  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1974  %bc = bitcast <8 x i32> %sel to <4 x i64>
1975  ret <4 x i64> %bc
1976}
1977
1978define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1979; X32-LABEL: test_mm256_max_epu8:
1980; X32:       # BB#0:
1981; X32-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1982; X32-NEXT:    retl
1983;
1984; X64-LABEL: test_mm256_max_epu8:
1985; X64:       # BB#0:
1986; X64-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1987; X64-NEXT:    retq
1988  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1989  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1990  %cmp = icmp ugt <32 x i8> %arg0, %arg1
1991  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1992  %bc = bitcast <32 x i8> %sel to <4 x i64>
1993  ret <4 x i64> %bc
1994}
1995
1996define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1997; X32-LABEL: test_mm256_max_epu16:
1998; X32:       # BB#0:
1999; X32-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
2000; X32-NEXT:    retl
2001;
2002; X64-LABEL: test_mm256_max_epu16:
2003; X64:       # BB#0:
2004; X64-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
2005; X64-NEXT:    retq
2006  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2007  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2008  %cmp = icmp ugt <16 x i16> %arg0, %arg1
2009  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
2010  %bc = bitcast <16 x i16> %sel to <4 x i64>
2011  ret <4 x i64> %bc
2012}
2013
2014define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
2015; X32-LABEL: test_mm256_max_epu32:
2016; X32:       # BB#0:
2017; X32-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
2018; X32-NEXT:    retl
2019;
2020; X64-LABEL: test_mm256_max_epu32:
2021; X64:       # BB#0:
2022; X64-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
2023; X64-NEXT:    retq
2024  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2025  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2026  %cmp = icmp ugt <8 x i32> %arg0, %arg1
2027  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
2028  %bc = bitcast <8 x i32> %sel to <4 x i64>
2029  ret <4 x i64> %bc
2030}
2031
2032define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2033; X32-LABEL: test_mm256_min_epi8:
2034; X32:       # BB#0:
2035; X32-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
2036; X32-NEXT:    retl
2037;
2038; X64-LABEL: test_mm256_min_epi8:
2039; X64:       # BB#0:
2040; X64-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
2041; X64-NEXT:    retq
2042  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2043  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2044  %cmp = icmp slt <32 x i8> %arg0, %arg1
2045  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
2046  %bc = bitcast <32 x i8> %sel to <4 x i64>
2047  ret <4 x i64> %bc
2048}
2049
2050define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2051; X32-LABEL: test_mm256_min_epi16:
2052; X32:       # BB#0:
2053; X32-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
2054; X32-NEXT:    retl
2055;
2056; X64-LABEL: test_mm256_min_epi16:
2057; X64:       # BB#0:
2058; X64-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
2059; X64-NEXT:    retq
2060  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2061  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2062  %cmp = icmp slt <16 x i16> %arg0, %arg1
2063  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
2064  %bc = bitcast <16 x i16> %sel to <4 x i64>
2065  ret <4 x i64> %bc
2066}
2067
2068define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2069; X32-LABEL: test_mm256_min_epi32:
2070; X32:       # BB#0:
2071; X32-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
2072; X32-NEXT:    retl
2073;
2074; X64-LABEL: test_mm256_min_epi32:
2075; X64:       # BB#0:
2076; X64-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
2077; X64-NEXT:    retq
2078  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2079  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2080  %cmp = icmp slt <8 x i32> %arg0, %arg1
2081  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
2082  %bc = bitcast <8 x i32> %sel to <4 x i64>
2083  ret <4 x i64> %bc
2084}
2085
2086define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2087; X32-LABEL: test_mm256_min_epu8:
2088; X32:       # BB#0:
2089; X32-NEXT:    vpminub %ymm1, %ymm0, %ymm0
2090; X32-NEXT:    retl
2091;
2092; X64-LABEL: test_mm256_min_epu8:
2093; X64:       # BB#0:
2094; X64-NEXT:    vpminub %ymm1, %ymm0, %ymm0
2095; X64-NEXT:    retq
2096  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2097  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2098  %cmp = icmp ult <32 x i8> %arg0, %arg1
2099  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
2100  %bc = bitcast <32 x i8> %sel to <4 x i64>
2101  ret <4 x i64> %bc
2102}
2103
2104define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2105; X32-LABEL: test_mm256_min_epu16:
2106; X32:       # BB#0:
2107; X32-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
2108; X32-NEXT:    retl
2109;
2110; X64-LABEL: test_mm256_min_epu16:
2111; X64:       # BB#0:
2112; X64-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
2113; X64-NEXT:    retq
2114  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2115  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2116  %cmp = icmp ult <16 x i16> %arg0, %arg1
2117  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
2118  %bc = bitcast <16 x i16> %sel to <4 x i64>
2119  ret <4 x i64> %bc
2120}
2121
2122define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
2123; X32-LABEL: test_mm256_min_epu32:
2124; X32:       # BB#0:
2125; X32-NEXT:    vpminud %ymm1, %ymm0, %ymm0
2126; X32-NEXT:    retl
2127;
2128; X64-LABEL: test_mm256_min_epu32:
2129; X64:       # BB#0:
2130; X64-NEXT:    vpminud %ymm1, %ymm0, %ymm0
2131; X64-NEXT:    retq
2132  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2133  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2134  %cmp = icmp ult <8 x i32> %arg0, %arg1
2135  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
2136  %bc = bitcast <8 x i32> %sel to <4 x i64>
2137  ret <4 x i64> %bc
2138}
2139
2140define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
2141; X32-LABEL: test_mm256_movemask_epi8:
2142; X32:       # BB#0:
2143; X32-NEXT:    vpmovmskb %ymm0, %eax
2144; X32-NEXT:    vzeroupper
2145; X32-NEXT:    retl
2146;
2147; X64-LABEL: test_mm256_movemask_epi8:
2148; X64:       # BB#0:
2149; X64-NEXT:    vpmovmskb %ymm0, %eax
2150; X64-NEXT:    vzeroupper
2151; X64-NEXT:    retq
2152  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2153  %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
2154  ret i32 %res
2155}
2156declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
2157
2158define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2159; X32-LABEL: test_mm256_mpsadbw_epu8:
2160; X32:       # BB#0:
2161; X32-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
2162; X32-NEXT:    retl
2163;
2164; X64-LABEL: test_mm256_mpsadbw_epu8:
2165; X64:       # BB#0:
2166; X64-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
2167; X64-NEXT:    retq
2168  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2169  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2170  %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
2171  %bc = bitcast <16 x i16>  %call to <4 x i64>
2172  ret <4 x i64> %bc
2173}
2174declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
2175
2176define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2177; X32-LABEL: test_mm256_mul_epi32:
2178; X32:       # BB#0:
2179; X32-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
2180; X32-NEXT:    retl
2181;
2182; X64-LABEL: test_mm256_mul_epi32:
2183; X64:       # BB#0:
2184; X64-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
2185; X64-NEXT:    retq
2186  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2187  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2188  %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %arg0, <8 x i32> %arg1)
2189  ret <4 x i64> %res
2190}
2191declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
2192
2193define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
2194; X32-LABEL: test_mm256_mul_epu32:
2195; X32:       # BB#0:
2196; X32-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
2197; X32-NEXT:    retl
2198;
2199; X64-LABEL: test_mm256_mul_epu32:
2200; X64:       # BB#0:
2201; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
2202; X64-NEXT:    retq
2203  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2204  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2205  %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %arg0, <8 x i32> %arg1)
2206  ret <4 x i64> %res
2207}
2208declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
2209
2210define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2211; X32-LABEL: test_mm256_mulhi_epi16:
2212; X32:       # BB#0:
2213; X32-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
2214; X32-NEXT:    retl
2215;
2216; X64-LABEL: test_mm256_mulhi_epi16:
2217; X64:       # BB#0:
2218; X64-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
2219; X64-NEXT:    retq
2220  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2221  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2222  %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
2223  %bc = bitcast <16 x i16> %res to <4 x i64>
2224  ret <4 x i64> %bc
2225}
2226declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
2227
2228define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2229; X32-LABEL: test_mm256_mulhi_epu16:
2230; X32:       # BB#0:
2231; X32-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
2232; X32-NEXT:    retl
2233;
2234; X64-LABEL: test_mm256_mulhi_epu16:
2235; X64:       # BB#0:
2236; X64-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
2237; X64-NEXT:    retq
2238  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2239  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2240  %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
2241  %bc = bitcast <16 x i16> %res to <4 x i64>
2242  ret <4 x i64> %bc
2243}
2244declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
2245
2246define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2247; X32-LABEL: test_mm256_mulhrs_epi16:
2248; X32:       # BB#0:
2249; X32-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
2250; X32-NEXT:    retl
2251;
2252; X64-LABEL: test_mm256_mulhrs_epi16:
2253; X64:       # BB#0:
2254; X64-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
2255; X64-NEXT:    retq
2256  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2257  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2258  %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
2259  %bc = bitcast <16 x i16> %res to <4 x i64>
2260  ret <4 x i64> %bc
2261}
2262declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
2263
2264define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2265; X32-LABEL: test_mm256_mullo_epi16:
2266; X32:       # BB#0:
2267; X32-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2268; X32-NEXT:    retl
2269;
2270; X64-LABEL: test_mm256_mullo_epi16:
2271; X64:       # BB#0:
2272; X64-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2273; X64-NEXT:    retq
2274  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2275  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2276  %res = mul <16 x i16> %arg0, %arg1
2277  %bc = bitcast <16 x i16> %res to <4 x i64>
2278  ret <4 x i64> %bc
2279}
2280
2281define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2282; X32-LABEL: test_mm256_mullo_epi32:
2283; X32:       # BB#0:
2284; X32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
2285; X32-NEXT:    retl
2286;
2287; X64-LABEL: test_mm256_mullo_epi32:
2288; X64:       # BB#0:
2289; X64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
2290; X64-NEXT:    retq
2291  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2292  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2293  %res = mul <8 x i32> %arg0, %arg1
2294  %bc = bitcast <8 x i32> %res to <4 x i64>
2295  ret <4 x i64> %bc
2296}
2297
2298define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2299; X32-LABEL: test_mm256_or_si256:
2300; X32:       # BB#0:
2301; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
2302; X32-NEXT:    retl
2303;
2304; X64-LABEL: test_mm256_or_si256:
2305; X64:       # BB#0:
2306; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
2307; X64-NEXT:    retq
2308  %res = or <4 x i64> %a0, %a1
2309  ret <4 x i64> %res
2310}
2311
2312define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2313; X32-LABEL: test_mm256_packs_epi16:
2314; X32:       # BB#0:
2315; X32-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
2316; X32-NEXT:    retl
2317;
2318; X64-LABEL: test_mm256_packs_epi16:
2319; X64:       # BB#0:
2320; X64-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
2321; X64-NEXT:    retq
2322  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2323  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2324  %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
2325  %res = bitcast <32 x i8> %call to <4 x i64>
2326  ret <4 x i64> %res
2327}
2328declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
2329
2330define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2331; X32-LABEL: test_mm256_packs_epi32:
2332; X32:       # BB#0:
2333; X32-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
2334; X32-NEXT:    retl
2335;
2336; X64-LABEL: test_mm256_packs_epi32:
2337; X64:       # BB#0:
2338; X64-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
2339; X64-NEXT:    retq
2340  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2341  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2342  %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
2343  %res = bitcast <16 x i16> %call to <4 x i64>
2344  ret <4 x i64> %res
2345}
2346declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
2347
2348define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2349; X32-LABEL: test_mm256_packus_epi16:
2350; X32:       # BB#0:
2351; X32-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
2352; X32-NEXT:    retl
2353;
2354; X64-LABEL: test_mm256_packus_epi16:
2355; X64:       # BB#0:
2356; X64-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
2357; X64-NEXT:    retq
2358  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2359  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2360  %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
2361  %res = bitcast <32 x i8> %call to <4 x i64>
2362  ret <4 x i64> %res
2363}
2364declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
2365
2366define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2367; X32-LABEL: test_mm256_packus_epi32:
2368; X32:       # BB#0:
2369; X32-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2370; X32-NEXT:    retl
2371;
2372; X64-LABEL: test_mm256_packus_epi32:
2373; X64:       # BB#0:
2374; X64-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2375; X64-NEXT:    retq
2376  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2377  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2378  %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
2379  %res = bitcast <16 x i16> %call to <4 x i64>
2380  ret <4 x i64> %res
2381}
2382declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
2383
2384define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
2385; X32-LABEL: test_mm256_permute2x128_si256:
2386; X32:       # BB#0:
2387; X32-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
2388; X32-NEXT:    retl
2389;
2390; X64-LABEL: test_mm256_permute2x128_si256:
2391; X64:       # BB#0:
2392; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
2393; X64-NEXT:    retq
2394  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 49)
2395  ret <4 x i64> %res
2396}
2397declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
2398
2399define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
2400; X32-LABEL: test_mm256_permute4x64_epi64:
2401; X32:       # BB#0:
2402; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0]
2403; X32-NEXT:    retl
2404;
2405; X64-LABEL: test_mm256_permute4x64_epi64:
2406; X64:       # BB#0:
2407; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0]
2408; X64-NEXT:    retq
2409  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
2410  ret <4 x i64> %res
2411}
2412
2413define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
2414; X32-LABEL: test_mm256_permute4x64_pd:
2415; X32:       # BB#0:
2416; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
2417; X32-NEXT:    retl
2418;
2419; X64-LABEL: test_mm256_permute4x64_pd:
2420; X64:       # BB#0:
2421; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
2422; X64-NEXT:    retq
2423  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
2424  ret <4 x double> %res
2425}
2426
2427define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2428; X32-LABEL: test_mm256_permutevar8x32_epi32:
2429; X32:       # BB#0:
2430; X32-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2431; X32-NEXT:    retl
2432;
2433; X64-LABEL: test_mm256_permutevar8x32_epi32:
2434; X64:       # BB#0:
2435; X64-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2436; X64-NEXT:    retq
2437  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2438  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2439  %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
2440  %res = bitcast <8 x i32> %call to <4 x i64>
2441  ret <4 x i64> %res
2442}
2443declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2444
2445define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2446; X32-LABEL: test_mm256_permutevar8x32_ps:
2447; X32:       # BB#0:
2448; X32-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2449; X32-NEXT:    retl
2450;
2451; X64-LABEL: test_mm256_permutevar8x32_ps:
2452; X64:       # BB#0:
2453; X64-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2454; X64-NEXT:    retq
2455  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2456  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2457  ret <8 x float> %res
2458}
2459declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2460
2461define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2462; X32-LABEL: test_mm256_sad_epu8:
2463; X32:       # BB#0:
2464; X32-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2465; X32-NEXT:    retl
2466;
2467; X64-LABEL: test_mm256_sad_epu8:
2468; X64:       # BB#0:
2469; X64-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2470; X64-NEXT:    retq
2471  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2472  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2473  %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2474  ret <4 x i64> %res
2475}
2476declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2477
2478define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2479; X32-LABEL: test_mm256_shuffle_epi32:
2480; X32:       # BB#0:
2481; X32-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2482; X32-NEXT:    retl
2483;
2484; X64-LABEL: test_mm256_shuffle_epi32:
2485; X64:       # BB#0:
2486; X64-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2487; X64-NEXT:    retq
2488  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2489  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2490  %res = bitcast <8 x i32> %shuf to <4 x i64>
2491  ret <4 x i64> %res
2492}
2493
2494define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2495; X32-LABEL: test_mm256_shuffle_epi8:
2496; X32:       # BB#0:
2497; X32-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2498; X32-NEXT:    retl
2499;
2500; X64-LABEL: test_mm256_shuffle_epi8:
2501; X64:       # BB#0:
2502; X64-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2503; X64-NEXT:    retq
2504  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2505  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2506  %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2507  %res = bitcast <32 x i8> %shuf to <4 x i64>
2508  ret <4 x i64> %res
2509}
2510declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2511
2512define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2513; X32-LABEL: test_mm256_shufflehi_epi16:
2514; X32:       # BB#0:
2515; X32-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2516; X32-NEXT:    retl
2517;
2518; X64-LABEL: test_mm256_shufflehi_epi16:
2519; X64:       # BB#0:
2520; X64-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2521; X64-NEXT:    retq
2522  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2523  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2524  %res = bitcast <16 x i16> %shuf to <4 x i64>
2525  ret <4 x i64> %res
2526}
2527
2528define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2529; X32-LABEL: test_mm256_shufflelo_epi16:
2530; X32:       # BB#0:
2531; X32-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2532; X32-NEXT:    retl
2533;
2534; X64-LABEL: test_mm256_shufflelo_epi16:
2535; X64:       # BB#0:
2536; X64-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2537; X64-NEXT:    retq
2538  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2539  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2540  %res = bitcast <16 x i16> %shuf to <4 x i64>
2541  ret <4 x i64> %res
2542}
2543
2544define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2545; X32-LABEL: test_mm256_sign_epi8:
2546; X32:       # BB#0:
2547; X32-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2548; X32-NEXT:    retl
2549;
2550; X64-LABEL: test_mm256_sign_epi8:
2551; X64:       # BB#0:
2552; X64-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2553; X64-NEXT:    retq
2554  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2555  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2556  %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2557  %res = bitcast <32 x i8> %call to <4 x i64>
2558  ret <4 x i64> %res
2559}
2560declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2561
2562define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2563; X32-LABEL: test_mm256_sign_epi16:
2564; X32:       # BB#0:
2565; X32-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2566; X32-NEXT:    retl
2567;
2568; X64-LABEL: test_mm256_sign_epi16:
2569; X64:       # BB#0:
2570; X64-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2571; X64-NEXT:    retq
2572  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2573  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2574  %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2575  %res = bitcast <16 x i16> %call to <4 x i64>
2576  ret <4 x i64> %res
2577}
2578declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2579
2580define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2581; X32-LABEL: test_mm256_sign_epi32:
2582; X32:       # BB#0:
2583; X32-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2584; X32-NEXT:    retl
2585;
2586; X64-LABEL: test_mm256_sign_epi32:
2587; X64:       # BB#0:
2588; X64-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2589; X64-NEXT:    retq
2590  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2591  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2592  %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2593  %res = bitcast <8 x i32> %call to <4 x i64>
2594  ret <4 x i64> %res
2595}
2596declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2597
2598define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2599; X32-LABEL: test_mm256_sll_epi16:
2600; X32:       # BB#0:
2601; X32-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2602; X32-NEXT:    retl
2603;
2604; X64-LABEL: test_mm256_sll_epi16:
2605; X64:       # BB#0:
2606; X64-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2607; X64-NEXT:    retq
2608  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2609  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2610  %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2611  %bc = bitcast <16 x i16> %res to <4 x i64>
2612  ret <4 x i64> %bc
2613}
2614declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2615
2616define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2617; X32-LABEL: test_mm256_sll_epi32:
2618; X32:       # BB#0:
2619; X32-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2620; X32-NEXT:    retl
2621;
2622; X64-LABEL: test_mm256_sll_epi32:
2623; X64:       # BB#0:
2624; X64-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2625; X64-NEXT:    retq
2626  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2627  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2628  %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2629  %bc = bitcast <8 x i32> %res to <4 x i64>
2630  ret <4 x i64> %bc
2631}
2632declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2633
2634define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2635; X32-LABEL: test_mm256_sll_epi64:
2636; X32:       # BB#0:
2637; X32-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2638; X32-NEXT:    retl
2639;
2640; X64-LABEL: test_mm256_sll_epi64:
2641; X64:       # BB#0:
2642; X64-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2643; X64-NEXT:    retq
2644  %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2645  ret <4 x i64> %res
2646}
2647declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2648
2649define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2650; X32-LABEL: test_mm256_slli_epi16:
2651; X32:       # BB#0:
2652; X32-NEXT:    vpsllw $3, %ymm0, %ymm0
2653; X32-NEXT:    retl
2654;
2655; X64-LABEL: test_mm256_slli_epi16:
2656; X64:       # BB#0:
2657; X64-NEXT:    vpsllw $3, %ymm0, %ymm0
2658; X64-NEXT:    retq
2659  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2660  %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2661  %bc = bitcast <16 x i16> %res to <4 x i64>
2662  ret <4 x i64> %bc
2663}
2664declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2665
2666define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2667; X32-LABEL: test_mm256_slli_epi32:
2668; X32:       # BB#0:
2669; X32-NEXT:    vpslld $3, %ymm0, %ymm0
2670; X32-NEXT:    retl
2671;
2672; X64-LABEL: test_mm256_slli_epi32:
2673; X64:       # BB#0:
2674; X64-NEXT:    vpslld $3, %ymm0, %ymm0
2675; X64-NEXT:    retq
2676  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2677  %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2678  %bc = bitcast <8 x i32> %res to <4 x i64>
2679  ret <4 x i64> %bc
2680}
2681declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2682
2683define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2684; X32-LABEL: test_mm256_slli_epi64:
2685; X32:       # BB#0:
2686; X32-NEXT:    vpsllq $3, %ymm0, %ymm0
2687; X32-NEXT:    retl
2688;
2689; X64-LABEL: test_mm256_slli_epi64:
2690; X64:       # BB#0:
2691; X64-NEXT:    vpsllq $3, %ymm0, %ymm0
2692; X64-NEXT:    retq
2693  %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2694  ret <4 x i64> %res
2695}
2696declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2697
2698define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2699; X32-LABEL: test_mm256_slli_si256:
2700; X32:       # BB#0:
2701; X32-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2702; X32-NEXT:    retl
2703;
2704; X64-LABEL: test_mm256_slli_si256:
2705; X64:       # BB#0:
2706; X64-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2707; X64-NEXT:    retq
2708  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2709  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2710  %res = bitcast <32 x i8> %shuf to <4 x i64>
2711  ret <4 x i64> %res
2712}
2713
2714define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2715; X32-LABEL: test_mm_sllv_epi32:
2716; X32:       # BB#0:
2717; X32-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2718; X32-NEXT:    retl
2719;
2720; X64-LABEL: test_mm_sllv_epi32:
2721; X64:       # BB#0:
2722; X64-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2723; X64-NEXT:    retq
2724  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2725  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2726  %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2727  %bc = bitcast <4 x i32> %res to <2 x i64>
2728  ret <2 x i64> %bc
2729}
2730declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2731
2732define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2733; X32-LABEL: test_mm256_sllv_epi32:
2734; X32:       # BB#0:
2735; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2736; X32-NEXT:    retl
2737;
2738; X64-LABEL: test_mm256_sllv_epi32:
2739; X64:       # BB#0:
2740; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2741; X64-NEXT:    retq
2742  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2743  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2744  %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2745  %bc = bitcast <8 x i32> %res to <4 x i64>
2746  ret <4 x i64> %bc
2747}
2748declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2749
2750define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2751; X32-LABEL: test_mm_sllv_epi64:
2752; X32:       # BB#0:
2753; X32-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2754; X32-NEXT:    retl
2755;
2756; X64-LABEL: test_mm_sllv_epi64:
2757; X64:       # BB#0:
2758; X64-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2759; X64-NEXT:    retq
2760  %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2761  ret <2 x i64> %res
2762}
2763declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2764
2765define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2766; X32-LABEL: test_mm256_sllv_epi64:
2767; X32:       # BB#0:
2768; X32-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2769; X32-NEXT:    retl
2770;
2771; X64-LABEL: test_mm256_sllv_epi64:
2772; X64:       # BB#0:
2773; X64-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2774; X64-NEXT:    retq
2775  %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2776  ret <4 x i64> %res
2777}
2778declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2779
2780define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2781; X32-LABEL: test_mm256_sra_epi16:
2782; X32:       # BB#0:
2783; X32-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2784; X32-NEXT:    retl
2785;
2786; X64-LABEL: test_mm256_sra_epi16:
2787; X64:       # BB#0:
2788; X64-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2789; X64-NEXT:    retq
2790  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2791  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2792  %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2793  %bc = bitcast <16 x i16> %res to <4 x i64>
2794  ret <4 x i64> %bc
2795}
2796declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2797
2798define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2799; X32-LABEL: test_mm256_sra_epi32:
2800; X32:       # BB#0:
2801; X32-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2802; X32-NEXT:    retl
2803;
2804; X64-LABEL: test_mm256_sra_epi32:
2805; X64:       # BB#0:
2806; X64-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2807; X64-NEXT:    retq
2808  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2809  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2810  %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2811  %bc = bitcast <8 x i32> %res to <4 x i64>
2812  ret <4 x i64> %bc
2813}
2814declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2815
2816define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2817; X32-LABEL: test_mm256_srai_epi16:
2818; X32:       # BB#0:
2819; X32-NEXT:    vpsraw $3, %ymm0, %ymm0
2820; X32-NEXT:    retl
2821;
2822; X64-LABEL: test_mm256_srai_epi16:
2823; X64:       # BB#0:
2824; X64-NEXT:    vpsraw $3, %ymm0, %ymm0
2825; X64-NEXT:    retq
2826  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2827  %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2828  %bc = bitcast <16 x i16> %res to <4 x i64>
2829  ret <4 x i64> %bc
2830}
2831declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2832
2833define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2834; X32-LABEL: test_mm256_srai_epi32:
2835; X32:       # BB#0:
2836; X32-NEXT:    vpsrad $3, %ymm0, %ymm0
2837; X32-NEXT:    retl
2838;
2839; X64-LABEL: test_mm256_srai_epi32:
2840; X64:       # BB#0:
2841; X64-NEXT:    vpsrad $3, %ymm0, %ymm0
2842; X64-NEXT:    retq
2843  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2844  %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2845  %bc = bitcast <8 x i32> %res to <4 x i64>
2846  ret <4 x i64> %bc
2847}
2848declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2849
2850define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2851; X32-LABEL: test_mm_srav_epi32:
2852; X32:       # BB#0:
2853; X32-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2854; X32-NEXT:    retl
2855;
2856; X64-LABEL: test_mm_srav_epi32:
2857; X64:       # BB#0:
2858; X64-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2859; X64-NEXT:    retq
2860  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2861  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2862  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2863  %bc = bitcast <4 x i32> %res to <2 x i64>
2864  ret <2 x i64> %bc
2865}
2866declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2867
2868define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2869; X32-LABEL: test_mm256_srav_epi32:
2870; X32:       # BB#0:
2871; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2872; X32-NEXT:    retl
2873;
2874; X64-LABEL: test_mm256_srav_epi32:
2875; X64:       # BB#0:
2876; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2877; X64-NEXT:    retq
2878  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2879  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2880  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2881  %bc = bitcast <8 x i32> %res to <4 x i64>
2882  ret <4 x i64> %bc
2883}
2884declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2885
2886define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2887; X32-LABEL: test_mm256_srl_epi16:
2888; X32:       # BB#0:
2889; X32-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2890; X32-NEXT:    retl
2891;
2892; X64-LABEL: test_mm256_srl_epi16:
2893; X64:       # BB#0:
2894; X64-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2895; X64-NEXT:    retq
2896  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2897  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2898  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2899  %bc = bitcast <16 x i16> %res to <4 x i64>
2900  ret <4 x i64> %bc
2901}
2902declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2903
2904define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2905; X32-LABEL: test_mm256_srl_epi32:
2906; X32:       # BB#0:
2907; X32-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2908; X32-NEXT:    retl
2909;
2910; X64-LABEL: test_mm256_srl_epi32:
2911; X64:       # BB#0:
2912; X64-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2913; X64-NEXT:    retq
2914  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2915  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2916  %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2917  %bc = bitcast <8 x i32> %res to <4 x i64>
2918  ret <4 x i64> %bc
2919}
2920declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2921
2922define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2923; X32-LABEL: test_mm256_srl_epi64:
2924; X32:       # BB#0:
2925; X32-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2926; X32-NEXT:    retl
2927;
2928; X64-LABEL: test_mm256_srl_epi64:
2929; X64:       # BB#0:
2930; X64-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2931; X64-NEXT:    retq
2932  %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2933  ret <4 x i64> %res
2934}
2935declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2936
2937define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2938; X32-LABEL: test_mm256_srli_epi16:
2939; X32:       # BB#0:
2940; X32-NEXT:    vpsrlw $3, %ymm0, %ymm0
2941; X32-NEXT:    retl
2942;
2943; X64-LABEL: test_mm256_srli_epi16:
2944; X64:       # BB#0:
2945; X64-NEXT:    vpsrlw $3, %ymm0, %ymm0
2946; X64-NEXT:    retq
2947  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2948  %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2949  %bc = bitcast <16 x i16> %res to <4 x i64>
2950  ret <4 x i64> %bc
2951}
2952declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2953
2954define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2955; X32-LABEL: test_mm256_srli_epi32:
2956; X32:       # BB#0:
2957; X32-NEXT:    vpsrld $3, %ymm0, %ymm0
2958; X32-NEXT:    retl
2959;
2960; X64-LABEL: test_mm256_srli_epi32:
2961; X64:       # BB#0:
2962; X64-NEXT:    vpsrld $3, %ymm0, %ymm0
2963; X64-NEXT:    retq
2964  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2965  %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2966  %bc = bitcast <8 x i32> %res to <4 x i64>
2967  ret <4 x i64> %bc
2968}
2969declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2970
2971define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2972; X32-LABEL: test_mm256_srli_epi64:
2973; X32:       # BB#0:
2974; X32-NEXT:    vpsrlq $3, %ymm0, %ymm0
2975; X32-NEXT:    retl
2976;
2977; X64-LABEL: test_mm256_srli_epi64:
2978; X64:       # BB#0:
2979; X64-NEXT:    vpsrlq $3, %ymm0, %ymm0
2980; X64-NEXT:    retq
2981  %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2982  ret <4 x i64> %res
2983}
2984declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2985
2986define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2987; X32-LABEL: test_mm256_srli_si256:
2988; X32:       # BB#0:
2989; X32-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2990; X32-NEXT:    retl
2991;
2992; X64-LABEL: test_mm256_srli_si256:
2993; X64:       # BB#0:
2994; X64-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2995; X64-NEXT:    retq
2996  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2997  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2998  %res = bitcast <32 x i8> %shuf to <4 x i64>
2999  ret <4 x i64> %res
3000}
3001
3002define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
3003; X32-LABEL: test_mm_srlv_epi32:
3004; X32:       # BB#0:
3005; X32-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
3006; X32-NEXT:    retl
3007;
3008; X64-LABEL: test_mm_srlv_epi32:
3009; X64:       # BB#0:
3010; X64-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
3011; X64-NEXT:    retq
3012  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
3013  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
3014  %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
3015  %bc = bitcast <4 x i32> %res to <2 x i64>
3016  ret <2 x i64> %bc
3017}
3018declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
3019
3020define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
3021; X32-LABEL: test_mm256_srlv_epi32:
3022; X32:       # BB#0:
3023; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
3024; X32-NEXT:    retl
3025;
3026; X64-LABEL: test_mm256_srlv_epi32:
3027; X64:       # BB#0:
3028; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
3029; X64-NEXT:    retq
3030  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
3031  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
3032  %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
3033  %bc = bitcast <8 x i32> %res to <4 x i64>
3034  ret <4 x i64> %bc
3035}
3036declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
3037
3038define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
3039; X32-LABEL: test_mm_srlv_epi64:
3040; X32:       # BB#0:
3041; X32-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
3042; X32-NEXT:    retl
3043;
3044; X64-LABEL: test_mm_srlv_epi64:
3045; X64:       # BB#0:
3046; X64-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
3047; X64-NEXT:    retq
3048  %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
3049  ret <2 x i64> %res
3050}
3051declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
3052
3053define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
3054; X32-LABEL: test_mm256_srlv_epi64:
3055; X32:       # BB#0:
3056; X32-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
3057; X32-NEXT:    retl
3058;
3059; X64-LABEL: test_mm256_srlv_epi64:
3060; X64:       # BB#0:
3061; X64-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
3062; X64-NEXT:    retq
3063  %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
3064  ret <4 x i64> %res
3065}
3066declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
3067
3068define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
3069; X32-LABEL: test_mm256_stream_load_si256:
3070; X32:       # BB#0:
3071; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3072; X32-NEXT:    vmovntdqa (%eax), %ymm0
3073; X32-NEXT:    retl
3074;
3075; X64-LABEL: test_mm256_stream_load_si256:
3076; X64:       # BB#0:
3077; X64-NEXT:    vmovntdqa (%rdi), %ymm0
3078; X64-NEXT:    retq
3079  %arg0 = bitcast <4 x i64> *%a0 to i8*
3080  %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
3081  ret <4 x i64> %res
3082}
3083declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
3084
3085define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3086; X32-LABEL: test_mm256_sub_epi8:
3087; X32:       # BB#0:
3088; X32-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
3089; X32-NEXT:    retl
3090;
3091; X64-LABEL: test_mm256_sub_epi8:
3092; X64:       # BB#0:
3093; X64-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
3094; X64-NEXT:    retq
3095  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
3096  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
3097  %res = sub <32 x i8> %arg0, %arg1
3098  %bc = bitcast <32 x i8> %res to <4 x i64>
3099  ret <4 x i64> %bc
3100}
3101
3102define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3103; X32-LABEL: test_mm256_sub_epi16:
3104; X32:       # BB#0:
3105; X32-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
3106; X32-NEXT:    retl
3107;
3108; X64-LABEL: test_mm256_sub_epi16:
3109; X64:       # BB#0:
3110; X64-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
3111; X64-NEXT:    retq
3112  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
3113  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
3114  %res = sub <16 x i16> %arg0, %arg1
3115  %bc = bitcast <16 x i16> %res to <4 x i64>
3116  ret <4 x i64> %bc
3117}
3118
3119define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3120; X32-LABEL: test_mm256_sub_epi32:
3121; X32:       # BB#0:
3122; X32-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
3123; X32-NEXT:    retl
3124;
3125; X64-LABEL: test_mm256_sub_epi32:
3126; X64:       # BB#0:
3127; X64-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
3128; X64-NEXT:    retq
3129  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
3130  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
3131  %res = sub <8 x i32> %arg0, %arg1
3132  %bc = bitcast <8 x i32> %res to <4 x i64>
3133  ret <4 x i64> %bc
3134}
3135
3136define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3137; X32-LABEL: test_mm256_sub_epi64:
3138; X32:       # BB#0:
3139; X32-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
3140; X32-NEXT:    retl
3141;
3142; X64-LABEL: test_mm256_sub_epi64:
3143; X64:       # BB#0:
3144; X64-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
3145; X64-NEXT:    retq
3146  %res = sub <4 x i64> %a0, %a1
3147  ret <4 x i64> %res
3148}
3149
3150define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
3151; X32-LABEL: test_mm256_subs_epi8:
3152; X32:       # BB#0:
3153; X32-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
3154; X32-NEXT:    retl
3155;
3156; X64-LABEL: test_mm256_subs_epi8:
3157; X64:       # BB#0:
3158; X64-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
3159; X64-NEXT:    retq
3160  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
3161  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
3162  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1)
3163  %bc = bitcast <32 x i8> %res to <4 x i64>
3164  ret <4 x i64> %bc
3165}
3166declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
3167
3168define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
3169; X32-LABEL: test_mm256_subs_epi16:
3170; X32:       # BB#0:
3171; X32-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
3172; X32-NEXT:    retl
3173;
3174; X64-LABEL: test_mm256_subs_epi16:
3175; X64:       # BB#0:
3176; X64-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
3177; X64-NEXT:    retq
3178  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
3179  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
3180  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1)
3181  %bc = bitcast <16 x i16> %res to <4 x i64>
3182  ret <4 x i64> %bc
3183}
3184declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
3185
3186define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
3187; X32-LABEL: test_mm256_subs_epu8:
3188; X32:       # BB#0:
3189; X32-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
3190; X32-NEXT:    retl
3191;
3192; X64-LABEL: test_mm256_subs_epu8:
3193; X64:       # BB#0:
3194; X64-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
3195; X64-NEXT:    retq
3196  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
3197  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
3198  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1)
3199  %bc = bitcast <32 x i8> %res to <4 x i64>
3200  ret <4 x i64> %bc
3201}
3202declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
3203
3204define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
3205; X32-LABEL: test_mm256_subs_epu16:
3206; X32:       # BB#0:
3207; X32-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
3208; X32-NEXT:    retl
3209;
3210; X64-LABEL: test_mm256_subs_epu16:
3211; X64:       # BB#0:
3212; X64-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
3213; X64-NEXT:    retq
3214  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
3215  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
3216  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1)
3217  %bc = bitcast <16 x i16> %res to <4 x i64>
3218  ret <4 x i64> %bc
3219}
3220declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
3221
3222define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3223; X32-LABEL: test_mm256_unpackhi_epi8:
3224; X32:       # BB#0:
3225; X32-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
3226; X32-NEXT:    retl
3227;
3228; X64-LABEL: test_mm256_unpackhi_epi8:
3229; X64:       # BB#0:
3230; X64-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
3231; X64-NEXT:    retq
3232  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
3233  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
3234  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
3235  %bc = bitcast <32 x i8> %res to <4 x i64>
3236  ret <4 x i64> %bc
3237}
3238
3239define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3240; X32-LABEL: test_mm256_unpackhi_epi16:
3241; X32:       # BB#0:
3242; X32-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
3243; X32-NEXT:    retl
3244;
3245; X64-LABEL: test_mm256_unpackhi_epi16:
3246; X64:       # BB#0:
3247; X64-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
3248; X64-NEXT:    retq
3249  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
3250  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
3251  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
3252  %bc = bitcast <16 x i16> %res to <4 x i64>
3253  ret <4 x i64> %bc
3254}
3255
3256define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3257; X32-LABEL: test_mm256_unpackhi_epi32:
3258; X32:       # BB#0:
3259; X32-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3260; X32-NEXT:    retl
3261;
3262; X64-LABEL: test_mm256_unpackhi_epi32:
3263; X64:       # BB#0:
3264; X64-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3265; X64-NEXT:    retq
3266  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
3267  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
3268  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
3269  %bc = bitcast <8 x i32> %res to <4 x i64>
3270  ret <4 x i64> %bc
3271}
3272
3273define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3274; X32-LABEL: test_mm256_unpackhi_epi64:
3275; X32:       # BB#0:
3276; X32-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3277; X32-NEXT:    retl
3278;
3279; X64-LABEL: test_mm256_unpackhi_epi64:
3280; X64:       # BB#0:
3281; X64-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3282; X64-NEXT:    retq
3283  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
3284  ret <4 x i64> %res
3285}
3286
3287define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3288; X32-LABEL: test_mm256_unpacklo_epi8:
3289; X32:       # BB#0:
3290; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
3291; X32-NEXT:    retl
3292;
3293; X64-LABEL: test_mm256_unpacklo_epi8:
3294; X64:       # BB#0:
3295; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
3296; X64-NEXT:    retq
3297  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
3298  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
3299  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
3300  %bc = bitcast <32 x i8> %res to <4 x i64>
3301  ret <4 x i64> %bc
3302}
3303
3304define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3305; X32-LABEL: test_mm256_unpacklo_epi16:
3306; X32:       # BB#0:
3307; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
3308; X32-NEXT:    retl
3309;
3310; X64-LABEL: test_mm256_unpacklo_epi16:
3311; X64:       # BB#0:
3312; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
3313; X64-NEXT:    retq
3314  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
3315  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
3316  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
3317  %bc = bitcast <16 x i16> %res to <4 x i64>
3318  ret <4 x i64> %bc
3319}
3320
3321define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3322; X32-LABEL: test_mm256_unpacklo_epi32:
3323; X32:       # BB#0:
3324; X32-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3325; X32-NEXT:    retl
3326;
3327; X64-LABEL: test_mm256_unpacklo_epi32:
3328; X64:       # BB#0:
3329; X64-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3330; X64-NEXT:    retq
3331  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
3332  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
3333  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3334  %bc = bitcast <8 x i32> %res to <4 x i64>
3335  ret <4 x i64> %bc
3336}
3337
3338define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3339; X32-LABEL: test_mm256_unpacklo_epi64:
3340; X32:       # BB#0:
3341; X32-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3342; X32-NEXT:    retl
3343;
3344; X64-LABEL: test_mm256_unpacklo_epi64:
3345; X64:       # BB#0:
3346; X64-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3347; X64-NEXT:    retq
3348  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
3349  ret <4 x i64> %res
3350}
3351
3352define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3353; X32-LABEL: test_mm256_xor_si256:
3354; X32:       # BB#0:
3355; X32-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3356; X32-NEXT:    retl
3357;
3358; X64-LABEL: test_mm256_xor_si256:
3359; X64:       # BB#0:
3360; X64-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3361; X64-NEXT:    retq
3362  %res = xor <4 x i64> %a0, %a1
3363  ret <4 x i64> %res
3364}
3365
3366declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
3367
3368declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
3369