1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
3
4declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
5declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
6declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
7declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
8
9declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
10declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
11declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
12declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
13
14define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf)  {
15; CHECK-LABEL: gather_mask_dps:
16; CHECK:       ## %bb.0:
17; CHECK-NEXT:    kmovd %edi, %k1
18; CHECK-NEXT:    kmovq %k1, %k2
19; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
20; CHECK-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
21; CHECK-NEXT:    vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
22; CHECK-NEXT:    vzeroupper
23; CHECK-NEXT:    retq
24  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
25  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
26  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
27  ret void
28}
29
30define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
31; CHECK-LABEL: gather_mask_dpd:
32; CHECK:       ## %bb.0:
33; CHECK-NEXT:    kmovd %edi, %k1
34; CHECK-NEXT:    kmovq %k1, %k2
35; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
36; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
37; CHECK-NEXT:    vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
38; CHECK-NEXT:    vzeroupper
39; CHECK-NEXT:    retq
40  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
41  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
42  call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
43  ret void
44}
45
46define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf)  {
47; CHECK-LABEL: gather_mask_qps:
48; CHECK:       ## %bb.0:
49; CHECK-NEXT:    kmovd %edi, %k1
50; CHECK-NEXT:    kmovq %k1, %k2
51; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
52; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
53; CHECK-NEXT:    vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
54; CHECK-NEXT:    vzeroupper
55; CHECK-NEXT:    retq
56  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
57  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
58  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
59  ret void
60}
61
62define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
63; CHECK-LABEL: gather_mask_qpd:
64; CHECK:       ## %bb.0:
65; CHECK-NEXT:    kmovd %edi, %k1
66; CHECK-NEXT:    kmovq %k1, %k2
67; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
68; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
69; CHECK-NEXT:    vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
70; CHECK-NEXT:    vzeroupper
71; CHECK-NEXT:    retq
72  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
73  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
74  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
75  ret void
76}
77;;
78;; Integer Gather/Scatter
79;;
80declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
81declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
82declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
83declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
84
85declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
86declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
87declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
88declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
89
90define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf)  {
91; CHECK-LABEL: gather_mask_dd:
92; CHECK:       ## %bb.0:
93; CHECK-NEXT:    kmovd %edi, %k1
94; CHECK-NEXT:    kmovq %k1, %k2
95; CHECK-NEXT:    vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
96; CHECK-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
97; CHECK-NEXT:    vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
98; CHECK-NEXT:    vzeroupper
99; CHECK-NEXT:    retq
100  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
101  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
102  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
103  ret void
104}
105
106define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf)  {
107; CHECK-LABEL: gather_mask_qd:
108; CHECK:       ## %bb.0:
109; CHECK-NEXT:    kmovd %edi, %k1
110; CHECK-NEXT:    kmovq %k1, %k2
111; CHECK-NEXT:    vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
112; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
113; CHECK-NEXT:    vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
114; CHECK-NEXT:    vzeroupper
115; CHECK-NEXT:    retq
116  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
117  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
118  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
119  ret void
120}
121
122define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
123; CHECK-LABEL: gather_mask_qq:
124; CHECK:       ## %bb.0:
125; CHECK-NEXT:    kmovd %edi, %k1
126; CHECK-NEXT:    kmovq %k1, %k2
127; CHECK-NEXT:    vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
128; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
129; CHECK-NEXT:    vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
130; CHECK-NEXT:    vzeroupper
131; CHECK-NEXT:    retq
132  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
133  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
134  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
135  ret void
136}
137
138define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
139; CHECK-LABEL: gather_mask_dq:
140; CHECK:       ## %bb.0:
141; CHECK-NEXT:    kmovd %edi, %k1
142; CHECK-NEXT:    kmovq %k1, %k2
143; CHECK-NEXT:    vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
144; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
145; CHECK-NEXT:    vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
146; CHECK-NEXT:    vzeroupper
147; CHECK-NEXT:    retq
148  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
149  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
150  call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
151  ret void
152}
153
154define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
155; CHECK-LABEL: gather_mask_dpd_execdomain:
156; CHECK:       ## %bb.0:
157; CHECK-NEXT:    kmovd %edi, %k1
158; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
159; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
160; CHECK-NEXT:    vzeroupper
161; CHECK-NEXT:    retq
162  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
163  store <8 x double> %x, <8 x double>* %stbuf
164  ret void
165}
166
167define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
168; CHECK-LABEL: gather_mask_qpd_execdomain:
169; CHECK:       ## %bb.0:
170; CHECK-NEXT:    kmovd %edi, %k1
171; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
172; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
173; CHECK-NEXT:    vzeroupper
174; CHECK-NEXT:    retq
175  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
176  store <8 x double> %x, <8 x double>* %stbuf
177  ret void
178}
179
180define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base)  {
181; CHECK-LABEL: gather_mask_dps_execdomain:
182; CHECK:       ## %bb.0:
183; CHECK-NEXT:    kmovd %edi, %k1
184; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
185; CHECK-NEXT:    vmovaps %zmm1, %zmm0
186; CHECK-NEXT:    retq
187  %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
188  ret <16 x float> %res;
189}
190
191define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base)  {
192; CHECK-LABEL: gather_mask_qps_execdomain:
193; CHECK:       ## %bb.0:
194; CHECK-NEXT:    kmovd %edi, %k1
195; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
196; CHECK-NEXT:    vmovaps %ymm1, %ymm0
197; CHECK-NEXT:    retq
198  %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
199  ret <8 x float> %res;
200}
201
202define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
203; CHECK-LABEL: scatter_mask_dpd_execdomain:
204; CHECK:       ## %bb.0:
205; CHECK-NEXT:    kmovd %esi, %k1
206; CHECK-NEXT:    vmovapd (%rdi), %zmm1
207; CHECK-NEXT:    vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
208; CHECK-NEXT:    vzeroupper
209; CHECK-NEXT:    retq
210  %x = load <8 x double>, <8 x double>* %src, align 64
211  call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
212  ret void
213}
214
215define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
216; CHECK-LABEL: scatter_mask_qpd_execdomain:
217; CHECK:       ## %bb.0:
218; CHECK-NEXT:    kmovd %esi, %k1
219; CHECK-NEXT:    vmovapd (%rdi), %zmm1
220; CHECK-NEXT:    vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
221; CHECK-NEXT:    vzeroupper
222; CHECK-NEXT:    retq
223  %x = load <8 x double>, <8 x double>* %src, align 64
224  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
225  ret void
226}
227
228define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf)  {
229; CHECK-LABEL: scatter_mask_dps_execdomain:
230; CHECK:       ## %bb.0:
231; CHECK-NEXT:    kmovd %esi, %k1
232; CHECK-NEXT:    vmovaps (%rdi), %zmm1
233; CHECK-NEXT:    vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
234; CHECK-NEXT:    vzeroupper
235; CHECK-NEXT:    retq
236  %x = load <16 x float>, <16 x float>* %src, align 64
237  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
238  ret void
239}
240
241define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
242; CHECK-LABEL: scatter_mask_qps_execdomain:
243; CHECK:       ## %bb.0:
244; CHECK-NEXT:    kmovd %esi, %k1
245; CHECK-NEXT:    vmovaps (%rdi), %ymm1
246; CHECK-NEXT:    vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
247; CHECK-NEXT:    vzeroupper
248; CHECK-NEXT:    retq
249  %x = load <8 x float>, <8 x float>* %src, align 32
250  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
251  ret void
252}
253
254define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf)  {
255; CHECK-LABEL: gather_qps:
256; CHECK:       ## %bb.0:
257; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
258; CHECK-NEXT:    kxnorw %k0, %k0, %k1
259; CHECK-NEXT:    kxnorw %k0, %k0, %k2
260; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
261; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
262; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
263; CHECK-NEXT:    vzeroupper
264; CHECK-NEXT:    retq
265  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
266  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
267  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
268  ret void
269}
270
271declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
272declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
273define void @prefetch(<8 x i64> %ind, i8* %base) {
274; CHECK-LABEL: prefetch:
275; CHECK:       ## %bb.0:
276; CHECK-NEXT:    kxnorw %k0, %k0, %k1
277; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
278; CHECK-NEXT:    kxorw %k0, %k0, %k1
279; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
280; CHECK-NEXT:    movb $1, %al
281; CHECK-NEXT:    kmovd %eax, %k1
282; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
283; CHECK-NEXT:    movb $120, %al
284; CHECK-NEXT:    kmovd %eax, %k1
285; CHECK-NEXT:    vscatterpf1qps (%rdi,%zmm0,2) {%k1}
286; CHECK-NEXT:    vzeroupper
287; CHECK-NEXT:    retq
288  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3)
289  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2)
290  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3)
291  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2)
292  ret void
293}
294
295declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
296
297define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
298; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
299; CHECK:       ## %bb.0:
300; CHECK-NEXT:    kmovd %esi, %k1
301; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
302; CHECK-NEXT:    kxnorw %k0, %k0, %k1
303; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
304; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
305; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
306; CHECK-NEXT:    retq
307  %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
308  %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
309  %res2 = fadd <2 x double> %res, %res1
310  ret <2 x double> %res2
311}
312
313declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32)
314
315define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
316; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
317; CHECK:       ## %bb.0:
318; CHECK-NEXT:    kmovd %esi, %k1
319; CHECK-NEXT:    vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
320; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
321; CHECK-NEXT:    retq
322  %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
323  %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
324  %res2 = add <2 x i64> %res, %res1
325  ret <2 x i64> %res2
326}
327
328declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32)
329
330define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
331; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
332; CHECK:       ## %bb.0:
333; CHECK-NEXT:    kmovd %esi, %k1
334; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
335; CHECK-NEXT:    kxnorw %k0, %k0, %k1
336; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
337; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
338; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
339; CHECK-NEXT:    retq
340  %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
341  %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
342  %res2 = fadd <4 x double> %res, %res1
343  ret <4 x double> %res2
344}
345
346declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32)
347
348define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
349; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
350; CHECK:       ## %bb.0:
351; CHECK-NEXT:    kmovd %esi, %k1
352; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
353; CHECK-NEXT:    kxnorw %k0, %k0, %k1
354; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
355; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
356; CHECK-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
357; CHECK-NEXT:    retq
358  %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
359  %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
360  %res2 = add <4 x i64> %res, %res1
361  ret <4 x i64> %res2
362}
363
364declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32)
365
366define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
367; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
368; CHECK:       ## %bb.0:
369; CHECK-NEXT:    kmovd %esi, %k1
370; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
371; CHECK-NEXT:    kxnorw %k0, %k0, %k1
372; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
373; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
374; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
375; CHECK-NEXT:    retq
376  %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
377  %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
378  %res2 = fadd <4 x float> %res, %res1
379  ret <4 x float> %res2
380}
381
382declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32)
383
384define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
385; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
386; CHECK:       ## %bb.0:
387; CHECK-NEXT:    kmovd %esi, %k1
388; CHECK-NEXT:    kxnorw %k0, %k0, %k2
389; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
390; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
391; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
392; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
393; CHECK-NEXT:    retq
394  %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4)
395  %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
396  %res2 = add <4 x i32> %res, %res1
397  ret <4 x i32> %res2
398}
399
400declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32)
401
402define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
403; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
404; CHECK:       ## %bb.0:
405; CHECK-NEXT:    kmovd %esi, %k1
406; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
407; CHECK-NEXT:    kxnorw %k0, %k0, %k1
408; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
409; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
410; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
411; CHECK-NEXT:    vzeroupper
412; CHECK-NEXT:    retq
413  %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
414  %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
415  %res2 = fadd <4 x float> %res, %res1
416  ret <4 x float> %res2
417}
418
419declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32)
420
421define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
422; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
423; CHECK:       ## %bb.0:
424; CHECK-NEXT:    kmovd %esi, %k1
425; CHECK-NEXT:    vmovdqa %xmm0, %xmm2
426; CHECK-NEXT:    kmovq %k1, %k2
427; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
428; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
429; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
430; CHECK-NEXT:    vzeroupper
431; CHECK-NEXT:    retq
432  %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
433  %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
434  %res2 = add <4 x i32> %res, %res1
435  ret <4 x i32> %res2
436}
437
438declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32)
439
440define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
441; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
442; CHECK:       ## %bb.0:
443; CHECK-NEXT:    kmovd %esi, %k1
444; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
445; CHECK-NEXT:    kxnorw %k0, %k0, %k1
446; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
447; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
448; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
449; CHECK-NEXT:    retq
450  %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
451  %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
452  %res2 = fadd <2 x double> %res, %res1
453  ret <2 x double> %res2
454}
455
456declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32)
457
458define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
459; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
460; CHECK:       ## %bb.0:
461; CHECK-NEXT:    kmovd %esi, %k1
462; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
463; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
464; CHECK-NEXT:    retq
465  %res = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
466  %res1 = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
467  %res2 = add <2 x i64> %res, %res1
468  ret <2 x i64> %res2
469}
470
471declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32)
472
473define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
474; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
475; CHECK:       ## %bb.0:
476; CHECK-NEXT:    kmovd %esi, %k1
477; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
478; CHECK-NEXT:    kxnorw %k0, %k0, %k1
479; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
480; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
481; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
482; CHECK-NEXT:    retq
483  %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
484  %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
485  %res2 = fadd <4 x double> %res, %res1
486  ret <4 x double> %res2
487}
488
489declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32)
490
491define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
492; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
493; CHECK:       ## %bb.0:
494; CHECK-NEXT:    kmovd %esi, %k1
495; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
496; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
497; CHECK-NEXT:    retq
498  %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
499  %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
500  %res2 = add <4 x i64> %res, %res1
501  ret <4 x i64> %res2
502}
503
504declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32)
505
506define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
507; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
508; CHECK:       ## %bb.0:
509; CHECK-NEXT:    kmovd %esi, %k1
510; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
511; CHECK-NEXT:    kxnorw %k0, %k0, %k1
512; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
513; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
514; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
515; CHECK-NEXT:    retq
516  %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
517  %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
518  %res2 = fadd <4 x float> %res, %res1
519  ret <4 x float> %res2
520}
521
522declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32)
523
524define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
525; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
526; CHECK:       ## %bb.0:
527; CHECK-NEXT:    kmovd %esi, %k1
528; CHECK-NEXT:    kxnorw %k0, %k0, %k2
529; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
530; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
531; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
532; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
533; CHECK-NEXT:    retq
534  %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4)
535  %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2)
536  %res2 = add <4 x i32> %res, %res1
537  ret <4 x i32> %res2
538}
539
540declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32)
541
542define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
543; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
544; CHECK:       ## %bb.0:
545; CHECK-NEXT:    kmovd %esi, %k1
546; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
547; CHECK-NEXT:    kxnorw %k0, %k0, %k1
548; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
549; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
550; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
551; CHECK-NEXT:    retq
552  %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
553  %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2)
554  %res2 = fadd <8 x float> %res, %res1
555  ret <8 x float> %res2
556}
557
558declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32)
559
560define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
561; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
562; CHECK:       ## %bb.0:
563; CHECK-NEXT:    kmovd %esi, %k1
564; CHECK-NEXT:    vmovdqa %ymm0, %ymm2
565; CHECK-NEXT:    kmovq %k1, %k2
566; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
567; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
568; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
569; CHECK-NEXT:    retq
570  %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
571  %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2)
572  %res2 = add <8 x i32> %res, %res1
573  ret <8 x i32> %res2
574}
575
576declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32)
577
578define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
579; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
580; CHECK:       ## %bb.0:
581; CHECK-NEXT:    kmovd %esi, %k1
582; CHECK-NEXT:    kxnorw %k0, %k0, %k2
583; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
584; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
585; CHECK-NEXT:    retq
586  call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2)
587  call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
588  ret void
589}
590
591declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32)
592
593define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
594; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
595; CHECK:       ## %bb.0:
596; CHECK-NEXT:    kmovd %esi, %k1
597; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
598; CHECK-NEXT:    kxnorw %k0, %k0, %k1
599; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
600; CHECK-NEXT:    retq
601  call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2)
602  call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
603  ret void
604}
605
606declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32)
607
608define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
609; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
610; CHECK:       ## %bb.0:
611; CHECK-NEXT:    kmovd %esi, %k1
612; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
613; CHECK-NEXT:    kxnorw %k0, %k0, %k1
614; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
615; CHECK-NEXT:    vzeroupper
616; CHECK-NEXT:    retq
617  call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
618  call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
619  ret void
620}
621
622declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32)
623
624define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
625; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
626; CHECK:       ## %bb.0:
627; CHECK-NEXT:    kmovd %esi, %k1
628; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
629; CHECK-NEXT:    kxnorw %k0, %k0, %k1
630; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
631; CHECK-NEXT:    vzeroupper
632; CHECK-NEXT:    retq
633  call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
634  call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
635  ret void
636}
637
638declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32)
639
640define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
641; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
642; CHECK:       ## %bb.0:
643; CHECK-NEXT:    kmovd %esi, %k1
644; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
645; CHECK-NEXT:    kxnorw %k0, %k0, %k1
646; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
647; CHECK-NEXT:    retq
648  call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2)
649  call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
650  ret void
651}
652
653declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32)
654
655define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
656; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
657; CHECK:       ## %bb.0:
658; CHECK-NEXT:    kmovd %esi, %k1
659; CHECK-NEXT:    kxnorw %k0, %k0, %k2
660; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
661; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
662; CHECK-NEXT:    retq
663  call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2)
664  call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
665  ret void
666}
667
668declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32)
669
670define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
671; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
672; CHECK:       ## %bb.0:
673; CHECK-NEXT:    kmovd %esi, %k1
674; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
675; CHECK-NEXT:    kxnorw %k0, %k0, %k1
676; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
677; CHECK-NEXT:    vzeroupper
678; CHECK-NEXT:    retq
679  call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
680  call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
681  ret void
682}
683
684declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32)
685
686define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
687; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
688; CHECK:       ## %bb.0:
689; CHECK-NEXT:    kmovd %esi, %k1
690; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
691; CHECK-NEXT:    kxnorw %k0, %k0, %k1
692; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
693; CHECK-NEXT:    vzeroupper
694; CHECK-NEXT:    retq
695  call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
696  call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
697  ret void
698}
699
700declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32)
701
702define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
703; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
704; CHECK:       ## %bb.0:
705; CHECK-NEXT:    kmovd %esi, %k1
706; CHECK-NEXT:    kxnorw %k0, %k0, %k2
707; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
708; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
709; CHECK-NEXT:    retq
710  call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2)
711  call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
712  ret void
713}
714
715declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32)
716
717define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
718; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
719; CHECK:       ## %bb.0:
720; CHECK-NEXT:    kmovd %esi, %k1
721; CHECK-NEXT:    kxnorw %k0, %k0, %k2
722; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
723; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
724; CHECK-NEXT:    retq
725  call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2)
726  call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
727  ret void
728}
729
730declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32)
731
732define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
733; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
734; CHECK:       ## %bb.0:
735; CHECK-NEXT:    kmovd %esi, %k1
736; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
737; CHECK-NEXT:    kxnorw %k0, %k0, %k1
738; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
739; CHECK-NEXT:    vzeroupper
740; CHECK-NEXT:    retq
741  call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
742  call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
743  ret void
744}
745
746declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32)
747
748define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
749; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
750; CHECK:       ## %bb.0:
751; CHECK-NEXT:    kmovd %esi, %k1
752; CHECK-NEXT:    kxnorw %k0, %k0, %k2
753; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
754; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
755; CHECK-NEXT:    vzeroupper
756; CHECK-NEXT:    retq
757  call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
758  call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
759  ret void
760}
761
762declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32)
763
764define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
765; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
766; CHECK:       ## %bb.0:
767; CHECK-NEXT:    kmovd %esi, %k1
768; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
769; CHECK-NEXT:    kxnorw %k0, %k0, %k1
770; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
771; CHECK-NEXT:    retq
772  call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2)
773  call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
774  ret void
775}
776
777declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32)
778
779define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
780; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
781; CHECK:       ## %bb.0:
782; CHECK-NEXT:    kmovd %esi, %k1
783; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
784; CHECK-NEXT:    kxnorw %k0, %k0, %k1
785; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
786; CHECK-NEXT:    retq
787  call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2)
788  call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
789  ret void
790}
791
792declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32)
793
794define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
795; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
796; CHECK:       ## %bb.0:
797; CHECK-NEXT:    kmovd %esi, %k1
798; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
799; CHECK-NEXT:    kxnorw %k0, %k0, %k1
800; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
801; CHECK-NEXT:    vzeroupper
802; CHECK-NEXT:    retq
803  call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
804  call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
805  ret void
806}
807
808declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32)
809
810define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
811; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
812; CHECK:       ## %bb.0:
813; CHECK-NEXT:    kmovd %esi, %k1
814; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
815; CHECK-NEXT:    kxnorw %k0, %k0, %k1
816; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
817; CHECK-NEXT:    vzeroupper
818; CHECK-NEXT:    retq
819  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
820  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
821  ret void
822}
823
824define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
825; CHECK-LABEL: scatter_mask_test:
826; CHECK:       ## %bb.0:
827; CHECK-NEXT:    kxnorw %k0, %k0, %k1
828; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
829; CHECK-NEXT:    kxorw %k0, %k0, %k1
830; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
831; CHECK-NEXT:    movb $1, %al
832; CHECK-NEXT:    kmovd %eax, %k1
833; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
834; CHECK-NEXT:    movb $96, %al
835; CHECK-NEXT:    kmovd %eax, %k1
836; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
837; CHECK-NEXT:    vzeroupper
838; CHECK-NEXT:    retq
839  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
840  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
841  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
842  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
843  ret void
844}
845
846define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base)  {
847; CHECK-LABEL: gather_mask_test:
848; CHECK:       ## %bb.0:
849; CHECK-NEXT:    kxnorw %k0, %k0, %k1
850; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
851; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
852; CHECK-NEXT:    kxorw %k0, %k0, %k1
853; CHECK-NEXT:    vmovaps %zmm1, %zmm3
854; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
855; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
856; CHECK-NEXT:    movw $1, %ax
857; CHECK-NEXT:    kmovd %eax, %k1
858; CHECK-NEXT:    vmovaps %zmm1, %zmm3
859; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
860; CHECK-NEXT:    movw $220, %ax
861; CHECK-NEXT:    kmovd %eax, %k1
862; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
863; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
864; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
865; CHECK-NEXT:    retq
866  %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
867  %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
868  %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4)
869  %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4)
870
871  %res4 = fadd <16 x float> %res, %res1
872  %res5 = fadd <16 x float> %res3, %res2
873  %res6 = fadd <16 x float> %res5, %res4
874  ret <16 x float> %res6
875}
876