1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
3
4define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
5; CHECK-LABEL: gather_mask_dps:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    kmovd %edi, %k1
8; CHECK-NEXT:    kmovq %k1, %k2
9; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
10; CHECK-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
11; CHECK-NEXT:    vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
12; CHECK-NEXT:    vzeroupper
13; CHECK-NEXT:    retq
14  %1 = bitcast i16 %mask to <16 x i1>
15  %x = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
16  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
17  call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x float> %x, i32 4)
18  ret void
19}
20
21define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
22; CHECK-LABEL: gather_mask_dpd:
23; CHECK:       # %bb.0:
24; CHECK-NEXT:    kmovd %edi, %k1
25; CHECK-NEXT:    kmovq %k1, %k2
26; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
27; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
28; CHECK-NEXT:    vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
29; CHECK-NEXT:    vzeroupper
30; CHECK-NEXT:    retq
31  %1 = bitcast i8 %mask to <8 x i1>
32  %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
33  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
34  call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x double> %x, i32 4)
35  ret void
36}
37
38define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
39; CHECK-LABEL: gather_mask_qps:
40; CHECK:       # %bb.0:
41; CHECK-NEXT:    kmovd %edi, %k1
42; CHECK-NEXT:    kmovq %k1, %k2
43; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
44; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
45; CHECK-NEXT:    vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
46; CHECK-NEXT:    vzeroupper
47; CHECK-NEXT:    retq
48  %1 = bitcast i8 %mask to <8 x i1>
49  %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
50  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
51  call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x float> %x, i32 4)
52  ret void
53}
54
55define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
56; CHECK-LABEL: gather_mask_qpd:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    kmovd %edi, %k1
59; CHECK-NEXT:    kmovq %k1, %k2
60; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
61; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
62; CHECK-NEXT:    vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
63; CHECK-NEXT:    vzeroupper
64; CHECK-NEXT:    retq
65  %1 = bitcast i8 %mask to <8 x i1>
66  %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
67  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
68  call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x double> %x, i32 4)
69  ret void
70}
71;;
72;; Integer Gather/Scatter
73;;
74
75define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
76; CHECK-LABEL: gather_mask_dd:
77; CHECK:       # %bb.0:
78; CHECK-NEXT:    kmovd %edi, %k1
79; CHECK-NEXT:    kmovq %k1, %k2
80; CHECK-NEXT:    vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
81; CHECK-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
82; CHECK-NEXT:    vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
83; CHECK-NEXT:    vzeroupper
84; CHECK-NEXT:    retq
85  %1 = bitcast i16 %mask to <16 x i1>
86  %x = call <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
87  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
88  call void @llvm.x86.avx512.mask.scatter.dpi.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x i32> %x, i32 4)
89  ret void
90}
91
92define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
93; CHECK-LABEL: gather_mask_qd:
94; CHECK:       # %bb.0:
95; CHECK-NEXT:    kmovd %edi, %k1
96; CHECK-NEXT:    kmovq %k1, %k2
97; CHECK-NEXT:    vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
98; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
99; CHECK-NEXT:    vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
100; CHECK-NEXT:    vzeroupper
101; CHECK-NEXT:    retq
102  %1 = bitcast i8 %mask to <8 x i1>
103  %x = call <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
104  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
105  call void @llvm.x86.avx512.mask.scatter.qpi.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i32> %x, i32 4)
106  ret void
107}
108
109define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
110; CHECK-LABEL: gather_mask_qq:
111; CHECK:       # %bb.0:
112; CHECK-NEXT:    kmovd %edi, %k1
113; CHECK-NEXT:    kmovq %k1, %k2
114; CHECK-NEXT:    vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
115; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
116; CHECK-NEXT:    vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
117; CHECK-NEXT:    vzeroupper
118; CHECK-NEXT:    retq
119  %1 = bitcast i8 %mask to <8 x i1>
120  %x = call <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
121  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
122  call void @llvm.x86.avx512.mask.scatter.qpq.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i64> %x, i32 4)
123  ret void
124}
125
126define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
127; CHECK-LABEL: gather_mask_dq:
128; CHECK:       # %bb.0:
129; CHECK-NEXT:    kmovd %edi, %k1
130; CHECK-NEXT:    kmovq %k1, %k2
131; CHECK-NEXT:    vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
132; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
133; CHECK-NEXT:    vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
134; CHECK-NEXT:    vzeroupper
135; CHECK-NEXT:    retq
136  %1 = bitcast i8 %mask to <8 x i1>
137  %x = call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
138  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
139  call void @llvm.x86.avx512.mask.scatter.dpq.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x i64> %x, i32 4)
140  ret void
141}
142
143define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
144; CHECK-LABEL: gather_mask_dpd_execdomain:
145; CHECK:       # %bb.0:
146; CHECK-NEXT:    kmovd %edi, %k1
147; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
148; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
149; CHECK-NEXT:    vzeroupper
150; CHECK-NEXT:    retq
151  %1 = bitcast i8 %mask to <8 x i1>
152  %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
153  store <8 x double> %x, <8 x double>* %stbuf
154  ret void
155}
156
157define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
158; CHECK-LABEL: gather_mask_qpd_execdomain:
159; CHECK:       # %bb.0:
160; CHECK-NEXT:    kmovd %edi, %k1
161; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
162; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
163; CHECK-NEXT:    vzeroupper
164; CHECK-NEXT:    retq
165  %1 = bitcast i8 %mask to <8 x i1>
166  %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
167  store <8 x double> %x, <8 x double>* %stbuf
168  ret void
169}
170
171define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
172; CHECK-LABEL: gather_mask_dps_execdomain:
173; CHECK:       # %bb.0:
174; CHECK-NEXT:    kmovd %edi, %k1
175; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
176; CHECK-NEXT:    vmovaps %zmm1, %zmm0
177; CHECK-NEXT:    retq
178  %1 = bitcast i16 %mask to <16 x i1>
179  %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
180  ret <16 x float> %res
181}
182
183define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
184; CHECK-LABEL: gather_mask_qps_execdomain:
185; CHECK:       # %bb.0:
186; CHECK-NEXT:    kmovd %edi, %k1
187; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
188; CHECK-NEXT:    vmovaps %ymm1, %ymm0
189; CHECK-NEXT:    retq
190  %1 = bitcast i8 %mask to <8 x i1>
191  %res = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
192  ret <8 x float> %res
193}
194
195define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
196; CHECK-LABEL: scatter_mask_dpd_execdomain:
197; CHECK:       # %bb.0:
198; CHECK-NEXT:    kmovd %esi, %k1
199; CHECK-NEXT:    vmovapd (%rdi), %zmm1
200; CHECK-NEXT:    vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
201; CHECK-NEXT:    vzeroupper
202; CHECK-NEXT:    retq
203  %1 = bitcast i8 %mask to <8 x i1>
204  %x = load <8 x double>, <8 x double>* %src, align 64
205  call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32>%ind, <8 x double> %x, i32 4)
206  ret void
207}
208
209define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
210; CHECK-LABEL: scatter_mask_qpd_execdomain:
211; CHECK:       # %bb.0:
212; CHECK-NEXT:    kmovd %esi, %k1
213; CHECK-NEXT:    vmovapd (%rdi), %zmm1
214; CHECK-NEXT:    vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
215; CHECK-NEXT:    vzeroupper
216; CHECK-NEXT:    retq
217  %1 = bitcast i8 %mask to <8 x i1>
218  %x = load <8 x double>, <8 x double>* %src, align 64
219  call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x double> %x, i32 4)
220  ret void
221}
222
223define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf)  {
224; CHECK-LABEL: scatter_mask_dps_execdomain:
225; CHECK:       # %bb.0:
226; CHECK-NEXT:    kmovd %esi, %k1
227; CHECK-NEXT:    vmovaps (%rdi), %zmm1
228; CHECK-NEXT:    vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
229; CHECK-NEXT:    vzeroupper
230; CHECK-NEXT:    retq
231  %1 = bitcast i16 %mask to <16 x i1>
232  %x = load <16 x float>, <16 x float>* %src, align 64
233  call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32>%ind, <16 x float> %x, i32 4)
234  ret void
235}
236
237define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
238; CHECK-LABEL: scatter_mask_qps_execdomain:
239; CHECK:       # %bb.0:
240; CHECK-NEXT:    kmovd %esi, %k1
241; CHECK-NEXT:    vmovaps (%rdi), %ymm1
242; CHECK-NEXT:    vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
243; CHECK-NEXT:    vzeroupper
244; CHECK-NEXT:    retq
245  %1 = bitcast i8 %mask to <8 x i1>
246  %x = load <8 x float>, <8 x float>* %src, align 32
247  call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x float> %x, i32 4)
248  ret void
249}
250
251define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
252; CHECK-LABEL: gather_qps:
253; CHECK:       # %bb.0:
254; CHECK-NEXT:    kxnorw %k0, %k0, %k1
255; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
256; CHECK-NEXT:    kxnorw %k0, %k0, %k2
257; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
258; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
259; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
260; CHECK-NEXT:    vzeroupper
261; CHECK-NEXT:    retq
262  %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
263  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
264  call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> %ind2, <8 x float> %x, i32 4)
265  ret void
266}
267
268declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
269declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
270define void @prefetch(<8 x i64> %ind, i8* %base) {
271; CHECK-LABEL: prefetch:
272; CHECK:       # %bb.0:
273; CHECK-NEXT:    kxnorw %k0, %k0, %k1
274; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
275; CHECK-NEXT:    kxorw %k0, %k0, %k1
276; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
277; CHECK-NEXT:    movb $1, %al
278; CHECK-NEXT:    kmovd %eax, %k1
279; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
280; CHECK-NEXT:    movb $120, %al
281; CHECK-NEXT:    kmovd %eax, %k1
282; CHECK-NEXT:    vscatterpf1qps (%rdi,%zmm0,2) {%k1}
283; CHECK-NEXT:    vzeroupper
284; CHECK-NEXT:    retq
285  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3)
286  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2)
287  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3)
288  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2)
289  ret void
290}
291
292define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
293; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df:
294; CHECK:       # %bb.0:
295; CHECK-NEXT:    kmovd %esi, %k1
296; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
297; CHECK-NEXT:    kxnorw %k0, %k0, %k1
298; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
299; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
300; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
301; CHECK-NEXT:    retq
302  %1 = bitcast i8 %x3 to <8 x i1>
303  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
304  %res = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
305  %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
306  %res2 = fadd <2 x double> %res, %res1
307  ret <2 x double> %res2
308}
309
310define <2 x i64> @test_int_x86_avx512_mask_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
311; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_di:
312; CHECK:       # %bb.0:
313; CHECK-NEXT:    kmovd %esi, %k1
314; CHECK-NEXT:    vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
315; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
316; CHECK-NEXT:    retq
317  %1 = bitcast i8 %x3 to <8 x i1>
318  %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
319  %res = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract1, i32 8)
320  %2 = bitcast i8 %x3 to <8 x i1>
321  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
322  %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 8)
323  %res2 = add <2 x i64> %res, %res1
324  ret <2 x i64> %res2
325}
326
327define <4 x double> @test_int_x86_avx512_mask_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
328; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_df:
329; CHECK:       # %bb.0:
330; CHECK-NEXT:    kmovd %esi, %k1
331; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
332; CHECK-NEXT:    kxnorw %k0, %k0, %k1
333; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
334; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
335; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
336; CHECK-NEXT:    retq
337  %1 = bitcast i8 %x3 to <8 x i1>
338  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339  %res = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4)
340  %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
341  %res2 = fadd <4 x double> %res, %res1
342  ret <4 x double> %res2
343}
344
345define <4 x i64> @test_int_x86_avx512_mask_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
346; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_di:
347; CHECK:       # %bb.0:
348; CHECK-NEXT:    kmovd %esi, %k1
349; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
350; CHECK-NEXT:    kxnorw %k0, %k0, %k1
351; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
352; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
353; CHECK-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
354; CHECK-NEXT:    retq
355  %1 = bitcast i8 %x3 to <8 x i1>
356  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
357  %res = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 8)
358  %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 8)
359  %res2 = add <4 x i64> %res, %res1
360  ret <4 x i64> %res2
361}
362
363define <4 x float> @test_int_x86_avx512_mask_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
364; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_sf:
365; CHECK:       # %bb.0:
366; CHECK-NEXT:    kmovd %esi, %k1
367; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
368; CHECK-NEXT:    kxnorw %k0, %k0, %k1
369; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
370; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
371; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
372; CHECK-NEXT:    retq
373  %1 = bitcast i8 %x3 to <8 x i1>
374  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
375  %res = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
376  %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
377  %res2 = fadd <4 x float> %res, %res1
378  ret <4 x float> %res2
379}
380
381define <4 x i32> @test_int_x86_avx512_mask_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
382; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_si:
383; CHECK:       # %bb.0:
384; CHECK-NEXT:    kxnorw %k0, %k0, %k1
385; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
386; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k1}
387; CHECK-NEXT:    kmovd %esi, %k1
388; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
389; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
390; CHECK-NEXT:    retq
391  %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 4)
392  %1 = bitcast i8 %x3 to <8 x i1>
393  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
394  %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
395  %res2 = add <4 x i32> %res, %res1
396  ret <4 x i32> %res2
397}
398
399define <4 x float> @test_int_x86_avx512_mask_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
400; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_sf:
401; CHECK:       # %bb.0:
402; CHECK-NEXT:    kmovd %esi, %k1
403; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
404; CHECK-NEXT:    kxnorw %k0, %k0, %k1
405; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
406; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
407; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
408; CHECK-NEXT:    vzeroupper
409; CHECK-NEXT:    retq
410  %1 = bitcast i8 %x3 to <8 x i1>
411  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
412  %res = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4)
413  %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
414  %res2 = fadd <4 x float> %res, %res1
415  ret <4 x float> %res2
416}
417
418define <4 x i32> @test_int_x86_avx512_mask_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
419; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_si:
420; CHECK:       # %bb.0:
421; CHECK-NEXT:    kmovd %esi, %k1
422; CHECK-NEXT:    vmovdqa %xmm0, %xmm2
423; CHECK-NEXT:    kmovq %k1, %k2
424; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
425; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
426; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
427; CHECK-NEXT:    vzeroupper
428; CHECK-NEXT:    retq
429  %1 = bitcast i8 %x3 to <8 x i1>
430  %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
431  %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract1, i32 4)
432  %2 = bitcast i8 %x3 to <8 x i1>
433  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
434  %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 2)
435  %res2 = add <4 x i32> %res, %res1
436  ret <4 x i32> %res2
437}
438
439define <2 x double> @test_int_x86_avx512_mask_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
440; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_df:
441; CHECK:       # %bb.0:
442; CHECK-NEXT:    kmovd %esi, %k1
443; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
444; CHECK-NEXT:    kxnorw %k0, %k0, %k1
445; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
446; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
447; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
448; CHECK-NEXT:    retq
449  %1 = bitcast i8 %x3 to <8 x i1>
450  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
451  %res = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract, i32 4)
452  %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
453  %res2 = fadd <2 x double> %res, %res1
454  ret <2 x double> %res2
455}
456
457define <2 x i64> @test_int_x86_avx512_mask_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
458; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_di:
459; CHECK:       # %bb.0:
460; CHECK-NEXT:    kmovd %esi, %k1
461; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
462; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
463; CHECK-NEXT:    retq
464  %1 = bitcast i8 %x3 to <8 x i1>
465  %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
466  %res = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract1, i32 8)
467  %2 = bitcast i8 %x3 to <8 x i1>
468  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
469  %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract, i32 8)
470  %res2 = add <2 x i64> %res, %res1
471  ret <2 x i64> %res2
472}
473
474define <4 x double> @test_int_x86_avx512_mask_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
475; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_df:
476; CHECK:       # %bb.0:
477; CHECK-NEXT:    kmovd %esi, %k1
478; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
479; CHECK-NEXT:    kxnorw %k0, %k0, %k1
480; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
481; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
482; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
483; CHECK-NEXT:    retq
484  %1 = bitcast i8 %x3 to <8 x i1>
485  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
486  %res = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4)
487  %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
488  %res2 = fadd <4 x double> %res, %res1
489  ret <4 x double> %res2
490}
491
492define <4 x i64> @test_int_x86_avx512_mask_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
493; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_di:
494; CHECK:       # %bb.0:
495; CHECK-NEXT:    kmovd %esi, %k1
496; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
497; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
498; CHECK-NEXT:    retq
499  %1 = bitcast i8 %x3 to <8 x i1>
500  %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
501  %res = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract1, i32 8)
502  %2 = bitcast i8 %x3 to <8 x i1>
503  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
504  %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 8)
505  %res2 = add <4 x i64> %res, %res1
506  ret <4 x i64> %res2
507}
508
509define <4 x float> @test_int_x86_avx512_mask_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
510; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_sf:
511; CHECK:       # %bb.0:
512; CHECK-NEXT:    kmovd %esi, %k1
513; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
514; CHECK-NEXT:    kxnorw %k0, %k0, %k1
515; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
516; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
517; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
518; CHECK-NEXT:    retq
519  %1 = bitcast i8 %x3 to <8 x i1>
520  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
521  %res = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4)
522  %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
523  %res2 = fadd <4 x float> %res, %res1
524  ret <4 x float> %res2
525}
526
527define <4 x i32> @test_int_x86_avx512_mask_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
528; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_si:
529; CHECK:       # %bb.0:
530; CHECK-NEXT:    kxnorw %k0, %k0, %k1
531; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
532; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k1}
533; CHECK-NEXT:    kmovd %esi, %k1
534; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
535; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
536; CHECK-NEXT:    retq
537  %res = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
538  %1 = bitcast i8 %x3 to <8 x i1>
539  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
540  %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 2)
541  %res2 = add <4 x i32> %res, %res1
542  ret <4 x i32> %res2
543}
544
545define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
546; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_sf:
547; CHECK:       # %bb.0:
548; CHECK-NEXT:    kmovd %esi, %k1
549; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
550; CHECK-NEXT:    kxnorw %k0, %k0, %k1
551; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
552; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
553; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
554; CHECK-NEXT:    retq
555  %1 = bitcast i8 %x3 to <8 x i1>
556  %res = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %1, i32 4)
557  %res1 = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 2)
558  %res2 = fadd <8 x float> %res, %res1
559  ret <8 x float> %res2
560}
561
562define <8 x i32> @test_int_x86_avx512_mask_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
563; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_si:
564; CHECK:       # %bb.0:
565; CHECK-NEXT:    kmovd %esi, %k1
566; CHECK-NEXT:    vmovdqa %ymm0, %ymm2
567; CHECK-NEXT:    kmovq %k1, %k2
568; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
569; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
570; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
571; CHECK-NEXT:    retq
572  %1 = bitcast i8 %x3 to <8 x i1>
573  %res = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %1, i32 4)
574  %2 = bitcast i8 %x3 to <8 x i1>
575  %res1 = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %2, i32 2)
576  %res2 = add <8 x i32> %res, %res1
577  ret <8 x i32> %res2
578}
579
580define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
581; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
582; CHECK:       # %bb.0:
583; CHECK-NEXT:    kmovd %esi, %k1
584; CHECK-NEXT:    kxnorw %k0, %k0, %k2
585; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
586; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
587; CHECK-NEXT:    retq
588  %1 = bitcast i8 %x1 to <8 x i1>
589  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
590  call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x double> %x3, i32 2)
591  call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x double> %x3, i32 4)
592  ret void
593}
594
595define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
596; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
597; CHECK:       # %bb.0:
598; CHECK-NEXT:    kmovd %esi, %k1
599; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
600; CHECK-NEXT:    kxnorw %k0, %k0, %k1
601; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
602; CHECK-NEXT:    retq
603  %1 = bitcast i8 %x1 to <8 x i1>
604  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
605  call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x i64> %x3, i32 2)
606  call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x i64> %x3, i32 4)
607  ret void
608}
609
610define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
611; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
612; CHECK:       # %bb.0:
613; CHECK-NEXT:    kmovd %esi, %k1
614; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
615; CHECK-NEXT:    kxnorw %k0, %k0, %k1
616; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
617; CHECK-NEXT:    vzeroupper
618; CHECK-NEXT:    retq
619  %1 = bitcast i8 %x1 to <8 x i1>
620  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
621  call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x double> %x3, i32 2)
622  call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x double> %x3, i32 4)
623  ret void
624}
625
626define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
627; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
628; CHECK:       # %bb.0:
629; CHECK-NEXT:    kmovd %esi, %k1
630; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
631; CHECK-NEXT:    kxnorw %k0, %k0, %k1
632; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
633; CHECK-NEXT:    vzeroupper
634; CHECK-NEXT:    retq
635  %1 = bitcast i8 %x1 to <8 x i1>
636  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
637  call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i64> %x3, i32 2)
638  call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i64> %x3, i32 4)
639  ret void
640}
641
642define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
643; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
644; CHECK:       # %bb.0:
645; CHECK-NEXT:    kmovd %esi, %k1
646; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
647; CHECK-NEXT:    kxnorw %k0, %k0, %k1
648; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
649; CHECK-NEXT:    retq
650  %1 = bitcast i8 %x1 to <8 x i1>
651  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
652  call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x float> %x3, i32 2)
653  call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x float> %x3, i32 4)
654  ret void
655}
656
657define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
658; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
659; CHECK:       # %bb.0:
660; CHECK-NEXT:    kmovd %esi, %k1
661; CHECK-NEXT:    kxnorw %k0, %k0, %k2
662; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
663; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
664; CHECK-NEXT:    retq
665  %1 = bitcast i8 %x1 to <8 x i1>
666  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
667  call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x i32> %x3, i32 2)
668  call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x i32> %x3, i32 4)
669  ret void
670}
671
672define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
673; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
674; CHECK:       # %bb.0:
675; CHECK-NEXT:    kmovd %esi, %k1
676; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
677; CHECK-NEXT:    kxnorw %k0, %k0, %k1
678; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
679; CHECK-NEXT:    vzeroupper
680; CHECK-NEXT:    retq
681  %1 = bitcast i8 %x1 to <8 x i1>
682  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
683  call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x float> %x3, i32 2)
684  call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x float> %x3, i32 4)
685  ret void
686}
687
688define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
689; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
690; CHECK:       # %bb.0:
691; CHECK-NEXT:    kmovd %esi, %k1
692; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
693; CHECK-NEXT:    kxnorw %k0, %k0, %k1
694; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
695; CHECK-NEXT:    vzeroupper
696; CHECK-NEXT:    retq
697  %1 = bitcast i8 %x1 to <8 x i1>
698  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
699  call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i32> %x3, i32 2)
700  call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i32> %x3, i32 4)
701  ret void
702}
703
704define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
705; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
706; CHECK:       # %bb.0:
707; CHECK-NEXT:    kmovd %esi, %k1
708; CHECK-NEXT:    kxnorw %k0, %k0, %k2
709; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
710; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
711; CHECK-NEXT:    retq
712  %1 = bitcast i8 %x1 to <8 x i1>
713  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
714  call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x double> %x3, i32 2)
715  call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x double> %x3, i32 4)
716  ret void
717}
718
719define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
720; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
721; CHECK:       # %bb.0:
722; CHECK-NEXT:    kmovd %esi, %k1
723; CHECK-NEXT:    kxnorw %k0, %k0, %k2
724; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
725; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
726; CHECK-NEXT:    retq
727  %1 = bitcast i8 %x1 to <8 x i1>
728  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
729  call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x i64> %x3, i32 2)
730  call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x i64> %x3, i32 4)
731  ret void
732}
733
734define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
735; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
736; CHECK:       # %bb.0:
737; CHECK-NEXT:    kmovd %esi, %k1
738; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
739; CHECK-NEXT:    kxnorw %k0, %k0, %k1
740; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
741; CHECK-NEXT:    vzeroupper
742; CHECK-NEXT:    retq
743  %1 = bitcast i8 %x1 to <8 x i1>
744  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
745  call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x double> %x3, i32 2)
746  call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x double> %x3, i32 4)
747  ret void
748}
749
750define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
751; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
752; CHECK:       # %bb.0:
753; CHECK-NEXT:    kmovd %esi, %k1
754; CHECK-NEXT:    kxnorw %k0, %k0, %k2
755; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
756; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
757; CHECK-NEXT:    vzeroupper
758; CHECK-NEXT:    retq
759  %1 = bitcast i8 %x1 to <8 x i1>
760  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
761  call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i64> %x3, i32 2)
762  call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i64> %x3, i32 4)
763  ret void
764}
765
766define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
767; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
768; CHECK:       # %bb.0:
769; CHECK-NEXT:    kmovd %esi, %k1
770; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
771; CHECK-NEXT:    kxnorw %k0, %k0, %k1
772; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
773; CHECK-NEXT:    retq
774  %1 = bitcast i8 %x1 to <8 x i1>
775  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
776  call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x float> %x3, i32 2)
777  call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x float> %x3, i32 4)
778  ret void
779}
780
781define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
782; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
783; CHECK:       # %bb.0:
784; CHECK-NEXT:    kmovd %esi, %k1
785; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
786; CHECK-NEXT:    kxnorw %k0, %k0, %k1
787; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
788; CHECK-NEXT:    retq
789  %1 = bitcast i8 %x1 to <8 x i1>
790  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
791  call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i32> %x3, i32 2)
792  call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i32> %x3, i32 4)
793  ret void
794}
795
796define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
797; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
798; CHECK:       # %bb.0:
799; CHECK-NEXT:    kmovd %esi, %k1
800; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
801; CHECK-NEXT:    kxnorw %k0, %k0, %k1
802; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
803; CHECK-NEXT:    vzeroupper
804; CHECK-NEXT:    retq
805  %1 = bitcast i8 %x1 to <8 x i1>
806  call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x float> %x3, i32 2)
807  call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x float> %x3, i32 4)
808  ret void
809}
810
811define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
812; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
813; CHECK:       # %bb.0:
814; CHECK-NEXT:    kmovd %esi, %k1
815; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
816; CHECK-NEXT:    kxnorw %k0, %k0, %k1
817; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
818; CHECK-NEXT:    vzeroupper
819; CHECK-NEXT:    retq
820  %1 = bitcast i8 %x1 to <8 x i1>
821  call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
822  call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 4)
823  ret void
824}
825
826define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
827; CHECK-LABEL: scatter_mask_test:
828; CHECK:       # %bb.0:
829; CHECK-NEXT:    kxnorw %k0, %k0, %k1
830; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
831; CHECK-NEXT:    kxorw %k0, %k0, %k1
832; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
833; CHECK-NEXT:    movb $1, %al
834; CHECK-NEXT:    kmovd %eax, %k1
835; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
836; CHECK-NEXT:    movb $96, %al
837; CHECK-NEXT:    kmovd %eax, %k1
838; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
839; CHECK-NEXT:    vzeroupper
840; CHECK-NEXT:    retq
841  call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 2)
842  call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> zeroinitializer, <8 x i32> %x2, <8 x i32> %x3, i32 4)
843  call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> <i8 1> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 2)
844  call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> <i8 96> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 4)
845  ret void
846}
847
848define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
849; CHECK-LABEL: gather_mask_test:
850; CHECK:       # %bb.0:
851; CHECK-NEXT:    kxnorw %k0, %k0, %k1
852; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
853; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
854; CHECK-NEXT:    kxorw %k0, %k0, %k1
855; CHECK-NEXT:    vmovaps %zmm1, %zmm3
856; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
857; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
858; CHECK-NEXT:    movw $1, %ax
859; CHECK-NEXT:    kmovd %eax, %k1
860; CHECK-NEXT:    vmovaps %zmm1, %zmm3
861; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
862; CHECK-NEXT:    movw $220, %ax
863; CHECK-NEXT:    kmovd %eax, %k1
864; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
865; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
866; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
867; CHECK-NEXT:    retq
868  %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
869  %res1 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> zeroinitializer, i32 4)
870  %res2 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 1> to <16 x i1>), i32 4)
871  %res3 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 220> to <16 x i1>), i32 4)
872  %res4 = fadd <16 x float> %res, %res1
873  %res5 = fadd <16 x float> %res3, %res2
874  %res6 = fadd <16 x float> %res5, %res4
875  ret <16 x float> %res6
876}
877
878@x = global [1024 x float] zeroinitializer, align 16
879
880define <8 x float> @gather_global(<8 x i64>, i32* nocapture readnone) {
881; CHECK-LABEL: gather_global:
882; CHECK:       # %bb.0:
883; CHECK-NEXT:    kxnorw %k0, %k0, %k1
884; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
885; CHECK-NEXT:    vgatherqps x(,%zmm0,4), %ymm1 {%k1}
886; CHECK-NEXT:    vmovaps %ymm1, %ymm0
887; CHECK-NEXT:    retq
888  %3 = tail call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> zeroinitializer, i8* bitcast ([1024 x float]* @x to i8*), <8 x i64> %0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
889  ret <8 x float> %3
890}
891
892declare <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float>, i8*, <16 x i32>, <16 x i1>, i32)
893declare <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double>, i8*, <8 x i32>, <8 x i1>, i32)
894declare <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float>, i8*, <8 x i64>, <8 x i1>, i32)
895declare <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double>, i8*, <8 x i64>, <8 x i1>, i32)
896declare <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, <16 x i1>, i32)
897declare <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64>, i8*, <8 x i32>, <8 x i1>, i32)
898declare <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32>, i8*, <8 x i64>, <8 x i1>, i32)
899declare <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64>, i8*, <8 x i64>, <8 x i1>, i32)
900declare <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double>, i8*, <2 x i64>, <2 x i1>, i32)
901declare <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64>, i8*, <2 x i64>, <2 x i1>, i32)
902declare <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double>, i8*, <4 x i64>, <4 x i1>, i32)
903declare <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64>, i8*, <4 x i64>, <4 x i1>, i32)
904declare <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float>, i8*, <2 x i64>, <2 x i1>, i32)
905declare <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32>, i8*, <2 x i64>, <2 x i1>, i32)
906declare <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float>, i8*, <4 x i64>, <4 x i1>, i32)
907declare <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32>, i8*, <4 x i64>, <4 x i1>, i32)
908declare <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double>, i8*, <4 x i32>, <2 x i1>, i32)
909declare <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, <2 x i1>, i32)
910declare <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double>, i8*, <4 x i32>, <4 x i1>, i32)
911declare <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, <4 x i1>, i32)
912declare <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, <4 x i1>, i32)
913declare <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, <4 x i1>, i32)
914declare <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, <8 x i1>, i32)
915declare <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, <8 x i1>, i32)
916declare void @llvm.x86.avx512.mask.scatter.dps.512(i8*, <16 x i1>, <16 x i32>, <16 x float>, i32)
917declare void @llvm.x86.avx512.mask.scatter.dpd.512(i8*, <8 x i1>, <8 x i32>, <8 x double>, i32)
918declare void @llvm.x86.avx512.mask.scatter.qps.512(i8*, <8 x i1>, <8 x i64>, <8 x float>, i32)
919declare void @llvm.x86.avx512.mask.scatter.qpd.512(i8*, <8 x i1>, <8 x i64>, <8 x double>, i32)
920declare void @llvm.x86.avx512.mask.scatter.dpi.512(i8*, <16 x i1>, <16 x i32>, <16 x i32>, i32)
921declare void @llvm.x86.avx512.mask.scatter.dpq.512(i8*, <8 x i1>, <8 x i32>, <8 x i64>, i32)
922declare void @llvm.x86.avx512.mask.scatter.qpi.512(i8*, <8 x i1>, <8 x i64>, <8 x i32>, i32)
923declare void @llvm.x86.avx512.mask.scatter.qpq.512(i8*, <8 x i1>, <8 x i64>, <8 x i64>, i32)
924declare void @llvm.x86.avx512.mask.scatterdiv2.df(i8*, <2 x i1>, <2 x i64>, <2 x double>, i32)
925declare void @llvm.x86.avx512.mask.scatterdiv2.di(i8*, <2 x i1>, <2 x i64>, <2 x i64>, i32)
926declare void @llvm.x86.avx512.mask.scatterdiv4.df(i8*, <4 x i1>, <4 x i64>, <4 x double>, i32)
927declare void @llvm.x86.avx512.mask.scatterdiv4.di(i8*, <4 x i1>, <4 x i64>, <4 x i64>, i32)
928declare void @llvm.x86.avx512.mask.scatterdiv4.sf(i8*, <2 x i1>, <2 x i64>, <4 x float>, i32)
929declare void @llvm.x86.avx512.mask.scatterdiv4.si(i8*, <2 x i1>, <2 x i64>, <4 x i32>, i32)
930declare void @llvm.x86.avx512.mask.scatterdiv8.sf(i8*, <4 x i1>, <4 x i64>, <4 x float>, i32)
931declare void @llvm.x86.avx512.mask.scatterdiv8.si(i8*, <4 x i1>, <4 x i64>, <4 x i32>, i32)
932declare void @llvm.x86.avx512.mask.scattersiv2.df(i8*, <2 x i1>, <4 x i32>, <2 x double>, i32)
933declare void @llvm.x86.avx512.mask.scattersiv2.di(i8*, <2 x i1>, <4 x i32>, <2 x i64>, i32)
934declare void @llvm.x86.avx512.mask.scattersiv4.df(i8*, <4 x i1>, <4 x i32>, <4 x double>, i32)
935declare void @llvm.x86.avx512.mask.scattersiv4.di(i8*, <4 x i1>, <4 x i32>, <4 x i64>, i32)
936declare void @llvm.x86.avx512.mask.scattersiv4.sf(i8*, <4 x i1>, <4 x i32>, <4 x float>, i32)
937declare void @llvm.x86.avx512.mask.scattersiv4.si(i8*, <4 x i1>, <4 x i32>, <4 x i32>, i32)
938declare void @llvm.x86.avx512.mask.scattersiv8.sf(i8*, <8 x i1>, <8 x i32>, <8 x float>, i32)
939declare void @llvm.x86.avx512.mask.scattersiv8.si(i8*, <8 x i1>, <8 x i32>, <8 x i32>, i32)
940
941