1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s 3 4define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) { 5; CHECK-LABEL: gather_mask_dps: 6; CHECK: # %bb.0: 7; CHECK-NEXT: kmovd %edi, %k1 8; CHECK-NEXT: kmovq %k1, %k2 9; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} 10; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 11; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} 12; CHECK-NEXT: vzeroupper 13; CHECK-NEXT: retq 14 %1 = bitcast i16 %mask to <16 x i1> 15 %x = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4) 16 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 17 call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x float> %x, i32 4) 18 ret void 19} 20 21define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { 22; CHECK-LABEL: gather_mask_dpd: 23; CHECK: # %bb.0: 24; CHECK-NEXT: kmovd %edi, %k1 25; CHECK-NEXT: kmovq %k1, %k2 26; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} 27; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 28; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} 29; CHECK-NEXT: vzeroupper 30; CHECK-NEXT: retq 31 %1 = bitcast i8 %mask to <8 x i1> 32 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4) 33 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 34 call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x double> %x, i32 4) 35 ret void 36} 37 38define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) { 39; CHECK-LABEL: gather_mask_qps: 40; CHECK: # %bb.0: 41; CHECK-NEXT: kmovd %edi, %k1 42; CHECK-NEXT: kmovq %k1, %k2 43; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} 44; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 45; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} 46; CHECK-NEXT: vzeroupper 47; CHECK-NEXT: retq 48 %1 = bitcast i8 %mask to <8 x i1> 49 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 50 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 51 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x float> %x, i32 4) 52 ret void 53} 54 55define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { 56; CHECK-LABEL: gather_mask_qpd: 57; CHECK: # %bb.0: 58; CHECK-NEXT: kmovd %edi, %k1 59; CHECK-NEXT: kmovq %k1, %k2 60; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} 61; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 62; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} 63; CHECK-NEXT: vzeroupper 64; CHECK-NEXT: retq 65 %1 = bitcast i8 %mask to <8 x i1> 66 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 67 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 68 call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x double> %x, i32 4) 69 ret void 70} 71;; 72;; Integer Gather/Scatter 73;; 74 75define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) { 76; CHECK-LABEL: gather_mask_dd: 77; CHECK: # %bb.0: 78; CHECK-NEXT: kmovd %edi, %k1 79; CHECK-NEXT: kmovq %k1, %k2 80; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} 81; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 82; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} 83; CHECK-NEXT: vzeroupper 84; CHECK-NEXT: retq 85 %1 = bitcast i16 %mask to <16 x i1> 86 %x = call <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4) 87 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 88 call void @llvm.x86.avx512.mask.scatter.dpi.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x i32> %x, i32 4) 89 ret void 90} 91 92define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) { 93; CHECK-LABEL: gather_mask_qd: 94; CHECK: # %bb.0: 95; CHECK-NEXT: kmovd %edi, %k1 96; CHECK-NEXT: kmovq %k1, %k2 97; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} 98; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 99; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} 100; CHECK-NEXT: vzeroupper 101; CHECK-NEXT: retq 102 %1 = bitcast i8 %mask to <8 x i1> 103 %x = call <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 104 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 105 call void @llvm.x86.avx512.mask.scatter.qpi.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i32> %x, i32 4) 106 ret void 107} 108 109define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { 110; CHECK-LABEL: gather_mask_qq: 111; CHECK: # %bb.0: 112; CHECK-NEXT: kmovd %edi, %k1 113; CHECK-NEXT: kmovq %k1, %k2 114; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} 115; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 116; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} 117; CHECK-NEXT: vzeroupper 118; CHECK-NEXT: retq 119 %1 = bitcast i8 %mask to <8 x i1> 120 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 121 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 122 call void @llvm.x86.avx512.mask.scatter.qpq.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i64> %x, i32 4) 123 ret void 124} 125 126define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { 127; CHECK-LABEL: gather_mask_dq: 128; CHECK: # %bb.0: 129; CHECK-NEXT: kmovd %edi, %k1 130; CHECK-NEXT: kmovq %k1, %k2 131; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} 132; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 133; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} 134; CHECK-NEXT: vzeroupper 135; CHECK-NEXT: retq 136 %1 = bitcast i8 %mask to <8 x i1> 137 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4) 138 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 139 call void @llvm.x86.avx512.mask.scatter.dpq.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x i64> %x, i32 4) 140 ret void 141} 142 143define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { 144; CHECK-LABEL: gather_mask_dpd_execdomain: 145; CHECK: # %bb.0: 146; CHECK-NEXT: kmovd %edi, %k1 147; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1} 148; CHECK-NEXT: vmovapd %zmm1, (%rdx) 149; CHECK-NEXT: vzeroupper 150; CHECK-NEXT: retq 151 %1 = bitcast i8 %mask to <8 x i1> 152 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4) 153 store <8 x double> %x, <8 x double>* %stbuf 154 ret void 155} 156 157define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { 158; CHECK-LABEL: gather_mask_qpd_execdomain: 159; CHECK: # %bb.0: 160; CHECK-NEXT: kmovd %edi, %k1 161; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1} 162; CHECK-NEXT: vmovapd %zmm1, (%rdx) 163; CHECK-NEXT: vzeroupper 164; CHECK-NEXT: retq 165 %1 = bitcast i8 %mask to <8 x i1> 166 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 167 store <8 x double> %x, <8 x double>* %stbuf 168 ret void 169} 170 171define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) { 172; CHECK-LABEL: gather_mask_dps_execdomain: 173; CHECK: # %bb.0: 174; CHECK-NEXT: kmovd %edi, %k1 175; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1} 176; CHECK-NEXT: vmovaps %zmm1, %zmm0 177; CHECK-NEXT: retq 178 %1 = bitcast i16 %mask to <16 x i1> 179 %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4) 180 ret <16 x float> %res 181} 182 183define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) { 184; CHECK-LABEL: gather_mask_qps_execdomain: 185; CHECK: # %bb.0: 186; CHECK-NEXT: kmovd %edi, %k1 187; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1} 188; CHECK-NEXT: vmovaps %ymm1, %ymm0 189; CHECK-NEXT: retq 190 %1 = bitcast i8 %mask to <8 x i1> 191 %res = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 192 ret <8 x float> %res 193} 194 195define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { 196; CHECK-LABEL: scatter_mask_dpd_execdomain: 197; CHECK: # %bb.0: 198; CHECK-NEXT: kmovd %esi, %k1 199; CHECK-NEXT: vmovapd (%rdi), %zmm1 200; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} 201; CHECK-NEXT: vzeroupper 202; CHECK-NEXT: retq 203 %1 = bitcast i8 %mask to <8 x i1> 204 %x = load <8 x double>, <8 x double>* %src, align 64 205 call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32>%ind, <8 x double> %x, i32 4) 206 ret void 207} 208 209define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { 210; CHECK-LABEL: scatter_mask_qpd_execdomain: 211; CHECK: # %bb.0: 212; CHECK-NEXT: kmovd %esi, %k1 213; CHECK-NEXT: vmovapd (%rdi), %zmm1 214; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} 215; CHECK-NEXT: vzeroupper 216; CHECK-NEXT: retq 217 %1 = bitcast i8 %mask to <8 x i1> 218 %x = load <8 x double>, <8 x double>* %src, align 64 219 call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x double> %x, i32 4) 220 ret void 221} 222 223define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) { 224; CHECK-LABEL: scatter_mask_dps_execdomain: 225; CHECK: # %bb.0: 226; CHECK-NEXT: kmovd %esi, %k1 227; CHECK-NEXT: vmovaps (%rdi), %zmm1 228; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} 229; CHECK-NEXT: vzeroupper 230; CHECK-NEXT: retq 231 %1 = bitcast i16 %mask to <16 x i1> 232 %x = load <16 x float>, <16 x float>* %src, align 64 233 call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32>%ind, <16 x float> %x, i32 4) 234 ret void 235} 236 237define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) { 238; CHECK-LABEL: scatter_mask_qps_execdomain: 239; CHECK: # %bb.0: 240; CHECK-NEXT: kmovd %esi, %k1 241; CHECK-NEXT: vmovaps (%rdi), %ymm1 242; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} 243; CHECK-NEXT: vzeroupper 244; CHECK-NEXT: retq 245 %1 = bitcast i8 %mask to <8 x i1> 246 %x = load <8 x float>, <8 x float>* %src, align 32 247 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x float> %x, i32 4) 248 ret void 249} 250 251define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) { 252; CHECK-LABEL: gather_qps: 253; CHECK: # %bb.0: 254; CHECK-NEXT: kxnorw %k0, %k0, %k1 255; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 256; CHECK-NEXT: kxnorw %k0, %k0, %k2 257; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} 258; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 259; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} 260; CHECK-NEXT: vzeroupper 261; CHECK-NEXT: retq 262 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 263 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 264 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> %ind2, <8 x float> %x, i32 4) 265 ret void 266} 267 268declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32); 269declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32); 270define void @prefetch(<8 x i64> %ind, i8* %base) { 271; CHECK-LABEL: prefetch: 272; CHECK: # %bb.0: 273; CHECK-NEXT: kxnorw %k0, %k0, %k1 274; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} 275; CHECK-NEXT: kxorw %k0, %k0, %k1 276; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} 277; CHECK-NEXT: movb $1, %al 278; CHECK-NEXT: kmovd %eax, %k1 279; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} 280; CHECK-NEXT: movb $120, %al 281; CHECK-NEXT: kmovd %eax, %k1 282; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} 283; CHECK-NEXT: vzeroupper 284; CHECK-NEXT: retq 285 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3) 286 call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2) 287 call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3) 288 call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2) 289 ret void 290} 291 292define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 293; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df: 294; CHECK: # %bb.0: 295; CHECK-NEXT: kmovd %esi, %k1 296; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1} 297; CHECK-NEXT: kxnorw %k0, %k0, %k1 298; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 299; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1} 300; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 301; CHECK-NEXT: retq 302 %1 = bitcast i8 %x3 to <8 x i1> 303 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 304 %res = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4) 305 %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2) 306 %res2 = fadd <2 x double> %res, %res1 307 ret <2 x double> %res2 308} 309 310define <2 x i64> @test_int_x86_avx512_mask_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 311; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_di: 312; CHECK: # %bb.0: 313; CHECK-NEXT: kmovd %esi, %k1 314; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} 315; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 316; CHECK-NEXT: retq 317 %1 = bitcast i8 %x3 to <8 x i1> 318 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 319 %res = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract1, i32 8) 320 %2 = bitcast i8 %x3 to <8 x i1> 321 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1> 322 %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 8) 323 %res2 = add <2 x i64> %res, %res1 324 ret <2 x i64> %res2 325} 326 327define <4 x double> @test_int_x86_avx512_mask_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 328; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_df: 329; CHECK: # %bb.0: 330; CHECK-NEXT: kmovd %esi, %k1 331; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1} 332; CHECK-NEXT: kxnorw %k0, %k0, %k1 333; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 334; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1} 335; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 336; CHECK-NEXT: retq 337 %1 = bitcast i8 %x3 to <8 x i1> 338 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 339 %res = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4) 340 %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2) 341 %res2 = fadd <4 x double> %res, %res1 342 ret <4 x double> %res2 343} 344 345define <4 x i64> @test_int_x86_avx512_mask_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 346; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_di: 347; CHECK: # %bb.0: 348; CHECK-NEXT: kmovd %esi, %k1 349; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} 350; CHECK-NEXT: kxnorw %k0, %k0, %k1 351; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 352; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} 353; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 354; CHECK-NEXT: retq 355 %1 = bitcast i8 %x3 to <8 x i1> 356 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 357 %res = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 8) 358 %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 8) 359 %res2 = add <4 x i64> %res, %res1 360 ret <4 x i64> %res2 361} 362 363define <4 x float> @test_int_x86_avx512_mask_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 364; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_sf: 365; CHECK: # %bb.0: 366; CHECK-NEXT: kmovd %esi, %k1 367; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1} 368; CHECK-NEXT: kxnorw %k0, %k0, %k1 369; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 370; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1} 371; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 372; CHECK-NEXT: retq 373 %1 = bitcast i8 %x3 to <8 x i1> 374 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 375 %res = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4) 376 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2) 377 %res2 = fadd <4 x float> %res, %res1 378 ret <4 x float> %res2 379} 380 381define <4 x i32> @test_int_x86_avx512_mask_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 382; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_si: 383; CHECK: # %bb.0: 384; CHECK-NEXT: kxnorw %k0, %k0, %k1 385; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 386; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k1} 387; CHECK-NEXT: kmovd %esi, %k1 388; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1} 389; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 390; CHECK-NEXT: retq 391 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 4) 392 %1 = bitcast i8 %x3 to <8 x i1> 393 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 394 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4) 395 %res2 = add <4 x i32> %res, %res1 396 ret <4 x i32> %res2 397} 398 399define <4 x float> @test_int_x86_avx512_mask_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 400; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_sf: 401; CHECK: # %bb.0: 402; CHECK-NEXT: kmovd %esi, %k1 403; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1} 404; CHECK-NEXT: kxnorw %k0, %k0, %k1 405; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 406; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1} 407; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 408; CHECK-NEXT: vzeroupper 409; CHECK-NEXT: retq 410 %1 = bitcast i8 %x3 to <8 x i1> 411 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 412 %res = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4) 413 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2) 414 %res2 = fadd <4 x float> %res, %res1 415 ret <4 x float> %res2 416} 417 418define <4 x i32> @test_int_x86_avx512_mask_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 419; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_si: 420; CHECK: # %bb.0: 421; CHECK-NEXT: kmovd %esi, %k1 422; CHECK-NEXT: vmovdqa %xmm0, %xmm2 423; CHECK-NEXT: kmovq %k1, %k2 424; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} 425; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} 426; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 427; CHECK-NEXT: vzeroupper 428; CHECK-NEXT: retq 429 %1 = bitcast i8 %x3 to <8 x i1> 430 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 431 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract1, i32 4) 432 %2 = bitcast i8 %x3 to <8 x i1> 433 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 434 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 2) 435 %res2 = add <4 x i32> %res, %res1 436 ret <4 x i32> %res2 437} 438 439define <2 x double> @test_int_x86_avx512_mask_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 440; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_df: 441; CHECK: # %bb.0: 442; CHECK-NEXT: kmovd %esi, %k1 443; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1} 444; CHECK-NEXT: kxnorw %k0, %k0, %k1 445; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 446; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1} 447; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 448; CHECK-NEXT: retq 449 %1 = bitcast i8 %x3 to <8 x i1> 450 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 451 %res = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract, i32 4) 452 %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> <i1 true, i1 true>, i32 2) 453 %res2 = fadd <2 x double> %res, %res1 454 ret <2 x double> %res2 455} 456 457define <2 x i64> @test_int_x86_avx512_mask_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 458; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_di: 459; CHECK: # %bb.0: 460; CHECK-NEXT: kmovd %esi, %k1 461; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} 462; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 463; CHECK-NEXT: retq 464 %1 = bitcast i8 %x3 to <8 x i1> 465 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 466 %res = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract1, i32 8) 467 %2 = bitcast i8 %x3 to <8 x i1> 468 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1> 469 %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract, i32 8) 470 %res2 = add <2 x i64> %res, %res1 471 ret <2 x i64> %res2 472} 473 474define <4 x double> @test_int_x86_avx512_mask_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 475; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_df: 476; CHECK: # %bb.0: 477; CHECK-NEXT: kmovd %esi, %k1 478; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1} 479; CHECK-NEXT: kxnorw %k0, %k0, %k1 480; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 481; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1} 482; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 483; CHECK-NEXT: retq 484 %1 = bitcast i8 %x3 to <8 x i1> 485 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 486 %res = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4) 487 %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2) 488 %res2 = fadd <4 x double> %res, %res1 489 ret <4 x double> %res2 490} 491 492define <4 x i64> @test_int_x86_avx512_mask_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 493; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_di: 494; CHECK: # %bb.0: 495; CHECK-NEXT: kmovd %esi, %k1 496; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} 497; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 498; CHECK-NEXT: retq 499 %1 = bitcast i8 %x3 to <8 x i1> 500 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 501 %res = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract1, i32 8) 502 %2 = bitcast i8 %x3 to <8 x i1> 503 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 504 %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 8) 505 %res2 = add <4 x i64> %res, %res1 506 ret <4 x i64> %res2 507} 508 509define <4 x float> @test_int_x86_avx512_mask_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 510; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_sf: 511; CHECK: # %bb.0: 512; CHECK-NEXT: kmovd %esi, %k1 513; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} 514; CHECK-NEXT: kxnorw %k0, %k0, %k1 515; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 516; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1} 517; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 518; CHECK-NEXT: retq 519 %1 = bitcast i8 %x3 to <8 x i1> 520 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 521 %res = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4) 522 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2) 523 %res2 = fadd <4 x float> %res, %res1 524 ret <4 x float> %res2 525} 526 527define <4 x i32> @test_int_x86_avx512_mask_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 528; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_si: 529; CHECK: # %bb.0: 530; CHECK-NEXT: kxnorw %k0, %k0, %k1 531; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 532; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k1} 533; CHECK-NEXT: kmovd %esi, %k1 534; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1} 535; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 536; CHECK-NEXT: retq 537 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4) 538 %1 = bitcast i8 %x3 to <8 x i1> 539 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 540 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 2) 541 %res2 = add <4 x i32> %res, %res1 542 ret <4 x i32> %res2 543} 544 545define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { 546; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_sf: 547; CHECK: # %bb.0: 548; CHECK-NEXT: kmovd %esi, %k1 549; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} 550; CHECK-NEXT: kxnorw %k0, %k0, %k1 551; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 552; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} 553; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 554; CHECK-NEXT: retq 555 %1 = bitcast i8 %x3 to <8 x i1> 556 %res = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %1, i32 4) 557 %res1 = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 2) 558 %res2 = fadd <8 x float> %res, %res1 559 ret <8 x float> %res2 560} 561 562define <8 x i32> @test_int_x86_avx512_mask_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { 563; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_si: 564; CHECK: # %bb.0: 565; CHECK-NEXT: kmovd %esi, %k1 566; CHECK-NEXT: vmovdqa %ymm0, %ymm2 567; CHECK-NEXT: kmovq %k1, %k2 568; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2} 569; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1} 570; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 571; CHECK-NEXT: retq 572 %1 = bitcast i8 %x3 to <8 x i1> 573 %res = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %1, i32 4) 574 %2 = bitcast i8 %x3 to <8 x i1> 575 %res1 = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %2, i32 2) 576 %res2 = add <8 x i32> %res, %res1 577 ret <8 x i32> %res2 578} 579 580define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) { 581; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df: 582; CHECK: # %bb.0: 583; CHECK-NEXT: kmovd %esi, %k1 584; CHECK-NEXT: kxnorw %k0, %k0, %k2 585; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2} 586; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1} 587; CHECK-NEXT: retq 588 %1 = bitcast i8 %x1 to <8 x i1> 589 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 590 call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x double> %x3, i32 2) 591 call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x double> %x3, i32 4) 592 ret void 593} 594 595define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) { 596; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di: 597; CHECK: # %bb.0: 598; CHECK-NEXT: kmovd %esi, %k1 599; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1} 600; CHECK-NEXT: kxnorw %k0, %k0, %k1 601; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1} 602; CHECK-NEXT: retq 603 %1 = bitcast i8 %x1 to <8 x i1> 604 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 605 call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x i64> %x3, i32 2) 606 call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x i64> %x3, i32 4) 607 ret void 608} 609 610define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) { 611; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df: 612; CHECK: # %bb.0: 613; CHECK-NEXT: kmovd %esi, %k1 614; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1} 615; CHECK-NEXT: kxnorw %k0, %k0, %k1 616; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} 617; CHECK-NEXT: vzeroupper 618; CHECK-NEXT: retq 619 %1 = bitcast i8 %x1 to <8 x i1> 620 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 621 call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x double> %x3, i32 2) 622 call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x double> %x3, i32 4) 623 ret void 624} 625 626define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) { 627; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di: 628; CHECK: # %bb.0: 629; CHECK-NEXT: kmovd %esi, %k1 630; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1} 631; CHECK-NEXT: kxnorw %k0, %k0, %k1 632; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1} 633; CHECK-NEXT: vzeroupper 634; CHECK-NEXT: retq 635 %1 = bitcast i8 %x1 to <8 x i1> 636 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 637 call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i64> %x3, i32 2) 638 call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i64> %x3, i32 4) 639 ret void 640} 641 642define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) { 643; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf: 644; CHECK: # %bb.0: 645; CHECK-NEXT: kmovd %esi, %k1 646; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1} 647; CHECK-NEXT: kxnorw %k0, %k0, %k1 648; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1} 649; CHECK-NEXT: retq 650 %1 = bitcast i8 %x1 to <8 x i1> 651 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 652 call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x float> %x3, i32 2) 653 call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x float> %x3, i32 4) 654 ret void 655} 656 657define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) { 658; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si: 659; CHECK: # %bb.0: 660; CHECK-NEXT: kmovd %esi, %k1 661; CHECK-NEXT: kxnorw %k0, %k0, %k2 662; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2} 663; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} 664; CHECK-NEXT: retq 665 %1 = bitcast i8 %x1 to <8 x i1> 666 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 667 call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x i32> %x3, i32 2) 668 call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x i32> %x3, i32 4) 669 ret void 670} 671 672define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) { 673; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf: 674; CHECK: # %bb.0: 675; CHECK-NEXT: kmovd %esi, %k1 676; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1} 677; CHECK-NEXT: kxnorw %k0, %k0, %k1 678; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1} 679; CHECK-NEXT: vzeroupper 680; CHECK-NEXT: retq 681 %1 = bitcast i8 %x1 to <8 x i1> 682 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 683 call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x float> %x3, i32 2) 684 call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x float> %x3, i32 4) 685 ret void 686} 687 688define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) { 689; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si: 690; CHECK: # %bb.0: 691; CHECK-NEXT: kmovd %esi, %k1 692; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1} 693; CHECK-NEXT: kxnorw %k0, %k0, %k1 694; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1} 695; CHECK-NEXT: vzeroupper 696; CHECK-NEXT: retq 697 %1 = bitcast i8 %x1 to <8 x i1> 698 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 699 call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i32> %x3, i32 2) 700 call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i32> %x3, i32 4) 701 ret void 702} 703 704define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) { 705; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df: 706; CHECK: # %bb.0: 707; CHECK-NEXT: kmovd %esi, %k1 708; CHECK-NEXT: kxnorw %k0, %k0, %k2 709; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2} 710; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1} 711; CHECK-NEXT: retq 712 %1 = bitcast i8 %x1 to <8 x i1> 713 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 714 call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x double> %x3, i32 2) 715 call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x double> %x3, i32 4) 716 ret void 717} 718 719define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) { 720; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di: 721; CHECK: # %bb.0: 722; CHECK-NEXT: kmovd %esi, %k1 723; CHECK-NEXT: kxnorw %k0, %k0, %k2 724; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2} 725; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1} 726; CHECK-NEXT: retq 727 %1 = bitcast i8 %x1 to <8 x i1> 728 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 729 call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x i64> %x3, i32 2) 730 call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x i64> %x3, i32 4) 731 ret void 732} 733 734define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) { 735; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df: 736; CHECK: # %bb.0: 737; CHECK-NEXT: kmovd %esi, %k1 738; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1} 739; CHECK-NEXT: kxnorw %k0, %k0, %k1 740; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1} 741; CHECK-NEXT: vzeroupper 742; CHECK-NEXT: retq 743 %1 = bitcast i8 %x1 to <8 x i1> 744 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 745 call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x double> %x3, i32 2) 746 call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x double> %x3, i32 4) 747 ret void 748} 749 750define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) { 751; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di: 752; CHECK: # %bb.0: 753; CHECK-NEXT: kmovd %esi, %k1 754; CHECK-NEXT: kxnorw %k0, %k0, %k2 755; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2} 756; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1} 757; CHECK-NEXT: vzeroupper 758; CHECK-NEXT: retq 759 %1 = bitcast i8 %x1 to <8 x i1> 760 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 761 call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i64> %x3, i32 2) 762 call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i64> %x3, i32 4) 763 ret void 764} 765 766define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) { 767; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf: 768; CHECK: # %bb.0: 769; CHECK-NEXT: kmovd %esi, %k1 770; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1} 771; CHECK-NEXT: kxnorw %k0, %k0, %k1 772; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1} 773; CHECK-NEXT: retq 774 %1 = bitcast i8 %x1 to <8 x i1> 775 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 776 call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x float> %x3, i32 2) 777 call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x float> %x3, i32 4) 778 ret void 779} 780 781define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) { 782; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si: 783; CHECK: # %bb.0: 784; CHECK-NEXT: kmovd %esi, %k1 785; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1} 786; CHECK-NEXT: kxnorw %k0, %k0, %k1 787; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} 788; CHECK-NEXT: retq 789 %1 = bitcast i8 %x1 to <8 x i1> 790 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 791 call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i32> %x3, i32 2) 792 call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i32> %x3, i32 4) 793 ret void 794} 795 796define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) { 797; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf: 798; CHECK: # %bb.0: 799; CHECK-NEXT: kmovd %esi, %k1 800; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} 801; CHECK-NEXT: kxnorw %k0, %k0, %k1 802; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} 803; CHECK-NEXT: vzeroupper 804; CHECK-NEXT: retq 805 %1 = bitcast i8 %x1 to <8 x i1> 806 call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x float> %x3, i32 2) 807 call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x float> %x3, i32 4) 808 ret void 809} 810 811define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) { 812; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si: 813; CHECK: # %bb.0: 814; CHECK-NEXT: kmovd %esi, %k1 815; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 816; CHECK-NEXT: kxnorw %k0, %k0, %k1 817; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 818; CHECK-NEXT: vzeroupper 819; CHECK-NEXT: retq 820 %1 = bitcast i8 %x1 to <8 x i1> 821 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x i32> %x3, i32 2) 822 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 4) 823 ret void 824} 825 826define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) { 827; CHECK-LABEL: scatter_mask_test: 828; CHECK: # %bb.0: 829; CHECK-NEXT: kxnorw %k0, %k0, %k1 830; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 831; CHECK-NEXT: kxorw %k0, %k0, %k1 832; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 833; CHECK-NEXT: movb $1, %al 834; CHECK-NEXT: kmovd %eax, %k1 835; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 836; CHECK-NEXT: movb $96, %al 837; CHECK-NEXT: kmovd %eax, %k1 838; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 839; CHECK-NEXT: vzeroupper 840; CHECK-NEXT: retq 841 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 2) 842 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> zeroinitializer, <8 x i32> %x2, <8 x i32> %x3, i32 4) 843 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> <i8 1> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 2) 844 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> <i8 96> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 4) 845 ret void 846} 847 848define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) { 849; CHECK-LABEL: gather_mask_test: 850; CHECK: # %bb.0: 851; CHECK-NEXT: kxnorw %k0, %k0, %k1 852; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 853; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} 854; CHECK-NEXT: kxorw %k0, %k0, %k1 855; CHECK-NEXT: vmovaps %zmm1, %zmm3 856; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} 857; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 858; CHECK-NEXT: movw $1, %ax 859; CHECK-NEXT: kmovd %eax, %k1 860; CHECK-NEXT: vmovaps %zmm1, %zmm3 861; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} 862; CHECK-NEXT: movw $220, %ax 863; CHECK-NEXT: kmovd %eax, %k1 864; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 865; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 866; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 867; CHECK-NEXT: retq 868 %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 869 %res1 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> zeroinitializer, i32 4) 870 %res2 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 1> to <16 x i1>), i32 4) 871 %res3 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 220> to <16 x i1>), i32 4) 872 %res4 = fadd <16 x float> %res, %res1 873 %res5 = fadd <16 x float> %res3, %res2 874 %res6 = fadd <16 x float> %res5, %res4 875 ret <16 x float> %res6 876} 877 878@x = global [1024 x float] zeroinitializer, align 16 879 880define <8 x float> @gather_global(<8 x i64>, i32* nocapture readnone) { 881; CHECK-LABEL: gather_global: 882; CHECK: # %bb.0: 883; CHECK-NEXT: kxnorw %k0, %k0, %k1 884; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 885; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1} 886; CHECK-NEXT: vmovaps %ymm1, %ymm0 887; CHECK-NEXT: retq 888 %3 = tail call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> zeroinitializer, i8* bitcast ([1024 x float]* @x to i8*), <8 x i64> %0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 889 ret <8 x float> %3 890} 891 892declare <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float>, i8*, <16 x i32>, <16 x i1>, i32) 893declare <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double>, i8*, <8 x i32>, <8 x i1>, i32) 894declare <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float>, i8*, <8 x i64>, <8 x i1>, i32) 895declare <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double>, i8*, <8 x i64>, <8 x i1>, i32) 896declare <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, <16 x i1>, i32) 897declare <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64>, i8*, <8 x i32>, <8 x i1>, i32) 898declare <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32>, i8*, <8 x i64>, <8 x i1>, i32) 899declare <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64>, i8*, <8 x i64>, <8 x i1>, i32) 900declare <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double>, i8*, <2 x i64>, <2 x i1>, i32) 901declare <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64>, i8*, <2 x i64>, <2 x i1>, i32) 902declare <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double>, i8*, <4 x i64>, <4 x i1>, i32) 903declare <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64>, i8*, <4 x i64>, <4 x i1>, i32) 904declare <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float>, i8*, <2 x i64>, <2 x i1>, i32) 905declare <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32>, i8*, <2 x i64>, <2 x i1>, i32) 906declare <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float>, i8*, <4 x i64>, <4 x i1>, i32) 907declare <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32>, i8*, <4 x i64>, <4 x i1>, i32) 908declare <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double>, i8*, <4 x i32>, <2 x i1>, i32) 909declare <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, <2 x i1>, i32) 910declare <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double>, i8*, <4 x i32>, <4 x i1>, i32) 911declare <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, <4 x i1>, i32) 912declare <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, <4 x i1>, i32) 913declare <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, <4 x i1>, i32) 914declare <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, <8 x i1>, i32) 915declare <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, <8 x i1>, i32) 916declare void @llvm.x86.avx512.mask.scatter.dps.512(i8*, <16 x i1>, <16 x i32>, <16 x float>, i32) 917declare void @llvm.x86.avx512.mask.scatter.dpd.512(i8*, <8 x i1>, <8 x i32>, <8 x double>, i32) 918declare void @llvm.x86.avx512.mask.scatter.qps.512(i8*, <8 x i1>, <8 x i64>, <8 x float>, i32) 919declare void @llvm.x86.avx512.mask.scatter.qpd.512(i8*, <8 x i1>, <8 x i64>, <8 x double>, i32) 920declare void @llvm.x86.avx512.mask.scatter.dpi.512(i8*, <16 x i1>, <16 x i32>, <16 x i32>, i32) 921declare void @llvm.x86.avx512.mask.scatter.dpq.512(i8*, <8 x i1>, <8 x i32>, <8 x i64>, i32) 922declare void @llvm.x86.avx512.mask.scatter.qpi.512(i8*, <8 x i1>, <8 x i64>, <8 x i32>, i32) 923declare void @llvm.x86.avx512.mask.scatter.qpq.512(i8*, <8 x i1>, <8 x i64>, <8 x i64>, i32) 924declare void @llvm.x86.avx512.mask.scatterdiv2.df(i8*, <2 x i1>, <2 x i64>, <2 x double>, i32) 925declare void @llvm.x86.avx512.mask.scatterdiv2.di(i8*, <2 x i1>, <2 x i64>, <2 x i64>, i32) 926declare void @llvm.x86.avx512.mask.scatterdiv4.df(i8*, <4 x i1>, <4 x i64>, <4 x double>, i32) 927declare void @llvm.x86.avx512.mask.scatterdiv4.di(i8*, <4 x i1>, <4 x i64>, <4 x i64>, i32) 928declare void @llvm.x86.avx512.mask.scatterdiv4.sf(i8*, <2 x i1>, <2 x i64>, <4 x float>, i32) 929declare void @llvm.x86.avx512.mask.scatterdiv4.si(i8*, <2 x i1>, <2 x i64>, <4 x i32>, i32) 930declare void @llvm.x86.avx512.mask.scatterdiv8.sf(i8*, <4 x i1>, <4 x i64>, <4 x float>, i32) 931declare void @llvm.x86.avx512.mask.scatterdiv8.si(i8*, <4 x i1>, <4 x i64>, <4 x i32>, i32) 932declare void @llvm.x86.avx512.mask.scattersiv2.df(i8*, <2 x i1>, <4 x i32>, <2 x double>, i32) 933declare void @llvm.x86.avx512.mask.scattersiv2.di(i8*, <2 x i1>, <4 x i32>, <2 x i64>, i32) 934declare void @llvm.x86.avx512.mask.scattersiv4.df(i8*, <4 x i1>, <4 x i32>, <4 x double>, i32) 935declare void @llvm.x86.avx512.mask.scattersiv4.di(i8*, <4 x i1>, <4 x i32>, <4 x i64>, i32) 936declare void @llvm.x86.avx512.mask.scattersiv4.sf(i8*, <4 x i1>, <4 x i32>, <4 x float>, i32) 937declare void @llvm.x86.avx512.mask.scattersiv4.si(i8*, <4 x i1>, <4 x i32>, <4 x i32>, i32) 938declare void @llvm.x86.avx512.mask.scattersiv8.sf(i8*, <8 x i1>, <8 x i32>, <8 x float>, i32) 939declare void @llvm.x86.avx512.mask.scattersiv8.si(i8*, <8 x i1>, <8 x i32>, <8 x i32>, i32) 940 941