1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64 4 5define void @test_mm256_2intersect_epi32(<4 x i64> %a, <4 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) { 6; X86-LABEL: test_mm256_2intersect_epi32: 7; X86: # %bb.0: # %entry 8; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] 9; X86-NEXT: vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1] 10; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 11; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 12; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 13; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 14; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 15; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 16; X86-NEXT: retl # encoding: [0xc3] 17; 18; X64-LABEL: test_mm256_2intersect_epi32: 19; X64: # %bb.0: # %entry 20; X64-NEXT: vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1] 21; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 22; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 23; X64-NEXT: movb %cl, (%rdi) # encoding: [0x88,0x0f] 24; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 25; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 26; X64-NEXT: retq # encoding: [0xc3] 27entry: 28 %0 = bitcast <4 x i64> %a to <8 x i32> 29 %1 = bitcast <4 x i64> %b to <8 x i32> 30 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1) 31 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 32 %4 = bitcast i8* %m0 to <8 x i1>* 33 store <8 x i1> %3, <8 x i1>* %4, align 8 34 %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 35 %6 = bitcast i8* %m1 to <8 x i1>* 36 store <8 x i1> %5, <8 x i1>* %6, align 8 37 ret void 38} 39 40define void @test_mm256_2intersect_epi64(<4 x i64> %a, <4 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) { 41; X86-LABEL: test_mm256_2intersect_epi64: 42; X86: # %bb.0: # %entry 43; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 44; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] 45; X86-NEXT: vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1] 46; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 47; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 48; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 49; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 50; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 51; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 52; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 53; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 54; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 55; X86-NEXT: retl # encoding: [0xc3] 56; 57; X64-LABEL: test_mm256_2intersect_epi64: 58; X64: # %bb.0: # %entry 59; X64-NEXT: vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1] 60; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 61; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 62; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 63; X64-NEXT: movb %al, (%rdi) # encoding: [0x88,0x07] 64; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 65; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 66; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 67; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 68; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 69; X64-NEXT: retq # encoding: [0xc3] 70entry: 71 %0 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %a, <4 x i64> %b) 72 %1 = extractvalue { <4 x i1>, <4 x i1> } %0, 0 73 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 74 %3 = bitcast <8 x i1> %2 to i8 75 store i8 %3, i8* %m0, align 1 76 %4 = extractvalue { <4 x i1>, <4 x i1> } %0, 1 77 %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 78 %6 = bitcast <8 x i1> %5 to i8 79 store i8 %6, i8* %m1, align 1 80 ret void 81} 82 83define void @test_mm256_2intersect_epi32_p(<4 x i64>* nocapture readonly %a, <4 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 84; X86-LABEL: test_mm256_2intersect_epi32_p: 85; X86: # %bb.0: # %entry 86; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] 87; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] 88; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] 89; X86-NEXT: vmovaps (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x02] 90; X86-NEXT: vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01] 91; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 92; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 93; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 94; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] 95; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 96; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 97; X86-NEXT: retl # encoding: [0xc3] 98; 99; X64-LABEL: test_mm256_2intersect_epi32_p: 100; X64: # %bb.0: # %entry 101; X64-NEXT: vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] 102; X64-NEXT: vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06] 103; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 104; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] 105; X64-NEXT: movb %sil, (%rdx) # encoding: [0x40,0x88,0x32] 106; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 107; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 108; X64-NEXT: retq # encoding: [0xc3] 109entry: 110 %0 = bitcast <4 x i64>* %a to <8 x i32>* 111 %1 = load <8 x i32>, <8 x i32>* %0, align 32 112 %2 = bitcast <4 x i64>* %b to <8 x i32>* 113 %3 = load <8 x i32>, <8 x i32>* %2, align 32 114 %4 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %1, <8 x i32> %3) 115 %5 = extractvalue { <8 x i1>, <8 x i1> } %4, 0 116 %6 = bitcast i8* %m0 to <8 x i1>* 117 store <8 x i1> %5, <8 x i1>* %6, align 8 118 %7 = extractvalue { <8 x i1>, <8 x i1> } %4, 1 119 %8 = bitcast i8* %m1 to <8 x i1>* 120 store <8 x i1> %7, <8 x i1>* %8, align 8 121 ret void 122} 123 124define void @test_mm256_2intersect_epi64_p(<4 x i64>* nocapture readonly %a, <4 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 125; X86-LABEL: test_mm256_2intersect_epi64_p: 126; X86: # %bb.0: # %entry 127; X86-NEXT: pushl %esi # encoding: [0x56] 128; X86-NEXT: .cfi_def_cfa_offset 8 129; X86-NEXT: .cfi_offset %esi, -8 130; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 131; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 132; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 133; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 134; X86-NEXT: vmovaps (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x06] 135; X86-NEXT: vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02] 136; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 137; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 138; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 139; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 140; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 141; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 142; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 143; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 144; X86-NEXT: popl %esi # encoding: [0x5e] 145; X86-NEXT: .cfi_def_cfa_offset 4 146; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 147; X86-NEXT: retl # encoding: [0xc3] 148; 149; X64-LABEL: test_mm256_2intersect_epi64_p: 150; X64: # %bb.0: # %entry 151; X64-NEXT: vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] 152; X64-NEXT: vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06] 153; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 154; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 155; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 156; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 157; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 158; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 159; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 160; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 161; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 162; X64-NEXT: retq # encoding: [0xc3] 163entry: 164 %0 = load <4 x i64>, <4 x i64>* %a, align 32 165 %1 = load <4 x i64>, <4 x i64>* %b, align 32 166 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %0, <4 x i64> %1) 167 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 168 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 169 %5 = bitcast <8 x i1> %4 to i8 170 store i8 %5, i8* %m0, align 1 171 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 172 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 173 %8 = bitcast <8 x i1> %7 to i8 174 store i8 %8, i8* %m1, align 1 175 ret void 176} 177 178define void @test_mm256_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 179; X86-LABEL: test_mm256_2intersect_epi32_b: 180; X86: # %bb.0: # %entry 181; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] 182; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] 183; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] 184; X86-NEXT: vbroadcastss (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x02] 185; X86-NEXT: vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01] 186; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 187; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 188; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 189; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] 190; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 191; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 192; X86-NEXT: retl # encoding: [0xc3] 193; 194; X64-LABEL: test_mm256_2intersect_epi32_b: 195; X64: # %bb.0: # %entry 196; X64-NEXT: vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07] 197; X64-NEXT: vp2intersectd (%rsi){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x06] 198; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 199; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] 200; X64-NEXT: movb %sil, (%rdx) # encoding: [0x40,0x88,0x32] 201; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 202; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 203; X64-NEXT: retq # encoding: [0xc3] 204entry: 205 %0 = load i32, i32* %a, align 4 206 %vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i32 0 207 %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer 208 %1 = load i32, i32* %b, align 4 209 %vecinit.i.i2 = insertelement <8 x i32> undef, i32 %1, i32 0 210 %vecinit7.i.i3 = shufflevector <8 x i32> %vecinit.i.i2, <8 x i32> undef, <8 x i32> zeroinitializer 211 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %vecinit7.i.i, <8 x i32> %vecinit7.i.i3) 212 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 213 %4 = bitcast i8* %m0 to <8 x i1>* 214 store <8 x i1> %3, <8 x i1>* %4, align 8 215 %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 216 %6 = bitcast i8* %m1 to <8 x i1>* 217 store <8 x i1> %5, <8 x i1>* %6, align 8 218 ret void 219} 220 221define void @test_mm256_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 222; X86-LABEL: test_mm256_2intersect_epi64_b: 223; X86: # %bb.0: # %entry 224; X86-NEXT: pushl %esi # encoding: [0x56] 225; X86-NEXT: .cfi_def_cfa_offset 8 226; X86-NEXT: .cfi_offset %esi, -8 227; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 228; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 229; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 230; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 231; X86-NEXT: vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06] 232; X86-NEXT: vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02] 233; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 234; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 235; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 236; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 237; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 238; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 239; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 240; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 241; X86-NEXT: popl %esi # encoding: [0x5e] 242; X86-NEXT: .cfi_def_cfa_offset 4 243; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 244; X86-NEXT: retl # encoding: [0xc3] 245; 246; X64-LABEL: test_mm256_2intersect_epi64_b: 247; X64: # %bb.0: # %entry 248; X64-NEXT: vbroadcastsd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x07] 249; X64-NEXT: vp2intersectq (%rsi){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x06] 250; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 251; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 252; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 253; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 254; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 255; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 256; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 257; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 258; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 259; X64-NEXT: retq # encoding: [0xc3] 260entry: 261 %0 = load i64, i64* %a, align 8 262 %vecinit.i.i = insertelement <4 x i64> undef, i64 %0, i32 0 263 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer 264 %1 = load i64, i64* %b, align 8 265 %vecinit.i.i2 = insertelement <4 x i64> undef, i64 %1, i32 0 266 %vecinit3.i.i3 = shufflevector <4 x i64> %vecinit.i.i2, <4 x i64> undef, <4 x i32> zeroinitializer 267 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %vecinit3.i.i, <4 x i64> %vecinit3.i.i3) 268 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 269 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 270 %5 = bitcast <8 x i1> %4 to i8 271 store i8 %5, i8* %m0, align 1 272 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 273 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 274 %8 = bitcast <8 x i1> %7 to i8 275 store i8 %8, i8* %m1, align 1 276 ret void 277} 278 279define void @test_mm_2intersect_epi32(<2 x i64> %a, <2 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) { 280; X86-LABEL: test_mm_2intersect_epi32: 281; X86: # %bb.0: # %entry 282; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 283; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] 284; X86-NEXT: vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1] 285; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 286; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 287; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 288; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 289; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 290; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 291; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 292; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 293; X86-NEXT: retl # encoding: [0xc3] 294; 295; X64-LABEL: test_mm_2intersect_epi32: 296; X64: # %bb.0: # %entry 297; X64-NEXT: vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1] 298; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 299; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 300; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 301; X64-NEXT: movb %al, (%rdi) # encoding: [0x88,0x07] 302; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 303; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 304; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 305; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 306; X64-NEXT: retq # encoding: [0xc3] 307entry: 308 %0 = bitcast <2 x i64> %a to <4 x i32> 309 %1 = bitcast <2 x i64> %b to <4 x i32> 310 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1) 311 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 312 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 313 %5 = bitcast <8 x i1> %4 to i8 314 store i8 %5, i8* %m0, align 1 315 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 316 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 317 %8 = bitcast <8 x i1> %7 to i8 318 store i8 %8, i8* %m1, align 1 319 ret void 320} 321 322define void @test_mm_2intersect_epi64(<2 x i64> %a, <2 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) { 323; X86-LABEL: test_mm_2intersect_epi64: 324; X86: # %bb.0: # %entry 325; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 326; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] 327; X86-NEXT: vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1] 328; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 329; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 330; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 331; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 332; X86-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 333; X86-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 334; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 335; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 336; X86-NEXT: retl # encoding: [0xc3] 337; 338; X64-LABEL: test_mm_2intersect_epi64: 339; X64: # %bb.0: # %entry 340; X64-NEXT: vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1] 341; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 342; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 343; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 344; X64-NEXT: movb %al, (%rdi) # encoding: [0x88,0x07] 345; X64-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 346; X64-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 347; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 348; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 349; X64-NEXT: retq # encoding: [0xc3] 350entry: 351 %0 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %a, <2 x i64> %b) 352 %1 = extractvalue { <2 x i1>, <2 x i1> } %0, 0 353 %2 = shufflevector <2 x i1> %1, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 354 %3 = bitcast <8 x i1> %2 to i8 355 store i8 %3, i8* %m0, align 1 356 %4 = extractvalue { <2 x i1>, <2 x i1> } %0, 1 357 %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 358 %6 = bitcast <8 x i1> %5 to i8 359 store i8 %6, i8* %m1, align 1 360 ret void 361} 362 363define void @test_mm_2intersect_epi32_p(<2 x i64>* nocapture readonly %a, <2 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 364; X86-LABEL: test_mm_2intersect_epi32_p: 365; X86: # %bb.0: # %entry 366; X86-NEXT: pushl %esi # encoding: [0x56] 367; X86-NEXT: .cfi_def_cfa_offset 8 368; X86-NEXT: .cfi_offset %esi, -8 369; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 370; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 371; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 372; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 373; X86-NEXT: vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06] 374; X86-NEXT: vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02] 375; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 376; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 377; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 378; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 379; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 380; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 381; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 382; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 383; X86-NEXT: popl %esi # encoding: [0x5e] 384; X86-NEXT: .cfi_def_cfa_offset 4 385; X86-NEXT: retl # encoding: [0xc3] 386; 387; X64-LABEL: test_mm_2intersect_epi32_p: 388; X64: # %bb.0: # %entry 389; X64-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] 390; X64-NEXT: vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06] 391; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 392; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 393; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 394; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 395; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 396; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 397; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 398; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 399; X64-NEXT: retq # encoding: [0xc3] 400entry: 401 %0 = bitcast <2 x i64>* %a to <4 x i32>* 402 %1 = load <4 x i32>, <4 x i32>* %0, align 16 403 %2 = bitcast <2 x i64>* %b to <4 x i32>* 404 %3 = load <4 x i32>, <4 x i32>* %2, align 16 405 %4 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %1, <4 x i32> %3) 406 %5 = extractvalue { <4 x i1>, <4 x i1> } %4, 0 407 %6 = shufflevector <4 x i1> %5, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 408 %7 = bitcast <8 x i1> %6 to i8 409 store i8 %7, i8* %m0, align 1 410 %8 = extractvalue { <4 x i1>, <4 x i1> } %4, 1 411 %9 = shufflevector <4 x i1> %8, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 412 %10 = bitcast <8 x i1> %9 to i8 413 store i8 %10, i8* %m1, align 1 414 ret void 415} 416 417define void @test_mm_2intersect_epi64_p(<2 x i64>* nocapture readonly %a, <2 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 418; X86-LABEL: test_mm_2intersect_epi64_p: 419; X86: # %bb.0: # %entry 420; X86-NEXT: pushl %esi # encoding: [0x56] 421; X86-NEXT: .cfi_def_cfa_offset 8 422; X86-NEXT: .cfi_offset %esi, -8 423; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 424; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 425; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 426; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 427; X86-NEXT: vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06] 428; X86-NEXT: vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02] 429; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 430; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 431; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 432; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 433; X86-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 434; X86-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 435; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 436; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 437; X86-NEXT: popl %esi # encoding: [0x5e] 438; X86-NEXT: .cfi_def_cfa_offset 4 439; X86-NEXT: retl # encoding: [0xc3] 440; 441; X64-LABEL: test_mm_2intersect_epi64_p: 442; X64: # %bb.0: # %entry 443; X64-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] 444; X64-NEXT: vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06] 445; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 446; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 447; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 448; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 449; X64-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 450; X64-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 451; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 452; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 453; X64-NEXT: retq # encoding: [0xc3] 454entry: 455 %0 = load <2 x i64>, <2 x i64>* %a, align 16 456 %1 = load <2 x i64>, <2 x i64>* %b, align 16 457 %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %0, <2 x i64> %1) 458 %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0 459 %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 460 %5 = bitcast <8 x i1> %4 to i8 461 store i8 %5, i8* %m0, align 1 462 %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1 463 %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 464 %8 = bitcast <8 x i1> %7 to i8 465 store i8 %8, i8* %m1, align 1 466 ret void 467} 468 469define void @test_mm_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 470; X86-LABEL: test_mm_2intersect_epi32_b: 471; X86: # %bb.0: # %entry 472; X86-NEXT: pushl %esi # encoding: [0x56] 473; X86-NEXT: .cfi_def_cfa_offset 8 474; X86-NEXT: .cfi_offset %esi, -8 475; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 476; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 477; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 478; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 479; X86-NEXT: vbroadcastss (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x06] 480; X86-NEXT: vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02] 481; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 482; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 483; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 484; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 485; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 486; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 487; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 488; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 489; X86-NEXT: popl %esi # encoding: [0x5e] 490; X86-NEXT: .cfi_def_cfa_offset 4 491; X86-NEXT: retl # encoding: [0xc3] 492; 493; X64-LABEL: test_mm_2intersect_epi32_b: 494; X64: # %bb.0: # %entry 495; X64-NEXT: vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07] 496; X64-NEXT: vp2intersectd (%rsi){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x06] 497; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 498; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 499; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 500; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 501; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 502; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 503; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 504; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 505; X64-NEXT: retq # encoding: [0xc3] 506entry: 507 %0 = load i32, i32* %a, align 4 508 %vecinit.i.i = insertelement <4 x i32> undef, i32 %0, i32 0 509 %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer 510 %1 = load i32, i32* %b, align 4 511 %vecinit.i.i2 = insertelement <4 x i32> undef, i32 %1, i32 0 512 %vecinit3.i.i3 = shufflevector <4 x i32> %vecinit.i.i2, <4 x i32> undef, <4 x i32> zeroinitializer 513 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %vecinit3.i.i, <4 x i32> %vecinit3.i.i3) 514 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 515 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 516 %5 = bitcast <8 x i1> %4 to i8 517 store i8 %5, i8* %m0, align 1 518 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 519 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 520 %8 = bitcast <8 x i1> %7 to i8 521 store i8 %8, i8* %m1, align 1 522 ret void 523} 524 525define void @test_mm_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 526; X86-LABEL: test_mm_2intersect_epi64_b: 527; X86: # %bb.0: # %entry 528; X86-NEXT: pushl %esi # encoding: [0x56] 529; X86-NEXT: .cfi_def_cfa_offset 8 530; X86-NEXT: .cfi_offset %esi, -8 531; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 532; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 533; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 534; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 535; X86-NEXT: vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06] 536; X86-NEXT: # xmm0 = mem[0,0] 537; X86-NEXT: vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02] 538; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 539; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 540; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 541; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 542; X86-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 543; X86-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 544; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 545; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 546; X86-NEXT: popl %esi # encoding: [0x5e] 547; X86-NEXT: .cfi_def_cfa_offset 4 548; X86-NEXT: retl # encoding: [0xc3] 549; 550; X64-LABEL: test_mm_2intersect_epi64_b: 551; X64: # %bb.0: # %entry 552; X64-NEXT: vmovddup (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x07] 553; X64-NEXT: # xmm0 = mem[0,0] 554; X64-NEXT: vp2intersectq (%rsi){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x06] 555; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 556; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 557; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 558; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 559; X64-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 560; X64-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 561; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 562; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 563; X64-NEXT: retq # encoding: [0xc3] 564entry: 565 %0 = load i64, i64* %a, align 8 566 %vecinit.i.i = insertelement <2 x i64> undef, i64 %0, i32 0 567 %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer 568 %1 = load i64, i64* %b, align 8 569 %vecinit.i.i2 = insertelement <2 x i64> undef, i64 %1, i32 0 570 %vecinit1.i.i3 = shufflevector <2 x i64> %vecinit.i.i2, <2 x i64> undef, <2 x i32> zeroinitializer 571 %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %vecinit1.i.i, <2 x i64> %vecinit1.i.i3) 572 %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0 573 %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 574 %5 = bitcast <8 x i1> %4 to i8 575 store i8 %5, i8* %m0, align 1 576 %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1 577 %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 578 %8 = bitcast <8 x i1> %7 to i8 579 store i8 %8, i8* %m1, align 1 580 ret void 581} 582 583declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>) 584declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>) 585declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>) 586declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>) 587