1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c 6 7define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { 8; X86-LABEL: test_mm512_kunpackd: 9; X86: # %bb.0: # %entry 10; X86-NEXT: pushl %ebp 11; X86-NEXT: .cfi_def_cfa_offset 8 12; X86-NEXT: .cfi_offset %ebp, -8 13; X86-NEXT: movl %esp, %ebp 14; X86-NEXT: .cfi_def_cfa_register %ebp 15; X86-NEXT: andl $-64, %esp 16; X86-NEXT: subl $64, %esp 17; X86-NEXT: vmovdqa64 136(%ebp), %zmm3 18; X86-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 19; X86-NEXT: vpcmpneqb 8(%ebp), %zmm2, %k1 20; X86-NEXT: vpcmpneqb 72(%ebp), %zmm3, %k2 21; X86-NEXT: kandd %k0, %k2, %k0 22; X86-NEXT: kmovd %k0, %eax 23; X86-NEXT: kshiftrq $32, %k2, %k0 24; X86-NEXT: kandd %k1, %k0, %k0 25; X86-NEXT: kmovd %k0, %edx 26; X86-NEXT: movl %ebp, %esp 27; X86-NEXT: popl %ebp 28; X86-NEXT: .cfi_def_cfa %esp, 4 29; X86-NEXT: vzeroupper 30; X86-NEXT: retl 31; 32; X64-LABEL: test_mm512_kunpackd: 33; X64: # %bb.0: # %entry 34; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 35; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1 36; X64-NEXT: kunpckdq %k0, %k1, %k1 37; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1} 38; X64-NEXT: kmovq %k0, %rax 39; X64-NEXT: vzeroupper 40; X64-NEXT: retq 41entry: 42 %0 = bitcast <8 x i64> %__E to <64 x i8> 43 %1 = bitcast <8 x i64> %__F to <64 x i8> 44 %2 = bitcast <8 x i64> %__B to <64 x i8> 45 %3 = bitcast <8 x i64> %__A to <64 x i8> 46 %4 = icmp ne <64 x i8> %2, %3 47 %5 = bitcast <8 x i64> %__C to <64 x i8> 48 %6 = bitcast <8 x i64> %__D to <64 x i8> 49 %7 = icmp ne <64 x i8> %5, %6 50 %8 = shufflevector <64 x i1> %4, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 51 %9 = shufflevector <64 x i1> %7, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 52 %10 = shufflevector <32 x i1> %8, <32 x i1> %9, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 53 %11 = icmp ne <64 x i8> %0, %1 54 %12 = and <64 x i1> %11, %10 55 %13 = bitcast <64 x i1> %12 to i64 56 ret i64 %13 57} 58 59define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { 60; X86-LABEL: test_mm512_kunpackw: 61; X86: # %bb.0: # %entry 62; X86-NEXT: pushl %ebp 63; X86-NEXT: .cfi_def_cfa_offset 8 64; X86-NEXT: .cfi_offset %ebp, -8 65; X86-NEXT: movl %esp, %ebp 66; X86-NEXT: .cfi_def_cfa_register %ebp 67; X86-NEXT: andl $-64, %esp 68; X86-NEXT: subl $64, %esp 69; X86-NEXT: vmovdqa64 136(%ebp), %zmm3 70; X86-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 71; X86-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1 72; X86-NEXT: kunpckwd %k0, %k1, %k1 73; X86-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1} 74; X86-NEXT: kmovd %k0, %eax 75; X86-NEXT: movl %ebp, %esp 76; X86-NEXT: popl %ebp 77; X86-NEXT: .cfi_def_cfa %esp, 4 78; X86-NEXT: vzeroupper 79; X86-NEXT: retl 80; 81; X64-LABEL: test_mm512_kunpackw: 82; X64: # %bb.0: # %entry 83; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 84; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1 85; X64-NEXT: kunpckwd %k0, %k1, %k1 86; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1} 87; X64-NEXT: kmovd %k0, %eax 88; X64-NEXT: vzeroupper 89; X64-NEXT: retq 90entry: 91 %0 = bitcast <8 x i64> %__E to <32 x i16> 92 %1 = bitcast <8 x i64> %__F to <32 x i16> 93 %2 = bitcast <8 x i64> %__B to <32 x i16> 94 %3 = bitcast <8 x i64> %__A to <32 x i16> 95 %4 = icmp ne <32 x i16> %2, %3 96 %5 = bitcast <8 x i64> %__C to <32 x i16> 97 %6 = bitcast <8 x i64> %__D to <32 x i16> 98 %7 = icmp ne <32 x i16> %5, %6 99 %8 = shufflevector <32 x i1> %4, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 100 %9 = shufflevector <32 x i1> %7, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 101 %10 = shufflevector <16 x i1> %8, <16 x i1> %9, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 102 %11 = icmp ne <32 x i16> %0, %1 103 %12 = and <32 x i1> %11, %10 104 %13 = bitcast <32 x i1> %12 to i32 105 ret i32 %13 106} 107 108 109define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) { 110; X86-LABEL: test_mm512_mask_set1_epi8: 111; X86: # %bb.0: # %entry 112; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 113; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 114; X86-NEXT: movb {{[0-9]+}}(%esp), %al 115; X86-NEXT: kunpckdq %k1, %k0, %k1 116; X86-NEXT: vpbroadcastb %eax, %zmm0 {%k1} 117; X86-NEXT: retl 118; 119; X64-LABEL: test_mm512_mask_set1_epi8: 120; X64: # %bb.0: # %entry 121; X64-NEXT: kmovq %rdi, %k1 122; X64-NEXT: vpbroadcastb %esi, %zmm0 {%k1} 123; X64-NEXT: retq 124 entry: 125 %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0 126 %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer 127 %0 = bitcast <8 x i64> %__O to <64 x i8> 128 %1 = bitcast i64 %__M to <64 x i1> 129 %2 = select <64 x i1> %1, <64 x i8> %vecinit63.i.i, <64 x i8> %0 130 %3 = bitcast <64 x i8> %2 to <8 x i64> 131 ret <8 x i64> %3 132} 133 134define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { 135; X86-LABEL: test_mm512_maskz_set1_epi8: 136; X86: # %bb.0: # %entry 137; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 138; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 139; X86-NEXT: movb {{[0-9]+}}(%esp), %al 140; X86-NEXT: kunpckdq %k1, %k0, %k1 141; X86-NEXT: vpbroadcastb %eax, %zmm0 {%k1} {z} 142; X86-NEXT: retl 143; 144; X64-LABEL: test_mm512_maskz_set1_epi8: 145; X64: # %bb.0: # %entry 146; X64-NEXT: kmovq %rdi, %k1 147; X64-NEXT: vpbroadcastb %esi, %zmm0 {%k1} {z} 148; X64-NEXT: retq 149 entry: 150 %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0 151 %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer 152 %0 = bitcast i64 %__M to <64 x i1> 153 %1 = select <64 x i1> %0, <64 x i8> %vecinit63.i.i, <64 x i8> zeroinitializer 154 %2 = bitcast <64 x i8> %1 to <8 x i64> 155 ret <8 x i64> %2 156} 157 158define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A) { 159; X86-LABEL: test_mm512_mask_set1_epi16: 160; X86: # %bb.0: # %entry 161; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 162; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 163; X86-NEXT: vpbroadcastw %eax, %zmm0 {%k1} 164; X86-NEXT: retl 165; 166; X64-LABEL: test_mm512_mask_set1_epi16: 167; X64: # %bb.0: # %entry 168; X64-NEXT: kmovd %edi, %k1 169; X64-NEXT: vpbroadcastw %esi, %zmm0 {%k1} 170; X64-NEXT: retq 171 entry: 172 %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0 173 %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer 174 %0 = bitcast <8 x i64> %__O to <32 x i16> 175 %1 = bitcast i32 %__M to <32 x i1> 176 %2 = select <32 x i1> %1, <32 x i16> %vecinit31.i.i, <32 x i16> %0 177 %3 = bitcast <32 x i16> %2 to <8 x i64> 178 ret <8 x i64> %3 179} 180 181define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A) { 182; X86-LABEL: test_mm512_maskz_set1_epi16: 183; X86: # %bb.0: # %entry 184; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 185; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 186; X86-NEXT: vpbroadcastw %eax, %zmm0 {%k1} {z} 187; X86-NEXT: retl 188; 189; X64-LABEL: test_mm512_maskz_set1_epi16: 190; X64: # %bb.0: # %entry 191; X64-NEXT: kmovd %edi, %k1 192; X64-NEXT: vpbroadcastw %esi, %zmm0 {%k1} {z} 193; X64-NEXT: retq 194 entry: 195 %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0 196 %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer 197 %0 = bitcast i32 %__M to <32 x i1> 198 %1 = select <32 x i1> %0, <32 x i16> %vecinit31.i.i, <32 x i16> zeroinitializer 199 %2 = bitcast <32 x i16> %1 to <8 x i64> 200 ret <8 x i64> %2 201} 202 203define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) { 204; CHECK-LABEL: test_mm512_broadcastb_epi8: 205; CHECK: # %bb.0: 206; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 207; CHECK-NEXT: ret{{[l|q]}} 208 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 209 %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <64 x i32> zeroinitializer 210 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 211 ret <8 x i64> %res1 212} 213 214define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, i64* %a1, <2 x i64> %a2) { 215; X86-LABEL: test_mm512_mask_broadcastb_epi8: 216; X86: # %bb.0: 217; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 218; X86-NEXT: kmovq (%eax), %k1 219; X86-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1} 220; X86-NEXT: retl 221; 222; X64-LABEL: test_mm512_mask_broadcastb_epi8: 223; X64: # %bb.0: 224; X64-NEXT: kmovq (%rdi), %k1 225; X64-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1} 226; X64-NEXT: retq 227 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 228 %bc1 = bitcast i64* %a1 to <64 x i1>* 229 %arg1 = load <64 x i1>, <64 x i1>* %bc1 230 %arg2 = bitcast <2 x i64> %a2 to <16 x i8> 231 %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <64 x i32> zeroinitializer 232 %res1 = select <64 x i1> %arg1, <64 x i8> %res0, <64 x i8> %arg0 233 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 234 ret <8 x i64> %res2 235} 236 237define <8 x i64> @test_mm512_maskz_broadcastb_epi8(i64* %a0, <2 x i64> %a1) { 238; X86-LABEL: test_mm512_maskz_broadcastb_epi8: 239; X86: # %bb.0: 240; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 241; X86-NEXT: kmovq (%eax), %k1 242; X86-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} 243; X86-NEXT: retl 244; 245; X64-LABEL: test_mm512_maskz_broadcastb_epi8: 246; X64: # %bb.0: 247; X64-NEXT: kmovq (%rdi), %k1 248; X64-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} 249; X64-NEXT: retq 250 %bc0 = bitcast i64* %a0 to <64 x i1>* 251 %arg0 = load <64 x i1>, <64 x i1>* %bc0 252 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 253 %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <64 x i32> zeroinitializer 254 %res1 = select <64 x i1> %arg0, <64 x i8> %res0, <64 x i8> zeroinitializer 255 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 256 ret <8 x i64> %res2 257} 258 259define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) { 260; CHECK-LABEL: test_mm512_broadcastw_epi16: 261; CHECK: # %bb.0: 262; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 263; CHECK-NEXT: ret{{[l|q]}} 264 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 265 %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <32 x i32> zeroinitializer 266 %res1 = bitcast <32 x i16> %res0 to <8 x i64> 267 ret <8 x i64> %res1 268} 269 270define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) { 271; X86-LABEL: test_mm512_mask_broadcastw_epi16: 272; X86: # %bb.0: 273; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 274; X86-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1} 275; X86-NEXT: retl 276; 277; X64-LABEL: test_mm512_mask_broadcastw_epi16: 278; X64: # %bb.0: 279; X64-NEXT: kmovd %edi, %k1 280; X64-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1} 281; X64-NEXT: retq 282 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 283 %arg1 = bitcast i32 %a1 to <32 x i1> 284 %arg2 = bitcast <2 x i64> %a2 to <8 x i16> 285 %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <32 x i32> zeroinitializer 286 %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0 287 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 288 ret <8 x i64> %res2 289} 290 291define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) { 292; X86-LABEL: test_mm512_maskz_broadcastw_epi16: 293; X86: # %bb.0: 294; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 295; X86-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} 296; X86-NEXT: retl 297; 298; X64-LABEL: test_mm512_maskz_broadcastw_epi16: 299; X64: # %bb.0: 300; X64-NEXT: kmovd %edi, %k1 301; X64-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} 302; X64-NEXT: retq 303 %arg0 = bitcast i32 %a0 to <32 x i1> 304 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 305 %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <32 x i32> zeroinitializer 306 %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer 307 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 308 ret <8 x i64> %res2 309} 310 311define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) { 312; CHECK-LABEL: test_mm512_bslli_epi128: 313; CHECK: # %bb.0: 314; CHECK-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 315; CHECK-NEXT: ret{{[l|q]}} 316 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 317 %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122> 318 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 319 ret <8 x i64> %res1 320} 321 322define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) { 323; CHECK-LABEL: test_mm512_bsrli_epi128: 324; CHECK: # %bb.0: 325; CHECK-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero 326; CHECK-NEXT: ret{{[l|q]}} 327 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 328 %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116> 329 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 330 ret <8 x i64> %res1 331} 332 333define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) { 334; CHECK-LABEL: test_mm512_unpackhi_epi8: 335; CHECK: # %bb.0: 336; CHECK-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 337; CHECK-NEXT: ret{{[l|q]}} 338 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 339 %arg1 = bitcast <8 x i64> %a1 to <64 x i8> 340 %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 341 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 342 ret <8 x i64> %res1 343} 344 345; TODO - improve support for i64 -> mmask64 on 32-bit targets 346define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) { 347; X86-LABEL: test_mm512_mask_unpackhi_epi8: 348; X86: # %bb.0: 349; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 350; X86-NEXT: kmovq (%eax), %k1 351; X86-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 352; X86-NEXT: retl 353; 354; X64-LABEL: test_mm512_mask_unpackhi_epi8: 355; X64: # %bb.0: 356; X64-NEXT: kmovq (%rdi), %k1 357; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 358; X64-NEXT: retq 359 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 360 %arg1 = bitcast i64* %a1 to <64 x i1>* 361 %sel1 = load <64 x i1>, <64 x i1>* %arg1 362 %arg2 = bitcast <8 x i64> %a2 to <64 x i8> 363 %arg3 = bitcast <8 x i64> %a3 to <64 x i8> 364 %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 365 %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0 366 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 367 ret <8 x i64> %res2 368} 369 370define <8 x i64> @test_mm512_maskz_unpackhi_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) { 371; X86-LABEL: test_mm512_maskz_unpackhi_epi8: 372; X86: # %bb.0: 373; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 374; X86-NEXT: kmovq (%eax), %k1 375; X86-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 376; X86-NEXT: retl 377; 378; X64-LABEL: test_mm512_maskz_unpackhi_epi8: 379; X64: # %bb.0: 380; X64-NEXT: kmovq (%rdi), %k1 381; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 382; X64-NEXT: retq 383 %arg0 = bitcast i64* %a0 to <64 x i1>* 384 %sel0 = load <64 x i1>, <64 x i1>* %arg0 385 %arg1 = bitcast <8 x i64> %a1 to <64 x i8> 386 %arg2 = bitcast <8 x i64> %a2 to <64 x i8> 387 %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 388 %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer 389 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 390 ret <8 x i64> %res2 391} 392 393define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) { 394; CHECK-LABEL: test_mm512_unpackhi_epi16: 395; CHECK: # %bb.0: 396; CHECK-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 397; CHECK-NEXT: ret{{[l|q]}} 398 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 399 %arg1 = bitcast <8 x i64> %a1 to <32 x i16> 400 %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 401 %res1 = bitcast <32 x i16> %res0 to <8 x i64> 402 ret <8 x i64> %res1 403} 404 405define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) { 406; X86-LABEL: test_mm512_mask_unpackhi_epi16: 407; X86: # %bb.0: 408; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 409; X86-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31] 410; X86-NEXT: retl 411; 412; X64-LABEL: test_mm512_mask_unpackhi_epi16: 413; X64: # %bb.0: 414; X64-NEXT: kmovd %edi, %k1 415; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31] 416; X64-NEXT: retq 417 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 418 %arg1 = bitcast i32 %a1 to <32 x i1> 419 %arg2 = bitcast <8 x i64> %a2 to <32 x i16> 420 %arg3 = bitcast <8 x i64> %a3 to <32 x i16> 421 %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 422 %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0 423 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 424 ret <8 x i64> %res2 425} 426 427define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) { 428; X86-LABEL: test_mm512_maskz_unpackhi_epi16: 429; X86: # %bb.0: 430; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 431; X86-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 432; X86-NEXT: retl 433; 434; X64-LABEL: test_mm512_maskz_unpackhi_epi16: 435; X64: # %bb.0: 436; X64-NEXT: kmovd %edi, %k1 437; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 438; X64-NEXT: retq 439 %arg0 = bitcast i32 %a0 to <32 x i1> 440 %arg1 = bitcast <8 x i64> %a1 to <32 x i16> 441 %arg2 = bitcast <8 x i64> %a2 to <32 x i16> 442 %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 443 %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer 444 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 445 ret <8 x i64> %res2 446} 447 448define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) { 449; CHECK-LABEL: test_mm512_unpacklo_epi8: 450; CHECK: # %bb.0: 451; CHECK-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 452; CHECK-NEXT: ret{{[l|q]}} 453 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 454 %arg1 = bitcast <8 x i64> %a1 to <64 x i8> 455 %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119> 456 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 457 ret <8 x i64> %res1 458} 459 460define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) { 461; X86-LABEL: test_mm512_mask_unpacklo_epi8: 462; X86: # %bb.0: 463; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 464; X86-NEXT: kmovq (%eax), %k1 465; X86-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 466; X86-NEXT: retl 467; 468; X64-LABEL: test_mm512_mask_unpacklo_epi8: 469; X64: # %bb.0: 470; X64-NEXT: kmovq (%rdi), %k1 471; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 472; X64-NEXT: retq 473 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 474 %arg1 = bitcast i64* %a1 to <64 x i1>* 475 %sel1 = load <64 x i1>, <64 x i1>* %arg1 476 %arg2 = bitcast <8 x i64> %a2 to <64 x i8> 477 %arg3 = bitcast <8 x i64> %a3 to <64 x i8> 478 %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119> 479 %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0 480 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 481 ret <8 x i64> %res2 482} 483 484define <8 x i64> @test_mm512_maskz_unpacklo_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) { 485; X86-LABEL: test_mm512_maskz_unpacklo_epi8: 486; X86: # %bb.0: 487; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 488; X86-NEXT: kmovq (%eax), %k1 489; X86-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 490; X86-NEXT: retl 491; 492; X64-LABEL: test_mm512_maskz_unpacklo_epi8: 493; X64: # %bb.0: 494; X64-NEXT: kmovq (%rdi), %k1 495; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 496; X64-NEXT: retq 497 %arg0 = bitcast i64* %a0 to <64 x i1>* 498 %sel0 = load <64 x i1>, <64 x i1>* %arg0 499 %arg1 = bitcast <8 x i64> %a1 to <64 x i8> 500 %arg2 = bitcast <8 x i64> %a2 to <64 x i8> 501 %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119> 502 %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer 503 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 504 ret <8 x i64> %res2 505} 506 507define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) { 508; CHECK-LABEL: test_mm512_unpacklo_epi16: 509; CHECK: # %bb.0: 510; CHECK-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 511; CHECK-NEXT: ret{{[l|q]}} 512 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 513 %arg1 = bitcast <8 x i64> %a1 to <32 x i16> 514 %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59> 515 %res1 = bitcast <32 x i16> %res0 to <8 x i64> 516 ret <8 x i64> %res1 517} 518 519define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) { 520; X86-LABEL: test_mm512_mask_unpacklo_epi16: 521; X86: # %bb.0: 522; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 523; X86-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27] 524; X86-NEXT: retl 525; 526; X64-LABEL: test_mm512_mask_unpacklo_epi16: 527; X64: # %bb.0: 528; X64-NEXT: kmovd %edi, %k1 529; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27] 530; X64-NEXT: retq 531 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 532 %arg1 = bitcast i32 %a1 to <32 x i1> 533 %arg2 = bitcast <8 x i64> %a2 to <32 x i16> 534 %arg3 = bitcast <8 x i64> %a3 to <32 x i16> 535 %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59> 536 %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0 537 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 538 ret <8 x i64> %res2 539} 540 541define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) { 542; X86-LABEL: test_mm512_maskz_unpacklo_epi16: 543; X86: # %bb.0: 544; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 545; X86-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 546; X86-NEXT: retl 547; 548; X64-LABEL: test_mm512_maskz_unpacklo_epi16: 549; X64: # %bb.0: 550; X64-NEXT: kmovd %edi, %k1 551; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 552; X64-NEXT: retq 553 %arg0 = bitcast i32 %a0 to <32 x i1> 554 %arg1 = bitcast <8 x i64> %a1 to <32 x i16> 555 %arg2 = bitcast <8 x i64> %a2 to <32 x i16> 556 %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59> 557 %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer 558 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 559 ret <8 x i64> %res2 560} 561 562define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { 563; X86-LABEL: test_mm512_test_epi8_mask: 564; X86: # %bb.0: # %entry 565; X86-NEXT: vptestmb %zmm0, %zmm1, %k0 566; X86-NEXT: kshiftrq $32, %k0, %k1 567; X86-NEXT: kmovd %k0, %eax 568; X86-NEXT: kmovd %k1, %edx 569; X86-NEXT: vzeroupper 570; X86-NEXT: retl 571; 572; X64-LABEL: test_mm512_test_epi8_mask: 573; X64: # %bb.0: # %entry 574; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 575; X64-NEXT: kmovq %k0, %rax 576; X64-NEXT: vzeroupper 577; X64-NEXT: retq 578entry: 579 %and1.i.i = and <8 x i64> %__B, %__A 580 %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> 581 %1 = icmp ne <64 x i8> %0, zeroinitializer 582 %2 = bitcast <64 x i1> %1 to i64 583 ret i64 %2 584} 585 586define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { 587; X86-LABEL: test_mm512_mask_test_epi8_mask: 588; X86: # %bb.0: # %entry 589; X86-NEXT: vptestmb %zmm0, %zmm1, %k0 590; X86-NEXT: kshiftrq $32, %k0, %k1 591; X86-NEXT: kmovd %k1, %edx 592; X86-NEXT: kmovd %k0, %eax 593; X86-NEXT: andl {{[0-9]+}}(%esp), %eax 594; X86-NEXT: andl {{[0-9]+}}(%esp), %edx 595; X86-NEXT: vzeroupper 596; X86-NEXT: retl 597; 598; X64-LABEL: test_mm512_mask_test_epi8_mask: 599; X64: # %bb.0: # %entry 600; X64-NEXT: kmovq %rdi, %k1 601; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1} 602; X64-NEXT: kmovq %k0, %rax 603; X64-NEXT: vzeroupper 604; X64-NEXT: retq 605entry: 606 %and1.i.i = and <8 x i64> %__B, %__A 607 %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> 608 %1 = icmp ne <64 x i8> %0, zeroinitializer 609 %2 = bitcast i64 %__U to <64 x i1> 610 %3 = and <64 x i1> %1, %2 611 %4 = bitcast <64 x i1> %3 to i64 612 ret i64 %4 613} 614 615define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) { 616; CHECK-LABEL: test_mm512_test_epi16_mask: 617; CHECK: # %bb.0: # %entry 618; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 619; CHECK-NEXT: kmovd %k0, %eax 620; CHECK-NEXT: vzeroupper 621; CHECK-NEXT: ret{{[l|q]}} 622entry: 623 %and1.i.i = and <8 x i64> %__B, %__A 624 %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> 625 %1 = icmp ne <32 x i16> %0, zeroinitializer 626 %2 = bitcast <32 x i1> %1 to i32 627 ret i32 %2 628} 629 630define i32 @test_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) { 631; X86-LABEL: test_mm512_mask_test_epi16_mask: 632; X86: # %bb.0: # %entry 633; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 634; X86-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} 635; X86-NEXT: kmovd %k0, %eax 636; X86-NEXT: vzeroupper 637; X86-NEXT: retl 638; 639; X64-LABEL: test_mm512_mask_test_epi16_mask: 640; X64: # %bb.0: # %entry 641; X64-NEXT: kmovd %edi, %k1 642; X64-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} 643; X64-NEXT: kmovd %k0, %eax 644; X64-NEXT: vzeroupper 645; X64-NEXT: retq 646entry: 647 %and1.i.i = and <8 x i64> %__B, %__A 648 %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> 649 %1 = icmp ne <32 x i16> %0, zeroinitializer 650 %2 = bitcast i32 %__U to <32 x i1> 651 %3 = and <32 x i1> %1, %2 652 %4 = bitcast <32 x i1> %3 to i32 653 ret i32 %4 654} 655 656define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { 657; X86-LABEL: test_mm512_testn_epi8_mask: 658; X86: # %bb.0: # %entry 659; X86-NEXT: vptestnmb %zmm0, %zmm1, %k0 660; X86-NEXT: kshiftrq $32, %k0, %k1 661; X86-NEXT: kmovd %k0, %eax 662; X86-NEXT: kmovd %k1, %edx 663; X86-NEXT: vzeroupper 664; X86-NEXT: retl 665; 666; X64-LABEL: test_mm512_testn_epi8_mask: 667; X64: # %bb.0: # %entry 668; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 669; X64-NEXT: kmovq %k0, %rax 670; X64-NEXT: vzeroupper 671; X64-NEXT: retq 672entry: 673 %and1.i.i = and <8 x i64> %__B, %__A 674 %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> 675 %1 = icmp eq <64 x i8> %0, zeroinitializer 676 %2 = bitcast <64 x i1> %1 to i64 677 ret i64 %2 678} 679 680define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { 681; X86-LABEL: test_mm512_mask_testn_epi8_mask: 682; X86: # %bb.0: # %entry 683; X86-NEXT: vptestnmb %zmm0, %zmm1, %k0 684; X86-NEXT: kshiftrq $32, %k0, %k1 685; X86-NEXT: kmovd %k1, %edx 686; X86-NEXT: kmovd %k0, %eax 687; X86-NEXT: andl {{[0-9]+}}(%esp), %eax 688; X86-NEXT: andl {{[0-9]+}}(%esp), %edx 689; X86-NEXT: vzeroupper 690; X86-NEXT: retl 691; 692; X64-LABEL: test_mm512_mask_testn_epi8_mask: 693; X64: # %bb.0: # %entry 694; X64-NEXT: kmovq %rdi, %k1 695; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1} 696; X64-NEXT: kmovq %k0, %rax 697; X64-NEXT: vzeroupper 698; X64-NEXT: retq 699entry: 700 %and1.i.i = and <8 x i64> %__B, %__A 701 %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> 702 %1 = icmp eq <64 x i8> %0, zeroinitializer 703 %2 = bitcast i64 %__U to <64 x i1> 704 %3 = and <64 x i1> %1, %2 705 %4 = bitcast <64 x i1> %3 to i64 706 ret i64 %4 707} 708 709define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) { 710; CHECK-LABEL: test_mm512_testn_epi16_mask: 711; CHECK: # %bb.0: # %entry 712; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 713; CHECK-NEXT: kmovd %k0, %eax 714; CHECK-NEXT: vzeroupper 715; CHECK-NEXT: ret{{[l|q]}} 716entry: 717 %and1.i.i = and <8 x i64> %__B, %__A 718 %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> 719 %1 = icmp eq <32 x i16> %0, zeroinitializer 720 %2 = bitcast <32 x i1> %1 to i32 721 ret i32 %2 722} 723 724define i32 @test_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) { 725; X86-LABEL: test_mm512_mask_testn_epi16_mask: 726; X86: # %bb.0: # %entry 727; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 728; X86-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} 729; X86-NEXT: kmovd %k0, %eax 730; X86-NEXT: vzeroupper 731; X86-NEXT: retl 732; 733; X64-LABEL: test_mm512_mask_testn_epi16_mask: 734; X64: # %bb.0: # %entry 735; X64-NEXT: kmovd %edi, %k1 736; X64-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} 737; X64-NEXT: kmovd %k0, %eax 738; X64-NEXT: vzeroupper 739; X64-NEXT: retq 740entry: 741 %and1.i.i = and <8 x i64> %__B, %__A 742 %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> 743 %1 = icmp eq <32 x i16> %0, zeroinitializer 744 %2 = bitcast i32 %__U to <32 x i1> 745 %3 = and <32 x i1> %1, %2 746 %4 = bitcast <32 x i1> %3 to i32 747 ret i32 %4 748} 749 750define <4 x i64> @test_mm512_cvtepi16_epi8(<8 x i64> %__A) { 751; CHECK-LABEL: test_mm512_cvtepi16_epi8: 752; CHECK: # %bb.0: # %entry 753; CHECK-NEXT: vpmovwb %zmm0, %ymm0 754; CHECK-NEXT: ret{{[l|q]}} 755entry: 756 %0 = bitcast <8 x i64> %__A to <32 x i16> 757 %conv.i = trunc <32 x i16> %0 to <32 x i8> 758 %1 = bitcast <32 x i8> %conv.i to <4 x i64> 759 ret <4 x i64> %1 760} 761 762define <4 x i64> @test_mm512_mask_cvtepi16_epi8(<4 x i64> %__O, i32 %__M, <8 x i64> %__A) { 763; X86-LABEL: test_mm512_mask_cvtepi16_epi8: 764; X86: # %bb.0: # %entry 765; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 766; X86-NEXT: vpmovwb %zmm1, %ymm0 {%k1} 767; X86-NEXT: retl 768; 769; X64-LABEL: test_mm512_mask_cvtepi16_epi8: 770; X64: # %bb.0: # %entry 771; X64-NEXT: kmovd %edi, %k1 772; X64-NEXT: vpmovwb %zmm1, %ymm0 {%k1} 773; X64-NEXT: retq 774entry: 775 %0 = bitcast <8 x i64> %__A to <32 x i16> 776 %conv.i.i = trunc <32 x i16> %0 to <32 x i8> 777 %1 = bitcast <4 x i64> %__O to <32 x i8> 778 %2 = bitcast i32 %__M to <32 x i1> 779 %3 = select <32 x i1> %2, <32 x i8> %conv.i.i, <32 x i8> %1 780 %4 = bitcast <32 x i8> %3 to <4 x i64> 781 ret <4 x i64> %4 782} 783 784define <4 x i64> @test_mm512_maskz_cvtepi16_epi8(i32 %__M, <8 x i64> %__A) { 785; X86-LABEL: test_mm512_maskz_cvtepi16_epi8: 786; X86: # %bb.0: # %entry 787; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 788; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} 789; X86-NEXT: retl 790; 791; X64-LABEL: test_mm512_maskz_cvtepi16_epi8: 792; X64: # %bb.0: # %entry 793; X64-NEXT: kmovd %edi, %k1 794; X64-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} 795; X64-NEXT: retq 796entry: 797 %0 = bitcast <8 x i64> %__A to <32 x i16> 798 %conv.i.i = trunc <32 x i16> %0 to <32 x i8> 799 %1 = bitcast i32 %__M to <32 x i1> 800 %2 = select <32 x i1> %1, <32 x i8> %conv.i.i, <32 x i8> zeroinitializer 801 %3 = bitcast <32 x i8> %2 to <4 x i64> 802 ret <4 x i64> %3 803} 804 805define <8 x i64> @test_mm512_mask2_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, i32 %__U, <8 x i64> %__B) { 806; X86-LABEL: test_mm512_mask2_permutex2var_epi16: 807; X86: # %bb.0: # %entry 808; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 809; X86-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} 810; X86-NEXT: vmovdqa64 %zmm1, %zmm0 811; X86-NEXT: retl 812; 813; X64-LABEL: test_mm512_mask2_permutex2var_epi16: 814; X64: # %bb.0: # %entry 815; X64-NEXT: kmovd %edi, %k1 816; X64-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} 817; X64-NEXT: vmovdqa64 %zmm1, %zmm0 818; X64-NEXT: retq 819entry: 820 %0 = bitcast <8 x i64> %__A to <32 x i16> 821 %1 = bitcast <8 x i64> %__I to <32 x i16> 822 %2 = bitcast <8 x i64> %__B to <32 x i16> 823 %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) 824 %4 = bitcast i32 %__U to <32 x i1> 825 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %1 826 %6 = bitcast <32 x i16> %5 to <8 x i64> 827 ret <8 x i64> %6 828} 829 830define <8 x i64> @test_mm512_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 831; CHECK-LABEL: test_mm512_permutex2var_epi16: 832; CHECK: # %bb.0: # %entry 833; CHECK-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 834; CHECK-NEXT: ret{{[l|q]}} 835entry: 836 %0 = bitcast <8 x i64> %__A to <32 x i16> 837 %1 = bitcast <8 x i64> %__I to <32 x i16> 838 %2 = bitcast <8 x i64> %__B to <32 x i16> 839 %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) 840 %4 = bitcast <32 x i16> %3 to <8 x i64> 841 ret <8 x i64> %4 842} 843 844define <8 x i64> @test_mm512_mask_permutex2var_epi16(<8 x i64> %__A, i32 %__U, <8 x i64> %__I, <8 x i64> %__B) { 845; X86-LABEL: test_mm512_mask_permutex2var_epi16: 846; X86: # %bb.0: # %entry 847; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 848; X86-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 {%k1} 849; X86-NEXT: retl 850; 851; X64-LABEL: test_mm512_mask_permutex2var_epi16: 852; X64: # %bb.0: # %entry 853; X64-NEXT: kmovd %edi, %k1 854; X64-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 {%k1} 855; X64-NEXT: retq 856entry: 857 %0 = bitcast <8 x i64> %__A to <32 x i16> 858 %1 = bitcast <8 x i64> %__I to <32 x i16> 859 %2 = bitcast <8 x i64> %__B to <32 x i16> 860 %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) 861 %4 = bitcast i32 %__U to <32 x i1> 862 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %0 863 %6 = bitcast <32 x i16> %5 to <8 x i64> 864 ret <8 x i64> %6 865} 866 867define <8 x i64> @test_mm512_maskz_permutex2var_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 868; X86-LABEL: test_mm512_maskz_permutex2var_epi16: 869; X86: # %bb.0: # %entry 870; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 871; X86-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z} 872; X86-NEXT: retl 873; 874; X64-LABEL: test_mm512_maskz_permutex2var_epi16: 875; X64: # %bb.0: # %entry 876; X64-NEXT: kmovd %edi, %k1 877; X64-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z} 878; X64-NEXT: retq 879entry: 880 %0 = bitcast <8 x i64> %__A to <32 x i16> 881 %1 = bitcast <8 x i64> %__I to <32 x i16> 882 %2 = bitcast <8 x i64> %__B to <32 x i16> 883 %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) 884 %4 = bitcast i32 %__U to <32 x i1> 885 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 886 %6 = bitcast <32 x i16> %5 to <8 x i64> 887 ret <8 x i64> %6 888} 889 890declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>) 891 892!0 = !{i32 1} 893 894