1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD 6 7define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { 8; AVX1-LABEL: testv4i64: 9; AVX1: # BB#0: 10; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 11; AVX1-NEXT: vpextrq $1, %xmm1, %rax 12; AVX1-NEXT: bsrq %rax, %rax 13; AVX1-NEXT: movl $127, %ecx 14; AVX1-NEXT: cmoveq %rcx, %rax 15; AVX1-NEXT: vmovq %rax, %xmm2 16; AVX1-NEXT: vmovq %xmm1, %rax 17; AVX1-NEXT: bsrq %rax, %rax 18; AVX1-NEXT: cmoveq %rcx, %rax 19; AVX1-NEXT: vmovq %rax, %xmm1 20; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 21; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] 22; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 23; AVX1-NEXT: vpextrq $1, %xmm0, %rax 24; AVX1-NEXT: bsrq %rax, %rax 25; AVX1-NEXT: cmoveq %rcx, %rax 26; AVX1-NEXT: vmovq %rax, %xmm3 27; AVX1-NEXT: vmovq %xmm0, %rax 28; AVX1-NEXT: bsrq %rax, %rax 29; AVX1-NEXT: cmoveq %rcx, %rax 30; AVX1-NEXT: vmovq %rax, %xmm0 31; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 32; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 33; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 34; AVX1-NEXT: retq 35; 36; AVX2-LABEL: testv4i64: 37; AVX2: # BB#0: 38; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 39; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 40; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 41; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 42; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 43; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 44; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 45; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 46; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 47; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 48; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 49; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 50; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 51; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 52; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 53; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 54; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 55; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 56; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 57; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 58; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 59; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 60; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 61; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 62; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 63; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 64; AVX2-NEXT: retq 65; 66; AVX512VLCD-LABEL: testv4i64: 67; AVX512VLCD: ## BB#0: 68; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0 69; AVX512VLCD-NEXT: retq 70; 71; AVX512CD-LABEL: testv4i64: 72; AVX512CD: ## BB#0: 73; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> 74; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 75; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> 76; AVX512CD-NEXT: retq 77 78 %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0) 79 ret <4 x i64> %out 80} 81 82define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { 83; AVX1-LABEL: testv4i64u: 84; AVX1: # BB#0: 85; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 86; AVX1-NEXT: vpextrq $1, %xmm1, %rax 87; AVX1-NEXT: bsrq %rax, %rax 88; AVX1-NEXT: vmovq %rax, %xmm2 89; AVX1-NEXT: vmovq %xmm1, %rax 90; AVX1-NEXT: bsrq %rax, %rax 91; AVX1-NEXT: vmovq %rax, %xmm1 92; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 93; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] 94; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 95; AVX1-NEXT: vpextrq $1, %xmm0, %rax 96; AVX1-NEXT: bsrq %rax, %rax 97; AVX1-NEXT: vmovq %rax, %xmm3 98; AVX1-NEXT: vmovq %xmm0, %rax 99; AVX1-NEXT: bsrq %rax, %rax 100; AVX1-NEXT: vmovq %rax, %xmm0 101; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 102; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 103; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 104; AVX1-NEXT: retq 105; 106; AVX2-LABEL: testv4i64u: 107; AVX2: # BB#0: 108; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 109; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 110; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 111; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 112; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 113; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 114; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 115; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 116; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 117; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 118; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 119; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 120; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 121; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 122; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 123; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 124; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 125; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 126; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 127; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 128; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 129; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 130; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 131; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 132; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 133; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 134; AVX2-NEXT: retq 135; 136; AVX512VLCD-LABEL: testv4i64u: 137; AVX512VLCD: ## BB#0: 138; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0 139; AVX512VLCD-NEXT: retq 140; 141; AVX512CD-LABEL: testv4i64u: 142; AVX512CD: ## BB#0: 143; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> 144; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 145; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> 146; AVX512CD-NEXT: retq 147 148 %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1) 149 ret <4 x i64> %out 150} 151 152define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { 153; AVX1-LABEL: testv8i32: 154; AVX1: # BB#0: 155; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 156; AVX1-NEXT: vpextrd $1, %xmm1, %eax 157; AVX1-NEXT: bsrl %eax, %ecx 158; AVX1-NEXT: movl $63, %eax 159; AVX1-NEXT: cmovel %eax, %ecx 160; AVX1-NEXT: vmovd %xmm1, %edx 161; AVX1-NEXT: bsrl %edx, %edx 162; AVX1-NEXT: cmovel %eax, %edx 163; AVX1-NEXT: vmovd %edx, %xmm2 164; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 165; AVX1-NEXT: vpextrd $2, %xmm1, %ecx 166; AVX1-NEXT: bsrl %ecx, %ecx 167; AVX1-NEXT: cmovel %eax, %ecx 168; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 169; AVX1-NEXT: vpextrd $3, %xmm1, %ecx 170; AVX1-NEXT: bsrl %ecx, %ecx 171; AVX1-NEXT: cmovel %eax, %ecx 172; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1 173; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31] 174; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 175; AVX1-NEXT: vpextrd $1, %xmm0, %ecx 176; AVX1-NEXT: bsrl %ecx, %ecx 177; AVX1-NEXT: cmovel %eax, %ecx 178; AVX1-NEXT: vmovd %xmm0, %edx 179; AVX1-NEXT: bsrl %edx, %edx 180; AVX1-NEXT: cmovel %eax, %edx 181; AVX1-NEXT: vmovd %edx, %xmm3 182; AVX1-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 183; AVX1-NEXT: vpextrd $2, %xmm0, %ecx 184; AVX1-NEXT: bsrl %ecx, %ecx 185; AVX1-NEXT: cmovel %eax, %ecx 186; AVX1-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 187; AVX1-NEXT: vpextrd $3, %xmm0, %ecx 188; AVX1-NEXT: bsrl %ecx, %ecx 189; AVX1-NEXT: cmovel %eax, %ecx 190; AVX1-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm0 191; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 192; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 193; AVX1-NEXT: retq 194; 195; AVX2-LABEL: testv8i32: 196; AVX2: # BB#0: 197; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 198; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 199; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 200; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 201; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 202; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 203; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 204; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 205; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 206; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 207; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 208; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 209; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 210; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 211; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 212; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 213; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 214; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 215; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 216; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 217; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 218; AVX2-NEXT: retq 219; 220; AVX512VLCD-LABEL: testv8i32: 221; AVX512VLCD: ## BB#0: 222; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 223; AVX512VLCD-NEXT: retq 224; 225; AVX512CD-LABEL: testv8i32: 226; AVX512CD: ## BB#0: 227; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> 228; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 229; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> 230; AVX512CD-NEXT: retq 231 232 %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0) 233 ret <8 x i32> %out 234} 235 236define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { 237; AVX1-LABEL: testv8i32u: 238; AVX1: # BB#0: 239; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 240; AVX1-NEXT: vpextrd $1, %xmm1, %eax 241; AVX1-NEXT: bsrl %eax, %eax 242; AVX1-NEXT: vmovd %xmm1, %ecx 243; AVX1-NEXT: bsrl %ecx, %ecx 244; AVX1-NEXT: vmovd %ecx, %xmm2 245; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 246; AVX1-NEXT: vpextrd $2, %xmm1, %eax 247; AVX1-NEXT: bsrl %eax, %eax 248; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 249; AVX1-NEXT: vpextrd $3, %xmm1, %eax 250; AVX1-NEXT: bsrl %eax, %eax 251; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 252; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31] 253; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 254; AVX1-NEXT: vpextrd $1, %xmm0, %eax 255; AVX1-NEXT: bsrl %eax, %eax 256; AVX1-NEXT: vmovd %xmm0, %ecx 257; AVX1-NEXT: bsrl %ecx, %ecx 258; AVX1-NEXT: vmovd %ecx, %xmm3 259; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 260; AVX1-NEXT: vpextrd $2, %xmm0, %eax 261; AVX1-NEXT: bsrl %eax, %eax 262; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 263; AVX1-NEXT: vpextrd $3, %xmm0, %eax 264; AVX1-NEXT: bsrl %eax, %eax 265; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 266; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 267; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 268; AVX1-NEXT: retq 269; 270; AVX2-LABEL: testv8i32u: 271; AVX2: # BB#0: 272; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 273; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 274; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 275; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 276; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 277; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 278; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 279; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 280; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 281; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 282; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 283; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 284; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 285; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 286; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 287; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 288; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 289; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 290; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 291; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 292; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 293; AVX2-NEXT: retq 294; 295; AVX512VLCD-LABEL: testv8i32u: 296; AVX512VLCD: ## BB#0: 297; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 298; AVX512VLCD-NEXT: retq 299; 300; AVX512CD-LABEL: testv8i32u: 301; AVX512CD: ## BB#0: 302; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> 303; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 304; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> 305; AVX512CD-NEXT: retq 306 307 %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1) 308 ret <8 x i32> %out 309} 310 311define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { 312; AVX1-LABEL: testv16i16: 313; AVX1: # BB#0: 314; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 315; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 316; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 317; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 318; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 319; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 320; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 321; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 322; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 323; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 324; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 325; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 326; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1 327; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 328; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 329; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 330; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 331; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 332; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 333; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 334; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 335; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 336; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 337; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 338; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 339; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0 340; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 341; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 342; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 343; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 344; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 345; AVX1-NEXT: retq 346; 347; AVX2-LABEL: testv16i16: 348; AVX2: # BB#0: 349; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 350; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 351; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 352; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 353; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 354; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 355; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 356; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 357; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 358; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 359; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 360; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 361; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 362; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 363; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 364; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 365; AVX2-NEXT: retq 366; 367; AVX512-LABEL: testv16i16: 368; AVX512: ## BB#0: 369; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 370; AVX512-NEXT: vplzcntd %zmm0, %zmm0 371; AVX512-NEXT: vpmovdw %zmm0, %ymm0 372; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 373; AVX512-NEXT: retq 374 %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 0) 375 ret <16 x i16> %out 376} 377 378define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { 379; AVX1-LABEL: testv16i16u: 380; AVX1: # BB#0: 381; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 382; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 383; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 384; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 385; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 386; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 387; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 388; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 389; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 390; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 391; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 392; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 393; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1 394; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 395; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 396; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 397; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 398; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 399; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 400; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 401; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 402; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 403; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 404; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 405; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 406; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0 407; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 408; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 409; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 410; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 411; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 412; AVX1-NEXT: retq 413; 414; AVX2-LABEL: testv16i16u: 415; AVX2: # BB#0: 416; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 417; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 418; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 419; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 420; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 421; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 422; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 423; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 424; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 425; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 426; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 427; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 428; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 429; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 430; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 431; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 432; AVX2-NEXT: retq 433; 434; AVX512-LABEL: testv16i16u: 435; AVX512: ## BB#0: 436; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 437; AVX512-NEXT: vplzcntd %zmm0, %zmm0 438; AVX512-NEXT: vpmovdw %zmm0, %ymm0 439; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 440; AVX512-NEXT: retq 441 %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 -1) 442 ret <16 x i16> %out 443} 444 445define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { 446; AVX1-LABEL: testv32i8: 447; AVX1: # BB#0: 448; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 449; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 450; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 451; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 452; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 453; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 454; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 455; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 456; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 457; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 458; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 459; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 460; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 461; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 462; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 463; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 464; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2 465; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 466; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 467; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 468; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 469; AVX1-NEXT: retq 470; 471; AVX2-LABEL: testv32i8: 472; AVX2: # BB#0: 473; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 474; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 475; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 476; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 477; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 478; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 479; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 480; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 481; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 482; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 483; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 484; AVX2-NEXT: retq 485; 486; AVX512VLCD-LABEL: testv32i8: 487; AVX512VLCD: ## BB#0: 488; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1 489; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 490; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1 491; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1 492; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] 493; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 494; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 495; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0 496; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0 497; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 498; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 499; AVX512VLCD-NEXT: retq 500; 501; AVX512CD-LABEL: testv32i8: 502; AVX512CD: ## BB#0: 503; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm1 504; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 505; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 506; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 507; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] 508; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 509; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 510; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 511; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 512; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 513; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 514; AVX512CD-NEXT: retq 515 %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0) 516 ret <32 x i8> %out 517} 518 519define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { 520; AVX1-LABEL: testv32i8u: 521; AVX1: # BB#0: 522; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 523; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 524; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 525; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 526; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 527; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 528; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 529; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 530; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 531; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 532; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 533; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 534; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 535; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 536; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 537; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 538; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2 539; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 540; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 541; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 542; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 543; AVX1-NEXT: retq 544; 545; AVX2-LABEL: testv32i8u: 546; AVX2: # BB#0: 547; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 548; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 549; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 550; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 551; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 552; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 553; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 554; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 555; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 556; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 557; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 558; AVX2-NEXT: retq 559; 560; AVX512VLCD-LABEL: testv32i8u: 561; AVX512VLCD: ## BB#0: 562; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1 563; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 564; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1 565; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1 566; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] 567; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 568; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 569; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0 570; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0 571; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 572; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 573; AVX512VLCD-NEXT: retq 574; 575; AVX512CD-LABEL: testv32i8u: 576; AVX512CD: ## BB#0: 577; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm1 578; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 579; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 580; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 581; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] 582; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 583; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 584; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 585; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 586; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 587; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 588; AVX512CD-NEXT: retq 589 %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1) 590 ret <32 x i8> %out 591} 592 593define <4 x i64> @foldv4i64() nounwind { 594; AVX-LABEL: foldv4i64: 595; AVX: # BB#0: 596; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] 597; AVX-NEXT: retq 598; 599; AVX512VLCD-LABEL: foldv4i64: 600; AVX512VLCD: ## BB#0: 601; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} ymm0 = [55,0,64,56] 602; AVX512VLCD-NEXT: retq 603; 604; AVX512CD-LABEL: foldv4i64: 605; AVX512CD: ## BB#0: 606; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] 607; AVX512CD-NEXT: retq 608 %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0) 609 ret <4 x i64> %out 610} 611 612define <4 x i64> @foldv4i64u() nounwind { 613; AVX-LABEL: foldv4i64u: 614; AVX: # BB#0: 615; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] 616; AVX-NEXT: retq 617; 618; AVX512VLCD-LABEL: foldv4i64u: 619; AVX512VLCD: ## BB#0: 620; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} ymm0 = [55,0,64,56] 621; AVX512VLCD-NEXT: retq 622; 623; AVX512CD-LABEL: foldv4i64u: 624; AVX512CD: ## BB#0: 625; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] 626; AVX512CD-NEXT: retq 627 %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1) 628 ret <4 x i64> %out 629} 630 631define <8 x i32> @foldv8i32() nounwind { 632; AVX-LABEL: foldv8i32: 633; AVX: # BB#0: 634; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 635; AVX-NEXT: retq 636; 637; AVX512VLCD-LABEL: foldv8i32: 638; AVX512VLCD: ## BB#0: 639; AVX512VLCD-NEXT: vmovdqa32 {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 640; AVX512VLCD-NEXT: retq 641; 642; AVX512CD-LABEL: foldv8i32: 643; AVX512CD: ## BB#0: 644; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 645; AVX512CD-NEXT: retq 646 %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0) 647 ret <8 x i32> %out 648} 649 650define <8 x i32> @foldv8i32u() nounwind { 651; AVX-LABEL: foldv8i32u: 652; AVX: # BB#0: 653; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 654; AVX-NEXT: retq 655; 656; AVX512VLCD-LABEL: foldv8i32u: 657; AVX512VLCD: ## BB#0: 658; AVX512VLCD-NEXT: vmovdqa32 {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 659; AVX512VLCD-NEXT: retq 660; 661; AVX512CD-LABEL: foldv8i32u: 662; AVX512CD: ## BB#0: 663; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 664; AVX512CD-NEXT: retq 665 %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1) 666 ret <8 x i32> %out 667} 668 669define <16 x i16> @foldv16i16() nounwind { 670; AVX-LABEL: foldv16i16: 671; AVX: # BB#0: 672; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 673; AVX-NEXT: retq 674; 675; AVX512VLCD-LABEL: foldv16i16: 676; AVX512VLCD: ## BB#0: 677; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 678; AVX512VLCD-NEXT: retq 679; 680; AVX512CD-LABEL: foldv16i16: 681; AVX512CD: ## BB#0: 682; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 683; AVX512CD-NEXT: retq 684 %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0) 685 ret <16 x i16> %out 686} 687 688define <16 x i16> @foldv16i16u() nounwind { 689; AVX-LABEL: foldv16i16u: 690; AVX: # BB#0: 691; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 692; AVX-NEXT: retq 693; 694; AVX512VLCD-LABEL: foldv16i16u: 695; AVX512VLCD: ## BB#0: 696; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 697; AVX512VLCD-NEXT: retq 698; 699; AVX512CD-LABEL: foldv16i16u: 700; AVX512CD: ## BB#0: 701; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 702; AVX512CD-NEXT: retq 703 %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1) 704 ret <16 x i16> %out 705} 706 707define <32 x i8> @foldv32i8() nounwind { 708; AVX-LABEL: foldv32i8: 709; AVX: # BB#0: 710; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 711; AVX-NEXT: retq 712; 713; AVX512VLCD-LABEL: foldv32i8: 714; AVX512VLCD: ## BB#0: 715; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 716; AVX512VLCD-NEXT: retq 717; 718; AVX512CD-LABEL: foldv32i8: 719; AVX512CD: ## BB#0: 720; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 721; AVX512CD-NEXT: retq 722 %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0) 723 ret <32 x i8> %out 724} 725 726define <32 x i8> @foldv32i8u() nounwind { 727; AVX-LABEL: foldv32i8u: 728; AVX: # BB#0: 729; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 730; AVX-NEXT: retq 731; 732; AVX512VLCD-LABEL: foldv32i8u: 733; AVX512VLCD: ## BB#0: 734; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 735; AVX512VLCD-NEXT: retq 736; 737; AVX512CD-LABEL: foldv32i8u: 738; AVX512CD: ## BB#0: 739; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 740; AVX512CD-NEXT: retq 741 %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1) 742 ret <32 x i8> %out 743} 744 745declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) 746declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) 747declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1) 748declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1) 749