1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=NOBW,AVX 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=NOBW,AVX 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=AVX512VLBWDQ 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512VLCD 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512CD 12; 13; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt. 14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE 15 16define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { 17; SSE2-LABEL: testv2i64: 18; SSE2: # %bb.0: 19; SSE2-NEXT: movdqa %xmm0, %xmm1 20; SSE2-NEXT: psrlq $1, %xmm1 21; SSE2-NEXT: por %xmm0, %xmm1 22; SSE2-NEXT: movdqa %xmm1, %xmm0 23; SSE2-NEXT: psrlq $2, %xmm0 24; SSE2-NEXT: por %xmm1, %xmm0 25; SSE2-NEXT: movdqa %xmm0, %xmm1 26; SSE2-NEXT: psrlq $4, %xmm1 27; SSE2-NEXT: por %xmm0, %xmm1 28; SSE2-NEXT: movdqa %xmm1, %xmm0 29; SSE2-NEXT: psrlq $8, %xmm0 30; SSE2-NEXT: por %xmm1, %xmm0 31; SSE2-NEXT: movdqa %xmm0, %xmm1 32; SSE2-NEXT: psrlq $16, %xmm1 33; SSE2-NEXT: por %xmm0, %xmm1 34; SSE2-NEXT: movdqa %xmm1, %xmm0 35; SSE2-NEXT: psrlq $32, %xmm0 36; SSE2-NEXT: por %xmm1, %xmm0 37; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 38; SSE2-NEXT: pxor %xmm0, %xmm1 39; SSE2-NEXT: movdqa %xmm1, %xmm0 40; SSE2-NEXT: psrlw $1, %xmm0 41; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 42; SSE2-NEXT: psubb %xmm0, %xmm1 43; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 44; SSE2-NEXT: movdqa %xmm1, %xmm2 45; SSE2-NEXT: pand %xmm0, %xmm2 46; SSE2-NEXT: psrlw $2, %xmm1 47; SSE2-NEXT: pand %xmm0, %xmm1 48; SSE2-NEXT: paddb %xmm2, %xmm1 49; SSE2-NEXT: movdqa %xmm1, %xmm2 50; SSE2-NEXT: psrlw $4, %xmm2 51; SSE2-NEXT: paddb %xmm1, %xmm2 52; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 53; SSE2-NEXT: pxor %xmm0, %xmm0 54; SSE2-NEXT: psadbw %xmm2, %xmm0 55; SSE2-NEXT: retq 56; 57; SSE3-LABEL: testv2i64: 58; SSE3: # %bb.0: 59; SSE3-NEXT: movdqa %xmm0, %xmm1 60; SSE3-NEXT: psrlq $1, %xmm1 61; SSE3-NEXT: por %xmm0, %xmm1 62; SSE3-NEXT: movdqa %xmm1, %xmm0 63; SSE3-NEXT: psrlq $2, %xmm0 64; SSE3-NEXT: por %xmm1, %xmm0 65; SSE3-NEXT: movdqa %xmm0, %xmm1 66; SSE3-NEXT: psrlq $4, %xmm1 67; SSE3-NEXT: por %xmm0, %xmm1 68; SSE3-NEXT: movdqa %xmm1, %xmm0 69; SSE3-NEXT: psrlq $8, %xmm0 70; SSE3-NEXT: por %xmm1, %xmm0 71; SSE3-NEXT: movdqa %xmm0, %xmm1 72; SSE3-NEXT: psrlq $16, %xmm1 73; SSE3-NEXT: por %xmm0, %xmm1 74; SSE3-NEXT: movdqa %xmm1, %xmm0 75; SSE3-NEXT: psrlq $32, %xmm0 76; SSE3-NEXT: por %xmm1, %xmm0 77; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 78; SSE3-NEXT: pxor %xmm0, %xmm1 79; SSE3-NEXT: movdqa %xmm1, %xmm0 80; SSE3-NEXT: psrlw $1, %xmm0 81; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 82; SSE3-NEXT: psubb %xmm0, %xmm1 83; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 84; SSE3-NEXT: movdqa %xmm1, %xmm2 85; SSE3-NEXT: pand %xmm0, %xmm2 86; SSE3-NEXT: psrlw $2, %xmm1 87; SSE3-NEXT: pand %xmm0, %xmm1 88; SSE3-NEXT: paddb %xmm2, %xmm1 89; SSE3-NEXT: movdqa %xmm1, %xmm2 90; SSE3-NEXT: psrlw $4, %xmm2 91; SSE3-NEXT: paddb %xmm1, %xmm2 92; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 93; SSE3-NEXT: pxor %xmm0, %xmm0 94; SSE3-NEXT: psadbw %xmm2, %xmm0 95; SSE3-NEXT: retq 96; 97; SSSE3-LABEL: testv2i64: 98; SSSE3: # %bb.0: 99; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 100; SSSE3-NEXT: movdqa %xmm2, %xmm3 101; SSSE3-NEXT: pshufb %xmm0, %xmm3 102; SSSE3-NEXT: movdqa %xmm0, %xmm1 103; SSSE3-NEXT: psrlw $4, %xmm1 104; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 105; SSSE3-NEXT: pxor %xmm4, %xmm4 106; SSSE3-NEXT: pshufb %xmm1, %xmm2 107; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 108; SSSE3-NEXT: pand %xmm3, %xmm1 109; SSSE3-NEXT: paddb %xmm2, %xmm1 110; SSSE3-NEXT: movdqa %xmm0, %xmm2 111; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 112; SSSE3-NEXT: psrlw $8, %xmm2 113; SSSE3-NEXT: pand %xmm1, %xmm2 114; SSSE3-NEXT: psrlw $8, %xmm1 115; SSSE3-NEXT: paddw %xmm2, %xmm1 116; SSSE3-NEXT: movdqa %xmm0, %xmm2 117; SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 118; SSSE3-NEXT: psrld $16, %xmm2 119; SSSE3-NEXT: pand %xmm1, %xmm2 120; SSSE3-NEXT: psrld $16, %xmm1 121; SSSE3-NEXT: paddd %xmm2, %xmm1 122; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 123; SSSE3-NEXT: psrlq $32, %xmm0 124; SSSE3-NEXT: pand %xmm1, %xmm0 125; SSSE3-NEXT: psrlq $32, %xmm1 126; SSSE3-NEXT: paddq %xmm0, %xmm1 127; SSSE3-NEXT: movdqa %xmm1, %xmm0 128; SSSE3-NEXT: retq 129; 130; SSE41-LABEL: testv2i64: 131; SSE41: # %bb.0: 132; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 133; SSE41-NEXT: movdqa %xmm2, %xmm3 134; SSE41-NEXT: pshufb %xmm0, %xmm3 135; SSE41-NEXT: movdqa %xmm0, %xmm1 136; SSE41-NEXT: psrlw $4, %xmm1 137; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 138; SSE41-NEXT: pxor %xmm4, %xmm4 139; SSE41-NEXT: pshufb %xmm1, %xmm2 140; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 141; SSE41-NEXT: pand %xmm3, %xmm1 142; SSE41-NEXT: paddb %xmm2, %xmm1 143; SSE41-NEXT: movdqa %xmm0, %xmm2 144; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 145; SSE41-NEXT: psrlw $8, %xmm2 146; SSE41-NEXT: pand %xmm1, %xmm2 147; SSE41-NEXT: psrlw $8, %xmm1 148; SSE41-NEXT: paddw %xmm2, %xmm1 149; SSE41-NEXT: movdqa %xmm0, %xmm2 150; SSE41-NEXT: pcmpeqw %xmm4, %xmm2 151; SSE41-NEXT: psrld $16, %xmm2 152; SSE41-NEXT: pand %xmm1, %xmm2 153; SSE41-NEXT: psrld $16, %xmm1 154; SSE41-NEXT: paddd %xmm2, %xmm1 155; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 156; SSE41-NEXT: psrlq $32, %xmm0 157; SSE41-NEXT: pand %xmm1, %xmm0 158; SSE41-NEXT: psrlq $32, %xmm1 159; SSE41-NEXT: paddq %xmm0, %xmm1 160; SSE41-NEXT: movdqa %xmm1, %xmm0 161; SSE41-NEXT: retq 162; 163; AVX-LABEL: testv2i64: 164; AVX: # %bb.0: 165; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 166; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 167; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 168; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 169; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 170; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 171; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 172; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 173; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 174; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 175; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 176; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 177; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 178; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 179; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 180; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 181; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 182; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 183; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 184; AVX-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 185; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 186; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 187; AVX-NEXT: vpsrlq $32, %xmm1, %xmm1 188; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 189; AVX-NEXT: retq 190; 191; AVX512VLBWDQ-LABEL: testv2i64: 192; AVX512VLBWDQ: # %bb.0: 193; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 194; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 195; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 196; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 197; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 198; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 199; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 200; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 201; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 202; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 203; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 204; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 205; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 206; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 207; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 208; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm2, %xmm2 209; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 210; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 211; AVX512VLBWDQ-NEXT: vpaddd %xmm2, %xmm1, %xmm1 212; AVX512VLBWDQ-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 213; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm0, %xmm0 214; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 215; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm1, %xmm1 216; AVX512VLBWDQ-NEXT: vpaddq %xmm0, %xmm1, %xmm0 217; AVX512VLBWDQ-NEXT: retq 218; 219; AVX512VLCD-LABEL: testv2i64: 220; AVX512VLCD: # %bb.0: 221; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0 222; AVX512VLCD-NEXT: retq 223; 224; AVX512CD-LABEL: testv2i64: 225; AVX512CD: # %bb.0: 226; AVX512CD-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 227; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 228; AVX512CD-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 229; AVX512CD-NEXT: vzeroupper 230; AVX512CD-NEXT: retq 231; 232; X32-SSE-LABEL: testv2i64: 233; X32-SSE: # %bb.0: 234; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 235; X32-SSE-NEXT: movdqa %xmm2, %xmm3 236; X32-SSE-NEXT: pshufb %xmm0, %xmm3 237; X32-SSE-NEXT: movdqa %xmm0, %xmm1 238; X32-SSE-NEXT: psrlw $4, %xmm1 239; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 240; X32-SSE-NEXT: pxor %xmm4, %xmm4 241; X32-SSE-NEXT: pshufb %xmm1, %xmm2 242; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 243; X32-SSE-NEXT: pand %xmm3, %xmm1 244; X32-SSE-NEXT: paddb %xmm2, %xmm1 245; X32-SSE-NEXT: movdqa %xmm0, %xmm2 246; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 247; X32-SSE-NEXT: psrlw $8, %xmm2 248; X32-SSE-NEXT: pand %xmm1, %xmm2 249; X32-SSE-NEXT: psrlw $8, %xmm1 250; X32-SSE-NEXT: paddw %xmm2, %xmm1 251; X32-SSE-NEXT: movdqa %xmm0, %xmm2 252; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm2 253; X32-SSE-NEXT: psrld $16, %xmm2 254; X32-SSE-NEXT: pand %xmm1, %xmm2 255; X32-SSE-NEXT: psrld $16, %xmm1 256; X32-SSE-NEXT: paddd %xmm2, %xmm1 257; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm0 258; X32-SSE-NEXT: psrlq $32, %xmm0 259; X32-SSE-NEXT: pand %xmm1, %xmm0 260; X32-SSE-NEXT: psrlq $32, %xmm1 261; X32-SSE-NEXT: paddq %xmm0, %xmm1 262; X32-SSE-NEXT: movdqa %xmm1, %xmm0 263; X32-SSE-NEXT: retl 264 265 %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) 266 ret <2 x i64> %out 267} 268 269define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { 270; SSE2-LABEL: testv2i64u: 271; SSE2: # %bb.0: 272; SSE2-NEXT: movdqa %xmm0, %xmm1 273; SSE2-NEXT: psrlq $1, %xmm1 274; SSE2-NEXT: por %xmm0, %xmm1 275; SSE2-NEXT: movdqa %xmm1, %xmm0 276; SSE2-NEXT: psrlq $2, %xmm0 277; SSE2-NEXT: por %xmm1, %xmm0 278; SSE2-NEXT: movdqa %xmm0, %xmm1 279; SSE2-NEXT: psrlq $4, %xmm1 280; SSE2-NEXT: por %xmm0, %xmm1 281; SSE2-NEXT: movdqa %xmm1, %xmm0 282; SSE2-NEXT: psrlq $8, %xmm0 283; SSE2-NEXT: por %xmm1, %xmm0 284; SSE2-NEXT: movdqa %xmm0, %xmm1 285; SSE2-NEXT: psrlq $16, %xmm1 286; SSE2-NEXT: por %xmm0, %xmm1 287; SSE2-NEXT: movdqa %xmm1, %xmm0 288; SSE2-NEXT: psrlq $32, %xmm0 289; SSE2-NEXT: por %xmm1, %xmm0 290; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 291; SSE2-NEXT: pxor %xmm0, %xmm1 292; SSE2-NEXT: movdqa %xmm1, %xmm0 293; SSE2-NEXT: psrlw $1, %xmm0 294; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 295; SSE2-NEXT: psubb %xmm0, %xmm1 296; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 297; SSE2-NEXT: movdqa %xmm1, %xmm2 298; SSE2-NEXT: pand %xmm0, %xmm2 299; SSE2-NEXT: psrlw $2, %xmm1 300; SSE2-NEXT: pand %xmm0, %xmm1 301; SSE2-NEXT: paddb %xmm2, %xmm1 302; SSE2-NEXT: movdqa %xmm1, %xmm2 303; SSE2-NEXT: psrlw $4, %xmm2 304; SSE2-NEXT: paddb %xmm1, %xmm2 305; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 306; SSE2-NEXT: pxor %xmm0, %xmm0 307; SSE2-NEXT: psadbw %xmm2, %xmm0 308; SSE2-NEXT: retq 309; 310; SSE3-LABEL: testv2i64u: 311; SSE3: # %bb.0: 312; SSE3-NEXT: movdqa %xmm0, %xmm1 313; SSE3-NEXT: psrlq $1, %xmm1 314; SSE3-NEXT: por %xmm0, %xmm1 315; SSE3-NEXT: movdqa %xmm1, %xmm0 316; SSE3-NEXT: psrlq $2, %xmm0 317; SSE3-NEXT: por %xmm1, %xmm0 318; SSE3-NEXT: movdqa %xmm0, %xmm1 319; SSE3-NEXT: psrlq $4, %xmm1 320; SSE3-NEXT: por %xmm0, %xmm1 321; SSE3-NEXT: movdqa %xmm1, %xmm0 322; SSE3-NEXT: psrlq $8, %xmm0 323; SSE3-NEXT: por %xmm1, %xmm0 324; SSE3-NEXT: movdqa %xmm0, %xmm1 325; SSE3-NEXT: psrlq $16, %xmm1 326; SSE3-NEXT: por %xmm0, %xmm1 327; SSE3-NEXT: movdqa %xmm1, %xmm0 328; SSE3-NEXT: psrlq $32, %xmm0 329; SSE3-NEXT: por %xmm1, %xmm0 330; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 331; SSE3-NEXT: pxor %xmm0, %xmm1 332; SSE3-NEXT: movdqa %xmm1, %xmm0 333; SSE3-NEXT: psrlw $1, %xmm0 334; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 335; SSE3-NEXT: psubb %xmm0, %xmm1 336; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 337; SSE3-NEXT: movdqa %xmm1, %xmm2 338; SSE3-NEXT: pand %xmm0, %xmm2 339; SSE3-NEXT: psrlw $2, %xmm1 340; SSE3-NEXT: pand %xmm0, %xmm1 341; SSE3-NEXT: paddb %xmm2, %xmm1 342; SSE3-NEXT: movdqa %xmm1, %xmm2 343; SSE3-NEXT: psrlw $4, %xmm2 344; SSE3-NEXT: paddb %xmm1, %xmm2 345; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 346; SSE3-NEXT: pxor %xmm0, %xmm0 347; SSE3-NEXT: psadbw %xmm2, %xmm0 348; SSE3-NEXT: retq 349; 350; SSSE3-LABEL: testv2i64u: 351; SSSE3: # %bb.0: 352; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 353; SSSE3-NEXT: movdqa %xmm2, %xmm3 354; SSSE3-NEXT: pshufb %xmm0, %xmm3 355; SSSE3-NEXT: movdqa %xmm0, %xmm1 356; SSSE3-NEXT: psrlw $4, %xmm1 357; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 358; SSSE3-NEXT: pxor %xmm4, %xmm4 359; SSSE3-NEXT: pshufb %xmm1, %xmm2 360; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 361; SSSE3-NEXT: pand %xmm3, %xmm1 362; SSSE3-NEXT: paddb %xmm2, %xmm1 363; SSSE3-NEXT: movdqa %xmm0, %xmm2 364; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 365; SSSE3-NEXT: psrlw $8, %xmm2 366; SSSE3-NEXT: pand %xmm1, %xmm2 367; SSSE3-NEXT: psrlw $8, %xmm1 368; SSSE3-NEXT: paddw %xmm2, %xmm1 369; SSSE3-NEXT: movdqa %xmm0, %xmm2 370; SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 371; SSSE3-NEXT: psrld $16, %xmm2 372; SSSE3-NEXT: pand %xmm1, %xmm2 373; SSSE3-NEXT: psrld $16, %xmm1 374; SSSE3-NEXT: paddd %xmm2, %xmm1 375; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 376; SSSE3-NEXT: psrlq $32, %xmm0 377; SSSE3-NEXT: pand %xmm1, %xmm0 378; SSSE3-NEXT: psrlq $32, %xmm1 379; SSSE3-NEXT: paddq %xmm0, %xmm1 380; SSSE3-NEXT: movdqa %xmm1, %xmm0 381; SSSE3-NEXT: retq 382; 383; SSE41-LABEL: testv2i64u: 384; SSE41: # %bb.0: 385; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 386; SSE41-NEXT: movdqa %xmm2, %xmm3 387; SSE41-NEXT: pshufb %xmm0, %xmm3 388; SSE41-NEXT: movdqa %xmm0, %xmm1 389; SSE41-NEXT: psrlw $4, %xmm1 390; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 391; SSE41-NEXT: pxor %xmm4, %xmm4 392; SSE41-NEXT: pshufb %xmm1, %xmm2 393; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 394; SSE41-NEXT: pand %xmm3, %xmm1 395; SSE41-NEXT: paddb %xmm2, %xmm1 396; SSE41-NEXT: movdqa %xmm0, %xmm2 397; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 398; SSE41-NEXT: psrlw $8, %xmm2 399; SSE41-NEXT: pand %xmm1, %xmm2 400; SSE41-NEXT: psrlw $8, %xmm1 401; SSE41-NEXT: paddw %xmm2, %xmm1 402; SSE41-NEXT: movdqa %xmm0, %xmm2 403; SSE41-NEXT: pcmpeqw %xmm4, %xmm2 404; SSE41-NEXT: psrld $16, %xmm2 405; SSE41-NEXT: pand %xmm1, %xmm2 406; SSE41-NEXT: psrld $16, %xmm1 407; SSE41-NEXT: paddd %xmm2, %xmm1 408; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 409; SSE41-NEXT: psrlq $32, %xmm0 410; SSE41-NEXT: pand %xmm1, %xmm0 411; SSE41-NEXT: psrlq $32, %xmm1 412; SSE41-NEXT: paddq %xmm0, %xmm1 413; SSE41-NEXT: movdqa %xmm1, %xmm0 414; SSE41-NEXT: retq 415; 416; AVX-LABEL: testv2i64u: 417; AVX: # %bb.0: 418; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 419; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 420; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 421; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 422; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 423; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 424; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 425; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 426; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 427; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 428; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 429; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 430; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 431; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 432; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 433; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 434; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 435; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 436; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 437; AVX-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 438; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 439; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 440; AVX-NEXT: vpsrlq $32, %xmm1, %xmm1 441; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 442; AVX-NEXT: retq 443; 444; AVX512VLBWDQ-LABEL: testv2i64u: 445; AVX512VLBWDQ: # %bb.0: 446; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 447; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 448; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 449; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 450; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 451; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 452; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 453; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 454; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 455; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 456; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 457; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 458; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 459; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 460; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 461; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm2, %xmm2 462; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 463; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 464; AVX512VLBWDQ-NEXT: vpaddd %xmm2, %xmm1, %xmm1 465; AVX512VLBWDQ-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 466; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm0, %xmm0 467; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 468; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm1, %xmm1 469; AVX512VLBWDQ-NEXT: vpaddq %xmm0, %xmm1, %xmm0 470; AVX512VLBWDQ-NEXT: retq 471; 472; AVX512VLCD-LABEL: testv2i64u: 473; AVX512VLCD: # %bb.0: 474; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0 475; AVX512VLCD-NEXT: retq 476; 477; AVX512CD-LABEL: testv2i64u: 478; AVX512CD: # %bb.0: 479; AVX512CD-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 480; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 481; AVX512CD-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 482; AVX512CD-NEXT: vzeroupper 483; AVX512CD-NEXT: retq 484; 485; X32-SSE-LABEL: testv2i64u: 486; X32-SSE: # %bb.0: 487; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 488; X32-SSE-NEXT: movdqa %xmm2, %xmm3 489; X32-SSE-NEXT: pshufb %xmm0, %xmm3 490; X32-SSE-NEXT: movdqa %xmm0, %xmm1 491; X32-SSE-NEXT: psrlw $4, %xmm1 492; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 493; X32-SSE-NEXT: pxor %xmm4, %xmm4 494; X32-SSE-NEXT: pshufb %xmm1, %xmm2 495; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 496; X32-SSE-NEXT: pand %xmm3, %xmm1 497; X32-SSE-NEXT: paddb %xmm2, %xmm1 498; X32-SSE-NEXT: movdqa %xmm0, %xmm2 499; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 500; X32-SSE-NEXT: psrlw $8, %xmm2 501; X32-SSE-NEXT: pand %xmm1, %xmm2 502; X32-SSE-NEXT: psrlw $8, %xmm1 503; X32-SSE-NEXT: paddw %xmm2, %xmm1 504; X32-SSE-NEXT: movdqa %xmm0, %xmm2 505; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm2 506; X32-SSE-NEXT: psrld $16, %xmm2 507; X32-SSE-NEXT: pand %xmm1, %xmm2 508; X32-SSE-NEXT: psrld $16, %xmm1 509; X32-SSE-NEXT: paddd %xmm2, %xmm1 510; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm0 511; X32-SSE-NEXT: psrlq $32, %xmm0 512; X32-SSE-NEXT: pand %xmm1, %xmm0 513; X32-SSE-NEXT: psrlq $32, %xmm1 514; X32-SSE-NEXT: paddq %xmm0, %xmm1 515; X32-SSE-NEXT: movdqa %xmm1, %xmm0 516; X32-SSE-NEXT: retl 517 518 %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1) 519 ret <2 x i64> %out 520} 521 522define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { 523; SSE2-LABEL: testv4i32: 524; SSE2: # %bb.0: 525; SSE2-NEXT: movdqa %xmm0, %xmm1 526; SSE2-NEXT: psrld $1, %xmm1 527; SSE2-NEXT: por %xmm0, %xmm1 528; SSE2-NEXT: movdqa %xmm1, %xmm0 529; SSE2-NEXT: psrld $2, %xmm0 530; SSE2-NEXT: por %xmm1, %xmm0 531; SSE2-NEXT: movdqa %xmm0, %xmm1 532; SSE2-NEXT: psrld $4, %xmm1 533; SSE2-NEXT: por %xmm0, %xmm1 534; SSE2-NEXT: movdqa %xmm1, %xmm0 535; SSE2-NEXT: psrld $8, %xmm0 536; SSE2-NEXT: por %xmm1, %xmm0 537; SSE2-NEXT: movdqa %xmm0, %xmm1 538; SSE2-NEXT: psrld $16, %xmm1 539; SSE2-NEXT: por %xmm0, %xmm1 540; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 541; SSE2-NEXT: pxor %xmm1, %xmm2 542; SSE2-NEXT: movdqa %xmm2, %xmm0 543; SSE2-NEXT: psrlw $1, %xmm0 544; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 545; SSE2-NEXT: psubb %xmm0, %xmm2 546; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 547; SSE2-NEXT: movdqa %xmm2, %xmm1 548; SSE2-NEXT: pand %xmm0, %xmm1 549; SSE2-NEXT: psrlw $2, %xmm2 550; SSE2-NEXT: pand %xmm0, %xmm2 551; SSE2-NEXT: paddb %xmm1, %xmm2 552; SSE2-NEXT: movdqa %xmm2, %xmm0 553; SSE2-NEXT: psrlw $4, %xmm0 554; SSE2-NEXT: paddb %xmm2, %xmm0 555; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 556; SSE2-NEXT: pxor %xmm1, %xmm1 557; SSE2-NEXT: movdqa %xmm0, %xmm2 558; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 559; SSE2-NEXT: psadbw %xmm1, %xmm2 560; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 561; SSE2-NEXT: psadbw %xmm1, %xmm0 562; SSE2-NEXT: packuswb %xmm2, %xmm0 563; SSE2-NEXT: retq 564; 565; SSE3-LABEL: testv4i32: 566; SSE3: # %bb.0: 567; SSE3-NEXT: movdqa %xmm0, %xmm1 568; SSE3-NEXT: psrld $1, %xmm1 569; SSE3-NEXT: por %xmm0, %xmm1 570; SSE3-NEXT: movdqa %xmm1, %xmm0 571; SSE3-NEXT: psrld $2, %xmm0 572; SSE3-NEXT: por %xmm1, %xmm0 573; SSE3-NEXT: movdqa %xmm0, %xmm1 574; SSE3-NEXT: psrld $4, %xmm1 575; SSE3-NEXT: por %xmm0, %xmm1 576; SSE3-NEXT: movdqa %xmm1, %xmm0 577; SSE3-NEXT: psrld $8, %xmm0 578; SSE3-NEXT: por %xmm1, %xmm0 579; SSE3-NEXT: movdqa %xmm0, %xmm1 580; SSE3-NEXT: psrld $16, %xmm1 581; SSE3-NEXT: por %xmm0, %xmm1 582; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 583; SSE3-NEXT: pxor %xmm1, %xmm2 584; SSE3-NEXT: movdqa %xmm2, %xmm0 585; SSE3-NEXT: psrlw $1, %xmm0 586; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 587; SSE3-NEXT: psubb %xmm0, %xmm2 588; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 589; SSE3-NEXT: movdqa %xmm2, %xmm1 590; SSE3-NEXT: pand %xmm0, %xmm1 591; SSE3-NEXT: psrlw $2, %xmm2 592; SSE3-NEXT: pand %xmm0, %xmm2 593; SSE3-NEXT: paddb %xmm1, %xmm2 594; SSE3-NEXT: movdqa %xmm2, %xmm0 595; SSE3-NEXT: psrlw $4, %xmm0 596; SSE3-NEXT: paddb %xmm2, %xmm0 597; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 598; SSE3-NEXT: pxor %xmm1, %xmm1 599; SSE3-NEXT: movdqa %xmm0, %xmm2 600; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 601; SSE3-NEXT: psadbw %xmm1, %xmm2 602; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 603; SSE3-NEXT: psadbw %xmm1, %xmm0 604; SSE3-NEXT: packuswb %xmm2, %xmm0 605; SSE3-NEXT: retq 606; 607; SSSE3-LABEL: testv4i32: 608; SSSE3: # %bb.0: 609; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 610; SSSE3-NEXT: movdqa %xmm2, %xmm3 611; SSSE3-NEXT: pshufb %xmm0, %xmm3 612; SSSE3-NEXT: movdqa %xmm0, %xmm1 613; SSSE3-NEXT: psrlw $4, %xmm1 614; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 615; SSSE3-NEXT: pxor %xmm4, %xmm4 616; SSSE3-NEXT: pshufb %xmm1, %xmm2 617; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 618; SSSE3-NEXT: pand %xmm3, %xmm1 619; SSSE3-NEXT: paddb %xmm2, %xmm1 620; SSSE3-NEXT: movdqa %xmm0, %xmm2 621; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 622; SSSE3-NEXT: psrlw $8, %xmm2 623; SSSE3-NEXT: pand %xmm1, %xmm2 624; SSSE3-NEXT: psrlw $8, %xmm1 625; SSSE3-NEXT: paddw %xmm2, %xmm1 626; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0 627; SSSE3-NEXT: psrld $16, %xmm0 628; SSSE3-NEXT: pand %xmm1, %xmm0 629; SSSE3-NEXT: psrld $16, %xmm1 630; SSSE3-NEXT: paddd %xmm0, %xmm1 631; SSSE3-NEXT: movdqa %xmm1, %xmm0 632; SSSE3-NEXT: retq 633; 634; SSE41-LABEL: testv4i32: 635; SSE41: # %bb.0: 636; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 637; SSE41-NEXT: movdqa %xmm2, %xmm3 638; SSE41-NEXT: pshufb %xmm0, %xmm3 639; SSE41-NEXT: movdqa %xmm0, %xmm1 640; SSE41-NEXT: psrlw $4, %xmm1 641; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 642; SSE41-NEXT: pxor %xmm4, %xmm4 643; SSE41-NEXT: pshufb %xmm1, %xmm2 644; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 645; SSE41-NEXT: pand %xmm3, %xmm1 646; SSE41-NEXT: paddb %xmm2, %xmm1 647; SSE41-NEXT: movdqa %xmm0, %xmm2 648; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 649; SSE41-NEXT: psrlw $8, %xmm2 650; SSE41-NEXT: pand %xmm1, %xmm2 651; SSE41-NEXT: psrlw $8, %xmm1 652; SSE41-NEXT: paddw %xmm2, %xmm1 653; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 654; SSE41-NEXT: psrld $16, %xmm0 655; SSE41-NEXT: pand %xmm1, %xmm0 656; SSE41-NEXT: psrld $16, %xmm1 657; SSE41-NEXT: paddd %xmm0, %xmm1 658; SSE41-NEXT: movdqa %xmm1, %xmm0 659; SSE41-NEXT: retq 660; 661; AVX-LABEL: testv4i32: 662; AVX: # %bb.0: 663; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 664; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 665; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 666; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 667; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 668; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 669; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 670; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 671; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 672; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 673; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 674; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 675; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 676; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 677; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 678; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 679; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 680; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 681; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 682; AVX-NEXT: retq 683; 684; AVX512VLBWDQ-LABEL: testv4i32: 685; AVX512VLBWDQ: # %bb.0: 686; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 687; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 688; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 689; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 690; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 691; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 692; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 693; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 694; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 695; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 696; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 697; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 698; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 699; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 700; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 701; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm0, %xmm0 702; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 703; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 704; AVX512VLBWDQ-NEXT: vpaddd %xmm0, %xmm1, %xmm0 705; AVX512VLBWDQ-NEXT: retq 706; 707; AVX512VLCD-LABEL: testv4i32: 708; AVX512VLCD: # %bb.0: 709; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0 710; AVX512VLCD-NEXT: retq 711; 712; AVX512CD-LABEL: testv4i32: 713; AVX512CD: # %bb.0: 714; AVX512CD-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 715; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 716; AVX512CD-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 717; AVX512CD-NEXT: vzeroupper 718; AVX512CD-NEXT: retq 719; 720; X32-SSE-LABEL: testv4i32: 721; X32-SSE: # %bb.0: 722; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 723; X32-SSE-NEXT: movdqa %xmm2, %xmm3 724; X32-SSE-NEXT: pshufb %xmm0, %xmm3 725; X32-SSE-NEXT: movdqa %xmm0, %xmm1 726; X32-SSE-NEXT: psrlw $4, %xmm1 727; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 728; X32-SSE-NEXT: pxor %xmm4, %xmm4 729; X32-SSE-NEXT: pshufb %xmm1, %xmm2 730; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 731; X32-SSE-NEXT: pand %xmm3, %xmm1 732; X32-SSE-NEXT: paddb %xmm2, %xmm1 733; X32-SSE-NEXT: movdqa %xmm0, %xmm2 734; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 735; X32-SSE-NEXT: psrlw $8, %xmm2 736; X32-SSE-NEXT: pand %xmm1, %xmm2 737; X32-SSE-NEXT: psrlw $8, %xmm1 738; X32-SSE-NEXT: paddw %xmm2, %xmm1 739; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm0 740; X32-SSE-NEXT: psrld $16, %xmm0 741; X32-SSE-NEXT: pand %xmm1, %xmm0 742; X32-SSE-NEXT: psrld $16, %xmm1 743; X32-SSE-NEXT: paddd %xmm0, %xmm1 744; X32-SSE-NEXT: movdqa %xmm1, %xmm0 745; X32-SSE-NEXT: retl 746 747 %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0) 748 ret <4 x i32> %out 749} 750 751define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { 752; SSE2-LABEL: testv4i32u: 753; SSE2: # %bb.0: 754; SSE2-NEXT: movdqa %xmm0, %xmm1 755; SSE2-NEXT: psrld $1, %xmm1 756; SSE2-NEXT: por %xmm0, %xmm1 757; SSE2-NEXT: movdqa %xmm1, %xmm0 758; SSE2-NEXT: psrld $2, %xmm0 759; SSE2-NEXT: por %xmm1, %xmm0 760; SSE2-NEXT: movdqa %xmm0, %xmm1 761; SSE2-NEXT: psrld $4, %xmm1 762; SSE2-NEXT: por %xmm0, %xmm1 763; SSE2-NEXT: movdqa %xmm1, %xmm0 764; SSE2-NEXT: psrld $8, %xmm0 765; SSE2-NEXT: por %xmm1, %xmm0 766; SSE2-NEXT: movdqa %xmm0, %xmm1 767; SSE2-NEXT: psrld $16, %xmm1 768; SSE2-NEXT: por %xmm0, %xmm1 769; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 770; SSE2-NEXT: pxor %xmm1, %xmm2 771; SSE2-NEXT: movdqa %xmm2, %xmm0 772; SSE2-NEXT: psrlw $1, %xmm0 773; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 774; SSE2-NEXT: psubb %xmm0, %xmm2 775; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 776; SSE2-NEXT: movdqa %xmm2, %xmm1 777; SSE2-NEXT: pand %xmm0, %xmm1 778; SSE2-NEXT: psrlw $2, %xmm2 779; SSE2-NEXT: pand %xmm0, %xmm2 780; SSE2-NEXT: paddb %xmm1, %xmm2 781; SSE2-NEXT: movdqa %xmm2, %xmm0 782; SSE2-NEXT: psrlw $4, %xmm0 783; SSE2-NEXT: paddb %xmm2, %xmm0 784; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 785; SSE2-NEXT: pxor %xmm1, %xmm1 786; SSE2-NEXT: movdqa %xmm0, %xmm2 787; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 788; SSE2-NEXT: psadbw %xmm1, %xmm2 789; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 790; SSE2-NEXT: psadbw %xmm1, %xmm0 791; SSE2-NEXT: packuswb %xmm2, %xmm0 792; SSE2-NEXT: retq 793; 794; SSE3-LABEL: testv4i32u: 795; SSE3: # %bb.0: 796; SSE3-NEXT: movdqa %xmm0, %xmm1 797; SSE3-NEXT: psrld $1, %xmm1 798; SSE3-NEXT: por %xmm0, %xmm1 799; SSE3-NEXT: movdqa %xmm1, %xmm0 800; SSE3-NEXT: psrld $2, %xmm0 801; SSE3-NEXT: por %xmm1, %xmm0 802; SSE3-NEXT: movdqa %xmm0, %xmm1 803; SSE3-NEXT: psrld $4, %xmm1 804; SSE3-NEXT: por %xmm0, %xmm1 805; SSE3-NEXT: movdqa %xmm1, %xmm0 806; SSE3-NEXT: psrld $8, %xmm0 807; SSE3-NEXT: por %xmm1, %xmm0 808; SSE3-NEXT: movdqa %xmm0, %xmm1 809; SSE3-NEXT: psrld $16, %xmm1 810; SSE3-NEXT: por %xmm0, %xmm1 811; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 812; SSE3-NEXT: pxor %xmm1, %xmm2 813; SSE3-NEXT: movdqa %xmm2, %xmm0 814; SSE3-NEXT: psrlw $1, %xmm0 815; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 816; SSE3-NEXT: psubb %xmm0, %xmm2 817; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 818; SSE3-NEXT: movdqa %xmm2, %xmm1 819; SSE3-NEXT: pand %xmm0, %xmm1 820; SSE3-NEXT: psrlw $2, %xmm2 821; SSE3-NEXT: pand %xmm0, %xmm2 822; SSE3-NEXT: paddb %xmm1, %xmm2 823; SSE3-NEXT: movdqa %xmm2, %xmm0 824; SSE3-NEXT: psrlw $4, %xmm0 825; SSE3-NEXT: paddb %xmm2, %xmm0 826; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 827; SSE3-NEXT: pxor %xmm1, %xmm1 828; SSE3-NEXT: movdqa %xmm0, %xmm2 829; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 830; SSE3-NEXT: psadbw %xmm1, %xmm2 831; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 832; SSE3-NEXT: psadbw %xmm1, %xmm0 833; SSE3-NEXT: packuswb %xmm2, %xmm0 834; SSE3-NEXT: retq 835; 836; SSSE3-LABEL: testv4i32u: 837; SSSE3: # %bb.0: 838; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 839; SSSE3-NEXT: movdqa %xmm2, %xmm3 840; SSSE3-NEXT: pshufb %xmm0, %xmm3 841; SSSE3-NEXT: movdqa %xmm0, %xmm1 842; SSSE3-NEXT: psrlw $4, %xmm1 843; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 844; SSSE3-NEXT: pxor %xmm4, %xmm4 845; SSSE3-NEXT: pshufb %xmm1, %xmm2 846; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 847; SSSE3-NEXT: pand %xmm3, %xmm1 848; SSSE3-NEXT: paddb %xmm2, %xmm1 849; SSSE3-NEXT: movdqa %xmm0, %xmm2 850; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 851; SSSE3-NEXT: psrlw $8, %xmm2 852; SSSE3-NEXT: pand %xmm1, %xmm2 853; SSSE3-NEXT: psrlw $8, %xmm1 854; SSSE3-NEXT: paddw %xmm2, %xmm1 855; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0 856; SSSE3-NEXT: psrld $16, %xmm0 857; SSSE3-NEXT: pand %xmm1, %xmm0 858; SSSE3-NEXT: psrld $16, %xmm1 859; SSSE3-NEXT: paddd %xmm0, %xmm1 860; SSSE3-NEXT: movdqa %xmm1, %xmm0 861; SSSE3-NEXT: retq 862; 863; SSE41-LABEL: testv4i32u: 864; SSE41: # %bb.0: 865; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 866; SSE41-NEXT: movdqa %xmm2, %xmm3 867; SSE41-NEXT: pshufb %xmm0, %xmm3 868; SSE41-NEXT: movdqa %xmm0, %xmm1 869; SSE41-NEXT: psrlw $4, %xmm1 870; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 871; SSE41-NEXT: pxor %xmm4, %xmm4 872; SSE41-NEXT: pshufb %xmm1, %xmm2 873; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 874; SSE41-NEXT: pand %xmm3, %xmm1 875; SSE41-NEXT: paddb %xmm2, %xmm1 876; SSE41-NEXT: movdqa %xmm0, %xmm2 877; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 878; SSE41-NEXT: psrlw $8, %xmm2 879; SSE41-NEXT: pand %xmm1, %xmm2 880; SSE41-NEXT: psrlw $8, %xmm1 881; SSE41-NEXT: paddw %xmm2, %xmm1 882; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 883; SSE41-NEXT: psrld $16, %xmm0 884; SSE41-NEXT: pand %xmm1, %xmm0 885; SSE41-NEXT: psrld $16, %xmm1 886; SSE41-NEXT: paddd %xmm0, %xmm1 887; SSE41-NEXT: movdqa %xmm1, %xmm0 888; SSE41-NEXT: retq 889; 890; AVX-LABEL: testv4i32u: 891; AVX: # %bb.0: 892; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 893; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 894; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 895; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 896; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 897; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 898; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 899; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 900; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 901; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 902; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 903; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 904; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 905; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 906; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 907; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 908; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 909; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 910; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 911; AVX-NEXT: retq 912; 913; AVX512VLBWDQ-LABEL: testv4i32u: 914; AVX512VLBWDQ: # %bb.0: 915; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 916; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 917; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 918; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 919; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 920; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 921; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 922; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 923; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 924; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 925; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 926; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 927; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 928; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 929; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 930; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm0, %xmm0 931; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 932; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 933; AVX512VLBWDQ-NEXT: vpaddd %xmm0, %xmm1, %xmm0 934; AVX512VLBWDQ-NEXT: retq 935; 936; AVX512VLCD-LABEL: testv4i32u: 937; AVX512VLCD: # %bb.0: 938; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0 939; AVX512VLCD-NEXT: retq 940; 941; AVX512CD-LABEL: testv4i32u: 942; AVX512CD: # %bb.0: 943; AVX512CD-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 944; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 945; AVX512CD-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 946; AVX512CD-NEXT: vzeroupper 947; AVX512CD-NEXT: retq 948; 949; X32-SSE-LABEL: testv4i32u: 950; X32-SSE: # %bb.0: 951; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 952; X32-SSE-NEXT: movdqa %xmm2, %xmm3 953; X32-SSE-NEXT: pshufb %xmm0, %xmm3 954; X32-SSE-NEXT: movdqa %xmm0, %xmm1 955; X32-SSE-NEXT: psrlw $4, %xmm1 956; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 957; X32-SSE-NEXT: pxor %xmm4, %xmm4 958; X32-SSE-NEXT: pshufb %xmm1, %xmm2 959; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 960; X32-SSE-NEXT: pand %xmm3, %xmm1 961; X32-SSE-NEXT: paddb %xmm2, %xmm1 962; X32-SSE-NEXT: movdqa %xmm0, %xmm2 963; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 964; X32-SSE-NEXT: psrlw $8, %xmm2 965; X32-SSE-NEXT: pand %xmm1, %xmm2 966; X32-SSE-NEXT: psrlw $8, %xmm1 967; X32-SSE-NEXT: paddw %xmm2, %xmm1 968; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm0 969; X32-SSE-NEXT: psrld $16, %xmm0 970; X32-SSE-NEXT: pand %xmm1, %xmm0 971; X32-SSE-NEXT: psrld $16, %xmm1 972; X32-SSE-NEXT: paddd %xmm0, %xmm1 973; X32-SSE-NEXT: movdqa %xmm1, %xmm0 974; X32-SSE-NEXT: retl 975 976 %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1) 977 ret <4 x i32> %out 978} 979 980define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { 981; SSE2-LABEL: testv8i16: 982; SSE2: # %bb.0: 983; SSE2-NEXT: movdqa %xmm0, %xmm1 984; SSE2-NEXT: psrlw $1, %xmm1 985; SSE2-NEXT: por %xmm0, %xmm1 986; SSE2-NEXT: movdqa %xmm1, %xmm0 987; SSE2-NEXT: psrlw $2, %xmm0 988; SSE2-NEXT: por %xmm1, %xmm0 989; SSE2-NEXT: movdqa %xmm0, %xmm1 990; SSE2-NEXT: psrlw $4, %xmm1 991; SSE2-NEXT: por %xmm0, %xmm1 992; SSE2-NEXT: movdqa %xmm1, %xmm0 993; SSE2-NEXT: psrlw $8, %xmm0 994; SSE2-NEXT: por %xmm1, %xmm0 995; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 996; SSE2-NEXT: pxor %xmm0, %xmm1 997; SSE2-NEXT: movdqa %xmm1, %xmm0 998; SSE2-NEXT: psrlw $1, %xmm0 999; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1000; SSE2-NEXT: psubb %xmm0, %xmm1 1001; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1002; SSE2-NEXT: movdqa %xmm1, %xmm2 1003; SSE2-NEXT: pand %xmm0, %xmm2 1004; SSE2-NEXT: psrlw $2, %xmm1 1005; SSE2-NEXT: pand %xmm0, %xmm1 1006; SSE2-NEXT: paddb %xmm2, %xmm1 1007; SSE2-NEXT: movdqa %xmm1, %xmm2 1008; SSE2-NEXT: psrlw $4, %xmm2 1009; SSE2-NEXT: paddb %xmm1, %xmm2 1010; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1011; SSE2-NEXT: movdqa %xmm2, %xmm0 1012; SSE2-NEXT: psllw $8, %xmm0 1013; SSE2-NEXT: paddb %xmm2, %xmm0 1014; SSE2-NEXT: psrlw $8, %xmm0 1015; SSE2-NEXT: retq 1016; 1017; SSE3-LABEL: testv8i16: 1018; SSE3: # %bb.0: 1019; SSE3-NEXT: movdqa %xmm0, %xmm1 1020; SSE3-NEXT: psrlw $1, %xmm1 1021; SSE3-NEXT: por %xmm0, %xmm1 1022; SSE3-NEXT: movdqa %xmm1, %xmm0 1023; SSE3-NEXT: psrlw $2, %xmm0 1024; SSE3-NEXT: por %xmm1, %xmm0 1025; SSE3-NEXT: movdqa %xmm0, %xmm1 1026; SSE3-NEXT: psrlw $4, %xmm1 1027; SSE3-NEXT: por %xmm0, %xmm1 1028; SSE3-NEXT: movdqa %xmm1, %xmm0 1029; SSE3-NEXT: psrlw $8, %xmm0 1030; SSE3-NEXT: por %xmm1, %xmm0 1031; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 1032; SSE3-NEXT: pxor %xmm0, %xmm1 1033; SSE3-NEXT: movdqa %xmm1, %xmm0 1034; SSE3-NEXT: psrlw $1, %xmm0 1035; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1036; SSE3-NEXT: psubb %xmm0, %xmm1 1037; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1038; SSE3-NEXT: movdqa %xmm1, %xmm2 1039; SSE3-NEXT: pand %xmm0, %xmm2 1040; SSE3-NEXT: psrlw $2, %xmm1 1041; SSE3-NEXT: pand %xmm0, %xmm1 1042; SSE3-NEXT: paddb %xmm2, %xmm1 1043; SSE3-NEXT: movdqa %xmm1, %xmm2 1044; SSE3-NEXT: psrlw $4, %xmm2 1045; SSE3-NEXT: paddb %xmm1, %xmm2 1046; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 1047; SSE3-NEXT: movdqa %xmm2, %xmm0 1048; SSE3-NEXT: psllw $8, %xmm0 1049; SSE3-NEXT: paddb %xmm2, %xmm0 1050; SSE3-NEXT: psrlw $8, %xmm0 1051; SSE3-NEXT: retq 1052; 1053; SSSE3-LABEL: testv8i16: 1054; SSSE3: # %bb.0: 1055; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1056; SSSE3-NEXT: movdqa %xmm2, %xmm3 1057; SSSE3-NEXT: pshufb %xmm0, %xmm3 1058; SSSE3-NEXT: movdqa %xmm0, %xmm1 1059; SSSE3-NEXT: psrlw $4, %xmm1 1060; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 1061; SSSE3-NEXT: pxor %xmm4, %xmm4 1062; SSSE3-NEXT: pshufb %xmm1, %xmm2 1063; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 1064; SSSE3-NEXT: pand %xmm3, %xmm1 1065; SSSE3-NEXT: paddb %xmm2, %xmm1 1066; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 1067; SSSE3-NEXT: psrlw $8, %xmm0 1068; SSSE3-NEXT: pand %xmm1, %xmm0 1069; SSSE3-NEXT: psrlw $8, %xmm1 1070; SSSE3-NEXT: paddw %xmm0, %xmm1 1071; SSSE3-NEXT: movdqa %xmm1, %xmm0 1072; SSSE3-NEXT: retq 1073; 1074; SSE41-LABEL: testv8i16: 1075; SSE41: # %bb.0: 1076; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1077; SSE41-NEXT: movdqa %xmm2, %xmm3 1078; SSE41-NEXT: pshufb %xmm0, %xmm3 1079; SSE41-NEXT: movdqa %xmm0, %xmm1 1080; SSE41-NEXT: psrlw $4, %xmm1 1081; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 1082; SSE41-NEXT: pxor %xmm4, %xmm4 1083; SSE41-NEXT: pshufb %xmm1, %xmm2 1084; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 1085; SSE41-NEXT: pand %xmm3, %xmm1 1086; SSE41-NEXT: paddb %xmm2, %xmm1 1087; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 1088; SSE41-NEXT: psrlw $8, %xmm0 1089; SSE41-NEXT: pand %xmm1, %xmm0 1090; SSE41-NEXT: psrlw $8, %xmm1 1091; SSE41-NEXT: paddw %xmm0, %xmm1 1092; SSE41-NEXT: movdqa %xmm1, %xmm0 1093; SSE41-NEXT: retq 1094; 1095; AVX-LABEL: testv8i16: 1096; AVX: # %bb.0: 1097; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1098; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 1099; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 1100; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1101; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 1102; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 1103; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 1104; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1105; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 1106; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 1107; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1108; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 1109; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 1110; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1111; AVX-NEXT: retq 1112; 1113; AVX512VLBWDQ-LABEL: testv8i16: 1114; AVX512VLBWDQ: # %bb.0: 1115; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1116; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 1117; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 1118; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1119; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 1120; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 1121; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 1122; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1123; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 1124; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 1125; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 1126; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 1127; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 1128; AVX512VLBWDQ-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1129; AVX512VLBWDQ-NEXT: retq 1130; 1131; AVX512VLCD-LABEL: testv8i16: 1132; AVX512VLCD: # %bb.0: 1133; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1134; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 1135; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 1136; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1137; AVX512VLCD-NEXT: vzeroupper 1138; AVX512VLCD-NEXT: retq 1139; 1140; AVX512CD-LABEL: testv8i16: 1141; AVX512CD: # %bb.0: 1142; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1143; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 1144; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 1145; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1146; AVX512CD-NEXT: vzeroupper 1147; AVX512CD-NEXT: retq 1148; 1149; X32-SSE-LABEL: testv8i16: 1150; X32-SSE: # %bb.0: 1151; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1152; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1153; X32-SSE-NEXT: pshufb %xmm0, %xmm3 1154; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1155; X32-SSE-NEXT: psrlw $4, %xmm1 1156; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1157; X32-SSE-NEXT: pxor %xmm4, %xmm4 1158; X32-SSE-NEXT: pshufb %xmm1, %xmm2 1159; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 1160; X32-SSE-NEXT: pand %xmm3, %xmm1 1161; X32-SSE-NEXT: paddb %xmm2, %xmm1 1162; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm0 1163; X32-SSE-NEXT: psrlw $8, %xmm0 1164; X32-SSE-NEXT: pand %xmm1, %xmm0 1165; X32-SSE-NEXT: psrlw $8, %xmm1 1166; X32-SSE-NEXT: paddw %xmm0, %xmm1 1167; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1168; X32-SSE-NEXT: retl 1169 %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0) 1170 ret <8 x i16> %out 1171} 1172 1173define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { 1174; SSE2-LABEL: testv8i16u: 1175; SSE2: # %bb.0: 1176; SSE2-NEXT: movdqa %xmm0, %xmm1 1177; SSE2-NEXT: psrlw $1, %xmm1 1178; SSE2-NEXT: por %xmm0, %xmm1 1179; SSE2-NEXT: movdqa %xmm1, %xmm0 1180; SSE2-NEXT: psrlw $2, %xmm0 1181; SSE2-NEXT: por %xmm1, %xmm0 1182; SSE2-NEXT: movdqa %xmm0, %xmm1 1183; SSE2-NEXT: psrlw $4, %xmm1 1184; SSE2-NEXT: por %xmm0, %xmm1 1185; SSE2-NEXT: movdqa %xmm1, %xmm0 1186; SSE2-NEXT: psrlw $8, %xmm0 1187; SSE2-NEXT: por %xmm1, %xmm0 1188; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1189; SSE2-NEXT: pxor %xmm0, %xmm1 1190; SSE2-NEXT: movdqa %xmm1, %xmm0 1191; SSE2-NEXT: psrlw $1, %xmm0 1192; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1193; SSE2-NEXT: psubb %xmm0, %xmm1 1194; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1195; SSE2-NEXT: movdqa %xmm1, %xmm2 1196; SSE2-NEXT: pand %xmm0, %xmm2 1197; SSE2-NEXT: psrlw $2, %xmm1 1198; SSE2-NEXT: pand %xmm0, %xmm1 1199; SSE2-NEXT: paddb %xmm2, %xmm1 1200; SSE2-NEXT: movdqa %xmm1, %xmm2 1201; SSE2-NEXT: psrlw $4, %xmm2 1202; SSE2-NEXT: paddb %xmm1, %xmm2 1203; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1204; SSE2-NEXT: movdqa %xmm2, %xmm0 1205; SSE2-NEXT: psllw $8, %xmm0 1206; SSE2-NEXT: paddb %xmm2, %xmm0 1207; SSE2-NEXT: psrlw $8, %xmm0 1208; SSE2-NEXT: retq 1209; 1210; SSE3-LABEL: testv8i16u: 1211; SSE3: # %bb.0: 1212; SSE3-NEXT: movdqa %xmm0, %xmm1 1213; SSE3-NEXT: psrlw $1, %xmm1 1214; SSE3-NEXT: por %xmm0, %xmm1 1215; SSE3-NEXT: movdqa %xmm1, %xmm0 1216; SSE3-NEXT: psrlw $2, %xmm0 1217; SSE3-NEXT: por %xmm1, %xmm0 1218; SSE3-NEXT: movdqa %xmm0, %xmm1 1219; SSE3-NEXT: psrlw $4, %xmm1 1220; SSE3-NEXT: por %xmm0, %xmm1 1221; SSE3-NEXT: movdqa %xmm1, %xmm0 1222; SSE3-NEXT: psrlw $8, %xmm0 1223; SSE3-NEXT: por %xmm1, %xmm0 1224; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 1225; SSE3-NEXT: pxor %xmm0, %xmm1 1226; SSE3-NEXT: movdqa %xmm1, %xmm0 1227; SSE3-NEXT: psrlw $1, %xmm0 1228; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1229; SSE3-NEXT: psubb %xmm0, %xmm1 1230; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1231; SSE3-NEXT: movdqa %xmm1, %xmm2 1232; SSE3-NEXT: pand %xmm0, %xmm2 1233; SSE3-NEXT: psrlw $2, %xmm1 1234; SSE3-NEXT: pand %xmm0, %xmm1 1235; SSE3-NEXT: paddb %xmm2, %xmm1 1236; SSE3-NEXT: movdqa %xmm1, %xmm2 1237; SSE3-NEXT: psrlw $4, %xmm2 1238; SSE3-NEXT: paddb %xmm1, %xmm2 1239; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 1240; SSE3-NEXT: movdqa %xmm2, %xmm0 1241; SSE3-NEXT: psllw $8, %xmm0 1242; SSE3-NEXT: paddb %xmm2, %xmm0 1243; SSE3-NEXT: psrlw $8, %xmm0 1244; SSE3-NEXT: retq 1245; 1246; SSSE3-LABEL: testv8i16u: 1247; SSSE3: # %bb.0: 1248; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1249; SSSE3-NEXT: movdqa %xmm2, %xmm3 1250; SSSE3-NEXT: pshufb %xmm0, %xmm3 1251; SSSE3-NEXT: movdqa %xmm0, %xmm1 1252; SSSE3-NEXT: psrlw $4, %xmm1 1253; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 1254; SSSE3-NEXT: pxor %xmm4, %xmm4 1255; SSSE3-NEXT: pshufb %xmm1, %xmm2 1256; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 1257; SSSE3-NEXT: pand %xmm3, %xmm1 1258; SSSE3-NEXT: paddb %xmm2, %xmm1 1259; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 1260; SSSE3-NEXT: psrlw $8, %xmm0 1261; SSSE3-NEXT: pand %xmm1, %xmm0 1262; SSSE3-NEXT: psrlw $8, %xmm1 1263; SSSE3-NEXT: paddw %xmm0, %xmm1 1264; SSSE3-NEXT: movdqa %xmm1, %xmm0 1265; SSSE3-NEXT: retq 1266; 1267; SSE41-LABEL: testv8i16u: 1268; SSE41: # %bb.0: 1269; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1270; SSE41-NEXT: movdqa %xmm2, %xmm3 1271; SSE41-NEXT: pshufb %xmm0, %xmm3 1272; SSE41-NEXT: movdqa %xmm0, %xmm1 1273; SSE41-NEXT: psrlw $4, %xmm1 1274; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 1275; SSE41-NEXT: pxor %xmm4, %xmm4 1276; SSE41-NEXT: pshufb %xmm1, %xmm2 1277; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 1278; SSE41-NEXT: pand %xmm3, %xmm1 1279; SSE41-NEXT: paddb %xmm2, %xmm1 1280; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 1281; SSE41-NEXT: psrlw $8, %xmm0 1282; SSE41-NEXT: pand %xmm1, %xmm0 1283; SSE41-NEXT: psrlw $8, %xmm1 1284; SSE41-NEXT: paddw %xmm0, %xmm1 1285; SSE41-NEXT: movdqa %xmm1, %xmm0 1286; SSE41-NEXT: retq 1287; 1288; AVX-LABEL: testv8i16u: 1289; AVX: # %bb.0: 1290; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1291; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 1292; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 1293; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1294; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 1295; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 1296; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 1297; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1298; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 1299; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 1300; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1301; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 1302; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 1303; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1304; AVX-NEXT: retq 1305; 1306; AVX512VLBWDQ-LABEL: testv8i16u: 1307; AVX512VLBWDQ: # %bb.0: 1308; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1309; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 1310; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 1311; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1312; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 1313; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 1314; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 1315; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1316; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 1317; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 1318; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 1319; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 1320; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 1321; AVX512VLBWDQ-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1322; AVX512VLBWDQ-NEXT: retq 1323; 1324; AVX512VLCD-LABEL: testv8i16u: 1325; AVX512VLCD: # %bb.0: 1326; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1327; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 1328; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 1329; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1330; AVX512VLCD-NEXT: vzeroupper 1331; AVX512VLCD-NEXT: retq 1332; 1333; AVX512CD-LABEL: testv8i16u: 1334; AVX512CD: # %bb.0: 1335; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1336; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 1337; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 1338; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1339; AVX512CD-NEXT: vzeroupper 1340; AVX512CD-NEXT: retq 1341; 1342; X32-SSE-LABEL: testv8i16u: 1343; X32-SSE: # %bb.0: 1344; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1345; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1346; X32-SSE-NEXT: pshufb %xmm0, %xmm3 1347; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1348; X32-SSE-NEXT: psrlw $4, %xmm1 1349; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1350; X32-SSE-NEXT: pxor %xmm4, %xmm4 1351; X32-SSE-NEXT: pshufb %xmm1, %xmm2 1352; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 1353; X32-SSE-NEXT: pand %xmm3, %xmm1 1354; X32-SSE-NEXT: paddb %xmm2, %xmm1 1355; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm0 1356; X32-SSE-NEXT: psrlw $8, %xmm0 1357; X32-SSE-NEXT: pand %xmm1, %xmm0 1358; X32-SSE-NEXT: psrlw $8, %xmm1 1359; X32-SSE-NEXT: paddw %xmm0, %xmm1 1360; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1361; X32-SSE-NEXT: retl 1362 %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1) 1363 ret <8 x i16> %out 1364} 1365 1366define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { 1367; SSE2-LABEL: testv16i8: 1368; SSE2: # %bb.0: 1369; SSE2-NEXT: movdqa %xmm0, %xmm1 1370; SSE2-NEXT: psrlw $1, %xmm1 1371; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1372; SSE2-NEXT: por %xmm0, %xmm1 1373; SSE2-NEXT: movdqa %xmm1, %xmm0 1374; SSE2-NEXT: psrlw $2, %xmm0 1375; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1376; SSE2-NEXT: por %xmm1, %xmm0 1377; SSE2-NEXT: movdqa %xmm0, %xmm1 1378; SSE2-NEXT: psrlw $4, %xmm1 1379; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1380; SSE2-NEXT: pand %xmm2, %xmm1 1381; SSE2-NEXT: por %xmm0, %xmm1 1382; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 1383; SSE2-NEXT: pxor %xmm1, %xmm3 1384; SSE2-NEXT: movdqa %xmm3, %xmm0 1385; SSE2-NEXT: psrlw $1, %xmm0 1386; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1387; SSE2-NEXT: psubb %xmm0, %xmm3 1388; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1389; SSE2-NEXT: movdqa %xmm3, %xmm1 1390; SSE2-NEXT: pand %xmm0, %xmm1 1391; SSE2-NEXT: psrlw $2, %xmm3 1392; SSE2-NEXT: pand %xmm0, %xmm3 1393; SSE2-NEXT: paddb %xmm1, %xmm3 1394; SSE2-NEXT: movdqa %xmm3, %xmm0 1395; SSE2-NEXT: psrlw $4, %xmm0 1396; SSE2-NEXT: paddb %xmm3, %xmm0 1397; SSE2-NEXT: pand %xmm2, %xmm0 1398; SSE2-NEXT: retq 1399; 1400; SSE3-LABEL: testv16i8: 1401; SSE3: # %bb.0: 1402; SSE3-NEXT: movdqa %xmm0, %xmm1 1403; SSE3-NEXT: psrlw $1, %xmm1 1404; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 1405; SSE3-NEXT: por %xmm0, %xmm1 1406; SSE3-NEXT: movdqa %xmm1, %xmm0 1407; SSE3-NEXT: psrlw $2, %xmm0 1408; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1409; SSE3-NEXT: por %xmm1, %xmm0 1410; SSE3-NEXT: movdqa %xmm0, %xmm1 1411; SSE3-NEXT: psrlw $4, %xmm1 1412; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1413; SSE3-NEXT: pand %xmm2, %xmm1 1414; SSE3-NEXT: por %xmm0, %xmm1 1415; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 1416; SSE3-NEXT: pxor %xmm1, %xmm3 1417; SSE3-NEXT: movdqa %xmm3, %xmm0 1418; SSE3-NEXT: psrlw $1, %xmm0 1419; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1420; SSE3-NEXT: psubb %xmm0, %xmm3 1421; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1422; SSE3-NEXT: movdqa %xmm3, %xmm1 1423; SSE3-NEXT: pand %xmm0, %xmm1 1424; SSE3-NEXT: psrlw $2, %xmm3 1425; SSE3-NEXT: pand %xmm0, %xmm3 1426; SSE3-NEXT: paddb %xmm1, %xmm3 1427; SSE3-NEXT: movdqa %xmm3, %xmm0 1428; SSE3-NEXT: psrlw $4, %xmm0 1429; SSE3-NEXT: paddb %xmm3, %xmm0 1430; SSE3-NEXT: pand %xmm2, %xmm0 1431; SSE3-NEXT: retq 1432; 1433; SSSE3-LABEL: testv16i8: 1434; SSSE3: # %bb.0: 1435; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1436; SSSE3-NEXT: movdqa %xmm1, %xmm2 1437; SSSE3-NEXT: pshufb %xmm0, %xmm2 1438; SSSE3-NEXT: psrlw $4, %xmm0 1439; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 1440; SSSE3-NEXT: pxor %xmm3, %xmm3 1441; SSSE3-NEXT: pcmpeqb %xmm0, %xmm3 1442; SSSE3-NEXT: pand %xmm2, %xmm3 1443; SSSE3-NEXT: pshufb %xmm0, %xmm1 1444; SSSE3-NEXT: paddb %xmm3, %xmm1 1445; SSSE3-NEXT: movdqa %xmm1, %xmm0 1446; SSSE3-NEXT: retq 1447; 1448; SSE41-LABEL: testv16i8: 1449; SSE41: # %bb.0: 1450; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1451; SSE41-NEXT: movdqa %xmm1, %xmm2 1452; SSE41-NEXT: pshufb %xmm0, %xmm2 1453; SSE41-NEXT: psrlw $4, %xmm0 1454; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 1455; SSE41-NEXT: pxor %xmm3, %xmm3 1456; SSE41-NEXT: pcmpeqb %xmm0, %xmm3 1457; SSE41-NEXT: pand %xmm2, %xmm3 1458; SSE41-NEXT: pshufb %xmm0, %xmm1 1459; SSE41-NEXT: paddb %xmm3, %xmm1 1460; SSE41-NEXT: movdqa %xmm1, %xmm0 1461; SSE41-NEXT: retq 1462; 1463; AVX-LABEL: testv16i8: 1464; AVX: # %bb.0: 1465; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1466; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 1467; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1468; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1469; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 1470; AVX-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 1471; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 1472; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1473; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 1474; AVX-NEXT: retq 1475; 1476; AVX512VLBWDQ-LABEL: testv16i8: 1477; AVX512VLBWDQ: # %bb.0: 1478; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1479; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 1480; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 1481; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1482; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 1483; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 1484; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm2, %xmm2 1485; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1486; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm2, %xmm0 1487; AVX512VLBWDQ-NEXT: retq 1488; 1489; AVX512-LABEL: testv16i8: 1490; AVX512: # %bb.0: 1491; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1492; AVX512-NEXT: vplzcntd %zmm0, %zmm0 1493; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1494; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1495; AVX512-NEXT: vzeroupper 1496; AVX512-NEXT: retq 1497; 1498; X32-SSE-LABEL: testv16i8: 1499; X32-SSE: # %bb.0: 1500; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1501; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1502; X32-SSE-NEXT: pshufb %xmm0, %xmm2 1503; X32-SSE-NEXT: psrlw $4, %xmm0 1504; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1505; X32-SSE-NEXT: pxor %xmm3, %xmm3 1506; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm3 1507; X32-SSE-NEXT: pand %xmm2, %xmm3 1508; X32-SSE-NEXT: pshufb %xmm0, %xmm1 1509; X32-SSE-NEXT: paddb %xmm3, %xmm1 1510; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1511; X32-SSE-NEXT: retl 1512 %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0) 1513 ret <16 x i8> %out 1514} 1515 1516define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { 1517; SSE2-LABEL: testv16i8u: 1518; SSE2: # %bb.0: 1519; SSE2-NEXT: movdqa %xmm0, %xmm1 1520; SSE2-NEXT: psrlw $1, %xmm1 1521; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1522; SSE2-NEXT: por %xmm0, %xmm1 1523; SSE2-NEXT: movdqa %xmm1, %xmm0 1524; SSE2-NEXT: psrlw $2, %xmm0 1525; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1526; SSE2-NEXT: por %xmm1, %xmm0 1527; SSE2-NEXT: movdqa %xmm0, %xmm1 1528; SSE2-NEXT: psrlw $4, %xmm1 1529; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1530; SSE2-NEXT: pand %xmm2, %xmm1 1531; SSE2-NEXT: por %xmm0, %xmm1 1532; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 1533; SSE2-NEXT: pxor %xmm1, %xmm3 1534; SSE2-NEXT: movdqa %xmm3, %xmm0 1535; SSE2-NEXT: psrlw $1, %xmm0 1536; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1537; SSE2-NEXT: psubb %xmm0, %xmm3 1538; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1539; SSE2-NEXT: movdqa %xmm3, %xmm1 1540; SSE2-NEXT: pand %xmm0, %xmm1 1541; SSE2-NEXT: psrlw $2, %xmm3 1542; SSE2-NEXT: pand %xmm0, %xmm3 1543; SSE2-NEXT: paddb %xmm1, %xmm3 1544; SSE2-NEXT: movdqa %xmm3, %xmm0 1545; SSE2-NEXT: psrlw $4, %xmm0 1546; SSE2-NEXT: paddb %xmm3, %xmm0 1547; SSE2-NEXT: pand %xmm2, %xmm0 1548; SSE2-NEXT: retq 1549; 1550; SSE3-LABEL: testv16i8u: 1551; SSE3: # %bb.0: 1552; SSE3-NEXT: movdqa %xmm0, %xmm1 1553; SSE3-NEXT: psrlw $1, %xmm1 1554; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 1555; SSE3-NEXT: por %xmm0, %xmm1 1556; SSE3-NEXT: movdqa %xmm1, %xmm0 1557; SSE3-NEXT: psrlw $2, %xmm0 1558; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1559; SSE3-NEXT: por %xmm1, %xmm0 1560; SSE3-NEXT: movdqa %xmm0, %xmm1 1561; SSE3-NEXT: psrlw $4, %xmm1 1562; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1563; SSE3-NEXT: pand %xmm2, %xmm1 1564; SSE3-NEXT: por %xmm0, %xmm1 1565; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 1566; SSE3-NEXT: pxor %xmm1, %xmm3 1567; SSE3-NEXT: movdqa %xmm3, %xmm0 1568; SSE3-NEXT: psrlw $1, %xmm0 1569; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1570; SSE3-NEXT: psubb %xmm0, %xmm3 1571; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1572; SSE3-NEXT: movdqa %xmm3, %xmm1 1573; SSE3-NEXT: pand %xmm0, %xmm1 1574; SSE3-NEXT: psrlw $2, %xmm3 1575; SSE3-NEXT: pand %xmm0, %xmm3 1576; SSE3-NEXT: paddb %xmm1, %xmm3 1577; SSE3-NEXT: movdqa %xmm3, %xmm0 1578; SSE3-NEXT: psrlw $4, %xmm0 1579; SSE3-NEXT: paddb %xmm3, %xmm0 1580; SSE3-NEXT: pand %xmm2, %xmm0 1581; SSE3-NEXT: retq 1582; 1583; SSSE3-LABEL: testv16i8u: 1584; SSSE3: # %bb.0: 1585; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1586; SSSE3-NEXT: movdqa %xmm1, %xmm2 1587; SSSE3-NEXT: pshufb %xmm0, %xmm2 1588; SSSE3-NEXT: psrlw $4, %xmm0 1589; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 1590; SSSE3-NEXT: pxor %xmm3, %xmm3 1591; SSSE3-NEXT: pcmpeqb %xmm0, %xmm3 1592; SSSE3-NEXT: pand %xmm2, %xmm3 1593; SSSE3-NEXT: pshufb %xmm0, %xmm1 1594; SSSE3-NEXT: paddb %xmm3, %xmm1 1595; SSSE3-NEXT: movdqa %xmm1, %xmm0 1596; SSSE3-NEXT: retq 1597; 1598; SSE41-LABEL: testv16i8u: 1599; SSE41: # %bb.0: 1600; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1601; SSE41-NEXT: movdqa %xmm1, %xmm2 1602; SSE41-NEXT: pshufb %xmm0, %xmm2 1603; SSE41-NEXT: psrlw $4, %xmm0 1604; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 1605; SSE41-NEXT: pxor %xmm3, %xmm3 1606; SSE41-NEXT: pcmpeqb %xmm0, %xmm3 1607; SSE41-NEXT: pand %xmm2, %xmm3 1608; SSE41-NEXT: pshufb %xmm0, %xmm1 1609; SSE41-NEXT: paddb %xmm3, %xmm1 1610; SSE41-NEXT: movdqa %xmm1, %xmm0 1611; SSE41-NEXT: retq 1612; 1613; AVX-LABEL: testv16i8u: 1614; AVX: # %bb.0: 1615; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1616; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 1617; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1618; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1619; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 1620; AVX-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 1621; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 1622; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1623; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 1624; AVX-NEXT: retq 1625; 1626; AVX512VLBWDQ-LABEL: testv16i8u: 1627; AVX512VLBWDQ: # %bb.0: 1628; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1629; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 1630; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 1631; AVX512VLBWDQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1632; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 1633; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 1634; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm2, %xmm2 1635; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1636; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm2, %xmm0 1637; AVX512VLBWDQ-NEXT: retq 1638; 1639; AVX512-LABEL: testv16i8u: 1640; AVX512: # %bb.0: 1641; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1642; AVX512-NEXT: vplzcntd %zmm0, %zmm0 1643; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1644; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1645; AVX512-NEXT: vzeroupper 1646; AVX512-NEXT: retq 1647; 1648; X32-SSE-LABEL: testv16i8u: 1649; X32-SSE: # %bb.0: 1650; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1651; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1652; X32-SSE-NEXT: pshufb %xmm0, %xmm2 1653; X32-SSE-NEXT: psrlw $4, %xmm0 1654; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1655; X32-SSE-NEXT: pxor %xmm3, %xmm3 1656; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm3 1657; X32-SSE-NEXT: pand %xmm2, %xmm3 1658; X32-SSE-NEXT: pshufb %xmm0, %xmm1 1659; X32-SSE-NEXT: paddb %xmm3, %xmm1 1660; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1661; X32-SSE-NEXT: retl 1662 %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1) 1663 ret <16 x i8> %out 1664} 1665 1666define <2 x i64> @foldv2i64() nounwind { 1667; SSE-LABEL: foldv2i64: 1668; SSE: # %bb.0: 1669; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] 1670; SSE-NEXT: retq 1671; 1672; NOBW-LABEL: foldv2i64: 1673; NOBW: # %bb.0: 1674; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] 1675; NOBW-NEXT: retq 1676; 1677; AVX512VLBWDQ-LABEL: foldv2i64: 1678; AVX512VLBWDQ: # %bb.0: 1679; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] 1680; AVX512VLBWDQ-NEXT: retq 1681; 1682; X32-SSE-LABEL: foldv2i64: 1683; X32-SSE: # %bb.0: 1684; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] 1685; X32-SSE-NEXT: retl 1686 %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0) 1687 ret <2 x i64> %out 1688} 1689 1690define <2 x i64> @foldv2i64u() nounwind { 1691; SSE-LABEL: foldv2i64u: 1692; SSE: # %bb.0: 1693; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] 1694; SSE-NEXT: retq 1695; 1696; NOBW-LABEL: foldv2i64u: 1697; NOBW: # %bb.0: 1698; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] 1699; NOBW-NEXT: retq 1700; 1701; AVX512VLBWDQ-LABEL: foldv2i64u: 1702; AVX512VLBWDQ: # %bb.0: 1703; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] 1704; AVX512VLBWDQ-NEXT: retq 1705; 1706; X32-SSE-LABEL: foldv2i64u: 1707; X32-SSE: # %bb.0: 1708; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] 1709; X32-SSE-NEXT: retl 1710 %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1) 1711 ret <2 x i64> %out 1712} 1713 1714define <4 x i32> @foldv4i32() nounwind { 1715; SSE-LABEL: foldv4i32: 1716; SSE: # %bb.0: 1717; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] 1718; SSE-NEXT: retq 1719; 1720; NOBW-LABEL: foldv4i32: 1721; NOBW: # %bb.0: 1722; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] 1723; NOBW-NEXT: retq 1724; 1725; AVX512VLBWDQ-LABEL: foldv4i32: 1726; AVX512VLBWDQ: # %bb.0: 1727; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] 1728; AVX512VLBWDQ-NEXT: retq 1729; 1730; X32-SSE-LABEL: foldv4i32: 1731; X32-SSE: # %bb.0: 1732; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] 1733; X32-SSE-NEXT: retl 1734 %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0) 1735 ret <4 x i32> %out 1736} 1737 1738define <4 x i32> @foldv4i32u() nounwind { 1739; SSE-LABEL: foldv4i32u: 1740; SSE: # %bb.0: 1741; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] 1742; SSE-NEXT: retq 1743; 1744; NOBW-LABEL: foldv4i32u: 1745; NOBW: # %bb.0: 1746; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] 1747; NOBW-NEXT: retq 1748; 1749; AVX512VLBWDQ-LABEL: foldv4i32u: 1750; AVX512VLBWDQ: # %bb.0: 1751; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] 1752; AVX512VLBWDQ-NEXT: retq 1753; 1754; X32-SSE-LABEL: foldv4i32u: 1755; X32-SSE: # %bb.0: 1756; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] 1757; X32-SSE-NEXT: retl 1758 %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1) 1759 ret <4 x i32> %out 1760} 1761 1762define <8 x i16> @foldv8i16() nounwind { 1763; SSE-LABEL: foldv8i16: 1764; SSE: # %bb.0: 1765; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1766; SSE-NEXT: retq 1767; 1768; NOBW-LABEL: foldv8i16: 1769; NOBW: # %bb.0: 1770; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1771; NOBW-NEXT: retq 1772; 1773; AVX512VLBWDQ-LABEL: foldv8i16: 1774; AVX512VLBWDQ: # %bb.0: 1775; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1776; AVX512VLBWDQ-NEXT: retq 1777; 1778; X32-SSE-LABEL: foldv8i16: 1779; X32-SSE: # %bb.0: 1780; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1781; X32-SSE-NEXT: retl 1782 %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0) 1783 ret <8 x i16> %out 1784} 1785 1786define <8 x i16> @foldv8i16u() nounwind { 1787; SSE-LABEL: foldv8i16u: 1788; SSE: # %bb.0: 1789; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1790; SSE-NEXT: retq 1791; 1792; NOBW-LABEL: foldv8i16u: 1793; NOBW: # %bb.0: 1794; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1795; NOBW-NEXT: retq 1796; 1797; AVX512VLBWDQ-LABEL: foldv8i16u: 1798; AVX512VLBWDQ: # %bb.0: 1799; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1800; AVX512VLBWDQ-NEXT: retq 1801; 1802; X32-SSE-LABEL: foldv8i16u: 1803; X32-SSE: # %bb.0: 1804; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1805; X32-SSE-NEXT: retl 1806 %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1) 1807 ret <8 x i16> %out 1808} 1809 1810define <16 x i8> @foldv16i8() nounwind { 1811; SSE-LABEL: foldv16i8: 1812; SSE: # %bb.0: 1813; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1814; SSE-NEXT: retq 1815; 1816; NOBW-LABEL: foldv16i8: 1817; NOBW: # %bb.0: 1818; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1819; NOBW-NEXT: retq 1820; 1821; AVX512VLBWDQ-LABEL: foldv16i8: 1822; AVX512VLBWDQ: # %bb.0: 1823; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1824; AVX512VLBWDQ-NEXT: retq 1825; 1826; X32-SSE-LABEL: foldv16i8: 1827; X32-SSE: # %bb.0: 1828; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1829; X32-SSE-NEXT: retl 1830 %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0) 1831 ret <16 x i8> %out 1832} 1833 1834define <16 x i8> @foldv16i8u() nounwind { 1835; SSE-LABEL: foldv16i8u: 1836; SSE: # %bb.0: 1837; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1838; SSE-NEXT: retq 1839; 1840; NOBW-LABEL: foldv16i8u: 1841; NOBW: # %bb.0: 1842; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1843; NOBW-NEXT: retq 1844; 1845; AVX512VLBWDQ-LABEL: foldv16i8u: 1846; AVX512VLBWDQ: # %bb.0: 1847; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1848; AVX512VLBWDQ-NEXT: retq 1849; 1850; X32-SSE-LABEL: foldv16i8u: 1851; X32-SSE: # %bb.0: 1852; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1853; X32-SSE-NEXT: retl 1854 %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1) 1855 ret <16 x i8> %out 1856} 1857 1858declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) 1859declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) 1860declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) 1861declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) 1862