1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512CDVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefixes=AVX,AVX512CD 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512VPOPCNTDQ 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=AVX512VPOPCNTDQVL 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=BITALG_NOVLX 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=BITALG 14; 15; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. 16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE 17 18define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { 19; SSE2-LABEL: testv2i64: 20; SSE2: # %bb.0: 21; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 22; SSE2-NEXT: paddq %xmm0, %xmm1 23; SSE2-NEXT: pandn %xmm1, %xmm0 24; SSE2-NEXT: movdqa %xmm0, %xmm1 25; SSE2-NEXT: psrlw $1, %xmm1 26; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 27; SSE2-NEXT: psubb %xmm1, %xmm0 28; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 29; SSE2-NEXT: movdqa %xmm0, %xmm2 30; SSE2-NEXT: pand %xmm1, %xmm2 31; SSE2-NEXT: psrlw $2, %xmm0 32; SSE2-NEXT: pand %xmm1, %xmm0 33; SSE2-NEXT: paddb %xmm2, %xmm0 34; SSE2-NEXT: movdqa %xmm0, %xmm1 35; SSE2-NEXT: psrlw $4, %xmm1 36; SSE2-NEXT: paddb %xmm0, %xmm1 37; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 38; SSE2-NEXT: pxor %xmm0, %xmm0 39; SSE2-NEXT: psadbw %xmm0, %xmm1 40; SSE2-NEXT: movdqa %xmm1, %xmm0 41; SSE2-NEXT: retq 42; 43; SSE3-LABEL: testv2i64: 44; SSE3: # %bb.0: 45; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 46; SSE3-NEXT: paddq %xmm0, %xmm1 47; SSE3-NEXT: pandn %xmm1, %xmm0 48; SSE3-NEXT: movdqa %xmm0, %xmm1 49; SSE3-NEXT: psrlw $1, %xmm1 50; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 51; SSE3-NEXT: psubb %xmm1, %xmm0 52; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 53; SSE3-NEXT: movdqa %xmm0, %xmm2 54; SSE3-NEXT: pand %xmm1, %xmm2 55; SSE3-NEXT: psrlw $2, %xmm0 56; SSE3-NEXT: pand %xmm1, %xmm0 57; SSE3-NEXT: paddb %xmm2, %xmm0 58; SSE3-NEXT: movdqa %xmm0, %xmm1 59; SSE3-NEXT: psrlw $4, %xmm1 60; SSE3-NEXT: paddb %xmm0, %xmm1 61; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 62; SSE3-NEXT: pxor %xmm0, %xmm0 63; SSE3-NEXT: psadbw %xmm0, %xmm1 64; SSE3-NEXT: movdqa %xmm1, %xmm0 65; SSE3-NEXT: retq 66; 67; SSSE3-LABEL: testv2i64: 68; SSSE3: # %bb.0: 69; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 70; SSSE3-NEXT: paddq %xmm0, %xmm1 71; SSSE3-NEXT: pandn %xmm1, %xmm0 72; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 73; SSSE3-NEXT: movdqa %xmm0, %xmm2 74; SSSE3-NEXT: pand %xmm1, %xmm2 75; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 76; SSSE3-NEXT: movdqa %xmm3, %xmm4 77; SSSE3-NEXT: pshufb %xmm2, %xmm4 78; SSSE3-NEXT: psrlw $4, %xmm0 79; SSSE3-NEXT: pand %xmm1, %xmm0 80; SSSE3-NEXT: pshufb %xmm0, %xmm3 81; SSSE3-NEXT: paddb %xmm4, %xmm3 82; SSSE3-NEXT: pxor %xmm0, %xmm0 83; SSSE3-NEXT: psadbw %xmm3, %xmm0 84; SSSE3-NEXT: retq 85; 86; SSE41-LABEL: testv2i64: 87; SSE41: # %bb.0: 88; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 89; SSE41-NEXT: paddq %xmm0, %xmm1 90; SSE41-NEXT: pandn %xmm1, %xmm0 91; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 92; SSE41-NEXT: movdqa %xmm0, %xmm2 93; SSE41-NEXT: pand %xmm1, %xmm2 94; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 95; SSE41-NEXT: movdqa %xmm3, %xmm4 96; SSE41-NEXT: pshufb %xmm2, %xmm4 97; SSE41-NEXT: psrlw $4, %xmm0 98; SSE41-NEXT: pand %xmm1, %xmm0 99; SSE41-NEXT: pshufb %xmm0, %xmm3 100; SSE41-NEXT: paddb %xmm4, %xmm3 101; SSE41-NEXT: pxor %xmm0, %xmm0 102; SSE41-NEXT: psadbw %xmm3, %xmm0 103; SSE41-NEXT: retq 104; 105; AVX1-LABEL: testv2i64: 106; AVX1: # %bb.0: 107; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 108; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 109; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 110; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 111; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 112; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 113; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 114; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 115; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 116; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 117; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 118; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 119; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 120; AVX1-NEXT: retq 121; 122; AVX2-LABEL: testv2i64: 123; AVX2: # %bb.0: 124; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 125; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 126; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 127; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 128; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 129; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 130; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 131; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 132; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 133; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 134; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 135; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 136; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 137; AVX2-NEXT: retq 138; 139; AVX512CDVL-LABEL: testv2i64: 140; AVX512CDVL: # %bb.0: 141; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 142; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 143; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 144; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 145; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] 146; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 147; AVX512CDVL-NEXT: retq 148; 149; AVX512CD-LABEL: testv2i64: 150; AVX512CD: # %bb.0: 151; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 152; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 153; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 154; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 155; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] 156; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 157; AVX512CD-NEXT: vzeroupper 158; AVX512CD-NEXT: retq 159; 160; AVX512VPOPCNTDQ-LABEL: testv2i64: 161; AVX512VPOPCNTDQ: # %bb.0: 162; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 163; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm1 164; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 165; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 166; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 167; AVX512VPOPCNTDQ-NEXT: vzeroupper 168; AVX512VPOPCNTDQ-NEXT: retq 169; 170; AVX512VPOPCNTDQVL-LABEL: testv2i64: 171; AVX512VPOPCNTDQVL: # %bb.0: 172; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 173; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 174; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 175; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 176; AVX512VPOPCNTDQVL-NEXT: retq 177; 178; BITALG_NOVLX-LABEL: testv2i64: 179; BITALG_NOVLX: # %bb.0: 180; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 181; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 182; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 183; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 184; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 185; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 186; BITALG_NOVLX-NEXT: vzeroupper 187; BITALG_NOVLX-NEXT: retq 188; 189; BITALG-LABEL: testv2i64: 190; BITALG: # %bb.0: 191; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 192; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1 193; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 194; BITALG-NEXT: vpopcntb %xmm0, %xmm0 195; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 196; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 197; BITALG-NEXT: retq 198; 199; X32-SSE-LABEL: testv2i64: 200; X32-SSE: # %bb.0: 201; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 202; X32-SSE-NEXT: paddq %xmm0, %xmm1 203; X32-SSE-NEXT: pandn %xmm1, %xmm0 204; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 205; X32-SSE-NEXT: movdqa %xmm0, %xmm2 206; X32-SSE-NEXT: pand %xmm1, %xmm2 207; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 208; X32-SSE-NEXT: movdqa %xmm3, %xmm4 209; X32-SSE-NEXT: pshufb %xmm2, %xmm4 210; X32-SSE-NEXT: psrlw $4, %xmm0 211; X32-SSE-NEXT: pand %xmm1, %xmm0 212; X32-SSE-NEXT: pshufb %xmm0, %xmm3 213; X32-SSE-NEXT: paddb %xmm4, %xmm3 214; X32-SSE-NEXT: pxor %xmm0, %xmm0 215; X32-SSE-NEXT: psadbw %xmm3, %xmm0 216; X32-SSE-NEXT: retl 217 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0) 218 ret <2 x i64> %out 219} 220 221define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { 222; SSE2-LABEL: testv2i64u: 223; SSE2: # %bb.0: 224; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 225; SSE2-NEXT: paddq %xmm0, %xmm1 226; SSE2-NEXT: pandn %xmm1, %xmm0 227; SSE2-NEXT: movdqa %xmm0, %xmm1 228; SSE2-NEXT: psrlw $1, %xmm1 229; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 230; SSE2-NEXT: psubb %xmm1, %xmm0 231; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 232; SSE2-NEXT: movdqa %xmm0, %xmm2 233; SSE2-NEXT: pand %xmm1, %xmm2 234; SSE2-NEXT: psrlw $2, %xmm0 235; SSE2-NEXT: pand %xmm1, %xmm0 236; SSE2-NEXT: paddb %xmm2, %xmm0 237; SSE2-NEXT: movdqa %xmm0, %xmm1 238; SSE2-NEXT: psrlw $4, %xmm1 239; SSE2-NEXT: paddb %xmm0, %xmm1 240; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 241; SSE2-NEXT: pxor %xmm0, %xmm0 242; SSE2-NEXT: psadbw %xmm0, %xmm1 243; SSE2-NEXT: movdqa %xmm1, %xmm0 244; SSE2-NEXT: retq 245; 246; SSE3-LABEL: testv2i64u: 247; SSE3: # %bb.0: 248; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 249; SSE3-NEXT: paddq %xmm0, %xmm1 250; SSE3-NEXT: pandn %xmm1, %xmm0 251; SSE3-NEXT: movdqa %xmm0, %xmm1 252; SSE3-NEXT: psrlw $1, %xmm1 253; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 254; SSE3-NEXT: psubb %xmm1, %xmm0 255; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 256; SSE3-NEXT: movdqa %xmm0, %xmm2 257; SSE3-NEXT: pand %xmm1, %xmm2 258; SSE3-NEXT: psrlw $2, %xmm0 259; SSE3-NEXT: pand %xmm1, %xmm0 260; SSE3-NEXT: paddb %xmm2, %xmm0 261; SSE3-NEXT: movdqa %xmm0, %xmm1 262; SSE3-NEXT: psrlw $4, %xmm1 263; SSE3-NEXT: paddb %xmm0, %xmm1 264; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 265; SSE3-NEXT: pxor %xmm0, %xmm0 266; SSE3-NEXT: psadbw %xmm0, %xmm1 267; SSE3-NEXT: movdqa %xmm1, %xmm0 268; SSE3-NEXT: retq 269; 270; SSSE3-LABEL: testv2i64u: 271; SSSE3: # %bb.0: 272; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 273; SSSE3-NEXT: paddq %xmm0, %xmm1 274; SSSE3-NEXT: pandn %xmm1, %xmm0 275; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 276; SSSE3-NEXT: movdqa %xmm0, %xmm2 277; SSSE3-NEXT: pand %xmm1, %xmm2 278; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 279; SSSE3-NEXT: movdqa %xmm3, %xmm4 280; SSSE3-NEXT: pshufb %xmm2, %xmm4 281; SSSE3-NEXT: psrlw $4, %xmm0 282; SSSE3-NEXT: pand %xmm1, %xmm0 283; SSSE3-NEXT: pshufb %xmm0, %xmm3 284; SSSE3-NEXT: paddb %xmm4, %xmm3 285; SSSE3-NEXT: pxor %xmm0, %xmm0 286; SSSE3-NEXT: psadbw %xmm3, %xmm0 287; SSSE3-NEXT: retq 288; 289; SSE41-LABEL: testv2i64u: 290; SSE41: # %bb.0: 291; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 292; SSE41-NEXT: paddq %xmm0, %xmm1 293; SSE41-NEXT: pandn %xmm1, %xmm0 294; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 295; SSE41-NEXT: movdqa %xmm0, %xmm2 296; SSE41-NEXT: pand %xmm1, %xmm2 297; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 298; SSE41-NEXT: movdqa %xmm3, %xmm4 299; SSE41-NEXT: pshufb %xmm2, %xmm4 300; SSE41-NEXT: psrlw $4, %xmm0 301; SSE41-NEXT: pand %xmm1, %xmm0 302; SSE41-NEXT: pshufb %xmm0, %xmm3 303; SSE41-NEXT: paddb %xmm4, %xmm3 304; SSE41-NEXT: pxor %xmm0, %xmm0 305; SSE41-NEXT: psadbw %xmm3, %xmm0 306; SSE41-NEXT: retq 307; 308; AVX1-LABEL: testv2i64u: 309; AVX1: # %bb.0: 310; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 311; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 312; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 313; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 314; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 315; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 316; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 317; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 318; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 319; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 320; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 321; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 322; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 323; AVX1-NEXT: retq 324; 325; AVX2-LABEL: testv2i64u: 326; AVX2: # %bb.0: 327; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 328; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 329; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 330; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 331; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 332; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 333; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 334; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 335; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 336; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 337; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 338; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 339; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 340; AVX2-NEXT: retq 341; 342; AVX512CDVL-LABEL: testv2i64u: 343; AVX512CDVL: # %bb.0: 344; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 345; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 346; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 347; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 348; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] 349; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 350; AVX512CDVL-NEXT: retq 351; 352; AVX512CD-LABEL: testv2i64u: 353; AVX512CD: # %bb.0: 354; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 355; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 356; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 357; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 358; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] 359; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 360; AVX512CD-NEXT: vzeroupper 361; AVX512CD-NEXT: retq 362; 363; AVX512VPOPCNTDQ-LABEL: testv2i64u: 364; AVX512VPOPCNTDQ: # %bb.0: 365; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 366; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm1 367; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 368; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 369; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 370; AVX512VPOPCNTDQ-NEXT: vzeroupper 371; AVX512VPOPCNTDQ-NEXT: retq 372; 373; AVX512VPOPCNTDQVL-LABEL: testv2i64u: 374; AVX512VPOPCNTDQVL: # %bb.0: 375; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 376; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 377; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 378; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 379; AVX512VPOPCNTDQVL-NEXT: retq 380; 381; BITALG_NOVLX-LABEL: testv2i64u: 382; BITALG_NOVLX: # %bb.0: 383; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 384; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 385; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 386; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 387; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 388; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 389; BITALG_NOVLX-NEXT: vzeroupper 390; BITALG_NOVLX-NEXT: retq 391; 392; BITALG-LABEL: testv2i64u: 393; BITALG: # %bb.0: 394; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 395; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1 396; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 397; BITALG-NEXT: vpopcntb %xmm0, %xmm0 398; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 399; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 400; BITALG-NEXT: retq 401; 402; X32-SSE-LABEL: testv2i64u: 403; X32-SSE: # %bb.0: 404; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 405; X32-SSE-NEXT: paddq %xmm0, %xmm1 406; X32-SSE-NEXT: pandn %xmm1, %xmm0 407; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 408; X32-SSE-NEXT: movdqa %xmm0, %xmm2 409; X32-SSE-NEXT: pand %xmm1, %xmm2 410; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 411; X32-SSE-NEXT: movdqa %xmm3, %xmm4 412; X32-SSE-NEXT: pshufb %xmm2, %xmm4 413; X32-SSE-NEXT: psrlw $4, %xmm0 414; X32-SSE-NEXT: pand %xmm1, %xmm0 415; X32-SSE-NEXT: pshufb %xmm0, %xmm3 416; X32-SSE-NEXT: paddb %xmm4, %xmm3 417; X32-SSE-NEXT: pxor %xmm0, %xmm0 418; X32-SSE-NEXT: psadbw %xmm3, %xmm0 419; X32-SSE-NEXT: retl 420 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1) 421 ret <2 x i64> %out 422} 423 424define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { 425; SSE2-LABEL: testv4i32: 426; SSE2: # %bb.0: 427; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 428; SSE2-NEXT: paddd %xmm0, %xmm1 429; SSE2-NEXT: pandn %xmm1, %xmm0 430; SSE2-NEXT: movdqa %xmm0, %xmm1 431; SSE2-NEXT: psrlw $1, %xmm1 432; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 433; SSE2-NEXT: psubb %xmm1, %xmm0 434; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 435; SSE2-NEXT: movdqa %xmm0, %xmm2 436; SSE2-NEXT: pand %xmm1, %xmm2 437; SSE2-NEXT: psrlw $2, %xmm0 438; SSE2-NEXT: pand %xmm1, %xmm0 439; SSE2-NEXT: paddb %xmm2, %xmm0 440; SSE2-NEXT: movdqa %xmm0, %xmm1 441; SSE2-NEXT: psrlw $4, %xmm1 442; SSE2-NEXT: paddb %xmm0, %xmm1 443; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 444; SSE2-NEXT: pxor %xmm0, %xmm0 445; SSE2-NEXT: movdqa %xmm1, %xmm2 446; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 447; SSE2-NEXT: psadbw %xmm0, %xmm2 448; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 449; SSE2-NEXT: psadbw %xmm0, %xmm1 450; SSE2-NEXT: packuswb %xmm2, %xmm1 451; SSE2-NEXT: movdqa %xmm1, %xmm0 452; SSE2-NEXT: retq 453; 454; SSE3-LABEL: testv4i32: 455; SSE3: # %bb.0: 456; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 457; SSE3-NEXT: paddd %xmm0, %xmm1 458; SSE3-NEXT: pandn %xmm1, %xmm0 459; SSE3-NEXT: movdqa %xmm0, %xmm1 460; SSE3-NEXT: psrlw $1, %xmm1 461; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 462; SSE3-NEXT: psubb %xmm1, %xmm0 463; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 464; SSE3-NEXT: movdqa %xmm0, %xmm2 465; SSE3-NEXT: pand %xmm1, %xmm2 466; SSE3-NEXT: psrlw $2, %xmm0 467; SSE3-NEXT: pand %xmm1, %xmm0 468; SSE3-NEXT: paddb %xmm2, %xmm0 469; SSE3-NEXT: movdqa %xmm0, %xmm1 470; SSE3-NEXT: psrlw $4, %xmm1 471; SSE3-NEXT: paddb %xmm0, %xmm1 472; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 473; SSE3-NEXT: pxor %xmm0, %xmm0 474; SSE3-NEXT: movdqa %xmm1, %xmm2 475; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 476; SSE3-NEXT: psadbw %xmm0, %xmm2 477; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 478; SSE3-NEXT: psadbw %xmm0, %xmm1 479; SSE3-NEXT: packuswb %xmm2, %xmm1 480; SSE3-NEXT: movdqa %xmm1, %xmm0 481; SSE3-NEXT: retq 482; 483; SSSE3-LABEL: testv4i32: 484; SSSE3: # %bb.0: 485; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 486; SSSE3-NEXT: paddd %xmm0, %xmm1 487; SSSE3-NEXT: pandn %xmm1, %xmm0 488; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 489; SSSE3-NEXT: movdqa %xmm0, %xmm3 490; SSSE3-NEXT: pand %xmm2, %xmm3 491; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 492; SSSE3-NEXT: movdqa %xmm1, %xmm4 493; SSSE3-NEXT: pshufb %xmm3, %xmm4 494; SSSE3-NEXT: psrlw $4, %xmm0 495; SSSE3-NEXT: pand %xmm2, %xmm0 496; SSSE3-NEXT: pshufb %xmm0, %xmm1 497; SSSE3-NEXT: paddb %xmm4, %xmm1 498; SSSE3-NEXT: pxor %xmm0, %xmm0 499; SSSE3-NEXT: movdqa %xmm1, %xmm2 500; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 501; SSSE3-NEXT: psadbw %xmm0, %xmm2 502; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 503; SSSE3-NEXT: psadbw %xmm0, %xmm1 504; SSSE3-NEXT: packuswb %xmm2, %xmm1 505; SSSE3-NEXT: movdqa %xmm1, %xmm0 506; SSSE3-NEXT: retq 507; 508; SSE41-LABEL: testv4i32: 509; SSE41: # %bb.0: 510; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 511; SSE41-NEXT: paddd %xmm0, %xmm1 512; SSE41-NEXT: pandn %xmm1, %xmm0 513; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 514; SSE41-NEXT: movdqa %xmm0, %xmm2 515; SSE41-NEXT: pand %xmm1, %xmm2 516; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 517; SSE41-NEXT: movdqa %xmm3, %xmm4 518; SSE41-NEXT: pshufb %xmm2, %xmm4 519; SSE41-NEXT: psrlw $4, %xmm0 520; SSE41-NEXT: pand %xmm1, %xmm0 521; SSE41-NEXT: pshufb %xmm0, %xmm3 522; SSE41-NEXT: paddb %xmm4, %xmm3 523; SSE41-NEXT: pxor %xmm1, %xmm1 524; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero 525; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] 526; SSE41-NEXT: psadbw %xmm1, %xmm3 527; SSE41-NEXT: psadbw %xmm1, %xmm0 528; SSE41-NEXT: packuswb %xmm3, %xmm0 529; SSE41-NEXT: retq 530; 531; AVX1-LABEL: testv4i32: 532; AVX1: # %bb.0: 533; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 534; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 535; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 536; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 537; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 538; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 539; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 540; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 541; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 542; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 543; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 544; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 545; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 546; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 547; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 548; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 549; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 550; AVX1-NEXT: retq 551; 552; AVX2-LABEL: testv4i32: 553; AVX2: # %bb.0: 554; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 555; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 556; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 557; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 558; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 559; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 560; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 561; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 562; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 563; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 564; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 565; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 566; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 567; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 568; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 569; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 570; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 571; AVX2-NEXT: retq 572; 573; AVX512CDVL-LABEL: testv4i32: 574; AVX512CDVL: # %bb.0: 575; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 576; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 577; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 578; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 579; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] 580; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 581; AVX512CDVL-NEXT: retq 582; 583; AVX512CD-LABEL: testv4i32: 584; AVX512CD: # %bb.0: 585; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 586; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1 587; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 588; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 589; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] 590; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 591; AVX512CD-NEXT: vzeroupper 592; AVX512CD-NEXT: retq 593; 594; AVX512VPOPCNTDQ-LABEL: testv4i32: 595; AVX512VPOPCNTDQ: # %bb.0: 596; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 597; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm1 598; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 599; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 600; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 601; AVX512VPOPCNTDQ-NEXT: vzeroupper 602; AVX512VPOPCNTDQ-NEXT: retq 603; 604; AVX512VPOPCNTDQVL-LABEL: testv4i32: 605; AVX512VPOPCNTDQVL: # %bb.0: 606; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 607; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 608; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 609; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 610; AVX512VPOPCNTDQVL-NEXT: retq 611; 612; BITALG_NOVLX-LABEL: testv4i32: 613; BITALG_NOVLX: # %bb.0: 614; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 615; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 616; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 617; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 618; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 619; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 620; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 621; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 622; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 623; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 624; BITALG_NOVLX-NEXT: vzeroupper 625; BITALG_NOVLX-NEXT: retq 626; 627; BITALG-LABEL: testv4i32: 628; BITALG: # %bb.0: 629; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 630; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1 631; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 632; BITALG-NEXT: vpopcntb %xmm0, %xmm0 633; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 634; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 635; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 636; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 637; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 638; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 639; BITALG-NEXT: retq 640; 641; X32-SSE-LABEL: testv4i32: 642; X32-SSE: # %bb.0: 643; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 644; X32-SSE-NEXT: paddd %xmm0, %xmm1 645; X32-SSE-NEXT: pandn %xmm1, %xmm0 646; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 647; X32-SSE-NEXT: movdqa %xmm0, %xmm2 648; X32-SSE-NEXT: pand %xmm1, %xmm2 649; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 650; X32-SSE-NEXT: movdqa %xmm3, %xmm4 651; X32-SSE-NEXT: pshufb %xmm2, %xmm4 652; X32-SSE-NEXT: psrlw $4, %xmm0 653; X32-SSE-NEXT: pand %xmm1, %xmm0 654; X32-SSE-NEXT: pshufb %xmm0, %xmm3 655; X32-SSE-NEXT: paddb %xmm4, %xmm3 656; X32-SSE-NEXT: pxor %xmm1, %xmm1 657; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero 658; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] 659; X32-SSE-NEXT: psadbw %xmm1, %xmm3 660; X32-SSE-NEXT: psadbw %xmm1, %xmm0 661; X32-SSE-NEXT: packuswb %xmm3, %xmm0 662; X32-SSE-NEXT: retl 663 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) 664 ret <4 x i32> %out 665} 666 667define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { 668; SSE2-LABEL: testv4i32u: 669; SSE2: # %bb.0: 670; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 671; SSE2-NEXT: paddd %xmm0, %xmm1 672; SSE2-NEXT: pandn %xmm1, %xmm0 673; SSE2-NEXT: movdqa %xmm0, %xmm1 674; SSE2-NEXT: psrlw $1, %xmm1 675; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 676; SSE2-NEXT: psubb %xmm1, %xmm0 677; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 678; SSE2-NEXT: movdqa %xmm0, %xmm2 679; SSE2-NEXT: pand %xmm1, %xmm2 680; SSE2-NEXT: psrlw $2, %xmm0 681; SSE2-NEXT: pand %xmm1, %xmm0 682; SSE2-NEXT: paddb %xmm2, %xmm0 683; SSE2-NEXT: movdqa %xmm0, %xmm1 684; SSE2-NEXT: psrlw $4, %xmm1 685; SSE2-NEXT: paddb %xmm0, %xmm1 686; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 687; SSE2-NEXT: pxor %xmm0, %xmm0 688; SSE2-NEXT: movdqa %xmm1, %xmm2 689; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 690; SSE2-NEXT: psadbw %xmm0, %xmm2 691; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 692; SSE2-NEXT: psadbw %xmm0, %xmm1 693; SSE2-NEXT: packuswb %xmm2, %xmm1 694; SSE2-NEXT: movdqa %xmm1, %xmm0 695; SSE2-NEXT: retq 696; 697; SSE3-LABEL: testv4i32u: 698; SSE3: # %bb.0: 699; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 700; SSE3-NEXT: paddd %xmm0, %xmm1 701; SSE3-NEXT: pandn %xmm1, %xmm0 702; SSE3-NEXT: movdqa %xmm0, %xmm1 703; SSE3-NEXT: psrlw $1, %xmm1 704; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 705; SSE3-NEXT: psubb %xmm1, %xmm0 706; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 707; SSE3-NEXT: movdqa %xmm0, %xmm2 708; SSE3-NEXT: pand %xmm1, %xmm2 709; SSE3-NEXT: psrlw $2, %xmm0 710; SSE3-NEXT: pand %xmm1, %xmm0 711; SSE3-NEXT: paddb %xmm2, %xmm0 712; SSE3-NEXT: movdqa %xmm0, %xmm1 713; SSE3-NEXT: psrlw $4, %xmm1 714; SSE3-NEXT: paddb %xmm0, %xmm1 715; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 716; SSE3-NEXT: pxor %xmm0, %xmm0 717; SSE3-NEXT: movdqa %xmm1, %xmm2 718; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 719; SSE3-NEXT: psadbw %xmm0, %xmm2 720; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 721; SSE3-NEXT: psadbw %xmm0, %xmm1 722; SSE3-NEXT: packuswb %xmm2, %xmm1 723; SSE3-NEXT: movdqa %xmm1, %xmm0 724; SSE3-NEXT: retq 725; 726; SSSE3-LABEL: testv4i32u: 727; SSSE3: # %bb.0: 728; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 729; SSSE3-NEXT: paddd %xmm0, %xmm1 730; SSSE3-NEXT: pandn %xmm1, %xmm0 731; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 732; SSSE3-NEXT: movdqa %xmm0, %xmm3 733; SSSE3-NEXT: pand %xmm2, %xmm3 734; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 735; SSSE3-NEXT: movdqa %xmm1, %xmm4 736; SSSE3-NEXT: pshufb %xmm3, %xmm4 737; SSSE3-NEXT: psrlw $4, %xmm0 738; SSSE3-NEXT: pand %xmm2, %xmm0 739; SSSE3-NEXT: pshufb %xmm0, %xmm1 740; SSSE3-NEXT: paddb %xmm4, %xmm1 741; SSSE3-NEXT: pxor %xmm0, %xmm0 742; SSSE3-NEXT: movdqa %xmm1, %xmm2 743; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 744; SSSE3-NEXT: psadbw %xmm0, %xmm2 745; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 746; SSSE3-NEXT: psadbw %xmm0, %xmm1 747; SSSE3-NEXT: packuswb %xmm2, %xmm1 748; SSSE3-NEXT: movdqa %xmm1, %xmm0 749; SSSE3-NEXT: retq 750; 751; SSE41-LABEL: testv4i32u: 752; SSE41: # %bb.0: 753; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 754; SSE41-NEXT: paddd %xmm0, %xmm1 755; SSE41-NEXT: pandn %xmm1, %xmm0 756; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 757; SSE41-NEXT: movdqa %xmm0, %xmm2 758; SSE41-NEXT: pand %xmm1, %xmm2 759; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 760; SSE41-NEXT: movdqa %xmm3, %xmm4 761; SSE41-NEXT: pshufb %xmm2, %xmm4 762; SSE41-NEXT: psrlw $4, %xmm0 763; SSE41-NEXT: pand %xmm1, %xmm0 764; SSE41-NEXT: pshufb %xmm0, %xmm3 765; SSE41-NEXT: paddb %xmm4, %xmm3 766; SSE41-NEXT: pxor %xmm1, %xmm1 767; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero 768; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] 769; SSE41-NEXT: psadbw %xmm1, %xmm3 770; SSE41-NEXT: psadbw %xmm1, %xmm0 771; SSE41-NEXT: packuswb %xmm3, %xmm0 772; SSE41-NEXT: retq 773; 774; AVX1-LABEL: testv4i32u: 775; AVX1: # %bb.0: 776; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 777; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 778; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 779; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 780; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 781; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 782; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 783; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 784; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 785; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 786; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 787; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 788; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 789; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 790; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 791; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 792; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 793; AVX1-NEXT: retq 794; 795; AVX2-LABEL: testv4i32u: 796; AVX2: # %bb.0: 797; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 798; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 799; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 800; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 801; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 802; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 803; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 804; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 805; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 806; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 807; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 808; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 809; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 810; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 811; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 812; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 813; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 814; AVX2-NEXT: retq 815; 816; AVX512CDVL-LABEL: testv4i32u: 817; AVX512CDVL: # %bb.0: 818; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 819; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 820; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 821; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 822; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] 823; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 824; AVX512CDVL-NEXT: retq 825; 826; AVX512CD-LABEL: testv4i32u: 827; AVX512CD: # %bb.0: 828; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 829; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1 830; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 831; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 832; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] 833; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 834; AVX512CD-NEXT: vzeroupper 835; AVX512CD-NEXT: retq 836; 837; AVX512VPOPCNTDQ-LABEL: testv4i32u: 838; AVX512VPOPCNTDQ: # %bb.0: 839; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 840; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm1 841; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 842; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 843; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 844; AVX512VPOPCNTDQ-NEXT: vzeroupper 845; AVX512VPOPCNTDQ-NEXT: retq 846; 847; AVX512VPOPCNTDQVL-LABEL: testv4i32u: 848; AVX512VPOPCNTDQVL: # %bb.0: 849; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 850; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 851; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 852; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 853; AVX512VPOPCNTDQVL-NEXT: retq 854; 855; BITALG_NOVLX-LABEL: testv4i32u: 856; BITALG_NOVLX: # %bb.0: 857; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 858; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 859; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 860; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 861; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 862; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 863; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 864; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 865; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 866; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 867; BITALG_NOVLX-NEXT: vzeroupper 868; BITALG_NOVLX-NEXT: retq 869; 870; BITALG-LABEL: testv4i32u: 871; BITALG: # %bb.0: 872; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 873; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1 874; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 875; BITALG-NEXT: vpopcntb %xmm0, %xmm0 876; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 877; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 878; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 879; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 880; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 881; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 882; BITALG-NEXT: retq 883; 884; X32-SSE-LABEL: testv4i32u: 885; X32-SSE: # %bb.0: 886; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 887; X32-SSE-NEXT: paddd %xmm0, %xmm1 888; X32-SSE-NEXT: pandn %xmm1, %xmm0 889; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 890; X32-SSE-NEXT: movdqa %xmm0, %xmm2 891; X32-SSE-NEXT: pand %xmm1, %xmm2 892; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 893; X32-SSE-NEXT: movdqa %xmm3, %xmm4 894; X32-SSE-NEXT: pshufb %xmm2, %xmm4 895; X32-SSE-NEXT: psrlw $4, %xmm0 896; X32-SSE-NEXT: pand %xmm1, %xmm0 897; X32-SSE-NEXT: pshufb %xmm0, %xmm3 898; X32-SSE-NEXT: paddb %xmm4, %xmm3 899; X32-SSE-NEXT: pxor %xmm1, %xmm1 900; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero 901; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] 902; X32-SSE-NEXT: psadbw %xmm1, %xmm3 903; X32-SSE-NEXT: psadbw %xmm1, %xmm0 904; X32-SSE-NEXT: packuswb %xmm3, %xmm0 905; X32-SSE-NEXT: retl 906 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) 907 ret <4 x i32> %out 908} 909 910define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { 911; SSE2-LABEL: testv8i16: 912; SSE2: # %bb.0: 913; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 914; SSE2-NEXT: paddw %xmm0, %xmm1 915; SSE2-NEXT: pandn %xmm1, %xmm0 916; SSE2-NEXT: movdqa %xmm0, %xmm1 917; SSE2-NEXT: psrlw $1, %xmm1 918; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 919; SSE2-NEXT: psubb %xmm1, %xmm0 920; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 921; SSE2-NEXT: movdqa %xmm0, %xmm2 922; SSE2-NEXT: pand %xmm1, %xmm2 923; SSE2-NEXT: psrlw $2, %xmm0 924; SSE2-NEXT: pand %xmm1, %xmm0 925; SSE2-NEXT: paddb %xmm2, %xmm0 926; SSE2-NEXT: movdqa %xmm0, %xmm1 927; SSE2-NEXT: psrlw $4, %xmm1 928; SSE2-NEXT: paddb %xmm0, %xmm1 929; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 930; SSE2-NEXT: movdqa %xmm1, %xmm0 931; SSE2-NEXT: psllw $8, %xmm0 932; SSE2-NEXT: paddb %xmm1, %xmm0 933; SSE2-NEXT: psrlw $8, %xmm0 934; SSE2-NEXT: retq 935; 936; SSE3-LABEL: testv8i16: 937; SSE3: # %bb.0: 938; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 939; SSE3-NEXT: paddw %xmm0, %xmm1 940; SSE3-NEXT: pandn %xmm1, %xmm0 941; SSE3-NEXT: movdqa %xmm0, %xmm1 942; SSE3-NEXT: psrlw $1, %xmm1 943; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 944; SSE3-NEXT: psubb %xmm1, %xmm0 945; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 946; SSE3-NEXT: movdqa %xmm0, %xmm2 947; SSE3-NEXT: pand %xmm1, %xmm2 948; SSE3-NEXT: psrlw $2, %xmm0 949; SSE3-NEXT: pand %xmm1, %xmm0 950; SSE3-NEXT: paddb %xmm2, %xmm0 951; SSE3-NEXT: movdqa %xmm0, %xmm1 952; SSE3-NEXT: psrlw $4, %xmm1 953; SSE3-NEXT: paddb %xmm0, %xmm1 954; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 955; SSE3-NEXT: movdqa %xmm1, %xmm0 956; SSE3-NEXT: psllw $8, %xmm0 957; SSE3-NEXT: paddb %xmm1, %xmm0 958; SSE3-NEXT: psrlw $8, %xmm0 959; SSE3-NEXT: retq 960; 961; SSSE3-LABEL: testv8i16: 962; SSSE3: # %bb.0: 963; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 964; SSSE3-NEXT: paddw %xmm0, %xmm1 965; SSSE3-NEXT: pandn %xmm1, %xmm0 966; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 967; SSSE3-NEXT: movdqa %xmm0, %xmm2 968; SSSE3-NEXT: pand %xmm1, %xmm2 969; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 970; SSSE3-NEXT: movdqa %xmm3, %xmm4 971; SSSE3-NEXT: pshufb %xmm2, %xmm4 972; SSSE3-NEXT: psrlw $4, %xmm0 973; SSSE3-NEXT: pand %xmm1, %xmm0 974; SSSE3-NEXT: pshufb %xmm0, %xmm3 975; SSSE3-NEXT: paddb %xmm4, %xmm3 976; SSSE3-NEXT: movdqa %xmm3, %xmm0 977; SSSE3-NEXT: psllw $8, %xmm0 978; SSSE3-NEXT: paddb %xmm3, %xmm0 979; SSSE3-NEXT: psrlw $8, %xmm0 980; SSSE3-NEXT: retq 981; 982; SSE41-LABEL: testv8i16: 983; SSE41: # %bb.0: 984; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 985; SSE41-NEXT: paddw %xmm0, %xmm1 986; SSE41-NEXT: pandn %xmm1, %xmm0 987; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 988; SSE41-NEXT: movdqa %xmm0, %xmm2 989; SSE41-NEXT: pand %xmm1, %xmm2 990; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 991; SSE41-NEXT: movdqa %xmm3, %xmm4 992; SSE41-NEXT: pshufb %xmm2, %xmm4 993; SSE41-NEXT: psrlw $4, %xmm0 994; SSE41-NEXT: pand %xmm1, %xmm0 995; SSE41-NEXT: pshufb %xmm0, %xmm3 996; SSE41-NEXT: paddb %xmm4, %xmm3 997; SSE41-NEXT: movdqa %xmm3, %xmm0 998; SSE41-NEXT: psllw $8, %xmm0 999; SSE41-NEXT: paddb %xmm3, %xmm0 1000; SSE41-NEXT: psrlw $8, %xmm0 1001; SSE41-NEXT: retq 1002; 1003; AVX-LABEL: testv8i16: 1004; AVX: # %bb.0: 1005; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1006; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1007; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1008; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1009; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1010; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1011; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1012; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1013; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1014; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1015; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1016; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 1017; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1018; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1019; AVX-NEXT: retq 1020; 1021; AVX512VPOPCNTDQ-LABEL: testv8i16: 1022; AVX512VPOPCNTDQ: # %bb.0: 1023; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1024; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1025; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 1026; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1027; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 1028; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 1029; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1030; AVX512VPOPCNTDQ-NEXT: vzeroupper 1031; AVX512VPOPCNTDQ-NEXT: retq 1032; 1033; AVX512VPOPCNTDQVL-LABEL: testv8i16: 1034; AVX512VPOPCNTDQVL: # %bb.0: 1035; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1036; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1037; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 1038; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1039; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 1040; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 1041; AVX512VPOPCNTDQVL-NEXT: vzeroupper 1042; AVX512VPOPCNTDQVL-NEXT: retq 1043; 1044; BITALG_NOVLX-LABEL: testv8i16: 1045; BITALG_NOVLX: # %bb.0: 1046; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1047; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1048; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1049; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 1050; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1051; BITALG_NOVLX-NEXT: vzeroupper 1052; BITALG_NOVLX-NEXT: retq 1053; 1054; BITALG-LABEL: testv8i16: 1055; BITALG: # %bb.0: 1056; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1057; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1058; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 1059; BITALG-NEXT: vpopcntw %xmm0, %xmm0 1060; BITALG-NEXT: retq 1061; 1062; X32-SSE-LABEL: testv8i16: 1063; X32-SSE: # %bb.0: 1064; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 1065; X32-SSE-NEXT: paddw %xmm0, %xmm1 1066; X32-SSE-NEXT: pandn %xmm1, %xmm0 1067; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1068; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1069; X32-SSE-NEXT: pand %xmm1, %xmm2 1070; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1071; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1072; X32-SSE-NEXT: pshufb %xmm2, %xmm4 1073; X32-SSE-NEXT: psrlw $4, %xmm0 1074; X32-SSE-NEXT: pand %xmm1, %xmm0 1075; X32-SSE-NEXT: pshufb %xmm0, %xmm3 1076; X32-SSE-NEXT: paddb %xmm4, %xmm3 1077; X32-SSE-NEXT: movdqa %xmm3, %xmm0 1078; X32-SSE-NEXT: psllw $8, %xmm0 1079; X32-SSE-NEXT: paddb %xmm3, %xmm0 1080; X32-SSE-NEXT: psrlw $8, %xmm0 1081; X32-SSE-NEXT: retl 1082 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0) 1083 ret <8 x i16> %out 1084} 1085 1086define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { 1087; SSE2-LABEL: testv8i16u: 1088; SSE2: # %bb.0: 1089; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1090; SSE2-NEXT: paddw %xmm0, %xmm1 1091; SSE2-NEXT: pandn %xmm1, %xmm0 1092; SSE2-NEXT: movdqa %xmm0, %xmm1 1093; SSE2-NEXT: psrlw $1, %xmm1 1094; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1095; SSE2-NEXT: psubb %xmm1, %xmm0 1096; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1097; SSE2-NEXT: movdqa %xmm0, %xmm2 1098; SSE2-NEXT: pand %xmm1, %xmm2 1099; SSE2-NEXT: psrlw $2, %xmm0 1100; SSE2-NEXT: pand %xmm1, %xmm0 1101; SSE2-NEXT: paddb %xmm2, %xmm0 1102; SSE2-NEXT: movdqa %xmm0, %xmm1 1103; SSE2-NEXT: psrlw $4, %xmm1 1104; SSE2-NEXT: paddb %xmm0, %xmm1 1105; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1106; SSE2-NEXT: movdqa %xmm1, %xmm0 1107; SSE2-NEXT: psllw $8, %xmm0 1108; SSE2-NEXT: paddb %xmm1, %xmm0 1109; SSE2-NEXT: psrlw $8, %xmm0 1110; SSE2-NEXT: retq 1111; 1112; SSE3-LABEL: testv8i16u: 1113; SSE3: # %bb.0: 1114; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 1115; SSE3-NEXT: paddw %xmm0, %xmm1 1116; SSE3-NEXT: pandn %xmm1, %xmm0 1117; SSE3-NEXT: movdqa %xmm0, %xmm1 1118; SSE3-NEXT: psrlw $1, %xmm1 1119; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 1120; SSE3-NEXT: psubb %xmm1, %xmm0 1121; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1122; SSE3-NEXT: movdqa %xmm0, %xmm2 1123; SSE3-NEXT: pand %xmm1, %xmm2 1124; SSE3-NEXT: psrlw $2, %xmm0 1125; SSE3-NEXT: pand %xmm1, %xmm0 1126; SSE3-NEXT: paddb %xmm2, %xmm0 1127; SSE3-NEXT: movdqa %xmm0, %xmm1 1128; SSE3-NEXT: psrlw $4, %xmm1 1129; SSE3-NEXT: paddb %xmm0, %xmm1 1130; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 1131; SSE3-NEXT: movdqa %xmm1, %xmm0 1132; SSE3-NEXT: psllw $8, %xmm0 1133; SSE3-NEXT: paddb %xmm1, %xmm0 1134; SSE3-NEXT: psrlw $8, %xmm0 1135; SSE3-NEXT: retq 1136; 1137; SSSE3-LABEL: testv8i16u: 1138; SSSE3: # %bb.0: 1139; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 1140; SSSE3-NEXT: paddw %xmm0, %xmm1 1141; SSSE3-NEXT: pandn %xmm1, %xmm0 1142; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1143; SSSE3-NEXT: movdqa %xmm0, %xmm2 1144; SSSE3-NEXT: pand %xmm1, %xmm2 1145; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1146; SSSE3-NEXT: movdqa %xmm3, %xmm4 1147; SSSE3-NEXT: pshufb %xmm2, %xmm4 1148; SSSE3-NEXT: psrlw $4, %xmm0 1149; SSSE3-NEXT: pand %xmm1, %xmm0 1150; SSSE3-NEXT: pshufb %xmm0, %xmm3 1151; SSSE3-NEXT: paddb %xmm4, %xmm3 1152; SSSE3-NEXT: movdqa %xmm3, %xmm0 1153; SSSE3-NEXT: psllw $8, %xmm0 1154; SSSE3-NEXT: paddb %xmm3, %xmm0 1155; SSSE3-NEXT: psrlw $8, %xmm0 1156; SSSE3-NEXT: retq 1157; 1158; SSE41-LABEL: testv8i16u: 1159; SSE41: # %bb.0: 1160; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 1161; SSE41-NEXT: paddw %xmm0, %xmm1 1162; SSE41-NEXT: pandn %xmm1, %xmm0 1163; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1164; SSE41-NEXT: movdqa %xmm0, %xmm2 1165; SSE41-NEXT: pand %xmm1, %xmm2 1166; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1167; SSE41-NEXT: movdqa %xmm3, %xmm4 1168; SSE41-NEXT: pshufb %xmm2, %xmm4 1169; SSE41-NEXT: psrlw $4, %xmm0 1170; SSE41-NEXT: pand %xmm1, %xmm0 1171; SSE41-NEXT: pshufb %xmm0, %xmm3 1172; SSE41-NEXT: paddb %xmm4, %xmm3 1173; SSE41-NEXT: movdqa %xmm3, %xmm0 1174; SSE41-NEXT: psllw $8, %xmm0 1175; SSE41-NEXT: paddb %xmm3, %xmm0 1176; SSE41-NEXT: psrlw $8, %xmm0 1177; SSE41-NEXT: retq 1178; 1179; AVX-LABEL: testv8i16u: 1180; AVX: # %bb.0: 1181; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1182; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1183; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1184; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1185; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1186; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1187; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1188; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1189; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1190; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1191; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1192; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 1193; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1194; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1195; AVX-NEXT: retq 1196; 1197; AVX512VPOPCNTDQ-LABEL: testv8i16u: 1198; AVX512VPOPCNTDQ: # %bb.0: 1199; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1200; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1201; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 1202; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1203; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 1204; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 1205; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1206; AVX512VPOPCNTDQ-NEXT: vzeroupper 1207; AVX512VPOPCNTDQ-NEXT: retq 1208; 1209; AVX512VPOPCNTDQVL-LABEL: testv8i16u: 1210; AVX512VPOPCNTDQVL: # %bb.0: 1211; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1212; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1213; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 1214; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1215; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 1216; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 1217; AVX512VPOPCNTDQVL-NEXT: vzeroupper 1218; AVX512VPOPCNTDQVL-NEXT: retq 1219; 1220; BITALG_NOVLX-LABEL: testv8i16u: 1221; BITALG_NOVLX: # %bb.0: 1222; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1223; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1224; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1225; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 1226; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1227; BITALG_NOVLX-NEXT: vzeroupper 1228; BITALG_NOVLX-NEXT: retq 1229; 1230; BITALG-LABEL: testv8i16u: 1231; BITALG: # %bb.0: 1232; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1233; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm1 1234; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 1235; BITALG-NEXT: vpopcntw %xmm0, %xmm0 1236; BITALG-NEXT: retq 1237; 1238; X32-SSE-LABEL: testv8i16u: 1239; X32-SSE: # %bb.0: 1240; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 1241; X32-SSE-NEXT: paddw %xmm0, %xmm1 1242; X32-SSE-NEXT: pandn %xmm1, %xmm0 1243; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1244; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1245; X32-SSE-NEXT: pand %xmm1, %xmm2 1246; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1247; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1248; X32-SSE-NEXT: pshufb %xmm2, %xmm4 1249; X32-SSE-NEXT: psrlw $4, %xmm0 1250; X32-SSE-NEXT: pand %xmm1, %xmm0 1251; X32-SSE-NEXT: pshufb %xmm0, %xmm3 1252; X32-SSE-NEXT: paddb %xmm4, %xmm3 1253; X32-SSE-NEXT: movdqa %xmm3, %xmm0 1254; X32-SSE-NEXT: psllw $8, %xmm0 1255; X32-SSE-NEXT: paddb %xmm3, %xmm0 1256; X32-SSE-NEXT: psrlw $8, %xmm0 1257; X32-SSE-NEXT: retl 1258 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1) 1259 ret <8 x i16> %out 1260} 1261 1262define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { 1263; SSE2-LABEL: testv16i8: 1264; SSE2: # %bb.0: 1265; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1266; SSE2-NEXT: paddb %xmm0, %xmm1 1267; SSE2-NEXT: pandn %xmm1, %xmm0 1268; SSE2-NEXT: movdqa %xmm0, %xmm1 1269; SSE2-NEXT: psrlw $1, %xmm1 1270; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1271; SSE2-NEXT: psubb %xmm1, %xmm0 1272; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1273; SSE2-NEXT: movdqa %xmm0, %xmm2 1274; SSE2-NEXT: pand %xmm1, %xmm2 1275; SSE2-NEXT: psrlw $2, %xmm0 1276; SSE2-NEXT: pand %xmm1, %xmm0 1277; SSE2-NEXT: paddb %xmm2, %xmm0 1278; SSE2-NEXT: movdqa %xmm0, %xmm1 1279; SSE2-NEXT: psrlw $4, %xmm1 1280; SSE2-NEXT: paddb %xmm0, %xmm1 1281; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1282; SSE2-NEXT: movdqa %xmm1, %xmm0 1283; SSE2-NEXT: retq 1284; 1285; SSE3-LABEL: testv16i8: 1286; SSE3: # %bb.0: 1287; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 1288; SSE3-NEXT: paddb %xmm0, %xmm1 1289; SSE3-NEXT: pandn %xmm1, %xmm0 1290; SSE3-NEXT: movdqa %xmm0, %xmm1 1291; SSE3-NEXT: psrlw $1, %xmm1 1292; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 1293; SSE3-NEXT: psubb %xmm1, %xmm0 1294; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1295; SSE3-NEXT: movdqa %xmm0, %xmm2 1296; SSE3-NEXT: pand %xmm1, %xmm2 1297; SSE3-NEXT: psrlw $2, %xmm0 1298; SSE3-NEXT: pand %xmm1, %xmm0 1299; SSE3-NEXT: paddb %xmm2, %xmm0 1300; SSE3-NEXT: movdqa %xmm0, %xmm1 1301; SSE3-NEXT: psrlw $4, %xmm1 1302; SSE3-NEXT: paddb %xmm0, %xmm1 1303; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 1304; SSE3-NEXT: movdqa %xmm1, %xmm0 1305; SSE3-NEXT: retq 1306; 1307; SSSE3-LABEL: testv16i8: 1308; SSSE3: # %bb.0: 1309; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 1310; SSSE3-NEXT: paddb %xmm0, %xmm1 1311; SSSE3-NEXT: pandn %xmm1, %xmm0 1312; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1313; SSSE3-NEXT: movdqa %xmm0, %xmm3 1314; SSSE3-NEXT: pand %xmm2, %xmm3 1315; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1316; SSSE3-NEXT: movdqa %xmm1, %xmm4 1317; SSSE3-NEXT: pshufb %xmm3, %xmm4 1318; SSSE3-NEXT: psrlw $4, %xmm0 1319; SSSE3-NEXT: pand %xmm2, %xmm0 1320; SSSE3-NEXT: pshufb %xmm0, %xmm1 1321; SSSE3-NEXT: paddb %xmm4, %xmm1 1322; SSSE3-NEXT: movdqa %xmm1, %xmm0 1323; SSSE3-NEXT: retq 1324; 1325; SSE41-LABEL: testv16i8: 1326; SSE41: # %bb.0: 1327; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 1328; SSE41-NEXT: paddb %xmm0, %xmm1 1329; SSE41-NEXT: pandn %xmm1, %xmm0 1330; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1331; SSE41-NEXT: movdqa %xmm0, %xmm3 1332; SSE41-NEXT: pand %xmm2, %xmm3 1333; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1334; SSE41-NEXT: movdqa %xmm1, %xmm4 1335; SSE41-NEXT: pshufb %xmm3, %xmm4 1336; SSE41-NEXT: psrlw $4, %xmm0 1337; SSE41-NEXT: pand %xmm2, %xmm0 1338; SSE41-NEXT: pshufb %xmm0, %xmm1 1339; SSE41-NEXT: paddb %xmm4, %xmm1 1340; SSE41-NEXT: movdqa %xmm1, %xmm0 1341; SSE41-NEXT: retq 1342; 1343; AVX-LABEL: testv16i8: 1344; AVX: # %bb.0: 1345; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1346; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1347; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1348; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1349; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1350; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1351; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1352; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1353; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1354; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1355; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1356; AVX-NEXT: retq 1357; 1358; AVX512VPOPCNTDQ-LABEL: testv16i8: 1359; AVX512VPOPCNTDQ: # %bb.0: 1360; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1361; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1362; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 1363; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1364; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 1365; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 1366; AVX512VPOPCNTDQ-NEXT: vzeroupper 1367; AVX512VPOPCNTDQ-NEXT: retq 1368; 1369; AVX512VPOPCNTDQVL-LABEL: testv16i8: 1370; AVX512VPOPCNTDQVL: # %bb.0: 1371; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1372; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1373; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 1374; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1375; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 1376; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 1377; AVX512VPOPCNTDQVL-NEXT: vzeroupper 1378; AVX512VPOPCNTDQVL-NEXT: retq 1379; 1380; BITALG_NOVLX-LABEL: testv16i8: 1381; BITALG_NOVLX: # %bb.0: 1382; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1383; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1384; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1385; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 1386; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1387; BITALG_NOVLX-NEXT: vzeroupper 1388; BITALG_NOVLX-NEXT: retq 1389; 1390; BITALG-LABEL: testv16i8: 1391; BITALG: # %bb.0: 1392; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1393; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1394; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 1395; BITALG-NEXT: vpopcntb %xmm0, %xmm0 1396; BITALG-NEXT: retq 1397; 1398; X32-SSE-LABEL: testv16i8: 1399; X32-SSE: # %bb.0: 1400; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 1401; X32-SSE-NEXT: paddb %xmm0, %xmm1 1402; X32-SSE-NEXT: pandn %xmm1, %xmm0 1403; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1404; X32-SSE-NEXT: movdqa %xmm0, %xmm3 1405; X32-SSE-NEXT: pand %xmm2, %xmm3 1406; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1407; X32-SSE-NEXT: movdqa %xmm1, %xmm4 1408; X32-SSE-NEXT: pshufb %xmm3, %xmm4 1409; X32-SSE-NEXT: psrlw $4, %xmm0 1410; X32-SSE-NEXT: pand %xmm2, %xmm0 1411; X32-SSE-NEXT: pshufb %xmm0, %xmm1 1412; X32-SSE-NEXT: paddb %xmm4, %xmm1 1413; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1414; X32-SSE-NEXT: retl 1415 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0) 1416 ret <16 x i8> %out 1417} 1418 1419define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { 1420; SSE2-LABEL: testv16i8u: 1421; SSE2: # %bb.0: 1422; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1423; SSE2-NEXT: paddb %xmm0, %xmm1 1424; SSE2-NEXT: pandn %xmm1, %xmm0 1425; SSE2-NEXT: movdqa %xmm0, %xmm1 1426; SSE2-NEXT: psrlw $1, %xmm1 1427; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1428; SSE2-NEXT: psubb %xmm1, %xmm0 1429; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1430; SSE2-NEXT: movdqa %xmm0, %xmm2 1431; SSE2-NEXT: pand %xmm1, %xmm2 1432; SSE2-NEXT: psrlw $2, %xmm0 1433; SSE2-NEXT: pand %xmm1, %xmm0 1434; SSE2-NEXT: paddb %xmm2, %xmm0 1435; SSE2-NEXT: movdqa %xmm0, %xmm1 1436; SSE2-NEXT: psrlw $4, %xmm1 1437; SSE2-NEXT: paddb %xmm0, %xmm1 1438; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1439; SSE2-NEXT: movdqa %xmm1, %xmm0 1440; SSE2-NEXT: retq 1441; 1442; SSE3-LABEL: testv16i8u: 1443; SSE3: # %bb.0: 1444; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 1445; SSE3-NEXT: paddb %xmm0, %xmm1 1446; SSE3-NEXT: pandn %xmm1, %xmm0 1447; SSE3-NEXT: movdqa %xmm0, %xmm1 1448; SSE3-NEXT: psrlw $1, %xmm1 1449; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 1450; SSE3-NEXT: psubb %xmm1, %xmm0 1451; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1452; SSE3-NEXT: movdqa %xmm0, %xmm2 1453; SSE3-NEXT: pand %xmm1, %xmm2 1454; SSE3-NEXT: psrlw $2, %xmm0 1455; SSE3-NEXT: pand %xmm1, %xmm0 1456; SSE3-NEXT: paddb %xmm2, %xmm0 1457; SSE3-NEXT: movdqa %xmm0, %xmm1 1458; SSE3-NEXT: psrlw $4, %xmm1 1459; SSE3-NEXT: paddb %xmm0, %xmm1 1460; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 1461; SSE3-NEXT: movdqa %xmm1, %xmm0 1462; SSE3-NEXT: retq 1463; 1464; SSSE3-LABEL: testv16i8u: 1465; SSSE3: # %bb.0: 1466; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 1467; SSSE3-NEXT: paddb %xmm0, %xmm1 1468; SSSE3-NEXT: pandn %xmm1, %xmm0 1469; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1470; SSSE3-NEXT: movdqa %xmm0, %xmm3 1471; SSSE3-NEXT: pand %xmm2, %xmm3 1472; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1473; SSSE3-NEXT: movdqa %xmm1, %xmm4 1474; SSSE3-NEXT: pshufb %xmm3, %xmm4 1475; SSSE3-NEXT: psrlw $4, %xmm0 1476; SSSE3-NEXT: pand %xmm2, %xmm0 1477; SSSE3-NEXT: pshufb %xmm0, %xmm1 1478; SSSE3-NEXT: paddb %xmm4, %xmm1 1479; SSSE3-NEXT: movdqa %xmm1, %xmm0 1480; SSSE3-NEXT: retq 1481; 1482; SSE41-LABEL: testv16i8u: 1483; SSE41: # %bb.0: 1484; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 1485; SSE41-NEXT: paddb %xmm0, %xmm1 1486; SSE41-NEXT: pandn %xmm1, %xmm0 1487; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1488; SSE41-NEXT: movdqa %xmm0, %xmm3 1489; SSE41-NEXT: pand %xmm2, %xmm3 1490; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1491; SSE41-NEXT: movdqa %xmm1, %xmm4 1492; SSE41-NEXT: pshufb %xmm3, %xmm4 1493; SSE41-NEXT: psrlw $4, %xmm0 1494; SSE41-NEXT: pand %xmm2, %xmm0 1495; SSE41-NEXT: pshufb %xmm0, %xmm1 1496; SSE41-NEXT: paddb %xmm4, %xmm1 1497; SSE41-NEXT: movdqa %xmm1, %xmm0 1498; SSE41-NEXT: retq 1499; 1500; AVX-LABEL: testv16i8u: 1501; AVX: # %bb.0: 1502; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1503; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1504; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1505; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1506; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1507; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1508; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1509; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1510; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1511; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1512; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1513; AVX-NEXT: retq 1514; 1515; AVX512VPOPCNTDQ-LABEL: testv16i8u: 1516; AVX512VPOPCNTDQ: # %bb.0: 1517; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1518; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1519; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 1520; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1521; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 1522; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 1523; AVX512VPOPCNTDQ-NEXT: vzeroupper 1524; AVX512VPOPCNTDQ-NEXT: retq 1525; 1526; AVX512VPOPCNTDQVL-LABEL: testv16i8u: 1527; AVX512VPOPCNTDQVL: # %bb.0: 1528; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1529; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1530; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 1531; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1532; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 1533; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 1534; AVX512VPOPCNTDQVL-NEXT: vzeroupper 1535; AVX512VPOPCNTDQVL-NEXT: retq 1536; 1537; BITALG_NOVLX-LABEL: testv16i8u: 1538; BITALG_NOVLX: # %bb.0: 1539; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1540; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1541; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1542; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 1543; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1544; BITALG_NOVLX-NEXT: vzeroupper 1545; BITALG_NOVLX-NEXT: retq 1546; 1547; BITALG-LABEL: testv16i8u: 1548; BITALG: # %bb.0: 1549; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1550; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm1 1551; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 1552; BITALG-NEXT: vpopcntb %xmm0, %xmm0 1553; BITALG-NEXT: retq 1554; 1555; X32-SSE-LABEL: testv16i8u: 1556; X32-SSE: # %bb.0: 1557; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 1558; X32-SSE-NEXT: paddb %xmm0, %xmm1 1559; X32-SSE-NEXT: pandn %xmm1, %xmm0 1560; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1561; X32-SSE-NEXT: movdqa %xmm0, %xmm3 1562; X32-SSE-NEXT: pand %xmm2, %xmm3 1563; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1564; X32-SSE-NEXT: movdqa %xmm1, %xmm4 1565; X32-SSE-NEXT: pshufb %xmm3, %xmm4 1566; X32-SSE-NEXT: psrlw $4, %xmm0 1567; X32-SSE-NEXT: pand %xmm2, %xmm0 1568; X32-SSE-NEXT: pshufb %xmm0, %xmm1 1569; X32-SSE-NEXT: paddb %xmm4, %xmm1 1570; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1571; X32-SSE-NEXT: retl 1572 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1) 1573 ret <16 x i8> %out 1574} 1575 1576define <2 x i64> @foldv2i64() nounwind { 1577; SSE-LABEL: foldv2i64: 1578; SSE: # %bb.0: 1579; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] 1580; SSE-NEXT: retq 1581; 1582; AVX-LABEL: foldv2i64: 1583; AVX: # %bb.0: 1584; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1585; AVX-NEXT: retq 1586; 1587; AVX512VPOPCNTDQ-LABEL: foldv2i64: 1588; AVX512VPOPCNTDQ: # %bb.0: 1589; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1590; AVX512VPOPCNTDQ-NEXT: retq 1591; 1592; AVX512VPOPCNTDQVL-LABEL: foldv2i64: 1593; AVX512VPOPCNTDQVL: # %bb.0: 1594; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1595; AVX512VPOPCNTDQVL-NEXT: retq 1596; 1597; BITALG_NOVLX-LABEL: foldv2i64: 1598; BITALG_NOVLX: # %bb.0: 1599; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1600; BITALG_NOVLX-NEXT: retq 1601; 1602; BITALG-LABEL: foldv2i64: 1603; BITALG: # %bb.0: 1604; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1605; BITALG-NEXT: retq 1606; 1607; X32-SSE-LABEL: foldv2i64: 1608; X32-SSE: # %bb.0: 1609; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] 1610; X32-SSE-NEXT: retl 1611 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0) 1612 ret <2 x i64> %out 1613} 1614 1615define <2 x i64> @foldv2i64u() nounwind { 1616; SSE-LABEL: foldv2i64u: 1617; SSE: # %bb.0: 1618; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] 1619; SSE-NEXT: retq 1620; 1621; AVX-LABEL: foldv2i64u: 1622; AVX: # %bb.0: 1623; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1624; AVX-NEXT: retq 1625; 1626; AVX512VPOPCNTDQ-LABEL: foldv2i64u: 1627; AVX512VPOPCNTDQ: # %bb.0: 1628; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1629; AVX512VPOPCNTDQ-NEXT: retq 1630; 1631; AVX512VPOPCNTDQVL-LABEL: foldv2i64u: 1632; AVX512VPOPCNTDQVL: # %bb.0: 1633; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1634; AVX512VPOPCNTDQVL-NEXT: retq 1635; 1636; BITALG_NOVLX-LABEL: foldv2i64u: 1637; BITALG_NOVLX: # %bb.0: 1638; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1639; BITALG_NOVLX-NEXT: retq 1640; 1641; BITALG-LABEL: foldv2i64u: 1642; BITALG: # %bb.0: 1643; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] 1644; BITALG-NEXT: retq 1645; 1646; X32-SSE-LABEL: foldv2i64u: 1647; X32-SSE: # %bb.0: 1648; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] 1649; X32-SSE-NEXT: retl 1650 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1) 1651 ret <2 x i64> %out 1652} 1653 1654define <4 x i32> @foldv4i32() nounwind { 1655; SSE-LABEL: foldv4i32: 1656; SSE: # %bb.0: 1657; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1658; SSE-NEXT: retq 1659; 1660; AVX-LABEL: foldv4i32: 1661; AVX: # %bb.0: 1662; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1663; AVX-NEXT: retq 1664; 1665; AVX512VPOPCNTDQ-LABEL: foldv4i32: 1666; AVX512VPOPCNTDQ: # %bb.0: 1667; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1668; AVX512VPOPCNTDQ-NEXT: retq 1669; 1670; AVX512VPOPCNTDQVL-LABEL: foldv4i32: 1671; AVX512VPOPCNTDQVL: # %bb.0: 1672; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1673; AVX512VPOPCNTDQVL-NEXT: retq 1674; 1675; BITALG_NOVLX-LABEL: foldv4i32: 1676; BITALG_NOVLX: # %bb.0: 1677; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1678; BITALG_NOVLX-NEXT: retq 1679; 1680; BITALG-LABEL: foldv4i32: 1681; BITALG: # %bb.0: 1682; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1683; BITALG-NEXT: retq 1684; 1685; X32-SSE-LABEL: foldv4i32: 1686; X32-SSE: # %bb.0: 1687; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1688; X32-SSE-NEXT: retl 1689 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0) 1690 ret <4 x i32> %out 1691} 1692 1693define <4 x i32> @foldv4i32u() nounwind { 1694; SSE-LABEL: foldv4i32u: 1695; SSE: # %bb.0: 1696; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1697; SSE-NEXT: retq 1698; 1699; AVX-LABEL: foldv4i32u: 1700; AVX: # %bb.0: 1701; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1702; AVX-NEXT: retq 1703; 1704; AVX512VPOPCNTDQ-LABEL: foldv4i32u: 1705; AVX512VPOPCNTDQ: # %bb.0: 1706; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1707; AVX512VPOPCNTDQ-NEXT: retq 1708; 1709; AVX512VPOPCNTDQVL-LABEL: foldv4i32u: 1710; AVX512VPOPCNTDQVL: # %bb.0: 1711; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1712; AVX512VPOPCNTDQVL-NEXT: retq 1713; 1714; BITALG_NOVLX-LABEL: foldv4i32u: 1715; BITALG_NOVLX: # %bb.0: 1716; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1717; BITALG_NOVLX-NEXT: retq 1718; 1719; BITALG-LABEL: foldv4i32u: 1720; BITALG: # %bb.0: 1721; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1722; BITALG-NEXT: retq 1723; 1724; X32-SSE-LABEL: foldv4i32u: 1725; X32-SSE: # %bb.0: 1726; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1727; X32-SSE-NEXT: retl 1728 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1) 1729 ret <4 x i32> %out 1730} 1731 1732define <8 x i16> @foldv8i16() nounwind { 1733; SSE-LABEL: foldv8i16: 1734; SSE: # %bb.0: 1735; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1736; SSE-NEXT: retq 1737; 1738; AVX-LABEL: foldv8i16: 1739; AVX: # %bb.0: 1740; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1741; AVX-NEXT: retq 1742; 1743; AVX512VPOPCNTDQ-LABEL: foldv8i16: 1744; AVX512VPOPCNTDQ: # %bb.0: 1745; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1746; AVX512VPOPCNTDQ-NEXT: retq 1747; 1748; AVX512VPOPCNTDQVL-LABEL: foldv8i16: 1749; AVX512VPOPCNTDQVL: # %bb.0: 1750; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1751; AVX512VPOPCNTDQVL-NEXT: retq 1752; 1753; BITALG_NOVLX-LABEL: foldv8i16: 1754; BITALG_NOVLX: # %bb.0: 1755; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1756; BITALG_NOVLX-NEXT: retq 1757; 1758; BITALG-LABEL: foldv8i16: 1759; BITALG: # %bb.0: 1760; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1761; BITALG-NEXT: retq 1762; 1763; X32-SSE-LABEL: foldv8i16: 1764; X32-SSE: # %bb.0: 1765; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1766; X32-SSE-NEXT: retl 1767 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0) 1768 ret <8 x i16> %out 1769} 1770 1771define <8 x i16> @foldv8i16u() nounwind { 1772; SSE-LABEL: foldv8i16u: 1773; SSE: # %bb.0: 1774; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1775; SSE-NEXT: retq 1776; 1777; AVX-LABEL: foldv8i16u: 1778; AVX: # %bb.0: 1779; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1780; AVX-NEXT: retq 1781; 1782; AVX512VPOPCNTDQ-LABEL: foldv8i16u: 1783; AVX512VPOPCNTDQ: # %bb.0: 1784; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1785; AVX512VPOPCNTDQ-NEXT: retq 1786; 1787; AVX512VPOPCNTDQVL-LABEL: foldv8i16u: 1788; AVX512VPOPCNTDQVL: # %bb.0: 1789; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1790; AVX512VPOPCNTDQVL-NEXT: retq 1791; 1792; BITALG_NOVLX-LABEL: foldv8i16u: 1793; BITALG_NOVLX: # %bb.0: 1794; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1795; BITALG_NOVLX-NEXT: retq 1796; 1797; BITALG-LABEL: foldv8i16u: 1798; BITALG: # %bb.0: 1799; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1800; BITALG-NEXT: retq 1801; 1802; X32-SSE-LABEL: foldv8i16u: 1803; X32-SSE: # %bb.0: 1804; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1805; X32-SSE-NEXT: retl 1806 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1) 1807 ret <8 x i16> %out 1808} 1809 1810define <16 x i8> @foldv16i8() nounwind { 1811; SSE-LABEL: foldv16i8: 1812; SSE: # %bb.0: 1813; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1814; SSE-NEXT: retq 1815; 1816; AVX-LABEL: foldv16i8: 1817; AVX: # %bb.0: 1818; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1819; AVX-NEXT: retq 1820; 1821; AVX512VPOPCNTDQ-LABEL: foldv16i8: 1822; AVX512VPOPCNTDQ: # %bb.0: 1823; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1824; AVX512VPOPCNTDQ-NEXT: retq 1825; 1826; AVX512VPOPCNTDQVL-LABEL: foldv16i8: 1827; AVX512VPOPCNTDQVL: # %bb.0: 1828; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1829; AVX512VPOPCNTDQVL-NEXT: retq 1830; 1831; BITALG_NOVLX-LABEL: foldv16i8: 1832; BITALG_NOVLX: # %bb.0: 1833; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1834; BITALG_NOVLX-NEXT: retq 1835; 1836; BITALG-LABEL: foldv16i8: 1837; BITALG: # %bb.0: 1838; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1839; BITALG-NEXT: retq 1840; 1841; X32-SSE-LABEL: foldv16i8: 1842; X32-SSE: # %bb.0: 1843; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1844; X32-SSE-NEXT: retl 1845 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0) 1846 ret <16 x i8> %out 1847} 1848 1849define <16 x i8> @foldv16i8u() nounwind { 1850; SSE-LABEL: foldv16i8u: 1851; SSE: # %bb.0: 1852; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1853; SSE-NEXT: retq 1854; 1855; AVX-LABEL: foldv16i8u: 1856; AVX: # %bb.0: 1857; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1858; AVX-NEXT: retq 1859; 1860; AVX512VPOPCNTDQ-LABEL: foldv16i8u: 1861; AVX512VPOPCNTDQ: # %bb.0: 1862; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1863; AVX512VPOPCNTDQ-NEXT: retq 1864; 1865; AVX512VPOPCNTDQVL-LABEL: foldv16i8u: 1866; AVX512VPOPCNTDQVL: # %bb.0: 1867; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1868; AVX512VPOPCNTDQVL-NEXT: retq 1869; 1870; BITALG_NOVLX-LABEL: foldv16i8u: 1871; BITALG_NOVLX: # %bb.0: 1872; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1873; BITALG_NOVLX-NEXT: retq 1874; 1875; BITALG-LABEL: foldv16i8u: 1876; BITALG: # %bb.0: 1877; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1878; BITALG-NEXT: retq 1879; 1880; X32-SSE-LABEL: foldv16i8u: 1881; X32-SSE: # %bb.0: 1882; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1883; X32-SSE-NEXT: retl 1884 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1) 1885 ret <16 x i8> %out 1886} 1887 1888declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) 1889declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) 1890declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1) 1891declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1) 1892