1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 9 10; 11; add 12; 13 14define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 15; SSE-LABEL: trunc_add_v4i64_v4i32: 16; SSE: # %bb.0: 17; SSE-NEXT: paddq %xmm3, %xmm1 18; SSE-NEXT: paddq %xmm2, %xmm0 19; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 20; SSE-NEXT: retq 21; 22; AVX1-LABEL: trunc_add_v4i64_v4i32: 23; AVX1: # %bb.0: 24; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 25; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 26; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 27; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 28; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 29; AVX1-NEXT: vzeroupper 30; AVX1-NEXT: retq 31; 32; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: 33; AVX2-SLOW: # %bb.0: 34; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 35; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 36; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 37; AVX2-SLOW-NEXT: vzeroupper 38; AVX2-SLOW-NEXT: retq 39; 40; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32: 41; AVX2-FAST: # %bb.0: 42; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0 43; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 44; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 45; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 46; AVX2-FAST-NEXT: vzeroupper 47; AVX2-FAST-NEXT: retq 48; 49; AVX512-LABEL: trunc_add_v4i64_v4i32: 50; AVX512: # %bb.0: 51; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 52; AVX512-NEXT: vpmovqd %zmm0, %ymm0 53; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 54; AVX512-NEXT: vzeroupper 55; AVX512-NEXT: retq 56 %1 = add <4 x i64> %a0, %a1 57 %2 = trunc <4 x i64> %1 to <4 x i32> 58 ret <4 x i32> %2 59} 60 61define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 62; SSE-LABEL: trunc_add_v8i64_v8i16: 63; SSE: # %bb.0: 64; SSE-NEXT: paddq %xmm6, %xmm2 65; SSE-NEXT: paddq %xmm7, %xmm3 66; SSE-NEXT: paddq %xmm4, %xmm0 67; SSE-NEXT: paddq %xmm5, %xmm1 68; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 69; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 70; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 71; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 72; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 73; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 74; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 75; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 76; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 77; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 78; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 79; SSE-NEXT: retq 80; 81; AVX1-LABEL: trunc_add_v8i64_v8i16: 82; AVX1: # %bb.0: 83; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 84; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 85; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 86; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 87; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 88; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 89; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 90; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 91; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 92; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 93; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 94; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 95; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 96; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 97; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 98; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 99; AVX1-NEXT: vzeroupper 100; AVX1-NEXT: retq 101; 102; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: 103; AVX2-SLOW: # %bb.0: 104; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 105; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 106; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 107; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 108; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 109; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 110; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 111; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 112; AVX2-SLOW-NEXT: vzeroupper 113; AVX2-SLOW-NEXT: retq 114; 115; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16: 116; AVX2-FAST: # %bb.0: 117; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1 118; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0 119; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 120; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 121; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 122; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 123; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 124; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 125; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 126; AVX2-FAST-NEXT: vzeroupper 127; AVX2-FAST-NEXT: retq 128; 129; AVX512-LABEL: trunc_add_v8i64_v8i16: 130; AVX512: # %bb.0: 131; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 132; AVX512-NEXT: vpmovqw %zmm0, %xmm0 133; AVX512-NEXT: vzeroupper 134; AVX512-NEXT: retq 135 %1 = add <8 x i64> %a0, %a1 136 %2 = trunc <8 x i64> %1 to <8 x i16> 137 ret <8 x i16> %2 138} 139 140define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 141; SSE-LABEL: trunc_add_v8i32_v8i16: 142; SSE: # %bb.0: 143; SSE-NEXT: paddd %xmm2, %xmm0 144; SSE-NEXT: paddd %xmm3, %xmm1 145; SSE-NEXT: pslld $16, %xmm1 146; SSE-NEXT: psrad $16, %xmm1 147; SSE-NEXT: pslld $16, %xmm0 148; SSE-NEXT: psrad $16, %xmm0 149; SSE-NEXT: packssdw %xmm1, %xmm0 150; SSE-NEXT: retq 151; 152; AVX1-LABEL: trunc_add_v8i32_v8i16: 153; AVX1: # %bb.0: 154; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 155; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 156; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 157; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 158; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 159; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 160; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 161; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 162; AVX1-NEXT: vzeroupper 163; AVX1-NEXT: retq 164; 165; AVX2-LABEL: trunc_add_v8i32_v8i16: 166; AVX2: # %bb.0: 167; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 168; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 169; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 170; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 171; AVX2-NEXT: vzeroupper 172; AVX2-NEXT: retq 173; 174; AVX512-LABEL: trunc_add_v8i32_v8i16: 175; AVX512: # %bb.0: 176; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 177; AVX512-NEXT: vpmovdw %zmm0, %ymm0 178; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 179; AVX512-NEXT: vzeroupper 180; AVX512-NEXT: retq 181 %1 = add <8 x i32> %a0, %a1 182 %2 = trunc <8 x i32> %1 to <8 x i16> 183 ret <8 x i16> %2 184} 185 186define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 187; SSE-LABEL: trunc_add_v16i64_v16i8: 188; SSE: # %bb.0: 189; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 190; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 191; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 192; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 193; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 194; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 195; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 196; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 197; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 198; SSE-NEXT: pand %xmm8, %xmm7 199; SSE-NEXT: pand %xmm8, %xmm6 200; SSE-NEXT: packuswb %xmm7, %xmm6 201; SSE-NEXT: pand %xmm8, %xmm5 202; SSE-NEXT: pand %xmm8, %xmm4 203; SSE-NEXT: packuswb %xmm5, %xmm4 204; SSE-NEXT: packuswb %xmm6, %xmm4 205; SSE-NEXT: pand %xmm8, %xmm3 206; SSE-NEXT: pand %xmm8, %xmm2 207; SSE-NEXT: packuswb %xmm3, %xmm2 208; SSE-NEXT: pand %xmm8, %xmm1 209; SSE-NEXT: pand %xmm8, %xmm0 210; SSE-NEXT: packuswb %xmm1, %xmm0 211; SSE-NEXT: packuswb %xmm2, %xmm0 212; SSE-NEXT: packuswb %xmm4, %xmm0 213; SSE-NEXT: retq 214; 215; AVX1-LABEL: trunc_add_v16i64_v16i8: 216; AVX1: # %bb.0: 217; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 218; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 219; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 220; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 221; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 222; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 223; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 224; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 225; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 226; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 227; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 228; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 229; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 230; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 231; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 232; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 233; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 234; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 235; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 236; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 237; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 238; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 239; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 240; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 241; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 242; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 243; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 244; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 245; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 246; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 247; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 248; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 249; AVX1-NEXT: vzeroupper 250; AVX1-NEXT: retq 251; 252; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: 253; AVX2-SLOW: # %bb.0: 254; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 255; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 256; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 257; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 258; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 259; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 260; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 261; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 262; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 263; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 264; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 265; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 266; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 267; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 268; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 269; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 270; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 271; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 272; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 273; AVX2-SLOW-NEXT: vzeroupper 274; AVX2-SLOW-NEXT: retq 275; 276; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8: 277; AVX2-FAST: # %bb.0: 278; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1 279; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0 280; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3 281; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2 282; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 283; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 284; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 285; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 286; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 287; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 288; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 289; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 290; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 291; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 292; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 293; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 294; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 295; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 296; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 297; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 298; AVX2-FAST-NEXT: vzeroupper 299; AVX2-FAST-NEXT: retq 300; 301; AVX512-LABEL: trunc_add_v16i64_v16i8: 302; AVX512: # %bb.0: 303; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 304; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 305; AVX512-NEXT: vpmovqb %zmm1, %xmm1 306; AVX512-NEXT: vpmovqb %zmm0, %xmm0 307; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 308; AVX512-NEXT: vzeroupper 309; AVX512-NEXT: retq 310 %1 = add <16 x i64> %a0, %a1 311 %2 = trunc <16 x i64> %1 to <16 x i8> 312 ret <16 x i8> %2 313} 314 315define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 316; SSE-LABEL: trunc_add_v16i32_v16i8: 317; SSE: # %bb.0: 318; SSE-NEXT: paddd %xmm4, %xmm0 319; SSE-NEXT: paddd %xmm5, %xmm1 320; SSE-NEXT: paddd %xmm6, %xmm2 321; SSE-NEXT: paddd %xmm7, %xmm3 322; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 323; SSE-NEXT: pand %xmm4, %xmm3 324; SSE-NEXT: pand %xmm4, %xmm2 325; SSE-NEXT: packuswb %xmm3, %xmm2 326; SSE-NEXT: pand %xmm4, %xmm1 327; SSE-NEXT: pand %xmm4, %xmm0 328; SSE-NEXT: packuswb %xmm1, %xmm0 329; SSE-NEXT: packuswb %xmm2, %xmm0 330; SSE-NEXT: retq 331; 332; AVX1-LABEL: trunc_add_v16i32_v16i8: 333; AVX1: # %bb.0: 334; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 335; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 336; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 337; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 338; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 339; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 340; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 341; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 342; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 343; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 344; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 345; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 346; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 347; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 348; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 349; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 350; AVX1-NEXT: vzeroupper 351; AVX1-NEXT: retq 352; 353; AVX2-LABEL: trunc_add_v16i32_v16i8: 354; AVX2: # %bb.0: 355; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 356; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 357; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 358; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 359; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 360; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 361; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 362; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 363; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 364; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 365; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 366; AVX2-NEXT: vzeroupper 367; AVX2-NEXT: retq 368; 369; AVX512-LABEL: trunc_add_v16i32_v16i8: 370; AVX512: # %bb.0: 371; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 372; AVX512-NEXT: vpmovdb %zmm0, %xmm0 373; AVX512-NEXT: vzeroupper 374; AVX512-NEXT: retq 375 %1 = add <16 x i32> %a0, %a1 376 %2 = trunc <16 x i32> %1 to <16 x i8> 377 ret <16 x i8> %2 378} 379 380define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 381; SSE-LABEL: trunc_add_v16i16_v16i8: 382; SSE: # %bb.0: 383; SSE-NEXT: paddw %xmm2, %xmm0 384; SSE-NEXT: paddw %xmm3, %xmm1 385; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 386; SSE-NEXT: pand %xmm2, %xmm1 387; SSE-NEXT: pand %xmm2, %xmm0 388; SSE-NEXT: packuswb %xmm1, %xmm0 389; SSE-NEXT: retq 390; 391; AVX1-LABEL: trunc_add_v16i16_v16i8: 392; AVX1: # %bb.0: 393; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 394; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 395; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 396; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 397; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 398; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 399; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 400; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 401; AVX1-NEXT: vzeroupper 402; AVX1-NEXT: retq 403; 404; AVX2-LABEL: trunc_add_v16i16_v16i8: 405; AVX2: # %bb.0: 406; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 407; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 408; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 409; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 410; AVX2-NEXT: vzeroupper 411; AVX2-NEXT: retq 412; 413; AVX512F-LABEL: trunc_add_v16i16_v16i8: 414; AVX512F: # %bb.0: 415; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 416; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 417; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 418; AVX512F-NEXT: vzeroupper 419; AVX512F-NEXT: retq 420; 421; AVX512BW-LABEL: trunc_add_v16i16_v16i8: 422; AVX512BW: # %bb.0: 423; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 424; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 425; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 426; AVX512BW-NEXT: vzeroupper 427; AVX512BW-NEXT: retq 428; 429; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: 430; AVX512DQ: # %bb.0: 431; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 432; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 433; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 434; AVX512DQ-NEXT: vzeroupper 435; AVX512DQ-NEXT: retq 436 %1 = add <16 x i16> %a0, %a1 437 %2 = trunc <16 x i16> %1 to <16 x i8> 438 ret <16 x i8> %2 439} 440 441define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 442; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 443; SSE: # %bb.0: 444; SSE-NEXT: pslld $16, %xmm2 445; SSE-NEXT: psrad $16, %xmm2 446; SSE-NEXT: pslld $16, %xmm1 447; SSE-NEXT: psrad $16, %xmm1 448; SSE-NEXT: packssdw %xmm2, %xmm1 449; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 450; SSE-NEXT: psraw $8, %xmm0 451; SSE-NEXT: paddw %xmm1, %xmm0 452; SSE-NEXT: retq 453; 454; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 455; AVX1: # %bb.0: 456; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 457; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 458; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 459; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 460; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 461; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 462; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 463; AVX1-NEXT: vzeroupper 464; AVX1-NEXT: retq 465; 466; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 467; AVX2: # %bb.0: 468; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 469; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 470; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 471; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 472; AVX2-NEXT: vzeroupper 473; AVX2-NEXT: retq 474; 475; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 476; AVX512: # %bb.0: 477; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 478; AVX512-NEXT: vpmovdw %zmm1, %ymm1 479; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 480; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 481; AVX512-NEXT: vzeroupper 482; AVX512-NEXT: retq 483 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 484 %2 = sext <8 x i8> %1 to <8 x i32> 485 %3 = add <8 x i32> %2, %a1 486 %4 = trunc <8 x i32> %3 to <8 x i16> 487 ret <8 x i16> %4 488} 489 490; 491; add to constant 492; 493 494define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 495; SSE-LABEL: trunc_add_const_v4i64_v4i32: 496; SSE: # %bb.0: 497; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 498; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 499; SSE-NEXT: retq 500; 501; AVX1-LABEL: trunc_add_const_v4i64_v4i32: 502; AVX1: # %bb.0: 503; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 504; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 505; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 506; AVX1-NEXT: vzeroupper 507; AVX1-NEXT: retq 508; 509; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: 510; AVX2-SLOW: # %bb.0: 511; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 512; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 513; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 514; AVX2-SLOW-NEXT: vzeroupper 515; AVX2-SLOW-NEXT: retq 516; 517; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32: 518; AVX2-FAST: # %bb.0: 519; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 520; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 521; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 522; AVX2-FAST-NEXT: vzeroupper 523; AVX2-FAST-NEXT: retq 524; 525; AVX512-LABEL: trunc_add_const_v4i64_v4i32: 526; AVX512: # %bb.0: 527; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 528; AVX512-NEXT: vpmovqd %zmm0, %ymm0 529; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 530; AVX512-NEXT: vzeroupper 531; AVX512-NEXT: retq 532 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 533 %2 = trunc <4 x i64> %1 to <4 x i32> 534 ret <4 x i32> %2 535} 536 537define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 538; SSE-LABEL: trunc_add_const_v8i64_v8i16: 539; SSE: # %bb.0: 540; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 541; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 542; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 543; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 544; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 545; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 546; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 547; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 548; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 549; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 550; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 551; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 552; SSE-NEXT: retq 553; 554; AVX1-LABEL: trunc_add_const_v8i64_v8i16: 555; AVX1: # %bb.0: 556; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 557; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 558; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 559; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 560; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 561; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 562; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 563; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 564; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 565; AVX1-NEXT: vzeroupper 566; AVX1-NEXT: retq 567; 568; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: 569; AVX2-SLOW: # %bb.0: 570; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 571; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 572; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 573; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 574; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 575; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 576; AVX2-SLOW-NEXT: vzeroupper 577; AVX2-SLOW-NEXT: retq 578; 579; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16: 580; AVX2-FAST: # %bb.0: 581; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 582; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 583; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 584; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 585; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 586; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 587; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 588; AVX2-FAST-NEXT: vzeroupper 589; AVX2-FAST-NEXT: retq 590; 591; AVX512-LABEL: trunc_add_const_v8i64_v8i16: 592; AVX512: # %bb.0: 593; AVX512-NEXT: vpmovqw %zmm0, %xmm0 594; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 595; AVX512-NEXT: vzeroupper 596; AVX512-NEXT: retq 597 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 598 %2 = trunc <8 x i64> %1 to <8 x i16> 599 ret <8 x i16> %2 600} 601 602define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 603; SSE-LABEL: trunc_add_const_v8i32_v8i16: 604; SSE: # %bb.0: 605; SSE-NEXT: pslld $16, %xmm1 606; SSE-NEXT: psrad $16, %xmm1 607; SSE-NEXT: pslld $16, %xmm0 608; SSE-NEXT: psrad $16, %xmm0 609; SSE-NEXT: packssdw %xmm1, %xmm0 610; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 611; SSE-NEXT: retq 612; 613; AVX1-LABEL: trunc_add_const_v8i32_v8i16: 614; AVX1: # %bb.0: 615; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 616; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 617; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 618; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 619; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 620; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 621; AVX1-NEXT: vzeroupper 622; AVX1-NEXT: retq 623; 624; AVX2-LABEL: trunc_add_const_v8i32_v8i16: 625; AVX2: # %bb.0: 626; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 627; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 628; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 629; AVX2-NEXT: vzeroupper 630; AVX2-NEXT: retq 631; 632; AVX512-LABEL: trunc_add_const_v8i32_v8i16: 633; AVX512: # %bb.0: 634; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 635; AVX512-NEXT: vpmovdw %zmm0, %ymm0 636; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 637; AVX512-NEXT: vzeroupper 638; AVX512-NEXT: retq 639 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 640 %2 = trunc <8 x i32> %1 to <8 x i16> 641 ret <8 x i16> %2 642} 643 644define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 645; SSE-LABEL: trunc_add_const_v16i64_v16i8: 646; SSE: # %bb.0: 647; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 648; SSE-NEXT: pand %xmm8, %xmm7 649; SSE-NEXT: pand %xmm8, %xmm6 650; SSE-NEXT: packuswb %xmm7, %xmm6 651; SSE-NEXT: pand %xmm8, %xmm5 652; SSE-NEXT: pand %xmm8, %xmm4 653; SSE-NEXT: packuswb %xmm5, %xmm4 654; SSE-NEXT: packuswb %xmm6, %xmm4 655; SSE-NEXT: pand %xmm8, %xmm3 656; SSE-NEXT: pand %xmm8, %xmm2 657; SSE-NEXT: packuswb %xmm3, %xmm2 658; SSE-NEXT: pand %xmm8, %xmm1 659; SSE-NEXT: pand %xmm8, %xmm0 660; SSE-NEXT: packuswb %xmm1, %xmm0 661; SSE-NEXT: packuswb %xmm2, %xmm0 662; SSE-NEXT: packuswb %xmm4, %xmm0 663; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 664; SSE-NEXT: retq 665; 666; AVX1-LABEL: trunc_add_const_v16i64_v16i8: 667; AVX1: # %bb.0: 668; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 669; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 670; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 671; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 672; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 673; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 674; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 675; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 676; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 677; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 678; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 679; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 680; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 681; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 682; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 683; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 684; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 685; AVX1-NEXT: vzeroupper 686; AVX1-NEXT: retq 687; 688; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: 689; AVX2-SLOW: # %bb.0: 690; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 691; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 692; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 693; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 694; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 695; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 696; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 697; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 698; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 699; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 700; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 701; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 702; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 703; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 704; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 705; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 706; AVX2-SLOW-NEXT: vzeroupper 707; AVX2-SLOW-NEXT: retq 708; 709; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8: 710; AVX2-FAST: # %bb.0: 711; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 712; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 713; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 714; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 715; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 716; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 717; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 718; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 719; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 720; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 721; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 722; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 723; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 724; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 725; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 726; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 727; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 728; AVX2-FAST-NEXT: vzeroupper 729; AVX2-FAST-NEXT: retq 730; 731; AVX512-LABEL: trunc_add_const_v16i64_v16i8: 732; AVX512: # %bb.0: 733; AVX512-NEXT: vpmovqb %zmm1, %xmm1 734; AVX512-NEXT: vpmovqb %zmm0, %xmm0 735; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 736; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 737; AVX512-NEXT: vzeroupper 738; AVX512-NEXT: retq 739 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 740 %2 = trunc <16 x i64> %1 to <16 x i8> 741 ret <16 x i8> %2 742} 743 744define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 745; SSE-LABEL: trunc_add_const_v16i32_v16i8: 746; SSE: # %bb.0: 747; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 748; SSE-NEXT: pand %xmm4, %xmm3 749; SSE-NEXT: pand %xmm4, %xmm2 750; SSE-NEXT: packuswb %xmm3, %xmm2 751; SSE-NEXT: pand %xmm4, %xmm1 752; SSE-NEXT: pand %xmm4, %xmm0 753; SSE-NEXT: packuswb %xmm1, %xmm0 754; SSE-NEXT: packuswb %xmm2, %xmm0 755; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 756; SSE-NEXT: retq 757; 758; AVX1-LABEL: trunc_add_const_v16i32_v16i8: 759; AVX1: # %bb.0: 760; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 761; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 762; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 763; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 764; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 765; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 766; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 767; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 768; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 769; AVX1-NEXT: vzeroupper 770; AVX1-NEXT: retq 771; 772; AVX2-LABEL: trunc_add_const_v16i32_v16i8: 773; AVX2: # %bb.0: 774; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 775; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 776; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 777; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 778; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 779; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 780; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 781; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 782; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 783; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 784; AVX2-NEXT: vzeroupper 785; AVX2-NEXT: retq 786; 787; AVX512-LABEL: trunc_add_const_v16i32_v16i8: 788; AVX512: # %bb.0: 789; AVX512-NEXT: vpmovdb %zmm0, %xmm0 790; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 791; AVX512-NEXT: vzeroupper 792; AVX512-NEXT: retq 793 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 794 %2 = trunc <16 x i32> %1 to <16 x i8> 795 ret <16 x i8> %2 796} 797 798define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 799; SSE-LABEL: trunc_add_const_v16i16_v16i8: 800; SSE: # %bb.0: 801; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 802; SSE-NEXT: pand %xmm2, %xmm1 803; SSE-NEXT: pand %xmm2, %xmm0 804; SSE-NEXT: packuswb %xmm1, %xmm0 805; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 806; SSE-NEXT: retq 807; 808; AVX1-LABEL: trunc_add_const_v16i16_v16i8: 809; AVX1: # %bb.0: 810; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 811; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 812; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 813; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 814; AVX1-NEXT: vzeroupper 815; AVX1-NEXT: retq 816; 817; AVX2-LABEL: trunc_add_const_v16i16_v16i8: 818; AVX2: # %bb.0: 819; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 820; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 821; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 822; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 823; AVX2-NEXT: vzeroupper 824; AVX2-NEXT: retq 825; 826; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: 827; AVX512F: # %bb.0: 828; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 829; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 830; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 831; AVX512F-NEXT: vzeroupper 832; AVX512F-NEXT: retq 833; 834; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: 835; AVX512BW: # %bb.0: 836; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 837; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 838; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 839; AVX512BW-NEXT: vzeroupper 840; AVX512BW-NEXT: retq 841; 842; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: 843; AVX512DQ: # %bb.0: 844; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 845; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 846; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 847; AVX512DQ-NEXT: vzeroupper 848; AVX512DQ-NEXT: retq 849 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 850 %2 = trunc <16 x i16> %1 to <16 x i8> 851 ret <16 x i8> %2 852} 853 854; 855; sub 856; 857 858define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 859; SSE-LABEL: trunc_sub_v4i64_v4i32: 860; SSE: # %bb.0: 861; SSE-NEXT: psubq %xmm3, %xmm1 862; SSE-NEXT: psubq %xmm2, %xmm0 863; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 864; SSE-NEXT: retq 865; 866; AVX1-LABEL: trunc_sub_v4i64_v4i32: 867; AVX1: # %bb.0: 868; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 869; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 870; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 871; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 872; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 873; AVX1-NEXT: vzeroupper 874; AVX1-NEXT: retq 875; 876; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: 877; AVX2-SLOW: # %bb.0: 878; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 879; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 880; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 881; AVX2-SLOW-NEXT: vzeroupper 882; AVX2-SLOW-NEXT: retq 883; 884; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32: 885; AVX2-FAST: # %bb.0: 886; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0 887; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 888; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 889; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 890; AVX2-FAST-NEXT: vzeroupper 891; AVX2-FAST-NEXT: retq 892; 893; AVX512-LABEL: trunc_sub_v4i64_v4i32: 894; AVX512: # %bb.0: 895; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 896; AVX512-NEXT: vpmovqd %zmm0, %ymm0 897; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 898; AVX512-NEXT: vzeroupper 899; AVX512-NEXT: retq 900 %1 = sub <4 x i64> %a0, %a1 901 %2 = trunc <4 x i64> %1 to <4 x i32> 902 ret <4 x i32> %2 903} 904 905define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 906; SSE-LABEL: trunc_sub_v8i64_v8i16: 907; SSE: # %bb.0: 908; SSE-NEXT: psubq %xmm6, %xmm2 909; SSE-NEXT: psubq %xmm7, %xmm3 910; SSE-NEXT: psubq %xmm4, %xmm0 911; SSE-NEXT: psubq %xmm5, %xmm1 912; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 913; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 914; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 915; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 916; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 917; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 918; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 919; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 920; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 921; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 922; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 923; SSE-NEXT: retq 924; 925; AVX1-LABEL: trunc_sub_v8i64_v8i16: 926; AVX1: # %bb.0: 927; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 928; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 929; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 930; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 931; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 932; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 933; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 934; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 935; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 936; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 937; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 938; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 939; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 940; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 941; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 942; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 943; AVX1-NEXT: vzeroupper 944; AVX1-NEXT: retq 945; 946; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: 947; AVX2-SLOW: # %bb.0: 948; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 949; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 950; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 951; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 952; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 953; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 954; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 955; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 956; AVX2-SLOW-NEXT: vzeroupper 957; AVX2-SLOW-NEXT: retq 958; 959; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16: 960; AVX2-FAST: # %bb.0: 961; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1 962; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0 963; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 964; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 965; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 966; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 967; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 968; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 969; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 970; AVX2-FAST-NEXT: vzeroupper 971; AVX2-FAST-NEXT: retq 972; 973; AVX512-LABEL: trunc_sub_v8i64_v8i16: 974; AVX512: # %bb.0: 975; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 976; AVX512-NEXT: vpmovqw %zmm0, %xmm0 977; AVX512-NEXT: vzeroupper 978; AVX512-NEXT: retq 979 %1 = sub <8 x i64> %a0, %a1 980 %2 = trunc <8 x i64> %1 to <8 x i16> 981 ret <8 x i16> %2 982} 983 984define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 985; SSE-LABEL: trunc_sub_v8i32_v8i16: 986; SSE: # %bb.0: 987; SSE-NEXT: psubd %xmm2, %xmm0 988; SSE-NEXT: psubd %xmm3, %xmm1 989; SSE-NEXT: pslld $16, %xmm1 990; SSE-NEXT: psrad $16, %xmm1 991; SSE-NEXT: pslld $16, %xmm0 992; SSE-NEXT: psrad $16, %xmm0 993; SSE-NEXT: packssdw %xmm1, %xmm0 994; SSE-NEXT: retq 995; 996; AVX1-LABEL: trunc_sub_v8i32_v8i16: 997; AVX1: # %bb.0: 998; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 999; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1000; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1001; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1002; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 1003; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1004; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1005; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1006; AVX1-NEXT: vzeroupper 1007; AVX1-NEXT: retq 1008; 1009; AVX2-LABEL: trunc_sub_v8i32_v8i16: 1010; AVX2: # %bb.0: 1011; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1012; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1013; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1014; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1015; AVX2-NEXT: vzeroupper 1016; AVX2-NEXT: retq 1017; 1018; AVX512-LABEL: trunc_sub_v8i32_v8i16: 1019; AVX512: # %bb.0: 1020; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1021; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1022; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1023; AVX512-NEXT: vzeroupper 1024; AVX512-NEXT: retq 1025 %1 = sub <8 x i32> %a0, %a1 1026 %2 = trunc <8 x i32> %1 to <8 x i16> 1027 ret <8 x i16> %2 1028} 1029 1030define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 1031; SSE-LABEL: trunc_sub_v16i64_v16i8: 1032; SSE: # %bb.0: 1033; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 1034; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 1035; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 1036; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 1037; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 1038; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 1039; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 1040; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 1041; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1042; SSE-NEXT: pand %xmm8, %xmm7 1043; SSE-NEXT: pand %xmm8, %xmm6 1044; SSE-NEXT: packuswb %xmm7, %xmm6 1045; SSE-NEXT: pand %xmm8, %xmm5 1046; SSE-NEXT: pand %xmm8, %xmm4 1047; SSE-NEXT: packuswb %xmm5, %xmm4 1048; SSE-NEXT: packuswb %xmm6, %xmm4 1049; SSE-NEXT: pand %xmm8, %xmm3 1050; SSE-NEXT: pand %xmm8, %xmm2 1051; SSE-NEXT: packuswb %xmm3, %xmm2 1052; SSE-NEXT: pand %xmm8, %xmm1 1053; SSE-NEXT: pand %xmm8, %xmm0 1054; SSE-NEXT: packuswb %xmm1, %xmm0 1055; SSE-NEXT: packuswb %xmm2, %xmm0 1056; SSE-NEXT: packuswb %xmm4, %xmm0 1057; SSE-NEXT: retq 1058; 1059; AVX1-LABEL: trunc_sub_v16i64_v16i8: 1060; AVX1: # %bb.0: 1061; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 1062; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 1063; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1064; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 1065; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 1066; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1067; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1068; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 1069; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 1070; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1071; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1072; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 1073; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 1074; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1075; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1076; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 1077; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 1078; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1079; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1080; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1081; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1082; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1083; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1084; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1085; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1086; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1087; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1088; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1089; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1090; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1091; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1092; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1093; AVX1-NEXT: vzeroupper 1094; AVX1-NEXT: retq 1095; 1096; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: 1097; AVX2-SLOW: # %bb.0: 1098; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1099; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1100; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1101; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1102; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 1103; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1104; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 1105; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1106; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1107; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1108; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1109; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 1110; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 1111; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1112; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 1113; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1114; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1115; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 1116; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1117; AVX2-SLOW-NEXT: vzeroupper 1118; AVX2-SLOW-NEXT: retq 1119; 1120; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8: 1121; AVX2-FAST: # %bb.0: 1122; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1123; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1124; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1125; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1126; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 1127; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 1128; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 1129; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1130; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1131; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1132; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1133; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 1134; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 1135; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 1136; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 1137; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1138; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1139; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1140; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 1141; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1142; AVX2-FAST-NEXT: vzeroupper 1143; AVX2-FAST-NEXT: retq 1144; 1145; AVX512-LABEL: trunc_sub_v16i64_v16i8: 1146; AVX512: # %bb.0: 1147; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 1148; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 1149; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1150; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1151; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1152; AVX512-NEXT: vzeroupper 1153; AVX512-NEXT: retq 1154 %1 = sub <16 x i64> %a0, %a1 1155 %2 = trunc <16 x i64> %1 to <16 x i8> 1156 ret <16 x i8> %2 1157} 1158 1159define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1160; SSE-LABEL: trunc_sub_v16i32_v16i8: 1161; SSE: # %bb.0: 1162; SSE-NEXT: psubd %xmm4, %xmm0 1163; SSE-NEXT: psubd %xmm5, %xmm1 1164; SSE-NEXT: psubd %xmm6, %xmm2 1165; SSE-NEXT: psubd %xmm7, %xmm3 1166; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1167; SSE-NEXT: pand %xmm4, %xmm3 1168; SSE-NEXT: pand %xmm4, %xmm2 1169; SSE-NEXT: packuswb %xmm3, %xmm2 1170; SSE-NEXT: pand %xmm4, %xmm1 1171; SSE-NEXT: pand %xmm4, %xmm0 1172; SSE-NEXT: packuswb %xmm1, %xmm0 1173; SSE-NEXT: packuswb %xmm2, %xmm0 1174; SSE-NEXT: retq 1175; 1176; AVX1-LABEL: trunc_sub_v16i32_v16i8: 1177; AVX1: # %bb.0: 1178; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 1179; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1180; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1181; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 1182; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 1183; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1184; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1185; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 1186; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 1187; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1188; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1189; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1190; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1191; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1192; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1193; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1194; AVX1-NEXT: vzeroupper 1195; AVX1-NEXT: retq 1196; 1197; AVX2-LABEL: trunc_sub_v16i32_v16i8: 1198; AVX2: # %bb.0: 1199; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 1200; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 1201; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1202; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1203; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1204; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1205; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 1206; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1207; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1208; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 1209; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1210; AVX2-NEXT: vzeroupper 1211; AVX2-NEXT: retq 1212; 1213; AVX512-LABEL: trunc_sub_v16i32_v16i8: 1214; AVX512: # %bb.0: 1215; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 1216; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1217; AVX512-NEXT: vzeroupper 1218; AVX512-NEXT: retq 1219 %1 = sub <16 x i32> %a0, %a1 1220 %2 = trunc <16 x i32> %1 to <16 x i8> 1221 ret <16 x i8> %2 1222} 1223 1224define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 1225; SSE-LABEL: trunc_sub_v16i16_v16i8: 1226; SSE: # %bb.0: 1227; SSE-NEXT: psubw %xmm2, %xmm0 1228; SSE-NEXT: psubw %xmm3, %xmm1 1229; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1230; SSE-NEXT: pand %xmm2, %xmm1 1231; SSE-NEXT: pand %xmm2, %xmm0 1232; SSE-NEXT: packuswb %xmm1, %xmm0 1233; SSE-NEXT: retq 1234; 1235; AVX1-LABEL: trunc_sub_v16i16_v16i8: 1236; AVX1: # %bb.0: 1237; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 1238; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1239; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1240; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1241; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1242; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1243; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 1244; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 1245; AVX1-NEXT: vzeroupper 1246; AVX1-NEXT: retq 1247; 1248; AVX2-LABEL: trunc_sub_v16i16_v16i8: 1249; AVX2: # %bb.0: 1250; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1251; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1252; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1253; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1254; AVX2-NEXT: vzeroupper 1255; AVX2-NEXT: retq 1256; 1257; AVX512F-LABEL: trunc_sub_v16i16_v16i8: 1258; AVX512F: # %bb.0: 1259; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1260; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1261; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1262; AVX512F-NEXT: vzeroupper 1263; AVX512F-NEXT: retq 1264; 1265; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: 1266; AVX512BW: # %bb.0: 1267; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1268; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1269; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1270; AVX512BW-NEXT: vzeroupper 1271; AVX512BW-NEXT: retq 1272; 1273; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: 1274; AVX512DQ: # %bb.0: 1275; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1276; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1277; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1278; AVX512DQ-NEXT: vzeroupper 1279; AVX512DQ-NEXT: retq 1280 %1 = sub <16 x i16> %a0, %a1 1281 %2 = trunc <16 x i16> %1 to <16 x i8> 1282 ret <16 x i8> %2 1283} 1284 1285define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { 1286; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: 1287; SSE: # %bb.0: 1288; SSE-NEXT: psubb %xmm1, %xmm0 1289; SSE-NEXT: retq 1290; 1291; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: 1292; AVX: # %bb.0: 1293; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1294; AVX-NEXT: retq 1295 %a = zext <16 x i8> %x to <16 x i16> 1296 %b = zext <16 x i8> %y to <16 x i16> 1297 %c = sub <16 x i16> %a, %b 1298 %d = trunc <16 x i16> %c to <16 x i8> 1299 ret <16 x i8> %d 1300} 1301 1302; 1303; sub to constant 1304; 1305 1306define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 1307; SSE-LABEL: trunc_sub_const_v4i64_v4i32: 1308; SSE: # %bb.0: 1309; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1310; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 1311; SSE-NEXT: retq 1312; 1313; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: 1314; AVX1: # %bb.0: 1315; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1316; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1317; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 1318; AVX1-NEXT: vzeroupper 1319; AVX1-NEXT: retq 1320; 1321; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: 1322; AVX2-SLOW: # %bb.0: 1323; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1324; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1325; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 1326; AVX2-SLOW-NEXT: vzeroupper 1327; AVX2-SLOW-NEXT: retq 1328; 1329; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: 1330; AVX2-FAST: # %bb.0: 1331; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 1332; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 1333; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 1334; AVX2-FAST-NEXT: vzeroupper 1335; AVX2-FAST-NEXT: retq 1336; 1337; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: 1338; AVX512: # %bb.0: 1339; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1340; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1341; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 1342; AVX512-NEXT: vzeroupper 1343; AVX512-NEXT: retq 1344 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 1345 %2 = trunc <4 x i64> %1 to <4 x i32> 1346 ret <4 x i32> %2 1347} 1348 1349define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 1350; SSE-LABEL: trunc_sub_const_v8i64_v8i16: 1351; SSE: # %bb.0: 1352; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1353; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1354; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1355; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1356; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1357; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1358; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1359; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1360; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1361; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1362; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1363; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 1364; SSE-NEXT: retq 1365; 1366; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: 1367; AVX1: # %bb.0: 1368; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 1369; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1370; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1371; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1372; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1373; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1374; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1375; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1376; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1377; AVX1-NEXT: vzeroupper 1378; AVX1-NEXT: retq 1379; 1380; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: 1381; AVX2-SLOW: # %bb.0: 1382; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1383; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1384; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1385; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1386; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1387; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1388; AVX2-SLOW-NEXT: vzeroupper 1389; AVX2-SLOW-NEXT: retq 1390; 1391; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16: 1392; AVX2-FAST: # %bb.0: 1393; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1394; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 1395; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 1396; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1397; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1398; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1399; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1400; AVX2-FAST-NEXT: vzeroupper 1401; AVX2-FAST-NEXT: retq 1402; 1403; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: 1404; AVX512: # %bb.0: 1405; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1406; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1407; AVX512-NEXT: vzeroupper 1408; AVX512-NEXT: retq 1409 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 1410 %2 = trunc <8 x i64> %1 to <8 x i16> 1411 ret <8 x i16> %2 1412} 1413 1414define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 1415; SSE-LABEL: trunc_sub_const_v8i32_v8i16: 1416; SSE: # %bb.0: 1417; SSE-NEXT: pslld $16, %xmm1 1418; SSE-NEXT: psrad $16, %xmm1 1419; SSE-NEXT: pslld $16, %xmm0 1420; SSE-NEXT: psrad $16, %xmm0 1421; SSE-NEXT: packssdw %xmm1, %xmm0 1422; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 1423; SSE-NEXT: retq 1424; 1425; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: 1426; AVX1: # %bb.0: 1427; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1428; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 1429; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1430; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1431; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1432; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1433; AVX1-NEXT: vzeroupper 1434; AVX1-NEXT: retq 1435; 1436; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: 1437; AVX2: # %bb.0: 1438; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1439; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1440; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1441; AVX2-NEXT: vzeroupper 1442; AVX2-NEXT: retq 1443; 1444; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: 1445; AVX512: # %bb.0: 1446; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1447; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1448; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1449; AVX512-NEXT: vzeroupper 1450; AVX512-NEXT: retq 1451 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1452 %2 = trunc <8 x i32> %1 to <8 x i16> 1453 ret <8 x i16> %2 1454} 1455 1456define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 1457; SSE-LABEL: trunc_sub_const_v16i64_v16i8: 1458; SSE: # %bb.0: 1459; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1460; SSE-NEXT: pand %xmm8, %xmm7 1461; SSE-NEXT: pand %xmm8, %xmm6 1462; SSE-NEXT: packuswb %xmm7, %xmm6 1463; SSE-NEXT: pand %xmm8, %xmm5 1464; SSE-NEXT: pand %xmm8, %xmm4 1465; SSE-NEXT: packuswb %xmm5, %xmm4 1466; SSE-NEXT: packuswb %xmm6, %xmm4 1467; SSE-NEXT: pand %xmm8, %xmm3 1468; SSE-NEXT: pand %xmm8, %xmm2 1469; SSE-NEXT: packuswb %xmm3, %xmm2 1470; SSE-NEXT: pand %xmm8, %xmm1 1471; SSE-NEXT: pand %xmm8, %xmm0 1472; SSE-NEXT: packuswb %xmm1, %xmm0 1473; SSE-NEXT: packuswb %xmm2, %xmm0 1474; SSE-NEXT: packuswb %xmm4, %xmm0 1475; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 1476; SSE-NEXT: retq 1477; 1478; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: 1479; AVX1: # %bb.0: 1480; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 1481; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1482; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1483; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1484; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1485; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1486; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1487; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1488; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1489; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1490; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1491; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1492; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1493; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1494; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1495; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1496; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1497; AVX1-NEXT: vzeroupper 1498; AVX1-NEXT: retq 1499; 1500; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: 1501; AVX2-SLOW: # %bb.0: 1502; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 1503; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1504; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 1505; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1506; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1507; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1508; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1509; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 1510; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 1511; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1512; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 1513; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1514; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1515; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 1516; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1517; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1518; AVX2-SLOW-NEXT: vzeroupper 1519; AVX2-SLOW-NEXT: retq 1520; 1521; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8: 1522; AVX2-FAST: # %bb.0: 1523; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 1524; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 1525; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 1526; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1527; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1528; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1529; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1530; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 1531; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 1532; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 1533; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 1534; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1535; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1536; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1537; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 1538; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1539; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1540; AVX2-FAST-NEXT: vzeroupper 1541; AVX2-FAST-NEXT: retq 1542; 1543; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: 1544; AVX512: # %bb.0: 1545; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1546; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1547; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1548; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1549; AVX512-NEXT: vzeroupper 1550; AVX512-NEXT: retq 1551 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 1552 %2 = trunc <16 x i64> %1 to <16 x i8> 1553 ret <16 x i8> %2 1554} 1555 1556define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 1557; SSE-LABEL: trunc_sub_const_v16i32_v16i8: 1558; SSE: # %bb.0: 1559; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1560; SSE-NEXT: pand %xmm4, %xmm3 1561; SSE-NEXT: pand %xmm4, %xmm2 1562; SSE-NEXT: packuswb %xmm3, %xmm2 1563; SSE-NEXT: pand %xmm4, %xmm1 1564; SSE-NEXT: pand %xmm4, %xmm0 1565; SSE-NEXT: packuswb %xmm1, %xmm0 1566; SSE-NEXT: packuswb %xmm2, %xmm0 1567; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 1568; SSE-NEXT: retq 1569; 1570; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: 1571; AVX1: # %bb.0: 1572; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1573; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1574; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1575; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1576; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1577; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1578; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1579; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1580; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1581; AVX1-NEXT: vzeroupper 1582; AVX1-NEXT: retq 1583; 1584; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: 1585; AVX2: # %bb.0: 1586; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1587; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1588; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1589; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1590; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 1591; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1592; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1593; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 1594; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1595; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1596; AVX2-NEXT: vzeroupper 1597; AVX2-NEXT: retq 1598; 1599; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: 1600; AVX512: # %bb.0: 1601; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1602; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1603; AVX512-NEXT: vzeroupper 1604; AVX512-NEXT: retq 1605 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1606 %2 = trunc <16 x i32> %1 to <16 x i8> 1607 ret <16 x i8> %2 1608} 1609 1610define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 1611; SSE-LABEL: trunc_sub_const_v16i16_v16i8: 1612; SSE: # %bb.0: 1613; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1614; SSE-NEXT: pand %xmm2, %xmm1 1615; SSE-NEXT: pand %xmm2, %xmm0 1616; SSE-NEXT: packuswb %xmm1, %xmm0 1617; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 1618; SSE-NEXT: retq 1619; 1620; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: 1621; AVX1: # %bb.0: 1622; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1623; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1624; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1625; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1626; AVX1-NEXT: vzeroupper 1627; AVX1-NEXT: retq 1628; 1629; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: 1630; AVX2: # %bb.0: 1631; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1632; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1633; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1634; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1635; AVX2-NEXT: vzeroupper 1636; AVX2-NEXT: retq 1637; 1638; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: 1639; AVX512F: # %bb.0: 1640; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1641; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1642; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1643; AVX512F-NEXT: vzeroupper 1644; AVX512F-NEXT: retq 1645; 1646; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: 1647; AVX512BW: # %bb.0: 1648; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1649; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1650; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1651; AVX512BW-NEXT: vzeroupper 1652; AVX512BW-NEXT: retq 1653; 1654; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: 1655; AVX512DQ: # %bb.0: 1656; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1657; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1658; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1659; AVX512DQ-NEXT: vzeroupper 1660; AVX512DQ-NEXT: retq 1661 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1662 %2 = trunc <16 x i16> %1 to <16 x i8> 1663 ret <16 x i8> %2 1664} 1665 1666define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { 1667; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1668; SSE: # %bb.0: 1669; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 1670; SSE-NEXT: retq 1671; 1672; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1673; AVX: # %bb.0: 1674; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1675; AVX-NEXT: retq 1676 %a = zext <16 x i8> %x to <16 x i16> 1677 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1678 %c = trunc <16 x i16> %b to <16 x i8> 1679 ret <16 x i8> %c 1680} 1681 1682define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { 1683; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1684; SSE: # %bb.0: 1685; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1686; SSE-NEXT: psubb %xmm0, %xmm1 1687; SSE-NEXT: movdqa %xmm1, %xmm0 1688; SSE-NEXT: retq 1689; 1690; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1691; AVX: # %bb.0: 1692; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1693; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 1694; AVX-NEXT: retq 1695 %a = zext <16 x i8> %x to <16 x i16> 1696 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a 1697 %c = trunc <16 x i16> %b to <16 x i8> 1698 ret <16 x i8> %c 1699} 1700 1701; 1702; mul 1703; 1704 1705define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1706; SSE-LABEL: trunc_mul_v4i64_v4i32: 1707; SSE: # %bb.0: 1708; SSE-NEXT: pmuludq %xmm3, %xmm1 1709; SSE-NEXT: pmuludq %xmm2, %xmm0 1710; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1711; SSE-NEXT: retq 1712; 1713; AVX1-LABEL: trunc_mul_v4i64_v4i32: 1714; AVX1: # %bb.0: 1715; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1716; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1717; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1718; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1719; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1720; AVX1-NEXT: vzeroupper 1721; AVX1-NEXT: retq 1722; 1723; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: 1724; AVX2-SLOW: # %bb.0: 1725; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1726; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1727; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 1728; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1729; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1730; AVX2-SLOW-NEXT: vzeroupper 1731; AVX2-SLOW-NEXT: retq 1732; 1733; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32: 1734; AVX2-FAST: # %bb.0: 1735; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1736; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 1737; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 1738; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1739; AVX2-FAST-NEXT: vzeroupper 1740; AVX2-FAST-NEXT: retq 1741; 1742; AVX512F-LABEL: trunc_mul_v4i64_v4i32: 1743; AVX512F: # %bb.0: 1744; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1745; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1746; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1747; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1748; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1749; AVX512F-NEXT: vzeroupper 1750; AVX512F-NEXT: retq 1751; 1752; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: 1753; AVX512BW: # %bb.0: 1754; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1755; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1756; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1757; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1758; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1759; AVX512BW-NEXT: vzeroupper 1760; AVX512BW-NEXT: retq 1761; 1762; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: 1763; AVX512DQ: # %bb.0: 1764; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1765; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1766; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1767; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 1768; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1769; AVX512DQ-NEXT: vzeroupper 1770; AVX512DQ-NEXT: retq 1771 %1 = mul <4 x i64> %a0, %a1 1772 %2 = trunc <4 x i64> %1 to <4 x i32> 1773 ret <4 x i32> %2 1774} 1775 1776define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1777; SSE-LABEL: trunc_mul_v8i64_v8i16: 1778; SSE: # %bb.0: 1779; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1780; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] 1781; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1782; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 1783; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1784; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] 1785; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] 1786; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1787; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] 1788; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1789; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] 1790; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1791; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1792; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1793; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1794; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1795; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1796; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1797; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1798; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1799; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1800; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1801; SSE-NEXT: pmullw %xmm6, %xmm0 1802; SSE-NEXT: retq 1803; 1804; AVX1-LABEL: trunc_mul_v8i64_v8i16: 1805; AVX1: # %bb.0: 1806; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] 1807; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1808; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1809; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1810; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1811; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1812; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1813; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1814; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1815; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1816; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1817; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1818; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1819; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1820; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1821; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1822; AVX1-NEXT: vzeroupper 1823; AVX1-NEXT: retq 1824; 1825; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: 1826; AVX2-SLOW: # %bb.0: 1827; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 1828; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1829; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 1830; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1831; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1832; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1833; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] 1834; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1835; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] 1836; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1837; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1838; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1839; AVX2-SLOW-NEXT: vzeroupper 1840; AVX2-SLOW-NEXT: retq 1841; 1842; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16: 1843; AVX2-FAST: # %bb.0: 1844; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 1845; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 1846; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 1847; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1848; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1849; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1850; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1851; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 1852; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 1853; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1854; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1855; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1856; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1857; AVX2-FAST-NEXT: vzeroupper 1858; AVX2-FAST-NEXT: retq 1859; 1860; AVX512F-LABEL: trunc_mul_v8i64_v8i16: 1861; AVX512F: # %bb.0: 1862; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1863; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1864; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1865; AVX512F-NEXT: vzeroupper 1866; AVX512F-NEXT: retq 1867; 1868; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: 1869; AVX512BW: # %bb.0: 1870; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1871; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1872; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1873; AVX512BW-NEXT: vzeroupper 1874; AVX512BW-NEXT: retq 1875; 1876; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: 1877; AVX512DQ: # %bb.0: 1878; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1879; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 1880; AVX512DQ-NEXT: vzeroupper 1881; AVX512DQ-NEXT: retq 1882 %1 = mul <8 x i64> %a0, %a1 1883 %2 = trunc <8 x i64> %1 to <8 x i16> 1884 ret <8 x i16> %2 1885} 1886 1887define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 1888; SSE-LABEL: trunc_mul_v8i32_v8i16: 1889; SSE: # %bb.0: 1890; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1891; SSE-NEXT: pmuludq %xmm2, %xmm0 1892; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1893; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1894; SSE-NEXT: pmuludq %xmm4, %xmm2 1895; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1896; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1897; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1898; SSE-NEXT: pmuludq %xmm3, %xmm1 1899; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1900; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1901; SSE-NEXT: pmuludq %xmm2, %xmm3 1902; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 1903; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1904; SSE-NEXT: pslld $16, %xmm1 1905; SSE-NEXT: psrad $16, %xmm1 1906; SSE-NEXT: pslld $16, %xmm0 1907; SSE-NEXT: psrad $16, %xmm0 1908; SSE-NEXT: packssdw %xmm1, %xmm0 1909; SSE-NEXT: retq 1910; 1911; AVX1-LABEL: trunc_mul_v8i32_v8i16: 1912; AVX1: # %bb.0: 1913; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 1914; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1915; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1916; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1917; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 1918; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1919; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1920; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1921; AVX1-NEXT: vzeroupper 1922; AVX1-NEXT: retq 1923; 1924; AVX2-LABEL: trunc_mul_v8i32_v8i16: 1925; AVX2: # %bb.0: 1926; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1927; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1928; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1929; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1930; AVX2-NEXT: vzeroupper 1931; AVX2-NEXT: retq 1932; 1933; AVX512-LABEL: trunc_mul_v8i32_v8i16: 1934; AVX512: # %bb.0: 1935; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1936; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1937; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1938; AVX512-NEXT: vzeroupper 1939; AVX512-NEXT: retq 1940 %1 = mul <8 x i32> %a0, %a1 1941 %2 = trunc <8 x i32> %1 to <8 x i16> 1942 ret <8 x i16> %2 1943} 1944 1945define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 1946; SSE-LABEL: trunc_mul_v16i64_v16i8: 1947; SSE: # %bb.0: 1948; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 1949; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 1950; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 1951; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 1952; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 1953; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 1954; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 1955; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 1956; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1957; SSE-NEXT: pand %xmm8, %xmm7 1958; SSE-NEXT: pand %xmm8, %xmm6 1959; SSE-NEXT: packuswb %xmm7, %xmm6 1960; SSE-NEXT: pand %xmm8, %xmm5 1961; SSE-NEXT: pand %xmm8, %xmm4 1962; SSE-NEXT: packuswb %xmm5, %xmm4 1963; SSE-NEXT: packuswb %xmm6, %xmm4 1964; SSE-NEXT: pand %xmm8, %xmm3 1965; SSE-NEXT: pand %xmm8, %xmm2 1966; SSE-NEXT: packuswb %xmm3, %xmm2 1967; SSE-NEXT: pand %xmm8, %xmm1 1968; SSE-NEXT: pand %xmm8, %xmm0 1969; SSE-NEXT: packuswb %xmm1, %xmm0 1970; SSE-NEXT: packuswb %xmm2, %xmm0 1971; SSE-NEXT: packuswb %xmm4, %xmm0 1972; SSE-NEXT: retq 1973; 1974; AVX1-LABEL: trunc_mul_v16i64_v16i8: 1975; AVX1: # %bb.0: 1976; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 1977; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 1978; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1979; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 1980; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 1981; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1982; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1983; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 1984; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5 1985; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1986; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1987; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 1988; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6 1989; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1990; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1991; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 1992; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 1993; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1994; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1995; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1996; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1997; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1998; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1999; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2000; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 2001; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 2002; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2003; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 2004; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 2005; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 2006; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2007; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2008; AVX1-NEXT: vzeroupper 2009; AVX1-NEXT: retq 2010; 2011; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8: 2012; AVX2-SLOW: # %bb.0: 2013; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8 2014; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2] 2015; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7 2016; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2] 2017; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3 2018; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7 2019; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] 2020; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7 2021; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] 2022; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2 2023; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2024; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2025; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2026; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2027; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] 2028; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2 2029; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7 2030; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] 2031; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7 2032; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] 2033; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1 2034; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5 2035; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] 2036; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 2037; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] 2038; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0 2039; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2040; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2041; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2042; AVX2-SLOW-NEXT: vpand %xmm6, %xmm0, %xmm0 2043; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2044; AVX2-SLOW-NEXT: vzeroupper 2045; AVX2-SLOW-NEXT: retq 2046; 2047; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8: 2048; AVX2-FAST: # %bb.0: 2049; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] 2050; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 2051; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 2052; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3 2053; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 2054; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 2055; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2 2056; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2057; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2058; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2059; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2060; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] 2061; AVX2-FAST-NEXT: vpand %xmm6, %xmm2, %xmm2 2062; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 2063; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 2064; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1 2065; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4 2066; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 2067; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0 2068; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2069; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2070; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2071; AVX2-FAST-NEXT: vpand %xmm6, %xmm0, %xmm0 2072; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2073; AVX2-FAST-NEXT: vzeroupper 2074; AVX2-FAST-NEXT: retq 2075; 2076; AVX512F-LABEL: trunc_mul_v16i64_v16i8: 2077; AVX512F: # %bb.0: 2078; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 2079; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 2080; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 2081; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 2082; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2083; AVX512F-NEXT: vzeroupper 2084; AVX512F-NEXT: retq 2085; 2086; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: 2087; AVX512BW: # %bb.0: 2088; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 2089; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 2090; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 2091; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 2092; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2093; AVX512BW-NEXT: vzeroupper 2094; AVX512BW-NEXT: retq 2095; 2096; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: 2097; AVX512DQ: # %bb.0: 2098; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 2099; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 2100; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 2101; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 2102; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2103; AVX512DQ-NEXT: vzeroupper 2104; AVX512DQ-NEXT: retq 2105 %1 = mul <16 x i64> %a0, %a1 2106 %2 = trunc <16 x i64> %1 to <16 x i8> 2107 ret <16 x i8> %2 2108} 2109 2110define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 2111; SSE-LABEL: trunc_mul_v16i32_v16i8: 2112; SSE: # %bb.0: 2113; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 2114; SSE-NEXT: pmuludq %xmm4, %xmm0 2115; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2116; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2117; SSE-NEXT: pmuludq %xmm8, %xmm4 2118; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2119; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2120; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 2121; SSE-NEXT: pmuludq %xmm5, %xmm1 2122; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2123; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 2124; SSE-NEXT: pmuludq %xmm4, %xmm5 2125; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2126; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2127; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 2128; SSE-NEXT: pmuludq %xmm6, %xmm2 2129; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2130; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 2131; SSE-NEXT: pmuludq %xmm4, %xmm5 2132; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2133; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2134; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 2135; SSE-NEXT: pmuludq %xmm7, %xmm3 2136; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2137; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 2138; SSE-NEXT: pmuludq %xmm4, %xmm5 2139; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2140; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2141; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2142; SSE-NEXT: pand %xmm4, %xmm3 2143; SSE-NEXT: pand %xmm4, %xmm2 2144; SSE-NEXT: packuswb %xmm3, %xmm2 2145; SSE-NEXT: pand %xmm4, %xmm1 2146; SSE-NEXT: pand %xmm4, %xmm0 2147; SSE-NEXT: packuswb %xmm1, %xmm0 2148; SSE-NEXT: packuswb %xmm2, %xmm0 2149; SSE-NEXT: retq 2150; 2151; AVX1-LABEL: trunc_mul_v16i32_v16i8: 2152; AVX1: # %bb.0: 2153; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 2154; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2155; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2156; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 2157; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 2158; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2159; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2160; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 2161; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 2162; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2163; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2164; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 2165; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2166; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 2167; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2168; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2169; AVX1-NEXT: vzeroupper 2170; AVX1-NEXT: retq 2171; 2172; AVX2-LABEL: trunc_mul_v16i32_v16i8: 2173; AVX2: # %bb.0: 2174; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 2175; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 2176; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2177; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2178; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2179; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2180; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 2181; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2182; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2183; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 2184; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2185; AVX2-NEXT: vzeroupper 2186; AVX2-NEXT: retq 2187; 2188; AVX512-LABEL: trunc_mul_v16i32_v16i8: 2189; AVX512: # %bb.0: 2190; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 2191; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2192; AVX512-NEXT: vzeroupper 2193; AVX512-NEXT: retq 2194 %1 = mul <16 x i32> %a0, %a1 2195 %2 = trunc <16 x i32> %1 to <16 x i8> 2196 ret <16 x i8> %2 2197} 2198 2199define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2200; SSE-LABEL: trunc_mul_v16i16_v16i8: 2201; SSE: # %bb.0: 2202; SSE-NEXT: pmullw %xmm2, %xmm0 2203; SSE-NEXT: pmullw %xmm3, %xmm1 2204; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2205; SSE-NEXT: pand %xmm2, %xmm1 2206; SSE-NEXT: pand %xmm2, %xmm0 2207; SSE-NEXT: packuswb %xmm1, %xmm0 2208; SSE-NEXT: retq 2209; 2210; AVX1-LABEL: trunc_mul_v16i16_v16i8: 2211; AVX1: # %bb.0: 2212; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2213; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2214; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2215; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2216; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 2217; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 2218; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 2219; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2220; AVX1-NEXT: vzeroupper 2221; AVX1-NEXT: retq 2222; 2223; AVX2-LABEL: trunc_mul_v16i16_v16i8: 2224; AVX2: # %bb.0: 2225; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2226; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 2227; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2228; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2229; AVX2-NEXT: vzeroupper 2230; AVX2-NEXT: retq 2231; 2232; AVX512F-LABEL: trunc_mul_v16i16_v16i8: 2233; AVX512F: # %bb.0: 2234; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2235; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2236; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2237; AVX512F-NEXT: vzeroupper 2238; AVX512F-NEXT: retq 2239; 2240; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: 2241; AVX512BW: # %bb.0: 2242; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2243; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2244; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2245; AVX512BW-NEXT: vzeroupper 2246; AVX512BW-NEXT: retq 2247; 2248; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: 2249; AVX512DQ: # %bb.0: 2250; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2251; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2252; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2253; AVX512DQ-NEXT: vzeroupper 2254; AVX512DQ-NEXT: retq 2255 %1 = mul <16 x i16> %a0, %a1 2256 %2 = trunc <16 x i16> %1 to <16 x i8> 2257 ret <16 x i8> %2 2258} 2259 2260define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 2261; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2262; SSE: # %bb.0: 2263; SSE-NEXT: pxor %xmm3, %xmm3 2264; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2265; SSE-NEXT: pslld $16, %xmm2 2266; SSE-NEXT: psrad $16, %xmm2 2267; SSE-NEXT: pslld $16, %xmm1 2268; SSE-NEXT: psrad $16, %xmm1 2269; SSE-NEXT: packssdw %xmm2, %xmm1 2270; SSE-NEXT: pmullw %xmm1, %xmm0 2271; SSE-NEXT: retq 2272; 2273; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2274; AVX1: # %bb.0: 2275; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2276; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2277; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2278; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2279; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2280; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2281; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2282; AVX1-NEXT: vzeroupper 2283; AVX1-NEXT: retq 2284; 2285; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2286; AVX2: # %bb.0: 2287; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2288; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2289; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2290; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2291; AVX2-NEXT: vzeroupper 2292; AVX2-NEXT: retq 2293; 2294; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2295; AVX512: # %bb.0: 2296; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2297; AVX512-NEXT: vpmovdw %zmm1, %ymm1 2298; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2299; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2300; AVX512-NEXT: vzeroupper 2301; AVX512-NEXT: retq 2302 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2303 %2 = zext <8 x i8> %1 to <8 x i32> 2304 %3 = mul <8 x i32> %2, %a1 2305 %4 = trunc <8 x i32> %3 to <8 x i16> 2306 ret <8 x i16> %4 2307} 2308 2309; 2310; mul to constant 2311; 2312 2313define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2314; SSE-LABEL: trunc_mul_const_v4i64_v4i32: 2315; SSE: # %bb.0: 2316; SSE-NEXT: xorps %xmm2, %xmm2 2317; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2318; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 2319; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 2320; SSE-NEXT: movaps %xmm2, %xmm0 2321; SSE-NEXT: retq 2322; 2323; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: 2324; AVX1: # %bb.0: 2325; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2326; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2327; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2328; AVX1-NEXT: vzeroupper 2329; AVX1-NEXT: retq 2330; 2331; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: 2332; AVX2-SLOW: # %bb.0: 2333; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2334; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2335; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2336; AVX2-SLOW-NEXT: vzeroupper 2337; AVX2-SLOW-NEXT: retq 2338; 2339; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32: 2340; AVX2-FAST: # %bb.0: 2341; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2342; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2343; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2344; AVX2-FAST-NEXT: vzeroupper 2345; AVX2-FAST-NEXT: retq 2346; 2347; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: 2348; AVX512: # %bb.0: 2349; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2350; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2351; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2352; AVX512-NEXT: vzeroupper 2353; AVX512-NEXT: retq 2354 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2355 %2 = trunc <4 x i64> %1 to <4 x i32> 2356 ret <4 x i32> %2 2357} 2358 2359define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2360; SSE-LABEL: trunc_mul_const_v8i64_v8i16: 2361; SSE: # %bb.0: 2362; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2363; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2364; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2365; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2366; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2367; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2368; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2369; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2370; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2371; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2372; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2373; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 2374; SSE-NEXT: retq 2375; 2376; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: 2377; AVX1: # %bb.0: 2378; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 2379; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2380; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2381; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2382; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2383; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2384; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2385; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2386; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2387; AVX1-NEXT: vzeroupper 2388; AVX1-NEXT: retq 2389; 2390; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: 2391; AVX2-SLOW: # %bb.0: 2392; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 2393; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2394; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 2395; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2396; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2397; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2398; AVX2-SLOW-NEXT: vzeroupper 2399; AVX2-SLOW-NEXT: retq 2400; 2401; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16: 2402; AVX2-FAST: # %bb.0: 2403; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 2404; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 2405; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 2406; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2407; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2408; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2409; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2410; AVX2-FAST-NEXT: vzeroupper 2411; AVX2-FAST-NEXT: retq 2412; 2413; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: 2414; AVX512: # %bb.0: 2415; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2416; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2417; AVX512-NEXT: vzeroupper 2418; AVX512-NEXT: retq 2419 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 2420 %2 = trunc <8 x i64> %1 to <8 x i16> 2421 ret <8 x i16> %2 2422} 2423 2424define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 2425; SSE-LABEL: trunc_mul_const_v8i32_v8i16: 2426; SSE: # %bb.0: 2427; SSE-NEXT: pslld $16, %xmm1 2428; SSE-NEXT: psrad $16, %xmm1 2429; SSE-NEXT: pslld $16, %xmm0 2430; SSE-NEXT: psrad $16, %xmm0 2431; SSE-NEXT: packssdw %xmm1, %xmm0 2432; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 2433; SSE-NEXT: retq 2434; 2435; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: 2436; AVX1: # %bb.0: 2437; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2438; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2439; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2440; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2441; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2442; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2443; AVX1-NEXT: vzeroupper 2444; AVX1-NEXT: retq 2445; 2446; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: 2447; AVX2: # %bb.0: 2448; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2449; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2450; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2451; AVX2-NEXT: vzeroupper 2452; AVX2-NEXT: retq 2453; 2454; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: 2455; AVX512: # %bb.0: 2456; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2457; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2458; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2459; AVX512-NEXT: vzeroupper 2460; AVX512-NEXT: retq 2461 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2462 %2 = trunc <8 x i32> %1 to <8 x i16> 2463 ret <8 x i16> %2 2464} 2465 2466define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 2467; SSE-LABEL: trunc_mul_const_v16i64_v16i8: 2468; SSE: # %bb.0: 2469; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 2470; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2 2471; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3 2472; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4 2473; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5 2474; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6 2475; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7 2476; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2477; SSE-NEXT: pand %xmm8, %xmm7 2478; SSE-NEXT: pand %xmm8, %xmm6 2479; SSE-NEXT: packuswb %xmm7, %xmm6 2480; SSE-NEXT: pand %xmm8, %xmm5 2481; SSE-NEXT: pand %xmm8, %xmm4 2482; SSE-NEXT: packuswb %xmm5, %xmm4 2483; SSE-NEXT: packuswb %xmm6, %xmm4 2484; SSE-NEXT: pand %xmm8, %xmm3 2485; SSE-NEXT: pand %xmm8, %xmm2 2486; SSE-NEXT: packuswb %xmm3, %xmm2 2487; SSE-NEXT: pand %xmm8, %xmm1 2488; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2489; SSE-NEXT: packuswb %xmm1, %xmm0 2490; SSE-NEXT: packuswb %xmm2, %xmm0 2491; SSE-NEXT: packuswb %xmm4, %xmm0 2492; SSE-NEXT: retq 2493; 2494; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: 2495; AVX1: # %bb.0: 2496; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm8 2497; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2498; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 2499; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm5 2500; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2501; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 2502; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm6 2503; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2504; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 2505; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm7 2506; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2507; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm3 2508; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] 2509; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2510; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 2511; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 2512; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2513; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 2514; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 2515; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2516; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2517; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 2518; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2519; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2520; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 2521; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 2522; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2523; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2524; AVX1-NEXT: vzeroupper 2525; AVX1-NEXT: retq 2526; 2527; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8: 2528; AVX2-SLOW: # %bb.0: 2529; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 2530; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] 2531; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 2532; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 2533; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] 2534; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 2535; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2536; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2537; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2538; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2539; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 2540; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 2541; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 2542; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] 2543; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2544; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 2545; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] 2546; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 2547; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2548; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2549; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2550; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 2551; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2552; AVX2-SLOW-NEXT: vzeroupper 2553; AVX2-SLOW-NEXT: retq 2554; 2555; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8: 2556; AVX2-FAST: # %bb.0: 2557; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 2558; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 2559; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 2560; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 2561; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 2562; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2563; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2564; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2565; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2566; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 2567; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 2568; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 2569; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2570; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 2571; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 2572; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2573; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2574; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2575; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 2576; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2577; AVX2-FAST-NEXT: vzeroupper 2578; AVX2-FAST-NEXT: retq 2579; 2580; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: 2581; AVX512F: # %bb.0: 2582; AVX512F-NEXT: vpmuludq {{.*}}(%rip), %zmm0, %zmm0 2583; AVX512F-NEXT: vpmuludq {{.*}}(%rip), %zmm1, %zmm1 2584; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 2585; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 2586; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2587; AVX512F-NEXT: vzeroupper 2588; AVX512F-NEXT: retq 2589; 2590; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: 2591; AVX512BW: # %bb.0: 2592; AVX512BW-NEXT: vpmuludq {{.*}}(%rip), %zmm0, %zmm0 2593; AVX512BW-NEXT: vpmuludq {{.*}}(%rip), %zmm1, %zmm1 2594; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 2595; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 2596; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2597; AVX512BW-NEXT: vzeroupper 2598; AVX512BW-NEXT: retq 2599; 2600; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: 2601; AVX512DQ: # %bb.0: 2602; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 2603; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm1, %zmm1 2604; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 2605; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 2606; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2607; AVX512DQ-NEXT: vzeroupper 2608; AVX512DQ-NEXT: retq 2609 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 2610 %2 = trunc <16 x i64> %1 to <16 x i8> 2611 ret <16 x i8> %2 2612} 2613 2614define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 2615; SSE-LABEL: trunc_mul_const_v16i32_v16i8: 2616; SSE: # %bb.0: 2617; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3] 2618; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 2619; SSE-NEXT: pmuludq %xmm4, %xmm0 2620; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2621; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2622; SSE-NEXT: pmuludq %xmm5, %xmm4 2623; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2624; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2625; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7] 2626; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 2627; SSE-NEXT: pmuludq %xmm4, %xmm1 2628; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2629; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2630; SSE-NEXT: pmuludq %xmm5, %xmm4 2631; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2632; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2633; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11] 2634; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 2635; SSE-NEXT: pmuludq %xmm4, %xmm2 2636; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2637; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2638; SSE-NEXT: pmuludq %xmm5, %xmm4 2639; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2640; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2641; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15] 2642; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 2643; SSE-NEXT: pmuludq %xmm4, %xmm3 2644; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2645; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2646; SSE-NEXT: pmuludq %xmm5, %xmm4 2647; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2648; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2649; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2650; SSE-NEXT: pand %xmm4, %xmm3 2651; SSE-NEXT: pand %xmm4, %xmm2 2652; SSE-NEXT: packuswb %xmm3, %xmm2 2653; SSE-NEXT: pand %xmm4, %xmm1 2654; SSE-NEXT: pand %xmm4, %xmm0 2655; SSE-NEXT: packuswb %xmm1, %xmm0 2656; SSE-NEXT: packuswb %xmm2, %xmm0 2657; SSE-NEXT: retq 2658; 2659; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: 2660; AVX1: # %bb.0: 2661; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 2662; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2663; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2664; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3 2665; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2666; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 2667; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255] 2668; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2669; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2670; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2671; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2672; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2673; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2674; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2675; AVX1-NEXT: vzeroupper 2676; AVX1-NEXT: retq 2677; 2678; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: 2679; AVX2: # %bb.0: 2680; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2681; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2682; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2683; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 2684; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2685; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 2686; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2687; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2688; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2689; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 2690; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2691; AVX2-NEXT: vzeroupper 2692; AVX2-NEXT: retq 2693; 2694; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: 2695; AVX512: # %bb.0: 2696; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0 2697; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2698; AVX512-NEXT: vzeroupper 2699; AVX512-NEXT: retq 2700 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2701 %2 = trunc <16 x i32> %1 to <16 x i8> 2702 ret <16 x i8> %2 2703} 2704 2705define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 2706; SSE-LABEL: trunc_mul_const_v16i16_v16i8: 2707; SSE: # %bb.0: 2708; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 2709; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 2710; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2711; SSE-NEXT: pand %xmm2, %xmm1 2712; SSE-NEXT: pand %xmm2, %xmm0 2713; SSE-NEXT: packuswb %xmm1, %xmm0 2714; SSE-NEXT: retq 2715; 2716; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: 2717; AVX1: # %bb.0: 2718; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2719; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2720; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2721; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2722; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2723; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2724; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2725; AVX1-NEXT: vzeroupper 2726; AVX1-NEXT: retq 2727; 2728; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: 2729; AVX2: # %bb.0: 2730; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2731; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 2732; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2733; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2734; AVX2-NEXT: vzeroupper 2735; AVX2-NEXT: retq 2736; 2737; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: 2738; AVX512F: # %bb.0: 2739; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2740; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2741; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2742; AVX512F-NEXT: vzeroupper 2743; AVX512F-NEXT: retq 2744; 2745; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: 2746; AVX512BW: # %bb.0: 2747; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2748; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2749; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2750; AVX512BW-NEXT: vzeroupper 2751; AVX512BW-NEXT: retq 2752; 2753; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: 2754; AVX512DQ: # %bb.0: 2755; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2756; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2757; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2758; AVX512DQ-NEXT: vzeroupper 2759; AVX512DQ-NEXT: retq 2760 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 2761 %2 = trunc <16 x i16> %1 to <16 x i8> 2762 ret <16 x i8> %2 2763} 2764 2765; 2766; and 2767; 2768 2769define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2770; SSE-LABEL: trunc_and_v4i64_v4i32: 2771; SSE: # %bb.0: 2772; SSE-NEXT: andps %xmm3, %xmm1 2773; SSE-NEXT: andps %xmm2, %xmm0 2774; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2775; SSE-NEXT: retq 2776; 2777; AVX1-LABEL: trunc_and_v4i64_v4i32: 2778; AVX1: # %bb.0: 2779; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2780; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2781; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2782; AVX1-NEXT: vzeroupper 2783; AVX1-NEXT: retq 2784; 2785; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: 2786; AVX2-SLOW: # %bb.0: 2787; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 2788; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2789; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2790; AVX2-SLOW-NEXT: vzeroupper 2791; AVX2-SLOW-NEXT: retq 2792; 2793; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: 2794; AVX2-FAST: # %bb.0: 2795; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 2796; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2797; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 2798; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2799; AVX2-FAST-NEXT: vzeroupper 2800; AVX2-FAST-NEXT: retq 2801; 2802; AVX512-LABEL: trunc_and_v4i64_v4i32: 2803; AVX512: # %bb.0: 2804; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2805; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2806; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2807; AVX512-NEXT: vzeroupper 2808; AVX512-NEXT: retq 2809 %1 = and <4 x i64> %a0, %a1 2810 %2 = trunc <4 x i64> %1 to <4 x i32> 2811 ret <4 x i32> %2 2812} 2813 2814define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 2815; SSE-LABEL: trunc_and_v8i64_v8i16: 2816; SSE: # %bb.0: 2817; SSE-NEXT: pand %xmm6, %xmm2 2818; SSE-NEXT: pand %xmm7, %xmm3 2819; SSE-NEXT: pand %xmm4, %xmm0 2820; SSE-NEXT: pand %xmm5, %xmm1 2821; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2822; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2823; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2824; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2825; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2826; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2827; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2828; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2829; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2830; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2831; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2832; SSE-NEXT: retq 2833; 2834; AVX1-LABEL: trunc_and_v8i64_v8i16: 2835; AVX1: # %bb.0: 2836; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] 2837; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 2838; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2839; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2840; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2841; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 2842; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2843; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2844; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2845; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2846; AVX1-NEXT: vzeroupper 2847; AVX1-NEXT: retq 2848; 2849; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: 2850; AVX2-SLOW: # %bb.0: 2851; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1 2852; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 2853; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 2854; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2855; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 2856; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2857; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2858; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2859; AVX2-SLOW-NEXT: vzeroupper 2860; AVX2-SLOW-NEXT: retq 2861; 2862; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16: 2863; AVX2-FAST: # %bb.0: 2864; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1 2865; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 2866; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 2867; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 2868; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 2869; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2870; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2871; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2872; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2873; AVX2-FAST-NEXT: vzeroupper 2874; AVX2-FAST-NEXT: retq 2875; 2876; AVX512-LABEL: trunc_and_v8i64_v8i16: 2877; AVX512: # %bb.0: 2878; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 2879; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2880; AVX512-NEXT: vzeroupper 2881; AVX512-NEXT: retq 2882 %1 = and <8 x i64> %a0, %a1 2883 %2 = trunc <8 x i64> %1 to <8 x i16> 2884 ret <8 x i16> %2 2885} 2886 2887define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 2888; SSE-LABEL: trunc_and_v8i32_v8i16: 2889; SSE: # %bb.0: 2890; SSE-NEXT: pand %xmm2, %xmm0 2891; SSE-NEXT: pand %xmm3, %xmm1 2892; SSE-NEXT: pslld $16, %xmm1 2893; SSE-NEXT: psrad $16, %xmm1 2894; SSE-NEXT: pslld $16, %xmm0 2895; SSE-NEXT: psrad $16, %xmm0 2896; SSE-NEXT: packssdw %xmm1, %xmm0 2897; SSE-NEXT: retq 2898; 2899; AVX1-LABEL: trunc_and_v8i32_v8i16: 2900; AVX1: # %bb.0: 2901; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2902; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2903; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2904; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2905; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2906; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2907; AVX1-NEXT: vzeroupper 2908; AVX1-NEXT: retq 2909; 2910; AVX2-LABEL: trunc_and_v8i32_v8i16: 2911; AVX2: # %bb.0: 2912; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2913; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2914; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2915; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2916; AVX2-NEXT: vzeroupper 2917; AVX2-NEXT: retq 2918; 2919; AVX512-LABEL: trunc_and_v8i32_v8i16: 2920; AVX512: # %bb.0: 2921; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2922; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2923; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2924; AVX512-NEXT: vzeroupper 2925; AVX512-NEXT: retq 2926 %1 = and <8 x i32> %a0, %a1 2927 %2 = trunc <8 x i32> %1 to <8 x i16> 2928 ret <8 x i16> %2 2929} 2930 2931define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 2932; SSE-LABEL: trunc_and_v16i64_v16i8: 2933; SSE: # %bb.0: 2934; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 2935; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 2936; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 2937; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 2938; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 2939; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 2940; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 2941; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 2942; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2943; SSE-NEXT: pand %xmm8, %xmm7 2944; SSE-NEXT: pand %xmm8, %xmm6 2945; SSE-NEXT: packuswb %xmm7, %xmm6 2946; SSE-NEXT: pand %xmm8, %xmm5 2947; SSE-NEXT: pand %xmm8, %xmm4 2948; SSE-NEXT: packuswb %xmm5, %xmm4 2949; SSE-NEXT: packuswb %xmm6, %xmm4 2950; SSE-NEXT: pand %xmm8, %xmm3 2951; SSE-NEXT: pand %xmm8, %xmm2 2952; SSE-NEXT: packuswb %xmm3, %xmm2 2953; SSE-NEXT: pand %xmm8, %xmm1 2954; SSE-NEXT: pand %xmm8, %xmm0 2955; SSE-NEXT: packuswb %xmm1, %xmm0 2956; SSE-NEXT: packuswb %xmm2, %xmm0 2957; SSE-NEXT: packuswb %xmm4, %xmm0 2958; SSE-NEXT: retq 2959; 2960; AVX1-LABEL: trunc_and_v16i64_v16i8: 2961; AVX1: # %bb.0: 2962; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255] 2963; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7 2964; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 2965; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 2966; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3 2967; AVX1-NEXT: vandps %ymm6, %ymm8, %ymm6 2968; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 2969; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 2970; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 2971; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2972; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm3 2973; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2974; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2975; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2976; AVX1-NEXT: vandps %ymm4, %ymm8, %ymm3 2977; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 2978; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2979; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 2980; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2981; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2982; AVX1-NEXT: vzeroupper 2983; AVX1-NEXT: retq 2984; 2985; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: 2986; AVX2-SLOW: # %bb.0: 2987; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1 2988; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 2989; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3 2990; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2 2991; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 2992; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2993; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 2994; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2995; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2996; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2997; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 2998; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 2999; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 3000; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3001; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 3002; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3003; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3004; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 3005; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3006; AVX2-SLOW-NEXT: vzeroupper 3007; AVX2-SLOW-NEXT: retq 3008; 3009; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8: 3010; AVX2-FAST: # %bb.0: 3011; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1 3012; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0 3013; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3 3014; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2 3015; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 3016; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 3017; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 3018; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3019; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3020; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3021; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3022; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 3023; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 3024; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 3025; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 3026; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3027; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3028; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3029; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 3030; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3031; AVX2-FAST-NEXT: vzeroupper 3032; AVX2-FAST-NEXT: retq 3033; 3034; AVX512-LABEL: trunc_and_v16i64_v16i8: 3035; AVX512: # %bb.0: 3036; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 3037; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 3038; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3039; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3040; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3041; AVX512-NEXT: vzeroupper 3042; AVX512-NEXT: retq 3043 %1 = and <16 x i64> %a0, %a1 3044 %2 = trunc <16 x i64> %1 to <16 x i8> 3045 ret <16 x i8> %2 3046} 3047 3048define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3049; SSE-LABEL: trunc_and_v16i32_v16i8: 3050; SSE: # %bb.0: 3051; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3052; SSE-NEXT: pand %xmm8, %xmm7 3053; SSE-NEXT: pand %xmm3, %xmm7 3054; SSE-NEXT: pand %xmm8, %xmm6 3055; SSE-NEXT: pand %xmm2, %xmm6 3056; SSE-NEXT: packuswb %xmm7, %xmm6 3057; SSE-NEXT: pand %xmm8, %xmm5 3058; SSE-NEXT: pand %xmm1, %xmm5 3059; SSE-NEXT: pand %xmm8, %xmm4 3060; SSE-NEXT: pand %xmm4, %xmm0 3061; SSE-NEXT: packuswb %xmm5, %xmm0 3062; SSE-NEXT: packuswb %xmm6, %xmm0 3063; SSE-NEXT: retq 3064; 3065; AVX1-LABEL: trunc_and_v16i32_v16i8: 3066; AVX1: # %bb.0: 3067; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 3068; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3069; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 3070; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3071; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3072; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3073; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3074; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3075; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3076; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3077; AVX1-NEXT: vzeroupper 3078; AVX1-NEXT: retq 3079; 3080; AVX2-LABEL: trunc_and_v16i32_v16i8: 3081; AVX2: # %bb.0: 3082; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3083; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3084; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3085; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3086; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3087; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3088; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 3089; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3090; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3091; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 3092; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3093; AVX2-NEXT: vzeroupper 3094; AVX2-NEXT: retq 3095; 3096; AVX512-LABEL: trunc_and_v16i32_v16i8: 3097; AVX512: # %bb.0: 3098; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 3099; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3100; AVX512-NEXT: vzeroupper 3101; AVX512-NEXT: retq 3102 %1 = and <16 x i32> %a0, %a1 3103 %2 = trunc <16 x i32> %1 to <16 x i8> 3104 ret <16 x i8> %2 3105} 3106 3107define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 3108; SSE-LABEL: trunc_and_v16i16_v16i8: 3109; SSE: # %bb.0: 3110; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3111; SSE-NEXT: pand %xmm4, %xmm3 3112; SSE-NEXT: pand %xmm1, %xmm3 3113; SSE-NEXT: pand %xmm4, %xmm2 3114; SSE-NEXT: pand %xmm2, %xmm0 3115; SSE-NEXT: packuswb %xmm3, %xmm0 3116; SSE-NEXT: retq 3117; 3118; AVX1-LABEL: trunc_and_v16i16_v16i8: 3119; AVX1: # %bb.0: 3120; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 3121; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3122; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3123; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3124; AVX1-NEXT: vzeroupper 3125; AVX1-NEXT: retq 3126; 3127; AVX2-LABEL: trunc_and_v16i16_v16i8: 3128; AVX2: # %bb.0: 3129; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 3130; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3131; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3132; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3133; AVX2-NEXT: vzeroupper 3134; AVX2-NEXT: retq 3135; 3136; AVX512F-LABEL: trunc_and_v16i16_v16i8: 3137; AVX512F: # %bb.0: 3138; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 3139; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3140; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3141; AVX512F-NEXT: vzeroupper 3142; AVX512F-NEXT: retq 3143; 3144; AVX512BW-LABEL: trunc_and_v16i16_v16i8: 3145; AVX512BW: # %bb.0: 3146; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 3147; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3148; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3149; AVX512BW-NEXT: vzeroupper 3150; AVX512BW-NEXT: retq 3151; 3152; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: 3153; AVX512DQ: # %bb.0: 3154; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 3155; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3156; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3157; AVX512DQ-NEXT: vzeroupper 3158; AVX512DQ-NEXT: retq 3159 %1 = and <16 x i16> %a0, %a1 3160 %2 = trunc <16 x i16> %1 to <16 x i8> 3161 ret <16 x i8> %2 3162} 3163 3164; 3165; and to constant 3166; 3167 3168define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 3169; SSE-LABEL: trunc_and_const_v4i64_v4i32: 3170; SSE: # %bb.0: 3171; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3172; SSE-NEXT: andps {{.*}}(%rip), %xmm0 3173; SSE-NEXT: retq 3174; 3175; AVX1-LABEL: trunc_and_const_v4i64_v4i32: 3176; AVX1: # %bb.0: 3177; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3178; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3179; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 3180; AVX1-NEXT: vzeroupper 3181; AVX1-NEXT: retq 3182; 3183; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: 3184; AVX2-SLOW: # %bb.0: 3185; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3186; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3187; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 3188; AVX2-SLOW-NEXT: vzeroupper 3189; AVX2-SLOW-NEXT: retq 3190; 3191; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: 3192; AVX2-FAST: # %bb.0: 3193; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3194; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 3195; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 3196; AVX2-FAST-NEXT: vzeroupper 3197; AVX2-FAST-NEXT: retq 3198; 3199; AVX512-LABEL: trunc_and_const_v4i64_v4i32: 3200; AVX512: # %bb.0: 3201; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3202; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3203; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3204; AVX512-NEXT: vzeroupper 3205; AVX512-NEXT: retq 3206 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 3207 %2 = trunc <4 x i64> %1 to <4 x i32> 3208 ret <4 x i32> %2 3209} 3210 3211define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 3212; SSE-LABEL: trunc_and_const_v8i64_v8i16: 3213; SSE: # %bb.0: 3214; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3215; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3216; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3217; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3218; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3219; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3220; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3221; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3222; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3223; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3224; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3225; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 3226; SSE-NEXT: retq 3227; 3228; AVX1-LABEL: trunc_and_const_v8i64_v8i16: 3229; AVX1: # %bb.0: 3230; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3231; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3232; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3233; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3234; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3235; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3236; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3237; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3238; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3239; AVX1-NEXT: vzeroupper 3240; AVX1-NEXT: retq 3241; 3242; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: 3243; AVX2-SLOW: # %bb.0: 3244; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 3245; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3246; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 3247; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3248; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3249; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3250; AVX2-SLOW-NEXT: vzeroupper 3251; AVX2-SLOW-NEXT: retq 3252; 3253; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16: 3254; AVX2-FAST: # %bb.0: 3255; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 3256; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 3257; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 3258; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3259; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3260; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3261; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3262; AVX2-FAST-NEXT: vzeroupper 3263; AVX2-FAST-NEXT: retq 3264; 3265; AVX512-LABEL: trunc_and_const_v8i64_v8i16: 3266; AVX512: # %bb.0: 3267; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3268; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3269; AVX512-NEXT: vzeroupper 3270; AVX512-NEXT: retq 3271 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3272 %2 = trunc <8 x i64> %1 to <8 x i16> 3273 ret <8 x i16> %2 3274} 3275 3276define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3277; SSE-LABEL: trunc_and_const_v8i32_v8i16: 3278; SSE: # %bb.0: 3279; SSE-NEXT: pslld $16, %xmm1 3280; SSE-NEXT: psrad $16, %xmm1 3281; SSE-NEXT: pslld $16, %xmm0 3282; SSE-NEXT: psrad $16, %xmm0 3283; SSE-NEXT: packssdw %xmm1, %xmm0 3284; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3285; SSE-NEXT: retq 3286; 3287; AVX1-LABEL: trunc_and_const_v8i32_v8i16: 3288; AVX1: # %bb.0: 3289; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3290; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 3291; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3292; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3293; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3294; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3295; AVX1-NEXT: vzeroupper 3296; AVX1-NEXT: retq 3297; 3298; AVX2-LABEL: trunc_and_const_v8i32_v8i16: 3299; AVX2: # %bb.0: 3300; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3301; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3302; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3303; AVX2-NEXT: vzeroupper 3304; AVX2-NEXT: retq 3305; 3306; AVX512-LABEL: trunc_and_const_v8i32_v8i16: 3307; AVX512: # %bb.0: 3308; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3309; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3310; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3311; AVX512-NEXT: vzeroupper 3312; AVX512-NEXT: retq 3313 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3314 %2 = trunc <8 x i32> %1 to <8 x i16> 3315 ret <8 x i16> %2 3316} 3317 3318define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3319; SSE-LABEL: trunc_and_const_v16i64_v16i8: 3320; SSE: # %bb.0: 3321; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3322; SSE-NEXT: pand %xmm8, %xmm7 3323; SSE-NEXT: pand %xmm8, %xmm6 3324; SSE-NEXT: packuswb %xmm7, %xmm6 3325; SSE-NEXT: pand %xmm8, %xmm5 3326; SSE-NEXT: pand %xmm8, %xmm4 3327; SSE-NEXT: packuswb %xmm5, %xmm4 3328; SSE-NEXT: packuswb %xmm6, %xmm4 3329; SSE-NEXT: pand %xmm8, %xmm3 3330; SSE-NEXT: pand %xmm8, %xmm2 3331; SSE-NEXT: packuswb %xmm3, %xmm2 3332; SSE-NEXT: pand %xmm8, %xmm1 3333; SSE-NEXT: pand %xmm8, %xmm0 3334; SSE-NEXT: packuswb %xmm1, %xmm0 3335; SSE-NEXT: packuswb %xmm2, %xmm0 3336; SSE-NEXT: packuswb %xmm4, %xmm0 3337; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3338; SSE-NEXT: retq 3339; 3340; AVX1-LABEL: trunc_and_const_v16i64_v16i8: 3341; AVX1: # %bb.0: 3342; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3343; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3344; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3345; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3346; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3347; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3348; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3349; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3350; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3351; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3352; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3353; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3354; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3355; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3356; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3357; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3358; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3359; AVX1-NEXT: vzeroupper 3360; AVX1-NEXT: retq 3361; 3362; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: 3363; AVX2-SLOW: # %bb.0: 3364; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 3365; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 3366; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 3367; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3368; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3369; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3370; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 3371; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 3372; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 3373; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3374; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 3375; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3376; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3377; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 3378; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3379; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3380; AVX2-SLOW-NEXT: vzeroupper 3381; AVX2-SLOW-NEXT: retq 3382; 3383; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8: 3384; AVX2-FAST: # %bb.0: 3385; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 3386; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 3387; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 3388; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3389; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3390; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3391; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3392; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 3393; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 3394; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 3395; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 3396; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3397; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3398; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3399; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 3400; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3401; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3402; AVX2-FAST-NEXT: vzeroupper 3403; AVX2-FAST-NEXT: retq 3404; 3405; AVX512-LABEL: trunc_and_const_v16i64_v16i8: 3406; AVX512: # %bb.0: 3407; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3408; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3409; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3410; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3411; AVX512-NEXT: vzeroupper 3412; AVX512-NEXT: retq 3413 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3414 %2 = trunc <16 x i64> %1 to <16 x i8> 3415 ret <16 x i8> %2 3416} 3417 3418define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3419; SSE-LABEL: trunc_and_const_v16i32_v16i8: 3420; SSE: # %bb.0: 3421; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3422; SSE-NEXT: pand %xmm4, %xmm3 3423; SSE-NEXT: pand %xmm4, %xmm2 3424; SSE-NEXT: packuswb %xmm3, %xmm2 3425; SSE-NEXT: pand %xmm4, %xmm1 3426; SSE-NEXT: pand %xmm4, %xmm0 3427; SSE-NEXT: packuswb %xmm1, %xmm0 3428; SSE-NEXT: packuswb %xmm2, %xmm0 3429; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3430; SSE-NEXT: retq 3431; 3432; AVX1-LABEL: trunc_and_const_v16i32_v16i8: 3433; AVX1: # %bb.0: 3434; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3435; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3436; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3437; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3438; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3439; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3440; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3441; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3442; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3443; AVX1-NEXT: vzeroupper 3444; AVX1-NEXT: retq 3445; 3446; AVX2-LABEL: trunc_and_const_v16i32_v16i8: 3447; AVX2: # %bb.0: 3448; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3449; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3450; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3451; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3452; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 3453; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3454; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3455; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 3456; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3457; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3458; AVX2-NEXT: vzeroupper 3459; AVX2-NEXT: retq 3460; 3461; AVX512-LABEL: trunc_and_const_v16i32_v16i8: 3462; AVX512: # %bb.0: 3463; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3464; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3465; AVX512-NEXT: vzeroupper 3466; AVX512-NEXT: retq 3467 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3468 %2 = trunc <16 x i32> %1 to <16 x i8> 3469 ret <16 x i8> %2 3470} 3471 3472define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3473; SSE-LABEL: trunc_and_const_v16i16_v16i8: 3474; SSE: # %bb.0: 3475; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3476; SSE-NEXT: pand %xmm2, %xmm1 3477; SSE-NEXT: pand %xmm2, %xmm0 3478; SSE-NEXT: packuswb %xmm1, %xmm0 3479; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3480; SSE-NEXT: retq 3481; 3482; AVX1-LABEL: trunc_and_const_v16i16_v16i8: 3483; AVX1: # %bb.0: 3484; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3485; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3486; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3487; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3488; AVX1-NEXT: vzeroupper 3489; AVX1-NEXT: retq 3490; 3491; AVX2-LABEL: trunc_and_const_v16i16_v16i8: 3492; AVX2: # %bb.0: 3493; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3494; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3495; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3496; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3497; AVX2-NEXT: vzeroupper 3498; AVX2-NEXT: retq 3499; 3500; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: 3501; AVX512F: # %bb.0: 3502; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3503; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3504; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3505; AVX512F-NEXT: vzeroupper 3506; AVX512F-NEXT: retq 3507; 3508; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: 3509; AVX512BW: # %bb.0: 3510; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3511; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3512; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3513; AVX512BW-NEXT: vzeroupper 3514; AVX512BW-NEXT: retq 3515; 3516; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: 3517; AVX512DQ: # %bb.0: 3518; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3519; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3520; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3521; AVX512DQ-NEXT: vzeroupper 3522; AVX512DQ-NEXT: retq 3523 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3524 %2 = trunc <16 x i16> %1 to <16 x i8> 3525 ret <16 x i8> %2 3526} 3527 3528; 3529; xor 3530; 3531 3532define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3533; SSE-LABEL: trunc_xor_v4i64_v4i32: 3534; SSE: # %bb.0: 3535; SSE-NEXT: xorps %xmm3, %xmm1 3536; SSE-NEXT: xorps %xmm2, %xmm0 3537; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3538; SSE-NEXT: retq 3539; 3540; AVX1-LABEL: trunc_xor_v4i64_v4i32: 3541; AVX1: # %bb.0: 3542; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3543; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3544; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3545; AVX1-NEXT: vzeroupper 3546; AVX1-NEXT: retq 3547; 3548; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: 3549; AVX2-SLOW: # %bb.0: 3550; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 3551; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3552; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3553; AVX2-SLOW-NEXT: vzeroupper 3554; AVX2-SLOW-NEXT: retq 3555; 3556; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: 3557; AVX2-FAST: # %bb.0: 3558; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 3559; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3560; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 3561; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3562; AVX2-FAST-NEXT: vzeroupper 3563; AVX2-FAST-NEXT: retq 3564; 3565; AVX512-LABEL: trunc_xor_v4i64_v4i32: 3566; AVX512: # %bb.0: 3567; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3568; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3569; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3570; AVX512-NEXT: vzeroupper 3571; AVX512-NEXT: retq 3572 %1 = xor <4 x i64> %a0, %a1 3573 %2 = trunc <4 x i64> %1 to <4 x i32> 3574 ret <4 x i32> %2 3575} 3576 3577define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3578; SSE-LABEL: trunc_xor_v8i64_v8i16: 3579; SSE: # %bb.0: 3580; SSE-NEXT: pxor %xmm6, %xmm2 3581; SSE-NEXT: pxor %xmm7, %xmm3 3582; SSE-NEXT: pxor %xmm4, %xmm0 3583; SSE-NEXT: pxor %xmm5, %xmm1 3584; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3585; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3586; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3587; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3588; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3589; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3590; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3591; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3592; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3593; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3594; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3595; SSE-NEXT: retq 3596; 3597; AVX1-LABEL: trunc_xor_v8i64_v8i16: 3598; AVX1: # %bb.0: 3599; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3600; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3601; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3602; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3603; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3604; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3605; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3606; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3607; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3608; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3609; AVX1-NEXT: vzeroupper 3610; AVX1-NEXT: retq 3611; 3612; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: 3613; AVX2-SLOW: # %bb.0: 3614; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1 3615; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0 3616; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 3617; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3618; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 3619; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3620; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3621; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3622; AVX2-SLOW-NEXT: vzeroupper 3623; AVX2-SLOW-NEXT: retq 3624; 3625; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16: 3626; AVX2-FAST: # %bb.0: 3627; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1 3628; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0 3629; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 3630; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 3631; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 3632; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3633; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3634; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3635; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3636; AVX2-FAST-NEXT: vzeroupper 3637; AVX2-FAST-NEXT: retq 3638; 3639; AVX512-LABEL: trunc_xor_v8i64_v8i16: 3640; AVX512: # %bb.0: 3641; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 3642; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3643; AVX512-NEXT: vzeroupper 3644; AVX512-NEXT: retq 3645 %1 = xor <8 x i64> %a0, %a1 3646 %2 = trunc <8 x i64> %1 to <8 x i16> 3647 ret <8 x i16> %2 3648} 3649 3650define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 3651; SSE-LABEL: trunc_xor_v8i32_v8i16: 3652; SSE: # %bb.0: 3653; SSE-NEXT: pxor %xmm2, %xmm0 3654; SSE-NEXT: pxor %xmm3, %xmm1 3655; SSE-NEXT: pslld $16, %xmm1 3656; SSE-NEXT: psrad $16, %xmm1 3657; SSE-NEXT: pslld $16, %xmm0 3658; SSE-NEXT: psrad $16, %xmm0 3659; SSE-NEXT: packssdw %xmm1, %xmm0 3660; SSE-NEXT: retq 3661; 3662; AVX1-LABEL: trunc_xor_v8i32_v8i16: 3663; AVX1: # %bb.0: 3664; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3665; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3666; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 3667; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3668; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3669; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3670; AVX1-NEXT: vzeroupper 3671; AVX1-NEXT: retq 3672; 3673; AVX2-LABEL: trunc_xor_v8i32_v8i16: 3674; AVX2: # %bb.0: 3675; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3676; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3677; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3678; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3679; AVX2-NEXT: vzeroupper 3680; AVX2-NEXT: retq 3681; 3682; AVX512-LABEL: trunc_xor_v8i32_v8i16: 3683; AVX512: # %bb.0: 3684; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3685; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3686; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3687; AVX512-NEXT: vzeroupper 3688; AVX512-NEXT: retq 3689 %1 = xor <8 x i32> %a0, %a1 3690 %2 = trunc <8 x i32> %1 to <8 x i16> 3691 ret <8 x i16> %2 3692} 3693 3694define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 3695; SSE-LABEL: trunc_xor_v16i64_v16i8: 3696; SSE: # %bb.0: 3697; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 3698; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 3699; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 3700; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 3701; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 3702; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 3703; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 3704; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 3705; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3706; SSE-NEXT: pand %xmm8, %xmm7 3707; SSE-NEXT: pand %xmm8, %xmm6 3708; SSE-NEXT: packuswb %xmm7, %xmm6 3709; SSE-NEXT: pand %xmm8, %xmm5 3710; SSE-NEXT: pand %xmm8, %xmm4 3711; SSE-NEXT: packuswb %xmm5, %xmm4 3712; SSE-NEXT: packuswb %xmm6, %xmm4 3713; SSE-NEXT: pand %xmm8, %xmm3 3714; SSE-NEXT: pand %xmm8, %xmm2 3715; SSE-NEXT: packuswb %xmm3, %xmm2 3716; SSE-NEXT: pand %xmm8, %xmm1 3717; SSE-NEXT: pand %xmm8, %xmm0 3718; SSE-NEXT: packuswb %xmm1, %xmm0 3719; SSE-NEXT: packuswb %xmm2, %xmm0 3720; SSE-NEXT: packuswb %xmm4, %xmm0 3721; SSE-NEXT: retq 3722; 3723; AVX1-LABEL: trunc_xor_v16i64_v16i8: 3724; AVX1: # %bb.0: 3725; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 3726; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 3727; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 3728; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 3729; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3730; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3731; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3732; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3733; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3734; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3735; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3736; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3737; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3738; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3739; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3740; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3741; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3742; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3743; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3744; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3745; AVX1-NEXT: vzeroupper 3746; AVX1-NEXT: retq 3747; 3748; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: 3749; AVX2-SLOW: # %bb.0: 3750; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1 3751; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0 3752; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3 3753; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2 3754; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 3755; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 3756; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 3757; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3758; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3759; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3760; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 3761; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 3762; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 3763; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3764; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 3765; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3766; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3767; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 3768; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3769; AVX2-SLOW-NEXT: vzeroupper 3770; AVX2-SLOW-NEXT: retq 3771; 3772; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8: 3773; AVX2-FAST: # %bb.0: 3774; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1 3775; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0 3776; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3 3777; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2 3778; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 3779; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 3780; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 3781; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3782; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3783; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3784; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3785; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 3786; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 3787; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 3788; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 3789; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3790; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3791; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3792; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 3793; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3794; AVX2-FAST-NEXT: vzeroupper 3795; AVX2-FAST-NEXT: retq 3796; 3797; AVX512-LABEL: trunc_xor_v16i64_v16i8: 3798; AVX512: # %bb.0: 3799; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 3800; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 3801; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3802; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3803; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3804; AVX512-NEXT: vzeroupper 3805; AVX512-NEXT: retq 3806 %1 = xor <16 x i64> %a0, %a1 3807 %2 = trunc <16 x i64> %1 to <16 x i8> 3808 ret <16 x i8> %2 3809} 3810 3811define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3812; SSE-LABEL: trunc_xor_v16i32_v16i8: 3813; SSE: # %bb.0: 3814; SSE-NEXT: pxor %xmm4, %xmm0 3815; SSE-NEXT: pxor %xmm5, %xmm1 3816; SSE-NEXT: pxor %xmm6, %xmm2 3817; SSE-NEXT: pxor %xmm7, %xmm3 3818; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3819; SSE-NEXT: pand %xmm4, %xmm3 3820; SSE-NEXT: pand %xmm4, %xmm2 3821; SSE-NEXT: packuswb %xmm3, %xmm2 3822; SSE-NEXT: pand %xmm4, %xmm1 3823; SSE-NEXT: pand %xmm4, %xmm0 3824; SSE-NEXT: packuswb %xmm1, %xmm0 3825; SSE-NEXT: packuswb %xmm2, %xmm0 3826; SSE-NEXT: retq 3827; 3828; AVX1-LABEL: trunc_xor_v16i32_v16i8: 3829; AVX1: # %bb.0: 3830; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3831; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3832; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3833; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3834; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3835; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3836; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3837; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3838; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3839; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3840; AVX1-NEXT: vzeroupper 3841; AVX1-NEXT: retq 3842; 3843; AVX2-LABEL: trunc_xor_v16i32_v16i8: 3844; AVX2: # %bb.0: 3845; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3846; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3847; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3848; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3849; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3850; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3851; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 3852; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3853; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3854; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 3855; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3856; AVX2-NEXT: vzeroupper 3857; AVX2-NEXT: retq 3858; 3859; AVX512-LABEL: trunc_xor_v16i32_v16i8: 3860; AVX512: # %bb.0: 3861; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 3862; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3863; AVX512-NEXT: vzeroupper 3864; AVX512-NEXT: retq 3865 %1 = xor <16 x i32> %a0, %a1 3866 %2 = trunc <16 x i32> %1 to <16 x i8> 3867 ret <16 x i8> %2 3868} 3869 3870define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 3871; SSE-LABEL: trunc_xor_v16i16_v16i8: 3872; SSE: # %bb.0: 3873; SSE-NEXT: pxor %xmm2, %xmm0 3874; SSE-NEXT: pxor %xmm3, %xmm1 3875; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3876; SSE-NEXT: pand %xmm2, %xmm1 3877; SSE-NEXT: pand %xmm2, %xmm0 3878; SSE-NEXT: packuswb %xmm1, %xmm0 3879; SSE-NEXT: retq 3880; 3881; AVX1-LABEL: trunc_xor_v16i16_v16i8: 3882; AVX1: # %bb.0: 3883; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3884; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3885; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3886; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3887; AVX1-NEXT: vzeroupper 3888; AVX1-NEXT: retq 3889; 3890; AVX2-LABEL: trunc_xor_v16i16_v16i8: 3891; AVX2: # %bb.0: 3892; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3893; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3894; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3895; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3896; AVX2-NEXT: vzeroupper 3897; AVX2-NEXT: retq 3898; 3899; AVX512F-LABEL: trunc_xor_v16i16_v16i8: 3900; AVX512F: # %bb.0: 3901; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 3902; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3903; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3904; AVX512F-NEXT: vzeroupper 3905; AVX512F-NEXT: retq 3906; 3907; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: 3908; AVX512BW: # %bb.0: 3909; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 3910; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3911; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3912; AVX512BW-NEXT: vzeroupper 3913; AVX512BW-NEXT: retq 3914; 3915; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: 3916; AVX512DQ: # %bb.0: 3917; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 3918; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3919; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3920; AVX512DQ-NEXT: vzeroupper 3921; AVX512DQ-NEXT: retq 3922 %1 = xor <16 x i16> %a0, %a1 3923 %2 = trunc <16 x i16> %1 to <16 x i8> 3924 ret <16 x i8> %2 3925} 3926 3927; 3928; xor to constant 3929; 3930 3931define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 3932; SSE-LABEL: trunc_xor_const_v4i64_v4i32: 3933; SSE: # %bb.0: 3934; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3935; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 3936; SSE-NEXT: retq 3937; 3938; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: 3939; AVX1: # %bb.0: 3940; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3941; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3942; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 3943; AVX1-NEXT: vzeroupper 3944; AVX1-NEXT: retq 3945; 3946; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: 3947; AVX2-SLOW: # %bb.0: 3948; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3949; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3950; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 3951; AVX2-SLOW-NEXT: vzeroupper 3952; AVX2-SLOW-NEXT: retq 3953; 3954; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: 3955; AVX2-FAST: # %bb.0: 3956; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3957; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 3958; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 3959; AVX2-FAST-NEXT: vzeroupper 3960; AVX2-FAST-NEXT: retq 3961; 3962; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: 3963; AVX512: # %bb.0: 3964; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3965; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3966; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 3967; AVX512-NEXT: vzeroupper 3968; AVX512-NEXT: retq 3969 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 3970 %2 = trunc <4 x i64> %1 to <4 x i32> 3971 ret <4 x i32> %2 3972} 3973 3974define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 3975; SSE-LABEL: trunc_xor_const_v8i64_v8i16: 3976; SSE: # %bb.0: 3977; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3978; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3979; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3980; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3981; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3982; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3983; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3984; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3985; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3986; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3987; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3988; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0 3989; SSE-NEXT: retq 3990; 3991; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: 3992; AVX1: # %bb.0: 3993; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3994; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3995; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3996; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3997; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3998; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3999; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4000; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4001; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4002; AVX1-NEXT: vzeroupper 4003; AVX1-NEXT: retq 4004; 4005; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: 4006; AVX2-SLOW: # %bb.0: 4007; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 4008; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 4009; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 4010; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4011; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4012; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4013; AVX2-SLOW-NEXT: vzeroupper 4014; AVX2-SLOW-NEXT: retq 4015; 4016; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16: 4017; AVX2-FAST: # %bb.0: 4018; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 4019; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 4020; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 4021; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4022; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4023; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4024; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4025; AVX2-FAST-NEXT: vzeroupper 4026; AVX2-FAST-NEXT: retq 4027; 4028; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: 4029; AVX512: # %bb.0: 4030; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4031; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4032; AVX512-NEXT: vzeroupper 4033; AVX512-NEXT: retq 4034 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4035 %2 = trunc <8 x i64> %1 to <8 x i16> 4036 ret <8 x i16> %2 4037} 4038 4039define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 4040; SSE-LABEL: trunc_xor_const_v8i32_v8i16: 4041; SSE: # %bb.0: 4042; SSE-NEXT: pslld $16, %xmm1 4043; SSE-NEXT: psrad $16, %xmm1 4044; SSE-NEXT: pslld $16, %xmm0 4045; SSE-NEXT: psrad $16, %xmm0 4046; SSE-NEXT: packssdw %xmm1, %xmm0 4047; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4048; SSE-NEXT: retq 4049; 4050; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: 4051; AVX1: # %bb.0: 4052; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4053; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 4054; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4055; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4056; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4057; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4058; AVX1-NEXT: vzeroupper 4059; AVX1-NEXT: retq 4060; 4061; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: 4062; AVX2: # %bb.0: 4063; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4064; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4065; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4066; AVX2-NEXT: vzeroupper 4067; AVX2-NEXT: retq 4068; 4069; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: 4070; AVX512: # %bb.0: 4071; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4072; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4073; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4074; AVX512-NEXT: vzeroupper 4075; AVX512-NEXT: retq 4076 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4077 %2 = trunc <8 x i32> %1 to <8 x i16> 4078 ret <8 x i16> %2 4079} 4080 4081define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4082; SSE-LABEL: trunc_xor_const_v16i64_v16i8: 4083; SSE: # %bb.0: 4084; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4085; SSE-NEXT: pand %xmm8, %xmm7 4086; SSE-NEXT: pand %xmm8, %xmm6 4087; SSE-NEXT: packuswb %xmm7, %xmm6 4088; SSE-NEXT: pand %xmm8, %xmm5 4089; SSE-NEXT: pand %xmm8, %xmm4 4090; SSE-NEXT: packuswb %xmm5, %xmm4 4091; SSE-NEXT: packuswb %xmm6, %xmm4 4092; SSE-NEXT: pand %xmm8, %xmm3 4093; SSE-NEXT: pand %xmm8, %xmm2 4094; SSE-NEXT: packuswb %xmm3, %xmm2 4095; SSE-NEXT: pand %xmm8, %xmm1 4096; SSE-NEXT: pand %xmm8, %xmm0 4097; SSE-NEXT: packuswb %xmm1, %xmm0 4098; SSE-NEXT: packuswb %xmm2, %xmm0 4099; SSE-NEXT: packuswb %xmm4, %xmm0 4100; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4101; SSE-NEXT: retq 4102; 4103; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: 4104; AVX1: # %bb.0: 4105; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4106; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4107; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4108; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4109; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4110; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4111; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4112; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4113; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4114; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4115; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4116; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4117; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4118; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4119; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4120; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4121; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4122; AVX1-NEXT: vzeroupper 4123; AVX1-NEXT: retq 4124; 4125; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: 4126; AVX2-SLOW: # %bb.0: 4127; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 4128; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 4129; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 4130; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4131; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4132; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4133; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 4134; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 4135; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 4136; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 4137; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 4138; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4139; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4140; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 4141; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4142; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4143; AVX2-SLOW-NEXT: vzeroupper 4144; AVX2-SLOW-NEXT: retq 4145; 4146; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8: 4147; AVX2-FAST: # %bb.0: 4148; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 4149; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 4150; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 4151; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4152; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4153; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4154; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4155; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 4156; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 4157; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 4158; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 4159; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4160; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4161; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4162; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 4163; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4164; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4165; AVX2-FAST-NEXT: vzeroupper 4166; AVX2-FAST-NEXT: retq 4167; 4168; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: 4169; AVX512: # %bb.0: 4170; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4171; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4172; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4173; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4174; AVX512-NEXT: vzeroupper 4175; AVX512-NEXT: retq 4176 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 4177 %2 = trunc <16 x i64> %1 to <16 x i8> 4178 ret <16 x i8> %2 4179} 4180 4181define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 4182; SSE-LABEL: trunc_xor_const_v16i32_v16i8: 4183; SSE: # %bb.0: 4184; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4185; SSE-NEXT: pand %xmm4, %xmm3 4186; SSE-NEXT: pand %xmm4, %xmm2 4187; SSE-NEXT: packuswb %xmm3, %xmm2 4188; SSE-NEXT: pand %xmm4, %xmm1 4189; SSE-NEXT: pand %xmm4, %xmm0 4190; SSE-NEXT: packuswb %xmm1, %xmm0 4191; SSE-NEXT: packuswb %xmm2, %xmm0 4192; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4193; SSE-NEXT: retq 4194; 4195; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: 4196; AVX1: # %bb.0: 4197; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4198; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4199; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4200; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4201; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4202; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4203; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4204; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4205; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4206; AVX1-NEXT: vzeroupper 4207; AVX1-NEXT: retq 4208; 4209; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: 4210; AVX2: # %bb.0: 4211; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4212; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 4213; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4214; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 4215; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 4216; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4217; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4218; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 4219; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4220; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4221; AVX2-NEXT: vzeroupper 4222; AVX2-NEXT: retq 4223; 4224; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: 4225; AVX512: # %bb.0: 4226; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4227; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4228; AVX512-NEXT: vzeroupper 4229; AVX512-NEXT: retq 4230 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 4231 %2 = trunc <16 x i32> %1 to <16 x i8> 4232 ret <16 x i8> %2 4233} 4234 4235define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 4236; SSE-LABEL: trunc_xor_const_v16i16_v16i8: 4237; SSE: # %bb.0: 4238; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4239; SSE-NEXT: pand %xmm2, %xmm1 4240; SSE-NEXT: pand %xmm2, %xmm0 4241; SSE-NEXT: packuswb %xmm1, %xmm0 4242; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4243; SSE-NEXT: retq 4244; 4245; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: 4246; AVX1: # %bb.0: 4247; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 4248; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4249; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4250; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4251; AVX1-NEXT: vzeroupper 4252; AVX1-NEXT: retq 4253; 4254; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: 4255; AVX2: # %bb.0: 4256; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 4257; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4258; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4259; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4260; AVX2-NEXT: vzeroupper 4261; AVX2-NEXT: retq 4262; 4263; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: 4264; AVX512F: # %bb.0: 4265; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4266; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4267; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4268; AVX512F-NEXT: vzeroupper 4269; AVX512F-NEXT: retq 4270; 4271; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: 4272; AVX512BW: # %bb.0: 4273; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4274; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4275; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4276; AVX512BW-NEXT: vzeroupper 4277; AVX512BW-NEXT: retq 4278; 4279; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: 4280; AVX512DQ: # %bb.0: 4281; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4282; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4283; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4284; AVX512DQ-NEXT: vzeroupper 4285; AVX512DQ-NEXT: retq 4286 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 4287 %2 = trunc <16 x i16> %1 to <16 x i8> 4288 ret <16 x i8> %2 4289} 4290 4291; 4292; or 4293; 4294 4295define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 4296; SSE-LABEL: trunc_or_v4i64_v4i32: 4297; SSE: # %bb.0: 4298; SSE-NEXT: orps %xmm3, %xmm1 4299; SSE-NEXT: orps %xmm2, %xmm0 4300; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4301; SSE-NEXT: retq 4302; 4303; AVX1-LABEL: trunc_or_v4i64_v4i32: 4304; AVX1: # %bb.0: 4305; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4306; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4307; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4308; AVX1-NEXT: vzeroupper 4309; AVX1-NEXT: retq 4310; 4311; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: 4312; AVX2-SLOW: # %bb.0: 4313; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 4314; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 4315; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4316; AVX2-SLOW-NEXT: vzeroupper 4317; AVX2-SLOW-NEXT: retq 4318; 4319; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: 4320; AVX2-FAST: # %bb.0: 4321; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 4322; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 4323; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 4324; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4325; AVX2-FAST-NEXT: vzeroupper 4326; AVX2-FAST-NEXT: retq 4327; 4328; AVX512-LABEL: trunc_or_v4i64_v4i32: 4329; AVX512: # %bb.0: 4330; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4331; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4332; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4333; AVX512-NEXT: vzeroupper 4334; AVX512-NEXT: retq 4335 %1 = or <4 x i64> %a0, %a1 4336 %2 = trunc <4 x i64> %1 to <4 x i32> 4337 ret <4 x i32> %2 4338} 4339 4340define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 4341; SSE-LABEL: trunc_or_v8i64_v8i16: 4342; SSE: # %bb.0: 4343; SSE-NEXT: por %xmm6, %xmm2 4344; SSE-NEXT: por %xmm7, %xmm3 4345; SSE-NEXT: por %xmm4, %xmm0 4346; SSE-NEXT: por %xmm5, %xmm1 4347; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4348; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4349; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4350; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4351; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4352; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4353; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4354; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4355; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4356; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4357; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4358; SSE-NEXT: retq 4359; 4360; AVX1-LABEL: trunc_or_v8i64_v8i16: 4361; AVX1: # %bb.0: 4362; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4363; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4364; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 4365; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4366; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4367; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4368; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4369; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4370; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4371; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4372; AVX1-NEXT: vzeroupper 4373; AVX1-NEXT: retq 4374; 4375; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: 4376; AVX2-SLOW: # %bb.0: 4377; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1 4378; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0 4379; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 4380; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 4381; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 4382; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4383; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4384; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4385; AVX2-SLOW-NEXT: vzeroupper 4386; AVX2-SLOW-NEXT: retq 4387; 4388; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16: 4389; AVX2-FAST: # %bb.0: 4390; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 4391; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 4392; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 4393; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 4394; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 4395; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4396; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4397; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4398; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4399; AVX2-FAST-NEXT: vzeroupper 4400; AVX2-FAST-NEXT: retq 4401; 4402; AVX512-LABEL: trunc_or_v8i64_v8i16: 4403; AVX512: # %bb.0: 4404; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 4405; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4406; AVX512-NEXT: vzeroupper 4407; AVX512-NEXT: retq 4408 %1 = or <8 x i64> %a0, %a1 4409 %2 = trunc <8 x i64> %1 to <8 x i16> 4410 ret <8 x i16> %2 4411} 4412 4413define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 4414; SSE-LABEL: trunc_or_v8i32_v8i16: 4415; SSE: # %bb.0: 4416; SSE-NEXT: por %xmm2, %xmm0 4417; SSE-NEXT: por %xmm3, %xmm1 4418; SSE-NEXT: pslld $16, %xmm1 4419; SSE-NEXT: psrad $16, %xmm1 4420; SSE-NEXT: pslld $16, %xmm0 4421; SSE-NEXT: psrad $16, %xmm0 4422; SSE-NEXT: packssdw %xmm1, %xmm0 4423; SSE-NEXT: retq 4424; 4425; AVX1-LABEL: trunc_or_v8i32_v8i16: 4426; AVX1: # %bb.0: 4427; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4428; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4429; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 4430; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4431; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4432; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4433; AVX1-NEXT: vzeroupper 4434; AVX1-NEXT: retq 4435; 4436; AVX2-LABEL: trunc_or_v8i32_v8i16: 4437; AVX2: # %bb.0: 4438; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4439; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4440; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4441; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4442; AVX2-NEXT: vzeroupper 4443; AVX2-NEXT: retq 4444; 4445; AVX512-LABEL: trunc_or_v8i32_v8i16: 4446; AVX512: # %bb.0: 4447; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4448; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4449; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4450; AVX512-NEXT: vzeroupper 4451; AVX512-NEXT: retq 4452 %1 = or <8 x i32> %a0, %a1 4453 %2 = trunc <8 x i32> %1 to <8 x i16> 4454 ret <8 x i16> %2 4455} 4456 4457define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 4458; SSE-LABEL: trunc_or_v16i64_v16i8: 4459; SSE: # %bb.0: 4460; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 4461; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 4462; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 4463; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 4464; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 4465; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 4466; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 4467; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 4468; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4469; SSE-NEXT: pand %xmm8, %xmm7 4470; SSE-NEXT: pand %xmm8, %xmm6 4471; SSE-NEXT: packuswb %xmm7, %xmm6 4472; SSE-NEXT: pand %xmm8, %xmm5 4473; SSE-NEXT: pand %xmm8, %xmm4 4474; SSE-NEXT: packuswb %xmm5, %xmm4 4475; SSE-NEXT: packuswb %xmm6, %xmm4 4476; SSE-NEXT: pand %xmm8, %xmm3 4477; SSE-NEXT: pand %xmm8, %xmm2 4478; SSE-NEXT: packuswb %xmm3, %xmm2 4479; SSE-NEXT: pand %xmm8, %xmm1 4480; SSE-NEXT: pand %xmm8, %xmm0 4481; SSE-NEXT: packuswb %xmm1, %xmm0 4482; SSE-NEXT: packuswb %xmm2, %xmm0 4483; SSE-NEXT: packuswb %xmm4, %xmm0 4484; SSE-NEXT: retq 4485; 4486; AVX1-LABEL: trunc_or_v16i64_v16i8: 4487; AVX1: # %bb.0: 4488; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 4489; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 4490; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 4491; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 4492; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4493; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4494; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4495; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4496; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4497; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4498; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4499; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4500; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4501; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4502; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4503; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4504; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4505; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4506; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4507; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4508; AVX1-NEXT: vzeroupper 4509; AVX1-NEXT: retq 4510; 4511; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: 4512; AVX2-SLOW: # %bb.0: 4513; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1 4514; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0 4515; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3 4516; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2 4517; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 4518; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 4519; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 4520; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4521; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4522; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4523; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 4524; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 4525; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 4526; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 4527; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 4528; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4529; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4530; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 4531; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4532; AVX2-SLOW-NEXT: vzeroupper 4533; AVX2-SLOW-NEXT: retq 4534; 4535; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8: 4536; AVX2-FAST: # %bb.0: 4537; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 4538; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 4539; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3 4540; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2 4541; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 4542; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 4543; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 4544; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4545; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4546; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4547; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4548; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 4549; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 4550; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 4551; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 4552; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4553; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4554; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4555; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 4556; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4557; AVX2-FAST-NEXT: vzeroupper 4558; AVX2-FAST-NEXT: retq 4559; 4560; AVX512-LABEL: trunc_or_v16i64_v16i8: 4561; AVX512: # %bb.0: 4562; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 4563; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 4564; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4565; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4566; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4567; AVX512-NEXT: vzeroupper 4568; AVX512-NEXT: retq 4569 %1 = or <16 x i64> %a0, %a1 4570 %2 = trunc <16 x i64> %1 to <16 x i8> 4571 ret <16 x i8> %2 4572} 4573 4574define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 4575; SSE-LABEL: trunc_or_v16i32_v16i8: 4576; SSE: # %bb.0: 4577; SSE-NEXT: por %xmm4, %xmm0 4578; SSE-NEXT: por %xmm5, %xmm1 4579; SSE-NEXT: por %xmm6, %xmm2 4580; SSE-NEXT: por %xmm7, %xmm3 4581; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4582; SSE-NEXT: pand %xmm4, %xmm3 4583; SSE-NEXT: pand %xmm4, %xmm2 4584; SSE-NEXT: packuswb %xmm3, %xmm2 4585; SSE-NEXT: pand %xmm4, %xmm1 4586; SSE-NEXT: pand %xmm4, %xmm0 4587; SSE-NEXT: packuswb %xmm1, %xmm0 4588; SSE-NEXT: packuswb %xmm2, %xmm0 4589; SSE-NEXT: retq 4590; 4591; AVX1-LABEL: trunc_or_v16i32_v16i8: 4592; AVX1: # %bb.0: 4593; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4594; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4595; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4596; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4597; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4598; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4599; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4600; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4601; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4602; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4603; AVX1-NEXT: vzeroupper 4604; AVX1-NEXT: retq 4605; 4606; AVX2-LABEL: trunc_or_v16i32_v16i8: 4607; AVX2: # %bb.0: 4608; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4609; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4610; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4611; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 4612; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4613; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 4614; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 4615; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4616; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4617; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 4618; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4619; AVX2-NEXT: vzeroupper 4620; AVX2-NEXT: retq 4621; 4622; AVX512-LABEL: trunc_or_v16i32_v16i8: 4623; AVX512: # %bb.0: 4624; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 4625; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4626; AVX512-NEXT: vzeroupper 4627; AVX512-NEXT: retq 4628 %1 = or <16 x i32> %a0, %a1 4629 %2 = trunc <16 x i32> %1 to <16 x i8> 4630 ret <16 x i8> %2 4631} 4632 4633define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 4634; SSE-LABEL: trunc_or_v16i16_v16i8: 4635; SSE: # %bb.0: 4636; SSE-NEXT: por %xmm2, %xmm0 4637; SSE-NEXT: por %xmm3, %xmm1 4638; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4639; SSE-NEXT: pand %xmm2, %xmm1 4640; SSE-NEXT: pand %xmm2, %xmm0 4641; SSE-NEXT: packuswb %xmm1, %xmm0 4642; SSE-NEXT: retq 4643; 4644; AVX1-LABEL: trunc_or_v16i16_v16i8: 4645; AVX1: # %bb.0: 4646; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4647; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 4648; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4649; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4650; AVX1-NEXT: vzeroupper 4651; AVX1-NEXT: retq 4652; 4653; AVX2-LABEL: trunc_or_v16i16_v16i8: 4654; AVX2: # %bb.0: 4655; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4656; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 4657; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4658; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4659; AVX2-NEXT: vzeroupper 4660; AVX2-NEXT: retq 4661; 4662; AVX512F-LABEL: trunc_or_v16i16_v16i8: 4663; AVX512F: # %bb.0: 4664; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 4665; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4666; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4667; AVX512F-NEXT: vzeroupper 4668; AVX512F-NEXT: retq 4669; 4670; AVX512BW-LABEL: trunc_or_v16i16_v16i8: 4671; AVX512BW: # %bb.0: 4672; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 4673; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4674; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4675; AVX512BW-NEXT: vzeroupper 4676; AVX512BW-NEXT: retq 4677; 4678; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: 4679; AVX512DQ: # %bb.0: 4680; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 4681; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4682; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4683; AVX512DQ-NEXT: vzeroupper 4684; AVX512DQ-NEXT: retq 4685 %1 = or <16 x i16> %a0, %a1 4686 %2 = trunc <16 x i16> %1 to <16 x i8> 4687 ret <16 x i8> %2 4688} 4689 4690; 4691; or to constant 4692; 4693 4694define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 4695; SSE-LABEL: trunc_or_const_v4i64_v4i32: 4696; SSE: # %bb.0: 4697; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4698; SSE-NEXT: orps {{.*}}(%rip), %xmm0 4699; SSE-NEXT: retq 4700; 4701; AVX1-LABEL: trunc_or_const_v4i64_v4i32: 4702; AVX1: # %bb.0: 4703; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4704; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4705; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 4706; AVX1-NEXT: vzeroupper 4707; AVX1-NEXT: retq 4708; 4709; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: 4710; AVX2-SLOW: # %bb.0: 4711; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 4712; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4713; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 4714; AVX2-SLOW-NEXT: vzeroupper 4715; AVX2-SLOW-NEXT: retq 4716; 4717; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: 4718; AVX2-FAST: # %bb.0: 4719; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 4720; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 4721; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 4722; AVX2-FAST-NEXT: vzeroupper 4723; AVX2-FAST-NEXT: retq 4724; 4725; AVX512-LABEL: trunc_or_const_v4i64_v4i32: 4726; AVX512: # %bb.0: 4727; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4728; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4729; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4730; AVX512-NEXT: vzeroupper 4731; AVX512-NEXT: retq 4732 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 4733 %2 = trunc <4 x i64> %1 to <4 x i32> 4734 ret <4 x i32> %2 4735} 4736 4737define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 4738; SSE-LABEL: trunc_or_const_v8i64_v8i16: 4739; SSE: # %bb.0: 4740; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4741; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4742; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4743; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4744; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4745; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4746; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4747; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4748; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4749; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4750; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4751; SSE-NEXT: orpd {{.*}}(%rip), %xmm0 4752; SSE-NEXT: retq 4753; 4754; AVX1-LABEL: trunc_or_const_v8i64_v8i16: 4755; AVX1: # %bb.0: 4756; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 4757; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4758; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4759; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4760; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4761; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4762; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4763; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4764; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4765; AVX1-NEXT: vzeroupper 4766; AVX1-NEXT: retq 4767; 4768; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: 4769; AVX2-SLOW: # %bb.0: 4770; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 4771; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 4772; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 4773; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4774; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4775; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4776; AVX2-SLOW-NEXT: vzeroupper 4777; AVX2-SLOW-NEXT: retq 4778; 4779; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16: 4780; AVX2-FAST: # %bb.0: 4781; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 4782; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 4783; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 4784; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4785; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4786; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4787; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4788; AVX2-FAST-NEXT: vzeroupper 4789; AVX2-FAST-NEXT: retq 4790; 4791; AVX512-LABEL: trunc_or_const_v8i64_v8i16: 4792; AVX512: # %bb.0: 4793; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4794; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4795; AVX512-NEXT: vzeroupper 4796; AVX512-NEXT: retq 4797 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4798 %2 = trunc <8 x i64> %1 to <8 x i16> 4799 ret <8 x i16> %2 4800} 4801 4802define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 4803; SSE-LABEL: trunc_or_const_v8i32_v8i16: 4804; SSE: # %bb.0: 4805; SSE-NEXT: pslld $16, %xmm1 4806; SSE-NEXT: psrad $16, %xmm1 4807; SSE-NEXT: pslld $16, %xmm0 4808; SSE-NEXT: psrad $16, %xmm0 4809; SSE-NEXT: packssdw %xmm1, %xmm0 4810; SSE-NEXT: por {{.*}}(%rip), %xmm0 4811; SSE-NEXT: retq 4812; 4813; AVX1-LABEL: trunc_or_const_v8i32_v8i16: 4814; AVX1: # %bb.0: 4815; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4816; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 4817; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4818; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4819; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4820; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4821; AVX1-NEXT: vzeroupper 4822; AVX1-NEXT: retq 4823; 4824; AVX2-LABEL: trunc_or_const_v8i32_v8i16: 4825; AVX2: # %bb.0: 4826; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4827; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4828; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4829; AVX2-NEXT: vzeroupper 4830; AVX2-NEXT: retq 4831; 4832; AVX512-LABEL: trunc_or_const_v8i32_v8i16: 4833; AVX512: # %bb.0: 4834; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4835; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4836; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4837; AVX512-NEXT: vzeroupper 4838; AVX512-NEXT: retq 4839 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4840 %2 = trunc <8 x i32> %1 to <8 x i16> 4841 ret <8 x i16> %2 4842} 4843 4844define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4845; SSE-LABEL: trunc_or_const_v16i64_v16i8: 4846; SSE: # %bb.0: 4847; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4848; SSE-NEXT: pand %xmm8, %xmm7 4849; SSE-NEXT: pand %xmm8, %xmm6 4850; SSE-NEXT: packuswb %xmm7, %xmm6 4851; SSE-NEXT: pand %xmm8, %xmm5 4852; SSE-NEXT: pand %xmm8, %xmm4 4853; SSE-NEXT: packuswb %xmm5, %xmm4 4854; SSE-NEXT: packuswb %xmm6, %xmm4 4855; SSE-NEXT: pand %xmm8, %xmm3 4856; SSE-NEXT: pand %xmm8, %xmm2 4857; SSE-NEXT: packuswb %xmm3, %xmm2 4858; SSE-NEXT: pand %xmm8, %xmm1 4859; SSE-NEXT: pand %xmm8, %xmm0 4860; SSE-NEXT: packuswb %xmm1, %xmm0 4861; SSE-NEXT: packuswb %xmm2, %xmm0 4862; SSE-NEXT: packuswb %xmm4, %xmm0 4863; SSE-NEXT: por {{.*}}(%rip), %xmm0 4864; SSE-NEXT: retq 4865; 4866; AVX1-LABEL: trunc_or_const_v16i64_v16i8: 4867; AVX1: # %bb.0: 4868; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4869; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4870; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4871; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4872; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4873; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4874; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4875; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4876; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4877; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4878; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4879; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4880; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4881; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4882; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4883; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4884; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4885; AVX1-NEXT: vzeroupper 4886; AVX1-NEXT: retq 4887; 4888; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: 4889; AVX2-SLOW: # %bb.0: 4890; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] 4891; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 4892; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] 4893; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4894; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4895; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4896; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 4897; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 4898; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 4899; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 4900; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] 4901; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4902; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4903; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 4904; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4905; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4906; AVX2-SLOW-NEXT: vzeroupper 4907; AVX2-SLOW-NEXT: retq 4908; 4909; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8: 4910; AVX2-FAST: # %bb.0: 4911; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 4912; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 4913; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 4914; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4915; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4916; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4917; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4918; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 4919; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 4920; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 4921; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 4922; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4923; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4924; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4925; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 4926; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4927; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4928; AVX2-FAST-NEXT: vzeroupper 4929; AVX2-FAST-NEXT: retq 4930; 4931; AVX512-LABEL: trunc_or_const_v16i64_v16i8: 4932; AVX512: # %bb.0: 4933; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4934; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4935; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4936; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4937; AVX512-NEXT: vzeroupper 4938; AVX512-NEXT: retq 4939 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 4940 %2 = trunc <16 x i64> %1 to <16 x i8> 4941 ret <16 x i8> %2 4942} 4943 4944define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 4945; SSE-LABEL: trunc_or_const_v16i32_v16i8: 4946; SSE: # %bb.0: 4947; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4948; SSE-NEXT: pand %xmm4, %xmm3 4949; SSE-NEXT: pand %xmm4, %xmm2 4950; SSE-NEXT: packuswb %xmm3, %xmm2 4951; SSE-NEXT: pand %xmm4, %xmm1 4952; SSE-NEXT: pand %xmm4, %xmm0 4953; SSE-NEXT: packuswb %xmm1, %xmm0 4954; SSE-NEXT: packuswb %xmm2, %xmm0 4955; SSE-NEXT: por {{.*}}(%rip), %xmm0 4956; SSE-NEXT: retq 4957; 4958; AVX1-LABEL: trunc_or_const_v16i32_v16i8: 4959; AVX1: # %bb.0: 4960; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4961; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4962; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4963; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4964; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4965; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4966; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4967; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4968; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4969; AVX1-NEXT: vzeroupper 4970; AVX1-NEXT: retq 4971; 4972; AVX2-LABEL: trunc_or_const_v16i32_v16i8: 4973; AVX2: # %bb.0: 4974; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4975; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 4976; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4977; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 4978; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 4979; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4980; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4981; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 4982; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4983; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4984; AVX2-NEXT: vzeroupper 4985; AVX2-NEXT: retq 4986; 4987; AVX512-LABEL: trunc_or_const_v16i32_v16i8: 4988; AVX512: # %bb.0: 4989; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4990; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 4991; AVX512-NEXT: vzeroupper 4992; AVX512-NEXT: retq 4993 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 4994 %2 = trunc <16 x i32> %1 to <16 x i8> 4995 ret <16 x i8> %2 4996} 4997 4998define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 4999; SSE-LABEL: trunc_or_const_v16i16_v16i8: 5000; SSE: # %bb.0: 5001; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 5002; SSE-NEXT: pand %xmm2, %xmm1 5003; SSE-NEXT: pand %xmm2, %xmm0 5004; SSE-NEXT: packuswb %xmm1, %xmm0 5005; SSE-NEXT: por {{.*}}(%rip), %xmm0 5006; SSE-NEXT: retq 5007; 5008; AVX1-LABEL: trunc_or_const_v16i16_v16i8: 5009; AVX1: # %bb.0: 5010; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 5011; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5012; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5013; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5014; AVX1-NEXT: vzeroupper 5015; AVX1-NEXT: retq 5016; 5017; AVX2-LABEL: trunc_or_const_v16i16_v16i8: 5018; AVX2: # %bb.0: 5019; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 5020; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5021; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5022; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5023; AVX2-NEXT: vzeroupper 5024; AVX2-NEXT: retq 5025; 5026; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: 5027; AVX512F: # %bb.0: 5028; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 5029; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 5030; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5031; AVX512F-NEXT: vzeroupper 5032; AVX512F-NEXT: retq 5033; 5034; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: 5035; AVX512BW: # %bb.0: 5036; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 5037; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 5038; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5039; AVX512BW-NEXT: vzeroupper 5040; AVX512BW-NEXT: retq 5041; 5042; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: 5043; AVX512DQ: # %bb.0: 5044; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 5045; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 5046; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5047; AVX512DQ-NEXT: vzeroupper 5048; AVX512DQ-NEXT: retq 5049 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 5050 %2 = trunc <16 x i16> %1 to <16 x i8> 5051 ret <16 x i8> %2 5052} 5053 5054; 5055; complex patterns - often created by vectorizer 5056; 5057 5058define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 5059; SSE-LABEL: mul_add_const_v4i64_v4i32: 5060; SSE: # %bb.0: 5061; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 5062; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 5063; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 5064; SSE-NEXT: pmuludq %xmm2, %xmm0 5065; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 5066; SSE-NEXT: pmuludq %xmm3, %xmm1 5067; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 5068; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 5069; SSE-NEXT: retq 5070; 5071; AVX-LABEL: mul_add_const_v4i64_v4i32: 5072; AVX: # %bb.0: 5073; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 5074; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 5075; AVX-NEXT: retq 5076 %1 = sext <4 x i32> %a0 to <4 x i64> 5077 %2 = sext <4 x i32> %a1 to <4 x i64> 5078 %3 = mul <4 x i64> %1, %2 5079 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> 5080 %5 = trunc <4 x i64> %4 to <4 x i32> 5081 ret <4 x i32> %5 5082} 5083 5084define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 5085; SSE-LABEL: mul_add_self_v4i64_v4i32: 5086; SSE: # %bb.0: 5087; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 5088; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 5089; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 5090; SSE-NEXT: pmuludq %xmm2, %xmm0 5091; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 5092; SSE-NEXT: pmuludq %xmm3, %xmm1 5093; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 5094; SSE-NEXT: paddd %xmm0, %xmm0 5095; SSE-NEXT: retq 5096; 5097; AVX-LABEL: mul_add_self_v4i64_v4i32: 5098; AVX: # %bb.0: 5099; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 5100; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 5101; AVX-NEXT: retq 5102 %1 = sext <4 x i32> %a0 to <4 x i64> 5103 %2 = sext <4 x i32> %a1 to <4 x i64> 5104 %3 = mul <4 x i64> %1, %2 5105 %4 = add <4 x i64> %3, %3 5106 %5 = trunc <4 x i64> %4 to <4 x i32> 5107 ret <4 x i32> %5 5108} 5109 5110define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 5111; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: 5112; SSE: # %bb.0: 5113; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 5114; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 5115; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] 5116; SSE-NEXT: pmuludq %xmm2, %xmm4 5117; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 5118; SSE-NEXT: pmuludq %xmm3, %xmm1 5119; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] 5120; SSE-NEXT: paddd %xmm4, %xmm0 5121; SSE-NEXT: retq 5122; 5123; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: 5124; AVX: # %bb.0: 5125; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 5126; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 5127; AVX-NEXT: retq 5128 %1 = sext <4 x i32> %a0 to <4 x i64> 5129 %2 = sext <4 x i32> %a1 to <4 x i64> 5130 %3 = mul <4 x i64> %1, %2 5131 %4 = add <4 x i64> %1, %3 5132 %5 = trunc <4 x i64> %4 to <4 x i32> 5133 ret <4 x i32> %5 5134} 5135