1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 6 7; trunc(concat(x,y)) -> pack 8 9define <16 x i16> @trunc_concat_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 10; AVX1-LABEL: trunc_concat_packssdw_256: 11; AVX1: # %bb.0: 12; AVX1-NEXT: vpsrad $17, %xmm0, %xmm2 13; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 14; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0 15; AVX1-NEXT: vpsrad $23, %xmm1, %xmm3 16; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 17; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 18; AVX1-NEXT: vpsrad $23, %xmm1, %xmm1 19; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 20; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 21; AVX1-NEXT: retq 22; 23; AVX2-LABEL: trunc_concat_packssdw_256: 24; AVX2: # %bb.0: 25; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 26; AVX2-NEXT: vpsrad $23, %ymm1, %ymm1 27; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 28; AVX2-NEXT: retq 29; 30; AVX512-LABEL: trunc_concat_packssdw_256: 31; AVX512: # %bb.0: 32; AVX512-NEXT: vpsrad $17, %ymm0, %ymm0 33; AVX512-NEXT: vpsrad $23, %ymm1, %ymm1 34; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 35; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 36; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 37; AVX512-NEXT: vpmovdw %zmm0, %ymm0 38; AVX512-NEXT: retq 39 %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 40 %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 41 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 42 %4 = trunc <16 x i32> %3 to <16 x i16> 43 ret <16 x i16> %4 44} 45 46define <16 x i16> @trunc_concat_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 47; AVX1-LABEL: trunc_concat_packusdw_256: 48; AVX1: # %bb.0: 49; AVX1-NEXT: vpsrld $17, %xmm0, %xmm2 50; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 51; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0 52; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 53; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 54; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 55; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 56; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 57; AVX1-NEXT: retq 58; 59; AVX2-LABEL: trunc_concat_packusdw_256: 60; AVX2: # %bb.0: 61; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 62; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15] 63; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 64; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 65; AVX2-NEXT: retq 66; 67; AVX512-LABEL: trunc_concat_packusdw_256: 68; AVX512: # %bb.0: 69; AVX512-NEXT: vpsrld $17, %ymm0, %ymm0 70; AVX512-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm1, %ymm1 71; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 72; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 73; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 74; AVX512-NEXT: vpmovdw %zmm0, %ymm0 75; AVX512-NEXT: retq 76 %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 77 %2 = and <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 78 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 79 %4 = trunc <16 x i32> %3 to <16 x i16> 80 ret <16 x i16> %4 81} 82 83define <32 x i8> @trunc_concat_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 84; AVX1-LABEL: trunc_concat_packsswb_256: 85; AVX1: # %bb.0: 86; AVX1-NEXT: vpsraw $15, %xmm0, %xmm2 87; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 88; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 89; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 90; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 91; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 92; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 93; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 94; AVX1-NEXT: retq 95; 96; AVX2-LABEL: trunc_concat_packsswb_256: 97; AVX2: # %bb.0: 98; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 99; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 100; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 101; AVX2-NEXT: retq 102; 103; AVX512F-LABEL: trunc_concat_packsswb_256: 104; AVX512F: # %bb.0: 105; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 106; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 107; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 108; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 109; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 110; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 111; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 112; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 113; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 114; AVX512F-NEXT: retq 115; 116; AVX512BW-LABEL: trunc_concat_packsswb_256: 117; AVX512BW: # %bb.0: 118; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 119; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 120; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 121; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 122; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 123; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 124; AVX512BW-NEXT: retq 125 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 126 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 127 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 128 %4 = trunc <32 x i16> %3 to <32 x i8> 129 ret <32 x i8> %4 130} 131 132define <32 x i8> @trunc_concat_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 133; AVX1-LABEL: trunc_concat_packuswb_256: 134; AVX1: # %bb.0: 135; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 136; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 137; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 138; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 139; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 140; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 141; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 142; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 143; AVX1-NEXT: retq 144; 145; AVX2-LABEL: trunc_concat_packuswb_256: 146; AVX2: # %bb.0: 147; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 148; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 149; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 150; AVX2-NEXT: retq 151; 152; AVX512F-LABEL: trunc_concat_packuswb_256: 153; AVX512F: # %bb.0: 154; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 155; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 156; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 157; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 158; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 159; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 160; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 161; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 162; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 163; AVX512F-NEXT: retq 164; 165; AVX512BW-LABEL: trunc_concat_packuswb_256: 166; AVX512BW: # %bb.0: 167; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 168; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 169; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 170; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 171; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 172; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 173; AVX512BW-NEXT: retq 174 %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 175 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 176 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 177 %4 = trunc <32 x i16> %3 to <32 x i8> 178 ret <32 x i8> %4 179} 180 181; concat(trunc(x),trunc(y)) -> pack 182 183 184define <16 x i16> @concat_trunc_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 185; AVX1-LABEL: concat_trunc_packssdw_256: 186; AVX1: # %bb.0: 187; AVX1-NEXT: vpsrad $17, %xmm0, %xmm2 188; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 189; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0 190; AVX1-NEXT: vpsrad $23, %xmm1, %xmm3 191; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 192; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 193; AVX1-NEXT: vpsrad $23, %xmm1, %xmm1 194; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 195; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 196; AVX1-NEXT: retq 197; 198; AVX2-LABEL: concat_trunc_packssdw_256: 199; AVX2: # %bb.0: 200; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 201; AVX2-NEXT: vpsrad $23, %ymm1, %ymm1 202; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 203; AVX2-NEXT: retq 204; 205; AVX512-LABEL: concat_trunc_packssdw_256: 206; AVX512: # %bb.0: 207; AVX512-NEXT: vpsrad $17, %ymm0, %ymm0 208; AVX512-NEXT: vpsrad $23, %ymm1, %ymm1 209; AVX512-NEXT: vpmovdw %ymm0, %xmm0 210; AVX512-NEXT: vpmovdw %ymm1, %xmm1 211; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 212; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 213; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 214; AVX512-NEXT: retq 215 %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 216 %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 217 %3 = trunc <8 x i32> %1 to <8 x i16> 218 %4 = trunc <8 x i32> %2 to <8 x i16> 219 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 220 ret <16 x i16> %5 221} 222 223define <16 x i16> @concat_trunc_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 224; AVX1-LABEL: concat_trunc_packusdw_256: 225; AVX1: # %bb.0: 226; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 227; AVX1-NEXT: vpsrld $17, %xmm2, %xmm2 228; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0 229; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 230; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 231; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 232; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 233; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 234; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 235; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 236; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 237; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 238; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 239; AVX1-NEXT: retq 240; 241; AVX2-LABEL: concat_trunc_packusdw_256: 242; AVX2: # %bb.0: 243; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 244; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 245; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 246; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 247; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 248; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 249; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 250; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 251; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 252; AVX2-NEXT: retq 253; 254; AVX512-LABEL: concat_trunc_packusdw_256: 255; AVX512: # %bb.0: 256; AVX512-NEXT: vpsrld $17, %ymm0, %ymm0 257; AVX512-NEXT: vpmovdw %ymm0, %xmm0 258; AVX512-NEXT: vpmovdw %ymm1, %xmm1 259; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 260; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 261; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 262; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 263; AVX512-NEXT: retq 264 %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 265 %2 = and <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 266 %3 = trunc <8 x i32> %1 to <8 x i16> 267 %4 = trunc <8 x i32> %2 to <8 x i16> 268 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 269 ret <16 x i16> %5 270} 271 272define <32 x i8> @concat_trunc_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 273; AVX1-LABEL: concat_trunc_packsswb_256: 274; AVX1: # %bb.0: 275; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 276; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2 277; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 278; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 279; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 280; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 281; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 282; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 283; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 284; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 285; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 286; AVX1-NEXT: retq 287; 288; AVX2-LABEL: concat_trunc_packsswb_256: 289; AVX2: # %bb.0: 290; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 291; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 292; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 293; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 294; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 295; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 296; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 297; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 298; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 299; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 300; AVX2-NEXT: retq 301; 302; AVX512F-LABEL: concat_trunc_packsswb_256: 303; AVX512F: # %bb.0: 304; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 305; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 306; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 307; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 308; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 309; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 310; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 311; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 312; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 313; AVX512F-NEXT: retq 314; 315; AVX512BW-LABEL: concat_trunc_packsswb_256: 316; AVX512BW: # %bb.0: 317; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 318; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 319; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 320; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 321; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 322; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 323; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 324; AVX512BW-NEXT: retq 325 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 326 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 327 %3 = trunc <16 x i16> %1 to <16 x i8> 328 %4 = trunc <16 x i16> %2 to <16 x i8> 329 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 330 ret <32 x i8> %5 331} 332 333define <32 x i8> @concat_trunc_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 334; AVX1-LABEL: concat_trunc_packuswb_256: 335; AVX1: # %bb.0: 336; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 337; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2 338; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 339; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 340; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 341; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 342; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 343; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 344; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 345; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 346; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 347; AVX1-NEXT: retq 348; 349; AVX2-LABEL: concat_trunc_packuswb_256: 350; AVX2: # %bb.0: 351; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 352; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 353; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 354; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 355; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 356; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 357; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 358; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 359; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 360; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 361; AVX2-NEXT: retq 362; 363; AVX512F-LABEL: concat_trunc_packuswb_256: 364; AVX512F: # %bb.0: 365; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 366; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 367; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 368; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 369; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 370; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 371; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 372; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 373; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 374; AVX512F-NEXT: retq 375; 376; AVX512BW-LABEL: concat_trunc_packuswb_256: 377; AVX512BW: # %bb.0: 378; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 379; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 380; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 381; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 382; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 383; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 384; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 385; AVX512BW-NEXT: retq 386 %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 387 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 388 %3 = trunc <16 x i16> %1 to <16 x i8> 389 %4 = trunc <16 x i16> %2 to <16 x i8> 390 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 391 ret <32 x i8> %5 392} 393