1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 11 12define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind { 13; SSE2-LABEL: shuffle_v16i8_to_v8i8_1: 14; SSE2: # %bb.0: 15; SSE2-NEXT: movdqa (%rdi), %xmm0 16; SSE2-NEXT: psrlw $8, %xmm0 17; SSE2-NEXT: packuswb %xmm0, %xmm0 18; SSE2-NEXT: movq %xmm0, (%rsi) 19; SSE2-NEXT: retq 20; 21; SSE42-LABEL: shuffle_v16i8_to_v8i8_1: 22; SSE42: # %bb.0: 23; SSE42-NEXT: movdqa (%rdi), %xmm0 24; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 25; SSE42-NEXT: movq %xmm0, (%rsi) 26; SSE42-NEXT: retq 27; 28; AVX-LABEL: shuffle_v16i8_to_v8i8_1: 29; AVX: # %bb.0: 30; AVX-NEXT: vmovdqa (%rdi), %xmm0 31; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 32; AVX-NEXT: vmovq %xmm0, (%rsi) 33; AVX-NEXT: retq 34; 35; AVX512-LABEL: shuffle_v16i8_to_v8i8_1: 36; AVX512: # %bb.0: 37; AVX512-NEXT: vmovdqa (%rdi), %xmm0 38; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 39; AVX512-NEXT: vmovq %xmm0, (%rsi) 40; AVX512-NEXT: retq 41 %vec = load <16 x i8>, <16 x i8>* %L 42 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 43 store <8 x i8> %strided.vec, <8 x i8>* %S 44 ret void 45} 46 47define void @shuffle_v8i16_to_v4i16_1(<8 x i16>* %L, <4 x i16>* %S) nounwind { 48; SSE2-LABEL: shuffle_v8i16_to_v4i16_1: 49; SSE2: # %bb.0: 50; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7] 51; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 52; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 53; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 54; SSE2-NEXT: movq %xmm0, (%rsi) 55; SSE2-NEXT: retq 56; 57; SSE42-LABEL: shuffle_v8i16_to_v4i16_1: 58; SSE42: # %bb.0: 59; SSE42-NEXT: movdqa (%rdi), %xmm0 60; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 61; SSE42-NEXT: movq %xmm0, (%rsi) 62; SSE42-NEXT: retq 63; 64; AVX-LABEL: shuffle_v8i16_to_v4i16_1: 65; AVX: # %bb.0: 66; AVX-NEXT: vmovdqa (%rdi), %xmm0 67; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 68; AVX-NEXT: vmovq %xmm0, (%rsi) 69; AVX-NEXT: retq 70; 71; AVX512-LABEL: shuffle_v8i16_to_v4i16_1: 72; AVX512: # %bb.0: 73; AVX512-NEXT: vmovdqa (%rdi), %xmm0 74; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 75; AVX512-NEXT: vmovq %xmm0, (%rsi) 76; AVX512-NEXT: retq 77 %vec = load <8 x i16>, <8 x i16>* %L 78 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 79 store <4 x i16> %strided.vec, <4 x i16>* %S 80 ret void 81} 82 83define void @shuffle_v4i32_to_v2i32_1(<4 x i32>* %L, <2 x i32>* %S) nounwind { 84; SSE-LABEL: shuffle_v4i32_to_v2i32_1: 85; SSE: # %bb.0: 86; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] 87; SSE-NEXT: movq %xmm0, (%rsi) 88; SSE-NEXT: retq 89; 90; AVX-LABEL: shuffle_v4i32_to_v2i32_1: 91; AVX: # %bb.0: 92; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] 93; AVX-NEXT: vmovlps %xmm0, (%rsi) 94; AVX-NEXT: retq 95; 96; AVX512-LABEL: shuffle_v4i32_to_v2i32_1: 97; AVX512: # %bb.0: 98; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] 99; AVX512-NEXT: vmovlps %xmm0, (%rsi) 100; AVX512-NEXT: retq 101 %vec = load <4 x i32>, <4 x i32>* %L 102 %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3> 103 store <2 x i32> %strided.vec, <2 x i32>* %S 104 ret void 105} 106 107define void @shuffle_v16i8_to_v4i8_1(<16 x i8>* %L, <4 x i8>* %S) nounwind { 108; SSE2-LABEL: shuffle_v16i8_to_v4i8_1: 109; SSE2: # %bb.0: 110; SSE2-NEXT: movdqa (%rdi), %xmm0 111; SSE2-NEXT: pxor %xmm1, %xmm1 112; SSE2-NEXT: movdqa %xmm0, %xmm2 113; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 114; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 115; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 116; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 117; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 118; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 119; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 120; SSE2-NEXT: packuswb %xmm0, %xmm0 121; SSE2-NEXT: movd %xmm0, (%rsi) 122; SSE2-NEXT: retq 123; 124; SSE42-LABEL: shuffle_v16i8_to_v4i8_1: 125; SSE42: # %bb.0: 126; SSE42-NEXT: movdqa (%rdi), %xmm0 127; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 128; SSE42-NEXT: movd %xmm0, (%rsi) 129; SSE42-NEXT: retq 130; 131; AVX-LABEL: shuffle_v16i8_to_v4i8_1: 132; AVX: # %bb.0: 133; AVX-NEXT: vmovdqa (%rdi), %xmm0 134; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 135; AVX-NEXT: vmovd %xmm0, (%rsi) 136; AVX-NEXT: retq 137; 138; AVX512-LABEL: shuffle_v16i8_to_v4i8_1: 139; AVX512: # %bb.0: 140; AVX512-NEXT: vmovdqa (%rdi), %xmm0 141; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 142; AVX512-NEXT: vmovd %xmm0, (%rsi) 143; AVX512-NEXT: retq 144 %vec = load <16 x i8>, <16 x i8>* %L 145 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 146 store <4 x i8> %strided.vec, <4 x i8>* %S 147 ret void 148} 149 150define void @shuffle_v16i8_to_v4i8_2(<16 x i8>* %L, <4 x i8>* %S) nounwind { 151; SSE2-LABEL: shuffle_v16i8_to_v4i8_2: 152; SSE2: # %bb.0: 153; SSE2-NEXT: movdqa (%rdi), %xmm0 154; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 155; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 156; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 157; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 158; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 159; SSE2-NEXT: packuswb %xmm0, %xmm0 160; SSE2-NEXT: movd %xmm0, (%rsi) 161; SSE2-NEXT: retq 162; 163; SSE42-LABEL: shuffle_v16i8_to_v4i8_2: 164; SSE42: # %bb.0: 165; SSE42-NEXT: movdqa (%rdi), %xmm0 166; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 167; SSE42-NEXT: movd %xmm0, (%rsi) 168; SSE42-NEXT: retq 169; 170; AVX-LABEL: shuffle_v16i8_to_v4i8_2: 171; AVX: # %bb.0: 172; AVX-NEXT: vmovdqa (%rdi), %xmm0 173; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 174; AVX-NEXT: vmovd %xmm0, (%rsi) 175; AVX-NEXT: retq 176; 177; AVX512-LABEL: shuffle_v16i8_to_v4i8_2: 178; AVX512: # %bb.0: 179; AVX512-NEXT: vmovdqa (%rdi), %xmm0 180; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 181; AVX512-NEXT: vmovd %xmm0, (%rsi) 182; AVX512-NEXT: retq 183 %vec = load <16 x i8>, <16 x i8>* %L 184 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 185 store <4 x i8> %strided.vec, <4 x i8>* %S 186 ret void 187} 188 189define void @shuffle_v16i8_to_v4i8_3(<16 x i8>* %L, <4 x i8>* %S) nounwind { 190; SSE2-LABEL: shuffle_v16i8_to_v4i8_3: 191; SSE2: # %bb.0: 192; SSE2-NEXT: movdqa (%rdi), %xmm0 193; SSE2-NEXT: pxor %xmm1, %xmm1 194; SSE2-NEXT: movdqa %xmm0, %xmm2 195; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 196; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] 197; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 198; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 199; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 200; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 201; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 202; SSE2-NEXT: packuswb %xmm0, %xmm0 203; SSE2-NEXT: movd %xmm0, (%rsi) 204; SSE2-NEXT: retq 205; 206; SSE42-LABEL: shuffle_v16i8_to_v4i8_3: 207; SSE42: # %bb.0: 208; SSE42-NEXT: movdqa (%rdi), %xmm0 209; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 210; SSE42-NEXT: movd %xmm0, (%rsi) 211; SSE42-NEXT: retq 212; 213; AVX-LABEL: shuffle_v16i8_to_v4i8_3: 214; AVX: # %bb.0: 215; AVX-NEXT: vmovdqa (%rdi), %xmm0 216; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 217; AVX-NEXT: vmovd %xmm0, (%rsi) 218; AVX-NEXT: retq 219; 220; AVX512-LABEL: shuffle_v16i8_to_v4i8_3: 221; AVX512: # %bb.0: 222; AVX512-NEXT: vmovdqa (%rdi), %xmm0 223; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 224; AVX512-NEXT: vmovd %xmm0, (%rsi) 225; AVX512-NEXT: retq 226 %vec = load <16 x i8>, <16 x i8>* %L 227 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 228 store <4 x i8> %strided.vec, <4 x i8>* %S 229 ret void 230} 231 232define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind { 233; SSE-LABEL: shuffle_v8i16_to_v2i16_1: 234; SSE: # %bb.0: 235; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 236; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 237; SSE-NEXT: movd %xmm0, (%rsi) 238; SSE-NEXT: retq 239; 240; AVX1-LABEL: shuffle_v8i16_to_v2i16_1: 241; AVX1: # %bb.0: 242; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 243; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 244; AVX1-NEXT: vmovd %xmm0, (%rsi) 245; AVX1-NEXT: retq 246; 247; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1: 248; AVX2-SLOW: # %bb.0: 249; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 250; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 251; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 252; AVX2-SLOW-NEXT: retq 253; 254; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1: 255; AVX2-FAST: # %bb.0: 256; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 257; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 258; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 259; AVX2-FAST-NEXT: retq 260; 261; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1: 262; AVX512F: # %bb.0: 263; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 264; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 265; AVX512F-NEXT: vmovd %xmm0, (%rsi) 266; AVX512F-NEXT: retq 267; 268; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1: 269; AVX512VL: # %bb.0: 270; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 271; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 272; AVX512VL-NEXT: vmovd %xmm0, (%rsi) 273; AVX512VL-NEXT: retq 274; 275; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1: 276; AVX512BW: # %bb.0: 277; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 278; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 279; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 280; AVX512BW-NEXT: retq 281; 282; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1: 283; AVX512BWVL: # %bb.0: 284; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 285; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 286; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) 287; AVX512BWVL-NEXT: retq 288 %vec = load <8 x i16>, <8 x i16>* %L 289 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5> 290 store <2 x i16> %strided.vec, <2 x i16>* %S 291 ret void 292} 293 294define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind { 295; SSE-LABEL: shuffle_v8i16_to_v2i16_2: 296; SSE: # %bb.0: 297; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] 298; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 299; SSE-NEXT: movd %xmm0, (%rsi) 300; SSE-NEXT: retq 301; 302; AVX1-LABEL: shuffle_v8i16_to_v2i16_2: 303; AVX1: # %bb.0: 304; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 305; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 306; AVX1-NEXT: vmovd %xmm0, (%rsi) 307; AVX1-NEXT: retq 308; 309; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2: 310; AVX2-SLOW: # %bb.0: 311; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 312; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 313; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 314; AVX2-SLOW-NEXT: retq 315; 316; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2: 317; AVX2-FAST: # %bb.0: 318; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 319; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 320; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 321; AVX2-FAST-NEXT: retq 322; 323; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2: 324; AVX512F: # %bb.0: 325; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 326; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 327; AVX512F-NEXT: vmovd %xmm0, (%rsi) 328; AVX512F-NEXT: retq 329; 330; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2: 331; AVX512VL: # %bb.0: 332; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 333; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 334; AVX512VL-NEXT: vmovd %xmm0, (%rsi) 335; AVX512VL-NEXT: retq 336; 337; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2: 338; AVX512BW: # %bb.0: 339; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 340; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 341; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 342; AVX512BW-NEXT: retq 343; 344; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2: 345; AVX512BWVL: # %bb.0: 346; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 347; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 348; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) 349; AVX512BWVL-NEXT: retq 350 %vec = load <8 x i16>, <8 x i16>* %L 351 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6> 352 store <2 x i16> %strided.vec, <2 x i16>* %S 353 ret void 354} 355 356define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind { 357; SSE-LABEL: shuffle_v8i16_to_v2i16_3: 358; SSE: # %bb.0: 359; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] 360; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 361; SSE-NEXT: movd %xmm0, (%rsi) 362; SSE-NEXT: retq 363; 364; AVX1-LABEL: shuffle_v8i16_to_v2i16_3: 365; AVX1: # %bb.0: 366; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 367; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 368; AVX1-NEXT: vmovd %xmm0, (%rsi) 369; AVX1-NEXT: retq 370; 371; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3: 372; AVX2-SLOW: # %bb.0: 373; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 374; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 375; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 376; AVX2-SLOW-NEXT: retq 377; 378; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3: 379; AVX2-FAST: # %bb.0: 380; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 381; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 382; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 383; AVX2-FAST-NEXT: retq 384; 385; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3: 386; AVX512F: # %bb.0: 387; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 388; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 389; AVX512F-NEXT: vmovd %xmm0, (%rsi) 390; AVX512F-NEXT: retq 391; 392; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3: 393; AVX512VL: # %bb.0: 394; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 395; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 396; AVX512VL-NEXT: vmovd %xmm0, (%rsi) 397; AVX512VL-NEXT: retq 398; 399; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3: 400; AVX512BW: # %bb.0: 401; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 402; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 403; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 404; AVX512BW-NEXT: retq 405; 406; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3: 407; AVX512BWVL: # %bb.0: 408; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 409; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 410; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) 411; AVX512BWVL-NEXT: retq 412 %vec = load <8 x i16>, <8 x i16>* %L 413 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7> 414 store <2 x i16> %strided.vec, <2 x i16>* %S 415 ret void 416} 417 418define void @shuffle_v16i8_to_v2i8_1(<16 x i8>* %L, <2 x i8>* %S) nounwind { 419; SSE2-LABEL: shuffle_v16i8_to_v2i8_1: 420; SSE2: # %bb.0: 421; SSE2-NEXT: movdqa (%rdi), %xmm0 422; SSE2-NEXT: pxor %xmm1, %xmm1 423; SSE2-NEXT: movdqa %xmm0, %xmm2 424; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 425; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 426; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 427; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 428; SSE2-NEXT: packuswb %xmm0, %xmm0 429; SSE2-NEXT: movd %xmm0, %eax 430; SSE2-NEXT: movw %ax, (%rsi) 431; SSE2-NEXT: retq 432; 433; SSE42-LABEL: shuffle_v16i8_to_v2i8_1: 434; SSE42: # %bb.0: 435; SSE42-NEXT: movdqa (%rdi), %xmm0 436; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 437; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 438; SSE42-NEXT: retq 439; 440; AVX-LABEL: shuffle_v16i8_to_v2i8_1: 441; AVX: # %bb.0: 442; AVX-NEXT: vmovdqa (%rdi), %xmm0 443; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 444; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 445; AVX-NEXT: retq 446; 447; AVX512-LABEL: shuffle_v16i8_to_v2i8_1: 448; AVX512: # %bb.0: 449; AVX512-NEXT: vmovdqa (%rdi), %xmm0 450; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 451; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 452; AVX512-NEXT: retq 453 %vec = load <16 x i8>, <16 x i8>* %L 454 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9> 455 store <2 x i8> %strided.vec, <2 x i8>* %S 456 ret void 457} 458 459define void @shuffle_v16i8_to_v2i8_2(<16 x i8>* %L, <2 x i8>* %S) nounwind { 460; SSE2-LABEL: shuffle_v16i8_to_v2i8_2: 461; SSE2: # %bb.0: 462; SSE2-NEXT: movdqa (%rdi), %xmm0 463; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 464; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 465; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 466; SSE2-NEXT: packuswb %xmm0, %xmm0 467; SSE2-NEXT: movd %xmm0, %eax 468; SSE2-NEXT: movw %ax, (%rsi) 469; SSE2-NEXT: retq 470; 471; SSE42-LABEL: shuffle_v16i8_to_v2i8_2: 472; SSE42: # %bb.0: 473; SSE42-NEXT: movdqa (%rdi), %xmm0 474; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 475; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 476; SSE42-NEXT: retq 477; 478; AVX-LABEL: shuffle_v16i8_to_v2i8_2: 479; AVX: # %bb.0: 480; AVX-NEXT: vmovdqa (%rdi), %xmm0 481; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 482; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 483; AVX-NEXT: retq 484; 485; AVX512-LABEL: shuffle_v16i8_to_v2i8_2: 486; AVX512: # %bb.0: 487; AVX512-NEXT: vmovdqa (%rdi), %xmm0 488; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 489; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 490; AVX512-NEXT: retq 491 %vec = load <16 x i8>, <16 x i8>* %L 492 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10> 493 store <2 x i8> %strided.vec, <2 x i8>* %S 494 ret void 495} 496 497define void @shuffle_v16i8_to_v2i8_3(<16 x i8>* %L, <2 x i8>* %S) nounwind { 498; SSE2-LABEL: shuffle_v16i8_to_v2i8_3: 499; SSE2: # %bb.0: 500; SSE2-NEXT: movdqa (%rdi), %xmm0 501; SSE2-NEXT: pxor %xmm1, %xmm1 502; SSE2-NEXT: movdqa %xmm0, %xmm2 503; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 504; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 505; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 506; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 507; SSE2-NEXT: packuswb %xmm0, %xmm0 508; SSE2-NEXT: movd %xmm0, %eax 509; SSE2-NEXT: movw %ax, (%rsi) 510; SSE2-NEXT: retq 511; 512; SSE42-LABEL: shuffle_v16i8_to_v2i8_3: 513; SSE42: # %bb.0: 514; SSE42-NEXT: movdqa (%rdi), %xmm0 515; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 516; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 517; SSE42-NEXT: retq 518; 519; AVX-LABEL: shuffle_v16i8_to_v2i8_3: 520; AVX: # %bb.0: 521; AVX-NEXT: vmovdqa (%rdi), %xmm0 522; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 523; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 524; AVX-NEXT: retq 525; 526; AVX512-LABEL: shuffle_v16i8_to_v2i8_3: 527; AVX512: # %bb.0: 528; AVX512-NEXT: vmovdqa (%rdi), %xmm0 529; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 530; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 531; AVX512-NEXT: retq 532 %vec = load <16 x i8>, <16 x i8>* %L 533 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11> 534 store <2 x i8> %strided.vec, <2 x i8>* %S 535 ret void 536} 537 538define void @shuffle_v16i8_to_v2i8_4(<16 x i8>* %L, <2 x i8>* %S) nounwind { 539; SSE2-LABEL: shuffle_v16i8_to_v2i8_4: 540; SSE2: # %bb.0: 541; SSE2-NEXT: movdqa (%rdi), %xmm0 542; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 543; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 544; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 545; SSE2-NEXT: packuswb %xmm0, %xmm0 546; SSE2-NEXT: movd %xmm0, %eax 547; SSE2-NEXT: movw %ax, (%rsi) 548; SSE2-NEXT: retq 549; 550; SSE42-LABEL: shuffle_v16i8_to_v2i8_4: 551; SSE42: # %bb.0: 552; SSE42-NEXT: movdqa (%rdi), %xmm0 553; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 554; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 555; SSE42-NEXT: retq 556; 557; AVX-LABEL: shuffle_v16i8_to_v2i8_4: 558; AVX: # %bb.0: 559; AVX-NEXT: vmovdqa (%rdi), %xmm0 560; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 561; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 562; AVX-NEXT: retq 563; 564; AVX512-LABEL: shuffle_v16i8_to_v2i8_4: 565; AVX512: # %bb.0: 566; AVX512-NEXT: vmovdqa (%rdi), %xmm0 567; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 568; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 569; AVX512-NEXT: retq 570 %vec = load <16 x i8>, <16 x i8>* %L 571 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12> 572 store <2 x i8> %strided.vec, <2 x i8>* %S 573 ret void 574} 575 576define void @shuffle_v16i8_to_v2i8_5(<16 x i8>* %L, <2 x i8>* %S) nounwind { 577; SSE2-LABEL: shuffle_v16i8_to_v2i8_5: 578; SSE2: # %bb.0: 579; SSE2-NEXT: movdqa (%rdi), %xmm0 580; SSE2-NEXT: pxor %xmm1, %xmm1 581; SSE2-NEXT: movdqa %xmm0, %xmm2 582; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 583; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 584; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 585; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 586; SSE2-NEXT: packuswb %xmm0, %xmm0 587; SSE2-NEXT: movd %xmm0, %eax 588; SSE2-NEXT: movw %ax, (%rsi) 589; SSE2-NEXT: retq 590; 591; SSE42-LABEL: shuffle_v16i8_to_v2i8_5: 592; SSE42: # %bb.0: 593; SSE42-NEXT: movdqa (%rdi), %xmm0 594; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 595; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 596; SSE42-NEXT: retq 597; 598; AVX-LABEL: shuffle_v16i8_to_v2i8_5: 599; AVX: # %bb.0: 600; AVX-NEXT: vmovdqa (%rdi), %xmm0 601; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 602; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 603; AVX-NEXT: retq 604; 605; AVX512-LABEL: shuffle_v16i8_to_v2i8_5: 606; AVX512: # %bb.0: 607; AVX512-NEXT: vmovdqa (%rdi), %xmm0 608; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 609; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 610; AVX512-NEXT: retq 611 %vec = load <16 x i8>, <16 x i8>* %L 612 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13> 613 store <2 x i8> %strided.vec, <2 x i8>* %S 614 ret void 615} 616 617define void @shuffle_v16i8_to_v2i8_6(<16 x i8>* %L, <2 x i8>* %S) nounwind { 618; SSE2-LABEL: shuffle_v16i8_to_v2i8_6: 619; SSE2: # %bb.0: 620; SSE2-NEXT: movdqa (%rdi), %xmm0 621; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 622; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 623; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 624; SSE2-NEXT: packuswb %xmm0, %xmm0 625; SSE2-NEXT: movd %xmm0, %eax 626; SSE2-NEXT: movw %ax, (%rsi) 627; SSE2-NEXT: retq 628; 629; SSE42-LABEL: shuffle_v16i8_to_v2i8_6: 630; SSE42: # %bb.0: 631; SSE42-NEXT: movdqa (%rdi), %xmm0 632; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 633; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 634; SSE42-NEXT: retq 635; 636; AVX-LABEL: shuffle_v16i8_to_v2i8_6: 637; AVX: # %bb.0: 638; AVX-NEXT: vmovdqa (%rdi), %xmm0 639; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 640; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 641; AVX-NEXT: retq 642; 643; AVX512-LABEL: shuffle_v16i8_to_v2i8_6: 644; AVX512: # %bb.0: 645; AVX512-NEXT: vmovdqa (%rdi), %xmm0 646; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 647; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 648; AVX512-NEXT: retq 649 %vec = load <16 x i8>, <16 x i8>* %L 650 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14> 651 store <2 x i8> %strided.vec, <2 x i8>* %S 652 ret void 653} 654 655define void @shuffle_v16i8_to_v2i8_7(<16 x i8>* %L, <2 x i8>* %S) nounwind { 656; SSE2-LABEL: shuffle_v16i8_to_v2i8_7: 657; SSE2: # %bb.0: 658; SSE2-NEXT: movdqa (%rdi), %xmm0 659; SSE2-NEXT: pxor %xmm1, %xmm1 660; SSE2-NEXT: movdqa %xmm0, %xmm2 661; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 662; SSE2-NEXT: psrlw $8, %xmm0 663; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 664; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 665; SSE2-NEXT: packuswb %xmm0, %xmm0 666; SSE2-NEXT: movd %xmm0, %eax 667; SSE2-NEXT: movw %ax, (%rsi) 668; SSE2-NEXT: retq 669; 670; SSE42-LABEL: shuffle_v16i8_to_v2i8_7: 671; SSE42: # %bb.0: 672; SSE42-NEXT: movdqa (%rdi), %xmm0 673; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 674; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 675; SSE42-NEXT: retq 676; 677; AVX-LABEL: shuffle_v16i8_to_v2i8_7: 678; AVX: # %bb.0: 679; AVX-NEXT: vmovdqa (%rdi), %xmm0 680; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 681; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 682; AVX-NEXT: retq 683; 684; AVX512-LABEL: shuffle_v16i8_to_v2i8_7: 685; AVX512: # %bb.0: 686; AVX512-NEXT: vmovdqa (%rdi), %xmm0 687; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 688; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 689; AVX512-NEXT: retq 690 %vec = load <16 x i8>, <16 x i8>* %L 691 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15> 692 store <2 x i8> %strided.vec, <2 x i8>* %S 693 ret void 694} 695 696