1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 9 10define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind { 11; AVX-LABEL: shuffle_v32i8_to_v16i8_1: 12; AVX: # %bb.0: 13; AVX-NEXT: vmovdqa (%rdi), %xmm0 14; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 15; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 16; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 17; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 18; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 19; AVX-NEXT: vmovdqa %xmm0, (%rsi) 20; AVX-NEXT: retq 21; 22; AVX512-LABEL: shuffle_v32i8_to_v16i8_1: 23; AVX512: # %bb.0: 24; AVX512-NEXT: vmovdqa (%rdi), %xmm0 25; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 26; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 27; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 28; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 29; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 30; AVX512-NEXT: vmovdqa %xmm0, (%rsi) 31; AVX512-NEXT: retq 32 %vec = load <32 x i8>, <32 x i8>* %L 33 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 34 store <16 x i8> %strided.vec, <16 x i8>* %S 35 ret void 36} 37 38define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind { 39; AVX-LABEL: shuffle_v16i16_to_v8i16_1: 40; AVX: # %bb.0: 41; AVX-NEXT: vmovdqa (%rdi), %xmm0 42; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 43; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 44; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 45; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 46; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 47; AVX-NEXT: vmovdqa %xmm0, (%rsi) 48; AVX-NEXT: retq 49; 50; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1: 51; AVX512F: # %bb.0: 52; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 53; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 54; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 55; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 56; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 57; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 58; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 59; AVX512F-NEXT: retq 60; 61; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1: 62; AVX512VL: # %bb.0: 63; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 64; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 65; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 66; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 67; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 68; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 69; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 70; AVX512VL-NEXT: retq 71; 72; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1: 73; AVX512BW: # %bb.0: 74; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [1,3,5,7,33,35,37,39] 75; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 76; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 77; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 78; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) 79; AVX512BW-NEXT: vzeroupper 80; AVX512BW-NEXT: retq 81; 82; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1: 83; AVX512BWVL: # %bb.0: 84; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 85; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15] 86; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 87; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) 88; AVX512BWVL-NEXT: retq 89 %vec = load <16 x i16>, <16 x i16>* %L 90 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 91 store <8 x i16> %strided.vec, <8 x i16>* %S 92 ret void 93} 94 95define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind { 96; AVX-LABEL: shuffle_v8i32_to_v4i32_1: 97; AVX: # %bb.0: 98; AVX-NEXT: vmovaps (%rdi), %xmm0 99; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] 100; AVX-NEXT: vmovaps %xmm0, (%rsi) 101; AVX-NEXT: retq 102; 103; AVX512-LABEL: shuffle_v8i32_to_v4i32_1: 104; AVX512: # %bb.0: 105; AVX512-NEXT: vmovaps (%rdi), %xmm0 106; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] 107; AVX512-NEXT: vmovaps %xmm0, (%rsi) 108; AVX512-NEXT: retq 109 %vec = load <8 x i32>, <8 x i32>* %L 110 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 111 store <4 x i32> %strided.vec, <4 x i32>* %S 112 ret void 113} 114 115define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind { 116; AVX-LABEL: shuffle_v32i8_to_v8i8_1: 117; AVX: # %bb.0: 118; AVX-NEXT: vmovdqa (%rdi), %xmm0 119; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 120; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 121; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 122; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 123; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 124; AVX-NEXT: vmovq %xmm0, (%rsi) 125; AVX-NEXT: retq 126; 127; AVX512-LABEL: shuffle_v32i8_to_v8i8_1: 128; AVX512: # %bb.0: 129; AVX512-NEXT: vmovdqa (%rdi), %xmm0 130; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 131; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 132; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 133; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 134; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 135; AVX512-NEXT: vmovq %xmm0, (%rsi) 136; AVX512-NEXT: retq 137 %vec = load <32 x i8>, <32 x i8>* %L 138 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 139 store <8 x i8> %strided.vec, <8 x i8>* %S 140 ret void 141} 142 143define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind { 144; AVX-LABEL: shuffle_v32i8_to_v8i8_2: 145; AVX: # %bb.0: 146; AVX-NEXT: vmovdqa (%rdi), %xmm0 147; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 148; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 149; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 150; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 151; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 152; AVX-NEXT: vmovq %xmm0, (%rsi) 153; AVX-NEXT: retq 154; 155; AVX512-LABEL: shuffle_v32i8_to_v8i8_2: 156; AVX512: # %bb.0: 157; AVX512-NEXT: vmovdqa (%rdi), %xmm0 158; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 159; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 160; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 161; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 162; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 163; AVX512-NEXT: vmovq %xmm0, (%rsi) 164; AVX512-NEXT: retq 165 %vec = load <32 x i8>, <32 x i8>* %L 166 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 167 store <8 x i8> %strided.vec, <8 x i8>* %S 168 ret void 169} 170 171define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind { 172; AVX-LABEL: shuffle_v32i8_to_v8i8_3: 173; AVX: # %bb.0: 174; AVX-NEXT: vmovdqa (%rdi), %xmm0 175; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 176; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 177; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 178; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 179; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 180; AVX-NEXT: vmovq %xmm0, (%rsi) 181; AVX-NEXT: retq 182; 183; AVX512-LABEL: shuffle_v32i8_to_v8i8_3: 184; AVX512: # %bb.0: 185; AVX512-NEXT: vmovdqa (%rdi), %xmm0 186; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 187; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 188; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 189; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 190; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 191; AVX512-NEXT: vmovq %xmm0, (%rsi) 192; AVX512-NEXT: retq 193 %vec = load <32 x i8>, <32 x i8>* %L 194 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 195 store <8 x i8> %strided.vec, <8 x i8>* %S 196 ret void 197} 198 199define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind { 200; AVX1-LABEL: shuffle_v16i16_to_v4i16_1: 201; AVX1: # %bb.0: 202; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 203; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 204; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] 205; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 206; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 207; AVX1-NEXT: vmovq %xmm0, (%rsi) 208; AVX1-NEXT: retq 209; 210; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1: 211; AVX2-SLOW: # %bb.0: 212; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 213; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 214; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] 215; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 216; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 217; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 218; AVX2-SLOW-NEXT: retq 219; 220; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1: 221; AVX2-FAST: # %bb.0: 222; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 223; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 224; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 225; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 226; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 227; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 228; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 229; AVX2-FAST-NEXT: retq 230; 231; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1: 232; AVX512F: # %bb.0: 233; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 234; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 235; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] 236; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 237; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 238; AVX512F-NEXT: vmovq %xmm0, (%rsi) 239; AVX512F-NEXT: retq 240; 241; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1: 242; AVX512VL: # %bb.0: 243; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 244; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 245; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 246; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 247; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 248; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 249; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 250; AVX512VL-NEXT: retq 251; 252; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1: 253; AVX512BW: # %bb.0: 254; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,33,37,4,5,36,37] 255; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 256; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 257; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 258; AVX512BW-NEXT: vmovq %xmm1, (%rsi) 259; AVX512BW-NEXT: vzeroupper 260; AVX512BW-NEXT: retq 261; 262; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1: 263; AVX512BWVL: # %bb.0: 264; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 265; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u> 266; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 267; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) 268; AVX512BWVL-NEXT: retq 269 %vec = load <16 x i16>, <16 x i16>* %L 270 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 271 store <4 x i16> %strided.vec, <4 x i16>* %S 272 ret void 273} 274 275define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind { 276; AVX1-LABEL: shuffle_v16i16_to_v4i16_2: 277; AVX1: # %bb.0: 278; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 279; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 280; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 281; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 282; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 283; AVX1-NEXT: vmovq %xmm0, (%rsi) 284; AVX1-NEXT: retq 285; 286; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2: 287; AVX2-SLOW: # %bb.0: 288; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 289; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 290; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 291; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 292; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 293; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 294; AVX2-SLOW-NEXT: retq 295; 296; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2: 297; AVX2-FAST: # %bb.0: 298; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 299; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 300; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 301; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 302; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 303; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 304; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 305; AVX2-FAST-NEXT: retq 306; 307; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2: 308; AVX512F: # %bb.0: 309; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 310; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 311; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 312; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 313; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 314; AVX512F-NEXT: vmovq %xmm0, (%rsi) 315; AVX512F-NEXT: retq 316; 317; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2: 318; AVX512VL: # %bb.0: 319; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 320; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 321; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 322; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 323; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 324; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 325; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 326; AVX512VL-NEXT: retq 327; 328; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2: 329; AVX512BW: # %bb.0: 330; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,34,38,2,3,34,35] 331; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 332; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 333; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 334; AVX512BW-NEXT: vmovq %xmm1, (%rsi) 335; AVX512BW-NEXT: vzeroupper 336; AVX512BW-NEXT: retq 337; 338; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2: 339; AVX512BWVL: # %bb.0: 340; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 341; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u> 342; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 343; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) 344; AVX512BWVL-NEXT: retq 345 %vec = load <16 x i16>, <16 x i16>* %L 346 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 347 store <4 x i16> %strided.vec, <4 x i16>* %S 348 ret void 349} 350 351define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind { 352; AVX1-LABEL: shuffle_v16i16_to_v4i16_3: 353; AVX1: # %bb.0: 354; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 355; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 356; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 357; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 358; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 359; AVX1-NEXT: vmovq %xmm0, (%rsi) 360; AVX1-NEXT: retq 361; 362; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3: 363; AVX2-SLOW: # %bb.0: 364; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 365; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 366; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 367; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 368; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 369; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 370; AVX2-SLOW-NEXT: retq 371; 372; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3: 373; AVX2-FAST: # %bb.0: 374; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 375; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 376; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 377; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 378; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 379; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 380; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 381; AVX2-FAST-NEXT: retq 382; 383; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3: 384; AVX512F: # %bb.0: 385; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 386; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 387; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 388; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 389; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 390; AVX512F-NEXT: vmovq %xmm0, (%rsi) 391; AVX512F-NEXT: retq 392; 393; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3: 394; AVX512VL: # %bb.0: 395; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 396; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 397; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 398; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 399; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 400; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 401; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 402; AVX512VL-NEXT: retq 403; 404; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3: 405; AVX512BW: # %bb.0: 406; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,35,39,2,3,34,35] 407; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 408; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 409; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 410; AVX512BW-NEXT: vmovq %xmm1, (%rsi) 411; AVX512BW-NEXT: vzeroupper 412; AVX512BW-NEXT: retq 413; 414; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3: 415; AVX512BWVL: # %bb.0: 416; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 417; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <3,7,11,15,u,u,u,u> 418; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 419; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) 420; AVX512BWVL-NEXT: retq 421 %vec = load <16 x i16>, <16 x i16>* %L 422 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 423 store <4 x i16> %strided.vec, <4 x i16>* %S 424 ret void 425} 426 427define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind { 428; AVX-LABEL: shuffle_v32i8_to_v4i8_1: 429; AVX: # %bb.0: 430; AVX-NEXT: vmovdqa (%rdi), %xmm0 431; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 432; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 433; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 434; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 435; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 436; AVX-NEXT: vmovd %xmm0, (%rsi) 437; AVX-NEXT: retq 438; 439; AVX512-LABEL: shuffle_v32i8_to_v4i8_1: 440; AVX512: # %bb.0: 441; AVX512-NEXT: vmovdqa (%rdi), %xmm0 442; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 443; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 444; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 445; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 446; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 447; AVX512-NEXT: vmovd %xmm0, (%rsi) 448; AVX512-NEXT: retq 449 %vec = load <32 x i8>, <32 x i8>* %L 450 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25> 451 store <4 x i8> %strided.vec, <4 x i8>* %S 452 ret void 453} 454 455define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind { 456; AVX-LABEL: shuffle_v32i8_to_v4i8_2: 457; AVX: # %bb.0: 458; AVX-NEXT: vmovdqa (%rdi), %xmm0 459; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 460; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 461; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 462; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 463; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 464; AVX-NEXT: vmovd %xmm0, (%rsi) 465; AVX-NEXT: retq 466; 467; AVX512-LABEL: shuffle_v32i8_to_v4i8_2: 468; AVX512: # %bb.0: 469; AVX512-NEXT: vmovdqa (%rdi), %xmm0 470; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 471; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 472; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 473; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 474; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 475; AVX512-NEXT: vmovd %xmm0, (%rsi) 476; AVX512-NEXT: retq 477 %vec = load <32 x i8>, <32 x i8>* %L 478 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26> 479 store <4 x i8> %strided.vec, <4 x i8>* %S 480 ret void 481} 482 483define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind { 484; AVX-LABEL: shuffle_v32i8_to_v4i8_3: 485; AVX: # %bb.0: 486; AVX-NEXT: vmovdqa (%rdi), %xmm0 487; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 488; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 489; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 490; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 491; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 492; AVX-NEXT: vmovd %xmm0, (%rsi) 493; AVX-NEXT: retq 494; 495; AVX512-LABEL: shuffle_v32i8_to_v4i8_3: 496; AVX512: # %bb.0: 497; AVX512-NEXT: vmovdqa (%rdi), %xmm0 498; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 499; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 500; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 501; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 502; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 503; AVX512-NEXT: vmovd %xmm0, (%rsi) 504; AVX512-NEXT: retq 505 %vec = load <32 x i8>, <32 x i8>* %L 506 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27> 507 store <4 x i8> %strided.vec, <4 x i8>* %S 508 ret void 509} 510 511define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind { 512; AVX-LABEL: shuffle_v32i8_to_v4i8_4: 513; AVX: # %bb.0: 514; AVX-NEXT: vmovdqa (%rdi), %xmm0 515; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 516; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 517; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 518; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 519; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 520; AVX-NEXT: vmovd %xmm0, (%rsi) 521; AVX-NEXT: retq 522; 523; AVX512-LABEL: shuffle_v32i8_to_v4i8_4: 524; AVX512: # %bb.0: 525; AVX512-NEXT: vmovdqa (%rdi), %xmm0 526; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 527; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 528; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 529; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 530; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 531; AVX512-NEXT: vmovd %xmm0, (%rsi) 532; AVX512-NEXT: retq 533 %vec = load <32 x i8>, <32 x i8>* %L 534 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28> 535 store <4 x i8> %strided.vec, <4 x i8>* %S 536 ret void 537} 538 539define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind { 540; AVX-LABEL: shuffle_v32i8_to_v4i8_5: 541; AVX: # %bb.0: 542; AVX-NEXT: vmovdqa (%rdi), %xmm0 543; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 544; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 545; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 546; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 547; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 548; AVX-NEXT: vmovd %xmm0, (%rsi) 549; AVX-NEXT: retq 550; 551; AVX512-LABEL: shuffle_v32i8_to_v4i8_5: 552; AVX512: # %bb.0: 553; AVX512-NEXT: vmovdqa (%rdi), %xmm0 554; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 555; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 556; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 557; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 558; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 559; AVX512-NEXT: vmovd %xmm0, (%rsi) 560; AVX512-NEXT: retq 561 %vec = load <32 x i8>, <32 x i8>* %L 562 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29> 563 store <4 x i8> %strided.vec, <4 x i8>* %S 564 ret void 565} 566 567define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind { 568; AVX-LABEL: shuffle_v32i8_to_v4i8_6: 569; AVX: # %bb.0: 570; AVX-NEXT: vmovdqa (%rdi), %xmm0 571; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 572; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 573; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 574; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 575; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 576; AVX-NEXT: vmovd %xmm0, (%rsi) 577; AVX-NEXT: retq 578; 579; AVX512-LABEL: shuffle_v32i8_to_v4i8_6: 580; AVX512: # %bb.0: 581; AVX512-NEXT: vmovdqa (%rdi), %xmm0 582; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 583; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 584; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 585; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 586; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 587; AVX512-NEXT: vmovd %xmm0, (%rsi) 588; AVX512-NEXT: retq 589 %vec = load <32 x i8>, <32 x i8>* %L 590 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30> 591 store <4 x i8> %strided.vec, <4 x i8>* %S 592 ret void 593} 594 595define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind { 596; AVX-LABEL: shuffle_v32i8_to_v4i8_7: 597; AVX: # %bb.0: 598; AVX-NEXT: vmovdqa (%rdi), %xmm0 599; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 600; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 601; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 602; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 603; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 604; AVX-NEXT: vmovd %xmm0, (%rsi) 605; AVX-NEXT: retq 606; 607; AVX512-LABEL: shuffle_v32i8_to_v4i8_7: 608; AVX512: # %bb.0: 609; AVX512-NEXT: vmovdqa (%rdi), %xmm0 610; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 611; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 612; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 613; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 614; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 615; AVX512-NEXT: vmovd %xmm0, (%rsi) 616; AVX512-NEXT: retq 617 %vec = load <32 x i8>, <32 x i8>* %L 618 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31> 619 store <4 x i8> %strided.vec, <4 x i8>* %S 620 ret void 621} 622 623