1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2 4 5; 6; Unary shuffle indices from registers 7; 8 9define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 10; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64: 11; ALL: # %bb.0: 12; ALL-NEXT: pushq %rbp 13; ALL-NEXT: movq %rsp, %rbp 14; ALL-NEXT: andq $-32, %rsp 15; ALL-NEXT: subq $64, %rsp 16; ALL-NEXT: andl $3, %esi 17; ALL-NEXT: andl $3, %edi 18; ALL-NEXT: andl $3, %ecx 19; ALL-NEXT: andl $3, %edx 20; ALL-NEXT: vmovaps %ymm0, (%rsp) 21; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 22; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 23; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 24; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 25; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 26; ALL-NEXT: movq %rbp, %rsp 27; ALL-NEXT: popq %rbp 28; ALL-NEXT: retq 29 %x0 = extractelement <4 x double> %x, i64 %i0 30 %x1 = extractelement <4 x double> %x, i64 %i1 31 %x2 = extractelement <4 x double> %x, i64 %i2 32 %x3 = extractelement <4 x double> %x, i64 %i3 33 %r0 = insertelement <4 x double> undef, double %x0, i32 0 34 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 35 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 36 %r3 = insertelement <4 x double> %r2, double %x3, i32 3 37 ret <4 x double> %r3 38} 39 40define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 41; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: 42; ALL: # %bb.0: 43; ALL-NEXT: pushq %rbp 44; ALL-NEXT: movq %rsp, %rbp 45; ALL-NEXT: andq $-32, %rsp 46; ALL-NEXT: subq $64, %rsp 47; ALL-NEXT: andl $3, %edx 48; ALL-NEXT: andl $3, %esi 49; ALL-NEXT: vmovaps %ymm0, (%rsp) 50; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 51; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 52; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 53; ALL-NEXT: movq %rbp, %rsp 54; ALL-NEXT: popq %rbp 55; ALL-NEXT: retq 56 %x0 = extractelement <4 x double> %x, i64 %i0 57 %x1 = extractelement <4 x double> %x, i64 %i1 58 %x2 = extractelement <4 x double> %x, i64 %i2 59 %x3 = extractelement <4 x double> %x, i64 %i3 60 %r0 = insertelement <4 x double> undef, double undef, i32 0 61 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 62 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 63 %r3 = insertelement <4 x double> %r2, double 0.0, i32 3 64 ret <4 x double> %r3 65} 66 67define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 68; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64: 69; ALL: # %bb.0: 70; ALL-NEXT: andl $1, %esi 71; ALL-NEXT: andl $1, %edi 72; ALL-NEXT: andl $1, %ecx 73; ALL-NEXT: andl $1, %edx 74; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 75; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 76; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 77; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 78; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 79; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 80; ALL-NEXT: retq 81 %x0 = extractelement <2 x double> %x, i64 %i0 82 %x1 = extractelement <2 x double> %x, i64 %i1 83 %x2 = extractelement <2 x double> %x, i64 %i2 84 %x3 = extractelement <2 x double> %x, i64 %i3 85 %r0 = insertelement <4 x double> undef, double %x0, i32 0 86 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 87 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 88 %r3 = insertelement <4 x double> %r2, double %x3, i32 3 89 ret <4 x double> %r3 90} 91 92define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 93; ALL-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: 94; ALL: # %bb.0: 95; ALL-NEXT: pushq %rbp 96; ALL-NEXT: movq %rsp, %rbp 97; ALL-NEXT: andq $-32, %rsp 98; ALL-NEXT: subq $64, %rsp 99; ALL-NEXT: andl $3, %edi 100; ALL-NEXT: andl $3, %esi 101; ALL-NEXT: andl $3, %edx 102; ALL-NEXT: andl $3, %ecx 103; ALL-NEXT: vmovaps %ymm0, (%rsp) 104; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 105; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 106; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 107; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 108; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 109; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 110; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 111; ALL-NEXT: movq %rbp, %rsp 112; ALL-NEXT: popq %rbp 113; ALL-NEXT: retq 114 %x0 = extractelement <4 x i64> %x, i64 %i0 115 %x1 = extractelement <4 x i64> %x, i64 %i1 116 %x2 = extractelement <4 x i64> %x, i64 %i2 117 %x3 = extractelement <4 x i64> %x, i64 %i3 118 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 119 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 120 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 121 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 122 ret <4 x i64> %r3 123} 124 125define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 126; ALL-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: 127; ALL: # %bb.0: 128; ALL-NEXT: pushq %rbp 129; ALL-NEXT: movq %rsp, %rbp 130; ALL-NEXT: andq $-32, %rsp 131; ALL-NEXT: subq $64, %rsp 132; ALL-NEXT: andl $3, %edi 133; ALL-NEXT: andl $3, %esi 134; ALL-NEXT: vmovaps %ymm0, (%rsp) 135; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 136; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 137; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 138; ALL-NEXT: movq %rbp, %rsp 139; ALL-NEXT: popq %rbp 140; ALL-NEXT: retq 141 %x0 = extractelement <4 x i64> %x, i64 %i0 142 %x1 = extractelement <4 x i64> %x, i64 %i1 143 %x2 = extractelement <4 x i64> %x, i64 %i2 144 %x3 = extractelement <4 x i64> %x, i64 %i3 145 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 146 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 147 %r2 = insertelement <4 x i64> %r1, i64 0, i32 2 148 %r3 = insertelement <4 x i64> %r2, i64 0, i32 3 149 ret <4 x i64> %r3 150} 151 152define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 153; ALL-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: 154; ALL: # %bb.0: 155; ALL-NEXT: andl $1, %edi 156; ALL-NEXT: andl $1, %esi 157; ALL-NEXT: andl $1, %edx 158; ALL-NEXT: andl $1, %ecx 159; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 160; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 161; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 162; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 163; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 164; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 165; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 166; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 167; ALL-NEXT: retq 168 %x0 = extractelement <2 x i64> %x, i64 %i0 169 %x1 = extractelement <2 x i64> %x, i64 %i1 170 %x2 = extractelement <2 x i64> %x, i64 %i2 171 %x3 = extractelement <2 x i64> %x, i64 %i3 172 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 173 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 174 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 175 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 176 ret <4 x i64> %r3 177} 178 179define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { 180; ALL-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: 181; ALL: # %bb.0: 182; ALL-NEXT: pushq %rbp 183; ALL-NEXT: movq %rsp, %rbp 184; ALL-NEXT: andq $-32, %rsp 185; ALL-NEXT: subq $64, %rsp 186; ALL-NEXT: # kill: def $r9d killed $r9d def $r9 187; ALL-NEXT: # kill: def $r8d killed $r8d def $r8 188; ALL-NEXT: # kill: def $ecx killed $ecx def $rcx 189; ALL-NEXT: # kill: def $edx killed $edx def $rdx 190; ALL-NEXT: # kill: def $esi killed $esi def $rsi 191; ALL-NEXT: # kill: def $edi killed $edi def $rdi 192; ALL-NEXT: movl 24(%rbp), %r10d 193; ALL-NEXT: andl $7, %r10d 194; ALL-NEXT: movl 16(%rbp), %eax 195; ALL-NEXT: andl $7, %eax 196; ALL-NEXT: andl $7, %edi 197; ALL-NEXT: andl $7, %esi 198; ALL-NEXT: andl $7, %edx 199; ALL-NEXT: andl $7, %ecx 200; ALL-NEXT: andl $7, %r8d 201; ALL-NEXT: vmovaps %ymm0, (%rsp) 202; ALL-NEXT: andl $7, %r9d 203; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 204; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 205; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 206; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 207; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 208; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 209; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] 210; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] 211; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 212; ALL-NEXT: movq %rbp, %rsp 213; ALL-NEXT: popq %rbp 214; ALL-NEXT: retq 215 %x0 = extractelement <8 x float> %x, i32 %i0 216 %x1 = extractelement <8 x float> %x, i32 %i1 217 %x2 = extractelement <8 x float> %x, i32 %i2 218 %x3 = extractelement <8 x float> %x, i32 %i3 219 %x4 = extractelement <8 x float> %x, i32 %i4 220 %x5 = extractelement <8 x float> %x, i32 %i5 221 %x6 = extractelement <8 x float> %x, i32 %i6 222 %x7 = extractelement <8 x float> %x, i32 %i7 223 %r0 = insertelement <8 x float> undef, float %x0, i32 0 224 %r1 = insertelement <8 x float> %r0, float %x1, i32 1 225 %r2 = insertelement <8 x float> %r1, float %x2, i32 2 226 %r3 = insertelement <8 x float> %r2, float %x3, i32 3 227 %r4 = insertelement <8 x float> %r3, float %x4, i32 4 228 %r5 = insertelement <8 x float> %r4, float %x5, i32 5 229 %r6 = insertelement <8 x float> %r5, float %x6, i32 6 230 %r7 = insertelement <8 x float> %r6, float %x7, i32 7 231 ret <8 x float> %r7 232} 233 234define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { 235; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32: 236; ALL: # %bb.0: 237; ALL-NEXT: # kill: def $r9d killed $r9d def $r9 238; ALL-NEXT: # kill: def $r8d killed $r8d def $r8 239; ALL-NEXT: # kill: def $ecx killed $ecx def $rcx 240; ALL-NEXT: # kill: def $edx killed $edx def $rdx 241; ALL-NEXT: # kill: def $esi killed $esi def $rsi 242; ALL-NEXT: # kill: def $edi killed $edi def $rdi 243; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d 244; ALL-NEXT: andl $3, %r10d 245; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax 246; ALL-NEXT: andl $3, %eax 247; ALL-NEXT: andl $3, %edi 248; ALL-NEXT: andl $3, %esi 249; ALL-NEXT: andl $3, %edx 250; ALL-NEXT: andl $3, %ecx 251; ALL-NEXT: andl $3, %r8d 252; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 253; ALL-NEXT: andl $3, %r9d 254; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 255; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 256; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 257; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 258; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 259; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 260; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] 261; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] 262; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 263; ALL-NEXT: retq 264 %x0 = extractelement <4 x float> %x, i32 %i0 265 %x1 = extractelement <4 x float> %x, i32 %i1 266 %x2 = extractelement <4 x float> %x, i32 %i2 267 %x3 = extractelement <4 x float> %x, i32 %i3 268 %x4 = extractelement <4 x float> %x, i32 %i4 269 %x5 = extractelement <4 x float> %x, i32 %i5 270 %x6 = extractelement <4 x float> %x, i32 %i6 271 %x7 = extractelement <4 x float> %x, i32 %i7 272 %r0 = insertelement <8 x float> undef, float %x0, i32 0 273 %r1 = insertelement <8 x float> %r0, float %x1, i32 1 274 %r2 = insertelement <8 x float> %r1, float %x2, i32 2 275 %r3 = insertelement <8 x float> %r2, float %x3, i32 3 276 %r4 = insertelement <8 x float> %r3, float %x4, i32 4 277 %r5 = insertelement <8 x float> %r4, float %x5, i32 5 278 %r6 = insertelement <8 x float> %r5, float %x6, i32 6 279 %r7 = insertelement <8 x float> %r6, float %x7, i32 7 280 ret <8 x float> %r7 281} 282 283define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind { 284; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16: 285; AVX1: # %bb.0: 286; AVX1-NEXT: pushq %rbp 287; AVX1-NEXT: movq %rsp, %rbp 288; AVX1-NEXT: andq $-32, %rsp 289; AVX1-NEXT: subq $64, %rsp 290; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 291; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 292; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx 293; AVX1-NEXT: # kill: def $edx killed $edx def $rdx 294; AVX1-NEXT: # kill: def $esi killed $esi def $rsi 295; AVX1-NEXT: # kill: def $edi killed $edi def $rdi 296; AVX1-NEXT: andl $15, %edi 297; AVX1-NEXT: vmovaps %ymm0, (%rsp) 298; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax 299; AVX1-NEXT: vmovd %eax, %xmm0 300; AVX1-NEXT: andl $15, %esi 301; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0 302; AVX1-NEXT: andl $15, %edx 303; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0 304; AVX1-NEXT: andl $15, %ecx 305; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0 306; AVX1-NEXT: andl $15, %r8d 307; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0 308; AVX1-NEXT: andl $15, %r9d 309; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0 310; AVX1-NEXT: movl 16(%rbp), %eax 311; AVX1-NEXT: andl $15, %eax 312; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 313; AVX1-NEXT: movl 24(%rbp), %eax 314; AVX1-NEXT: andl $15, %eax 315; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 316; AVX1-NEXT: movl 32(%rbp), %eax 317; AVX1-NEXT: andl $15, %eax 318; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 319; AVX1-NEXT: vmovd %eax, %xmm1 320; AVX1-NEXT: movl 40(%rbp), %eax 321; AVX1-NEXT: andl $15, %eax 322; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 323; AVX1-NEXT: movl 48(%rbp), %eax 324; AVX1-NEXT: andl $15, %eax 325; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 326; AVX1-NEXT: movl 56(%rbp), %eax 327; AVX1-NEXT: andl $15, %eax 328; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 329; AVX1-NEXT: movl 64(%rbp), %eax 330; AVX1-NEXT: andl $15, %eax 331; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 332; AVX1-NEXT: movl 72(%rbp), %eax 333; AVX1-NEXT: andl $15, %eax 334; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 335; AVX1-NEXT: movl 80(%rbp), %eax 336; AVX1-NEXT: andl $15, %eax 337; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 338; AVX1-NEXT: movl 88(%rbp), %eax 339; AVX1-NEXT: andl $15, %eax 340; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1 341; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 342; AVX1-NEXT: movq %rbp, %rsp 343; AVX1-NEXT: popq %rbp 344; AVX1-NEXT: retq 345; 346; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16: 347; AVX2: # %bb.0: 348; AVX2-NEXT: pushq %rbp 349; AVX2-NEXT: movq %rsp, %rbp 350; AVX2-NEXT: andq $-32, %rsp 351; AVX2-NEXT: subq $64, %rsp 352; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 353; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 354; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 355; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 356; AVX2-NEXT: # kill: def $esi killed $esi def $rsi 357; AVX2-NEXT: # kill: def $edi killed $edi def $rdi 358; AVX2-NEXT: andl $15, %edi 359; AVX2-NEXT: vmovaps %ymm0, (%rsp) 360; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax 361; AVX2-NEXT: vmovd %eax, %xmm0 362; AVX2-NEXT: andl $15, %esi 363; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0 364; AVX2-NEXT: andl $15, %edx 365; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0 366; AVX2-NEXT: andl $15, %ecx 367; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0 368; AVX2-NEXT: andl $15, %r8d 369; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0 370; AVX2-NEXT: andl $15, %r9d 371; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0 372; AVX2-NEXT: movl 16(%rbp), %eax 373; AVX2-NEXT: andl $15, %eax 374; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 375; AVX2-NEXT: movl 24(%rbp), %eax 376; AVX2-NEXT: andl $15, %eax 377; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 378; AVX2-NEXT: movl 32(%rbp), %eax 379; AVX2-NEXT: andl $15, %eax 380; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 381; AVX2-NEXT: vmovd %eax, %xmm1 382; AVX2-NEXT: movl 40(%rbp), %eax 383; AVX2-NEXT: andl $15, %eax 384; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 385; AVX2-NEXT: movl 48(%rbp), %eax 386; AVX2-NEXT: andl $15, %eax 387; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 388; AVX2-NEXT: movl 56(%rbp), %eax 389; AVX2-NEXT: andl $15, %eax 390; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 391; AVX2-NEXT: movl 64(%rbp), %eax 392; AVX2-NEXT: andl $15, %eax 393; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 394; AVX2-NEXT: movl 72(%rbp), %eax 395; AVX2-NEXT: andl $15, %eax 396; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 397; AVX2-NEXT: movl 80(%rbp), %eax 398; AVX2-NEXT: andl $15, %eax 399; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 400; AVX2-NEXT: movl 88(%rbp), %eax 401; AVX2-NEXT: andl $15, %eax 402; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1 403; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 404; AVX2-NEXT: movq %rbp, %rsp 405; AVX2-NEXT: popq %rbp 406; AVX2-NEXT: retq 407 %x0 = extractelement <16 x i16> %x, i32 %i0 408 %x1 = extractelement <16 x i16> %x, i32 %i1 409 %x2 = extractelement <16 x i16> %x, i32 %i2 410 %x3 = extractelement <16 x i16> %x, i32 %i3 411 %x4 = extractelement <16 x i16> %x, i32 %i4 412 %x5 = extractelement <16 x i16> %x, i32 %i5 413 %x6 = extractelement <16 x i16> %x, i32 %i6 414 %x7 = extractelement <16 x i16> %x, i32 %i7 415 %x8 = extractelement <16 x i16> %x, i32 %i8 416 %x9 = extractelement <16 x i16> %x, i32 %i9 417 %x10 = extractelement <16 x i16> %x, i32 %i10 418 %x11 = extractelement <16 x i16> %x, i32 %i11 419 %x12 = extractelement <16 x i16> %x, i32 %i12 420 %x13 = extractelement <16 x i16> %x, i32 %i13 421 %x14 = extractelement <16 x i16> %x, i32 %i14 422 %x15 = extractelement <16 x i16> %x, i32 %i15 423 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0 424 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1 425 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2 426 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3 427 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4 428 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5 429 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6 430 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7 431 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8 432 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9 433 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10 434 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11 435 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12 436 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13 437 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14 438 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15 439 ret <16 x i16> %r15 440} 441 442define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind { 443; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: 444; AVX1: # %bb.0: 445; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 446; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 447; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx 448; AVX1-NEXT: # kill: def $edx killed $edx def $rdx 449; AVX1-NEXT: # kill: def $esi killed $esi def $rsi 450; AVX1-NEXT: # kill: def $edi killed $edi def $rdi 451; AVX1-NEXT: andl $7, %edi 452; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 453; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax 454; AVX1-NEXT: vmovd %eax, %xmm0 455; AVX1-NEXT: andl $7, %esi 456; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 457; AVX1-NEXT: andl $7, %edx 458; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 459; AVX1-NEXT: andl $7, %ecx 460; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 461; AVX1-NEXT: andl $7, %r8d 462; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 463; AVX1-NEXT: andl $7, %r9d 464; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 465; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 466; AVX1-NEXT: andl $7, %eax 467; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 468; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 469; AVX1-NEXT: andl $7, %eax 470; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 471; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 472; AVX1-NEXT: andl $7, %eax 473; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 474; AVX1-NEXT: vmovd %eax, %xmm1 475; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 476; AVX1-NEXT: andl $7, %eax 477; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 478; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 479; AVX1-NEXT: andl $7, %eax 480; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 481; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 482; AVX1-NEXT: andl $7, %eax 483; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 484; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 485; AVX1-NEXT: andl $7, %eax 486; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 487; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 488; AVX1-NEXT: andl $7, %eax 489; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 490; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 491; AVX1-NEXT: andl $7, %eax 492; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1 493; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax 494; AVX1-NEXT: andl $7, %eax 495; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1 496; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 497; AVX1-NEXT: retq 498; 499; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: 500; AVX2: # %bb.0: 501; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 502; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 503; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 504; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 505; AVX2-NEXT: # kill: def $esi killed $esi def $rsi 506; AVX2-NEXT: # kill: def $edi killed $edi def $rdi 507; AVX2-NEXT: andl $7, %edi 508; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 509; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax 510; AVX2-NEXT: vmovd %eax, %xmm0 511; AVX2-NEXT: andl $7, %esi 512; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 513; AVX2-NEXT: andl $7, %edx 514; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 515; AVX2-NEXT: andl $7, %ecx 516; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 517; AVX2-NEXT: andl $7, %r8d 518; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 519; AVX2-NEXT: andl $7, %r9d 520; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 521; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 522; AVX2-NEXT: andl $7, %eax 523; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 524; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 525; AVX2-NEXT: andl $7, %eax 526; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 527; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 528; AVX2-NEXT: andl $7, %eax 529; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 530; AVX2-NEXT: vmovd %eax, %xmm1 531; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 532; AVX2-NEXT: andl $7, %eax 533; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 534; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 535; AVX2-NEXT: andl $7, %eax 536; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 537; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 538; AVX2-NEXT: andl $7, %eax 539; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 540; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 541; AVX2-NEXT: andl $7, %eax 542; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 543; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 544; AVX2-NEXT: andl $7, %eax 545; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 546; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 547; AVX2-NEXT: andl $7, %eax 548; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1 549; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax 550; AVX2-NEXT: andl $7, %eax 551; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1 552; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 553; AVX2-NEXT: retq 554 %x0 = extractelement <8 x i16> %x, i32 %i0 555 %x1 = extractelement <8 x i16> %x, i32 %i1 556 %x2 = extractelement <8 x i16> %x, i32 %i2 557 %x3 = extractelement <8 x i16> %x, i32 %i3 558 %x4 = extractelement <8 x i16> %x, i32 %i4 559 %x5 = extractelement <8 x i16> %x, i32 %i5 560 %x6 = extractelement <8 x i16> %x, i32 %i6 561 %x7 = extractelement <8 x i16> %x, i32 %i7 562 %x8 = extractelement <8 x i16> %x, i32 %i8 563 %x9 = extractelement <8 x i16> %x, i32 %i9 564 %x10 = extractelement <8 x i16> %x, i32 %i10 565 %x11 = extractelement <8 x i16> %x, i32 %i11 566 %x12 = extractelement <8 x i16> %x, i32 %i12 567 %x13 = extractelement <8 x i16> %x, i32 %i13 568 %x14 = extractelement <8 x i16> %x, i32 %i14 569 %x15 = extractelement <8 x i16> %x, i32 %i15 570 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0 571 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1 572 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2 573 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3 574 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4 575 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5 576 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6 577 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7 578 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8 579 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9 580 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10 581 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11 582 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12 583 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13 584 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14 585 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15 586 ret <16 x i16> %r15 587} 588 589; 590; Unary shuffle indices from memory 591; 592 593define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind { 594; ALL-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: 595; ALL: # %bb.0: 596; ALL-NEXT: pushq %rbp 597; ALL-NEXT: movq %rsp, %rbp 598; ALL-NEXT: andq $-32, %rsp 599; ALL-NEXT: subq $64, %rsp 600; ALL-NEXT: movq (%rdi), %rax 601; ALL-NEXT: movq 8(%rdi), %rcx 602; ALL-NEXT: andl $3, %eax 603; ALL-NEXT: andl $3, %ecx 604; ALL-NEXT: movq 16(%rdi), %rdx 605; ALL-NEXT: andl $3, %edx 606; ALL-NEXT: movq 24(%rdi), %rsi 607; ALL-NEXT: andl $3, %esi 608; ALL-NEXT: vmovaps %ymm0, (%rsp) 609; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 610; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 611; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 612; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 613; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 614; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 615; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 616; ALL-NEXT: movq %rbp, %rsp 617; ALL-NEXT: popq %rbp 618; ALL-NEXT: retq 619 %p0 = getelementptr inbounds i64, i64* %i, i32 0 620 %p1 = getelementptr inbounds i64, i64* %i, i32 1 621 %p2 = getelementptr inbounds i64, i64* %i, i32 2 622 %p3 = getelementptr inbounds i64, i64* %i, i32 3 623 %i0 = load i64, i64* %p0, align 4 624 %i1 = load i64, i64* %p1, align 4 625 %i2 = load i64, i64* %p2, align 4 626 %i3 = load i64, i64* %p3, align 4 627 %x0 = extractelement <4 x i64> %x, i64 %i0 628 %x1 = extractelement <4 x i64> %x, i64 %i1 629 %x2 = extractelement <4 x i64> %x, i64 %i2 630 %x3 = extractelement <4 x i64> %x, i64 %i3 631 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 632 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 633 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 634 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 635 ret <4 x i64> %r3 636} 637 638define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind { 639; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: 640; ALL: # %bb.0: 641; ALL-NEXT: movq (%rdi), %rax 642; ALL-NEXT: movq 8(%rdi), %rcx 643; ALL-NEXT: andl $1, %eax 644; ALL-NEXT: andl $1, %ecx 645; ALL-NEXT: movq 16(%rdi), %rdx 646; ALL-NEXT: andl $1, %edx 647; ALL-NEXT: movq 24(%rdi), %rsi 648; ALL-NEXT: andl $1, %esi 649; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 650; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 651; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 652; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 653; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 654; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 655; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 656; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 657; ALL-NEXT: retq 658 %p0 = getelementptr inbounds i64, i64* %i, i32 0 659 %p1 = getelementptr inbounds i64, i64* %i, i32 1 660 %p2 = getelementptr inbounds i64, i64* %i, i32 2 661 %p3 = getelementptr inbounds i64, i64* %i, i32 3 662 %i0 = load i64, i64* %p0, align 4 663 %i1 = load i64, i64* %p1, align 4 664 %i2 = load i64, i64* %p2, align 4 665 %i3 = load i64, i64* %p3, align 4 666 %x0 = extractelement <2 x i64> %x, i64 %i0 667 %x1 = extractelement <2 x i64> %x, i64 %i1 668 %x2 = extractelement <2 x i64> %x, i64 %i2 669 %x3 = extractelement <2 x i64> %x, i64 %i3 670 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 671 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 672 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 673 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 674 ret <4 x i64> %r3 675} 676