1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512F 7 8define void @foo(<4 x float> %in, <4 x i8>* %out) { 9; SSE2-LABEL: foo: 10; SSE2: # %bb.0: 11; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 12; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 13; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 14; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx 15; SSE2-NEXT: shll $8, %ecx 16; SSE2-NEXT: orl %eax, %ecx 17; SSE2-NEXT: movd %ecx, %xmm0 18; SSE2-NEXT: movl $65280, %eax # imm = 0xFF00 19; SSE2-NEXT: orl -{{[0-9]+}}(%rsp), %eax 20; SSE2-NEXT: pinsrw $1, %eax, %xmm0 21; SSE2-NEXT: movd %xmm0, (%rdi) 22; SSE2-NEXT: retq 23; 24; SSE42-LABEL: foo: 25; SSE42: # %bb.0: 26; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 27; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 28; SSE42-NEXT: movl $255, %eax 29; SSE42-NEXT: pinsrb $3, %eax, %xmm0 30; SSE42-NEXT: movd %xmm0, (%rdi) 31; SSE42-NEXT: retq 32; 33; AVX-LABEL: foo: 34; AVX: # %bb.0: 35; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 36; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 37; AVX-NEXT: movl $255, %eax 38; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 39; AVX-NEXT: vmovd %xmm0, (%rdi) 40; AVX-NEXT: retq 41 %t0 = fptosi <4 x float> %in to <4 x i32> 42 %t1 = trunc <4 x i32> %t0 to <4 x i16> 43 %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 44 %t3 = trunc <8 x i16> %t2 to <8 x i8> 45 %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 46 %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3 47 store <4 x i8> %t5, <4 x i8>* %out 48 ret void 49} 50 51define <16 x i64> @catcat(<4 x i64> %x) { 52; SSE-LABEL: catcat: 53; SSE: # %bb.0: 54; SSE-NEXT: movq %rdi, %rax 55; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 56; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 57; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] 58; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 59; SSE-NEXT: movdqa %xmm1, 112(%rdi) 60; SSE-NEXT: movdqa %xmm1, 96(%rdi) 61; SSE-NEXT: movdqa %xmm3, 80(%rdi) 62; SSE-NEXT: movdqa %xmm3, 64(%rdi) 63; SSE-NEXT: movdqa %xmm0, 48(%rdi) 64; SSE-NEXT: movdqa %xmm0, 32(%rdi) 65; SSE-NEXT: movdqa %xmm2, 16(%rdi) 66; SSE-NEXT: movdqa %xmm2, (%rdi) 67; SSE-NEXT: retq 68; 69; AVX1-LABEL: catcat: 70; AVX1: # %bb.0: 71; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] 72; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3] 73; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] 74; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,2,3] 75; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] 76; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4 77; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 78; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 79; AVX1-NEXT: vmovaps %ymm4, %ymm0 80; AVX1-NEXT: retq 81; 82; AVX2-LABEL: catcat: 83; AVX2: # %bb.0: 84; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,1,1,1] 85; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] 86; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[3,3,3,3] 87; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 88; AVX2-NEXT: retq 89; 90; AVX512F-LABEL: catcat: 91; AVX512F: # %bb.0: 92; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 93; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] 94; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm2 95; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3] 96; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm1 97; AVX512F-NEXT: vmovaps %zmm2, %zmm0 98; AVX512F-NEXT: retq 99 %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 100 %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 101 %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 102 ret <16 x i64> %r 103} 104 105define <16 x i64> @load_catcat(<4 x i64>* %p) { 106; SSE-LABEL: load_catcat: 107; SSE: # %bb.0: 108; SSE-NEXT: movq %rdi, %rax 109; SSE-NEXT: movdqa (%rsi), %xmm0 110; SSE-NEXT: movdqa 16(%rsi), %xmm1 111; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 112; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 113; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] 114; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 115; SSE-NEXT: movdqa %xmm1, 112(%rdi) 116; SSE-NEXT: movdqa %xmm1, 96(%rdi) 117; SSE-NEXT: movdqa %xmm3, 80(%rdi) 118; SSE-NEXT: movdqa %xmm3, 64(%rdi) 119; SSE-NEXT: movdqa %xmm0, 48(%rdi) 120; SSE-NEXT: movdqa %xmm0, 32(%rdi) 121; SSE-NEXT: movdqa %xmm2, 16(%rdi) 122; SSE-NEXT: movdqa %xmm2, (%rdi) 123; SSE-NEXT: retq 124; 125; AVX1-LABEL: load_catcat: 126; AVX1: # %bb.0: 127; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 128; AVX1-NEXT: vbroadcastsd 8(%rdi), %ymm1 129; AVX1-NEXT: vbroadcastsd 16(%rdi), %ymm2 130; AVX1-NEXT: vbroadcastsd 24(%rdi), %ymm3 131; AVX1-NEXT: retq 132; 133; AVX2-LABEL: load_catcat: 134; AVX2: # %bb.0: 135; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 136; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm1 137; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm2 138; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3 139; AVX2-NEXT: retq 140; 141; AVX512F-LABEL: load_catcat: 142; AVX512F: # %bb.0: 143; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] 144; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5] 145; AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm0 146; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7] 147; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 148; AVX512F-NEXT: retq 149 %x = load <4 x i64>, <4 x i64>* %p 150 %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 151 %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 152 %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 153 ret <16 x i64> %r 154} 155 156; Use weird types to make sure we do not miscompile a case where 157; the source ops are not an even multiple size of the result. 158 159define <4 x i32> @cat_ext_straddle(<6 x i32>* %px, <6 x i32>* %py) { 160; SSE-LABEL: cat_ext_straddle: 161; SSE: # %bb.0: 162; SSE-NEXT: movaps 16(%rdi), %xmm0 163; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 164; SSE-NEXT: retq 165; 166; AVX-LABEL: cat_ext_straddle: 167; AVX: # %bb.0: 168; AVX-NEXT: vmovaps 16(%rdi), %xmm0 169; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 170; AVX-NEXT: retq 171 %x = load <6 x i32>, <6 x i32>* %px 172 %y = load <6 x i32>, <6 x i32>* %py 173 %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 174 %ext = shufflevector <12 x i32> %cat, <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 175 ret <4 x i32> %ext 176} 177