1; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s 2 3; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. 4 5define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) { 6; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: 7; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 8 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1) 9 ret <4 x double> %res 10} 11declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone 12 13define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) { 14; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1: 15; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 16 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1) 17 ret <8 x float> %res 18} 19declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone 20 21define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) { 22; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1: 23; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 24 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1) 25 ret <8 x i32> %res 26} 27 28; Verify that high bits of the immediate are masked off. This should be the equivalent 29; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's 30; not a vinsertf128 $1. 31define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) { 32; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: 33; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 34 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) 35 ret <8 x i32> %res 36} 37declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone 38 39; We don't check any vextractf128 variant with immediate 0 because that's just a move. 40 41define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) { 42; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1: 43; CHECK: vextractf128 $1, %ymm0, %xmm0 44 %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1) 45 ret <2 x double> %res 46} 47declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone 48 49define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) { 50; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1: 51; CHECK: vextractf128 $1, %ymm0, %xmm0 52 %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1) 53 ret <4 x float> %res 54} 55declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone 56 57define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) { 58; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1: 59; CHECK: vextractf128 $1, %ymm0, %xmm0 60 %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1) 61 ret <4 x i32> %res 62} 63declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone 64 65; Verify that high bits of the immediate are masked off. This should be the equivalent 66; of a vextractf128 $0 which should be optimized away, so just check that it's 67; not a vextractf128 of any kind. 68define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) { 69; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2: 70; CHECK-NOT: vextractf128 71 %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2) 72 ret <2 x double> %res 73} 74 75 76define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { 77; CHECK-LABEL: test_x86_avx_blend_pd_256: 78; CHECK: vblendpd 79 %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] 80 ret <4 x double> %res 81} 82declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone 83 84 85define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { 86; CHECK-LABEL: test_x86_avx_blend_ps_256: 87; CHECK: vblendps 88 %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] 89 ret <8 x float> %res 90} 91declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone 92 93 94define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { 95; CHECK-LABEL: test_x86_avx_dp_ps_256: 96; CHECK: vdpps 97 %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] 98 ret <8 x float> %res 99} 100declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone 101 102 103define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { 104; CHECK-LABEL: test_x86_sse2_psll_dq: 105; CHECK: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 106 %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1] 107 ret <2 x i64> %res 108} 109declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone 110 111 112define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { 113; CHECK-LABEL: test_x86_sse2_psrl_dq: 114; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero 115 %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1] 116 ret <2 x i64> %res 117} 118declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone 119 120 121define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { 122; CHECK-LABEL: test_x86_sse41_blendpd: 123; CHECK: vblendpd 124 %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1] 125 ret <2 x double> %res 126} 127declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone 128 129 130define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { 131; CHECK-LABEL: test_x86_sse41_blendps: 132; CHECK: vblendps 133 %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] 134 ret <4 x float> %res 135} 136declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone 137 138 139define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { 140; CHECK-LABEL: test_x86_sse41_pblendw: 141; CHECK: vpblendw 142 %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1] 143 ret <8 x i16> %res 144} 145declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone 146 147 148define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) { 149; CHECK-LABEL: test_x86_sse41_pmovsxbd: 150; CHECK: # BB#0: 151; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 152; CHECK-NEXT: retl 153 %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] 154 ret <4 x i32> %res 155} 156declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone 157 158 159define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) { 160; CHECK-LABEL: test_x86_sse41_pmovsxbq: 161; CHECK: # BB#0: 162; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 163; CHECK-NEXT: retl 164 %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] 165 ret <2 x i64> %res 166} 167declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone 168 169 170define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) { 171; CHECK-LABEL: test_x86_sse41_pmovsxbw: 172; CHECK: # BB#0: 173; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 174; CHECK-NEXT: retl 175 %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] 176 ret <8 x i16> %res 177} 178declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone 179 180 181define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) { 182; CHECK-LABEL: test_x86_sse41_pmovsxdq: 183; CHECK: # BB#0: 184; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 185; CHECK-NEXT: retl 186 %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] 187 ret <2 x i64> %res 188} 189declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone 190 191 192define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) { 193; CHECK-LABEL: test_x86_sse41_pmovsxwd: 194; CHECK: # BB#0: 195; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 196; CHECK-NEXT: retl 197 %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] 198 ret <4 x i32> %res 199} 200declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone 201 202 203define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) { 204; CHECK-LABEL: test_x86_sse41_pmovsxwq: 205; CHECK: # BB#0: 206; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 207; CHECK-NEXT: retl 208 %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] 209 ret <2 x i64> %res 210} 211declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone 212