1; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s 2 3define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 { 4 %1 = load <8 x i16>, <8 x i16>* %b 5 %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17) 6 ret <8 x i16> %2 7 8 ;LABEL: commute_fold_vpblendw_128 9 ;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] 10 ;CHECK-NEXT: retq 11} 12declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone 13 14define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 { 15 %1 = load <16 x i16>, <16 x i16>* %b 16 %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17) 17 ret <16 x i16> %2 18 19 ;LABEL: commute_fold_vpblendw_256 20 ;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] 21 ;CHECK-NEXT: retq 22} 23declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone 24 25define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 { 26 %1 = load <4 x i32>, <4 x i32>* %b 27 %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1) 28 ret <4 x i32> %2 29 30 ;LABEL: commute_fold_vpblendd_128 31 ;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 32 ;CHECK-NEXT: retq 33} 34declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone 35 36define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 { 37 %1 = load <8 x i32>, <8 x i32>* %b 38 %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129) 39 ret <8 x i32> %2 40 41 ;LABEL: commute_fold_vpblendd_256 42 ;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7] 43 ;CHECK-NEXT: retq 44} 45declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone 46 47define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 { 48 %1 = load <4 x float>, <4 x float>* %b 49 %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 5) 50 ret <4 x float> %2 51 52 ;LABEL: commute_fold_vblendps_128 53 ;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3] 54 ;CHECK-NEXT: retq 55} 56declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone 57 58define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 { 59 %1 = load <8 x float>, <8 x float>* %b 60 %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7) 61 ret <8 x float> %2 62 63 ;LABEL: commute_fold_vblendps_256 64 ;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7] 65 ;CHECK-NEXT: retq 66} 67declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 68 69define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 { 70 %1 = load <2 x double>, <2 x double>* %b 71 %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) 72 ret <2 x double> %2 73 74 ;LABEL: commute_fold_vblendpd_128 75 ;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] 76 ;CHECK-NEXT: retq 77} 78declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone 79 80define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 { 81 %1 = load <4 x double>, <4 x double>* %b 82 %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7) 83 ret <4 x double> %2 84 85 ;LABEL: commute_fold_vblendpd_256 86 ;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3] 87 ;CHECK-NEXT: retq 88} 89declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 90