1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST 8; 9; Verify that the DAG combiner correctly folds bitwise operations across 10; shuffles, nested shuffles with undef, pairs of nested shuffles, and other 11; basic and always-safe patterns. Also test that the DAG combiner will combine 12; target-specific shuffle instructions where reasonable. 13 14target triple = "x86_64-unknown-unknown" 15 16declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) 17declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) 18declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) 19 20define <4 x i32> @combine_pshufd1(<4 x i32> %a) { 21; CHECK-LABEL: combine_pshufd1: 22; CHECK: # %bb.0: # %entry 23; CHECK-NEXT: retq 24entry: 25 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 26 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 27 ret <4 x i32> %c 28} 29 30define <4 x i32> @combine_pshufd2(<4 x i32> %a) { 31; CHECK-LABEL: combine_pshufd2: 32; CHECK: # %bb.0: # %entry 33; CHECK-NEXT: retq 34entry: 35 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 36 %b.cast = bitcast <4 x i32> %b to <8 x i16> 37 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) 38 %c.cast = bitcast <8 x i16> %c to <4 x i32> 39 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 40 ret <4 x i32> %d 41} 42 43define <4 x i32> @combine_pshufd3(<4 x i32> %a) { 44; CHECK-LABEL: combine_pshufd3: 45; CHECK: # %bb.0: # %entry 46; CHECK-NEXT: retq 47entry: 48 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 49 %b.cast = bitcast <4 x i32> %b to <8 x i16> 50 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) 51 %c.cast = bitcast <8 x i16> %c to <4 x i32> 52 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 53 ret <4 x i32> %d 54} 55 56define <4 x i32> @combine_pshufd4(<4 x i32> %a) { 57; SSE-LABEL: combine_pshufd4: 58; SSE: # %bb.0: # %entry 59; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 60; SSE-NEXT: retq 61; 62; AVX-LABEL: combine_pshufd4: 63; AVX: # %bb.0: # %entry 64; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 65; AVX-NEXT: retq 66entry: 67 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 68 %b.cast = bitcast <4 x i32> %b to <8 x i16> 69 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) 70 %c.cast = bitcast <8 x i16> %c to <4 x i32> 71 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 72 ret <4 x i32> %d 73} 74 75define <4 x i32> @combine_pshufd5(<4 x i32> %a) { 76; SSE-LABEL: combine_pshufd5: 77; SSE: # %bb.0: # %entry 78; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 79; SSE-NEXT: retq 80; 81; AVX-LABEL: combine_pshufd5: 82; AVX: # %bb.0: # %entry 83; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 84; AVX-NEXT: retq 85entry: 86 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 87 %b.cast = bitcast <4 x i32> %b to <8 x i16> 88 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) 89 %c.cast = bitcast <8 x i16> %c to <4 x i32> 90 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) 91 ret <4 x i32> %d 92} 93 94define <4 x i32> @combine_pshufd6(<4 x i32> %a) { 95; SSE-LABEL: combine_pshufd6: 96; SSE: # %bb.0: # %entry 97; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 98; SSE-NEXT: retq 99; 100; AVX1-LABEL: combine_pshufd6: 101; AVX1: # %bb.0: # %entry 102; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 103; AVX1-NEXT: retq 104; 105; AVX2-LABEL: combine_pshufd6: 106; AVX2: # %bb.0: # %entry 107; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 108; AVX2-NEXT: retq 109entry: 110 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) 111 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) 112 ret <4 x i32> %c 113} 114 115define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { 116; CHECK-LABEL: combine_pshuflw1: 117; CHECK: # %bb.0: # %entry 118; CHECK-NEXT: retq 119entry: 120 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 121 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 122 ret <8 x i16> %c 123} 124 125define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { 126; CHECK-LABEL: combine_pshuflw2: 127; CHECK: # %bb.0: # %entry 128; CHECK-NEXT: retq 129entry: 130 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 131 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 132 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 133 ret <8 x i16> %d 134} 135 136define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { 137; SSE-LABEL: combine_pshuflw3: 138; SSE: # %bb.0: # %entry 139; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 140; SSE-NEXT: retq 141; 142; AVX-LABEL: combine_pshuflw3: 143; AVX: # %bb.0: # %entry 144; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 145; AVX-NEXT: retq 146entry: 147 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 148 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 149 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 150 ret <8 x i16> %d 151} 152 153define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { 154; SSE-LABEL: combine_pshufhw1: 155; SSE: # %bb.0: # %entry 156; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 157; SSE-NEXT: retq 158; 159; AVX-LABEL: combine_pshufhw1: 160; AVX: # %bb.0: # %entry 161; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 162; AVX-NEXT: retq 163entry: 164 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) 165 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 166 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 167 ret <8 x i16> %d 168} 169 170define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 171; SSE-LABEL: combine_bitwise_ops_test1: 172; SSE: # %bb.0: 173; SSE-NEXT: pand %xmm1, %xmm0 174; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 175; SSE-NEXT: retq 176; 177; AVX-LABEL: combine_bitwise_ops_test1: 178; AVX: # %bb.0: 179; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 180; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 181; AVX-NEXT: retq 182 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 183 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 184 %and = and <4 x i32> %shuf1, %shuf2 185 ret <4 x i32> %and 186} 187 188define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 189; SSE-LABEL: combine_bitwise_ops_test2: 190; SSE: # %bb.0: 191; SSE-NEXT: por %xmm1, %xmm0 192; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 193; SSE-NEXT: retq 194; 195; AVX-LABEL: combine_bitwise_ops_test2: 196; AVX: # %bb.0: 197; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 198; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 199; AVX-NEXT: retq 200 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 201 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 202 %or = or <4 x i32> %shuf1, %shuf2 203 ret <4 x i32> %or 204} 205 206define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 207; SSE-LABEL: combine_bitwise_ops_test3: 208; SSE: # %bb.0: 209; SSE-NEXT: pxor %xmm1, %xmm0 210; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 211; SSE-NEXT: retq 212; 213; AVX-LABEL: combine_bitwise_ops_test3: 214; AVX: # %bb.0: 215; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 216; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 217; AVX-NEXT: retq 218 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 219 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 220 %xor = xor <4 x i32> %shuf1, %shuf2 221 ret <4 x i32> %xor 222} 223 224define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 225; SSE-LABEL: combine_bitwise_ops_test4: 226; SSE: # %bb.0: 227; SSE-NEXT: pand %xmm1, %xmm0 228; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 229; SSE-NEXT: retq 230; 231; AVX-LABEL: combine_bitwise_ops_test4: 232; AVX: # %bb.0: 233; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 234; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 235; AVX-NEXT: retq 236 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 237 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 238 %and = and <4 x i32> %shuf1, %shuf2 239 ret <4 x i32> %and 240} 241 242define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 243; SSE-LABEL: combine_bitwise_ops_test5: 244; SSE: # %bb.0: 245; SSE-NEXT: por %xmm1, %xmm0 246; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 247; SSE-NEXT: retq 248; 249; AVX-LABEL: combine_bitwise_ops_test5: 250; AVX: # %bb.0: 251; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 252; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 253; AVX-NEXT: retq 254 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 255 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 256 %or = or <4 x i32> %shuf1, %shuf2 257 ret <4 x i32> %or 258} 259 260define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 261; SSE-LABEL: combine_bitwise_ops_test6: 262; SSE: # %bb.0: 263; SSE-NEXT: pxor %xmm1, %xmm0 264; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 265; SSE-NEXT: retq 266; 267; AVX-LABEL: combine_bitwise_ops_test6: 268; AVX: # %bb.0: 269; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 270; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 271; AVX-NEXT: retq 272 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 273 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 274 %xor = xor <4 x i32> %shuf1, %shuf2 275 ret <4 x i32> %xor 276} 277 278 279; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles 280; are not performing a swizzle operations. 281 282define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 283; SSE2-LABEL: combine_bitwise_ops_test1b: 284; SSE2: # %bb.0: 285; SSE2-NEXT: pand %xmm1, %xmm0 286; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 287; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 288; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 289; SSE2-NEXT: retq 290; 291; SSSE3-LABEL: combine_bitwise_ops_test1b: 292; SSSE3: # %bb.0: 293; SSSE3-NEXT: pand %xmm1, %xmm0 294; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 295; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 296; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 297; SSSE3-NEXT: retq 298; 299; SSE41-LABEL: combine_bitwise_ops_test1b: 300; SSE41: # %bb.0: 301; SSE41-NEXT: andps %xmm1, %xmm0 302; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 303; SSE41-NEXT: retq 304; 305; AVX-LABEL: combine_bitwise_ops_test1b: 306; AVX: # %bb.0: 307; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 308; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 309; AVX-NEXT: retq 310 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 311 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 312 %and = and <4 x i32> %shuf1, %shuf2 313 ret <4 x i32> %and 314} 315 316define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 317; SSE2-LABEL: combine_bitwise_ops_test2b: 318; SSE2: # %bb.0: 319; SSE2-NEXT: por %xmm1, %xmm0 320; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 321; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 322; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 323; SSE2-NEXT: retq 324; 325; SSSE3-LABEL: combine_bitwise_ops_test2b: 326; SSSE3: # %bb.0: 327; SSSE3-NEXT: por %xmm1, %xmm0 328; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 329; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 330; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 331; SSSE3-NEXT: retq 332; 333; SSE41-LABEL: combine_bitwise_ops_test2b: 334; SSE41: # %bb.0: 335; SSE41-NEXT: orps %xmm1, %xmm0 336; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 337; SSE41-NEXT: retq 338; 339; AVX-LABEL: combine_bitwise_ops_test2b: 340; AVX: # %bb.0: 341; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 342; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 343; AVX-NEXT: retq 344 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 345 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 346 %or = or <4 x i32> %shuf1, %shuf2 347 ret <4 x i32> %or 348} 349 350define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 351; SSE2-LABEL: combine_bitwise_ops_test3b: 352; SSE2: # %bb.0: 353; SSE2-NEXT: xorps %xmm1, %xmm0 354; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 355; SSE2-NEXT: retq 356; 357; SSSE3-LABEL: combine_bitwise_ops_test3b: 358; SSSE3: # %bb.0: 359; SSSE3-NEXT: xorps %xmm1, %xmm0 360; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 361; SSSE3-NEXT: retq 362; 363; SSE41-LABEL: combine_bitwise_ops_test3b: 364; SSE41: # %bb.0: 365; SSE41-NEXT: xorps %xmm1, %xmm0 366; SSE41-NEXT: xorps %xmm1, %xmm1 367; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 368; SSE41-NEXT: retq 369; 370; AVX-LABEL: combine_bitwise_ops_test3b: 371; AVX: # %bb.0: 372; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 373; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 374; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 375; AVX-NEXT: retq 376 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 377 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 378 %xor = xor <4 x i32> %shuf1, %shuf2 379 ret <4 x i32> %xor 380} 381 382define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 383; SSE2-LABEL: combine_bitwise_ops_test4b: 384; SSE2: # %bb.0: 385; SSE2-NEXT: pand %xmm1, %xmm0 386; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 387; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 388; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 389; SSE2-NEXT: retq 390; 391; SSSE3-LABEL: combine_bitwise_ops_test4b: 392; SSSE3: # %bb.0: 393; SSSE3-NEXT: pand %xmm1, %xmm0 394; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 395; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 396; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 397; SSSE3-NEXT: retq 398; 399; SSE41-LABEL: combine_bitwise_ops_test4b: 400; SSE41: # %bb.0: 401; SSE41-NEXT: andps %xmm1, %xmm0 402; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 403; SSE41-NEXT: retq 404; 405; AVX-LABEL: combine_bitwise_ops_test4b: 406; AVX: # %bb.0: 407; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 408; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 409; AVX-NEXT: retq 410 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 411 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 412 %and = and <4 x i32> %shuf1, %shuf2 413 ret <4 x i32> %and 414} 415 416define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 417; SSE2-LABEL: combine_bitwise_ops_test5b: 418; SSE2: # %bb.0: 419; SSE2-NEXT: por %xmm1, %xmm0 420; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 421; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 422; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 423; SSE2-NEXT: retq 424; 425; SSSE3-LABEL: combine_bitwise_ops_test5b: 426; SSSE3: # %bb.0: 427; SSSE3-NEXT: por %xmm1, %xmm0 428; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 429; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 430; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 431; SSSE3-NEXT: retq 432; 433; SSE41-LABEL: combine_bitwise_ops_test5b: 434; SSE41: # %bb.0: 435; SSE41-NEXT: orps %xmm1, %xmm0 436; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 437; SSE41-NEXT: retq 438; 439; AVX-LABEL: combine_bitwise_ops_test5b: 440; AVX: # %bb.0: 441; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 442; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 443; AVX-NEXT: retq 444 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 445 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 446 %or = or <4 x i32> %shuf1, %shuf2 447 ret <4 x i32> %or 448} 449 450define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 451; SSE2-LABEL: combine_bitwise_ops_test6b: 452; SSE2: # %bb.0: 453; SSE2-NEXT: xorps %xmm1, %xmm0 454; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 455; SSE2-NEXT: retq 456; 457; SSSE3-LABEL: combine_bitwise_ops_test6b: 458; SSSE3: # %bb.0: 459; SSSE3-NEXT: xorps %xmm1, %xmm0 460; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 461; SSSE3-NEXT: retq 462; 463; SSE41-LABEL: combine_bitwise_ops_test6b: 464; SSE41: # %bb.0: 465; SSE41-NEXT: xorps %xmm1, %xmm0 466; SSE41-NEXT: xorps %xmm1, %xmm1 467; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 468; SSE41-NEXT: retq 469; 470; AVX-LABEL: combine_bitwise_ops_test6b: 471; AVX: # %bb.0: 472; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 473; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 474; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 475; AVX-NEXT: retq 476 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 477 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 478 %xor = xor <4 x i32> %shuf1, %shuf2 479 ret <4 x i32> %xor 480} 481 482define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 483; SSE-LABEL: combine_bitwise_ops_test1c: 484; SSE: # %bb.0: 485; SSE-NEXT: andps %xmm1, %xmm0 486; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 487; SSE-NEXT: retq 488; 489; AVX-LABEL: combine_bitwise_ops_test1c: 490; AVX: # %bb.0: 491; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 492; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 493; AVX-NEXT: retq 494 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 495 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 496 %and = and <4 x i32> %shuf1, %shuf2 497 ret <4 x i32> %and 498} 499 500define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 501; SSE-LABEL: combine_bitwise_ops_test2c: 502; SSE: # %bb.0: 503; SSE-NEXT: orps %xmm1, %xmm0 504; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 505; SSE-NEXT: retq 506; 507; AVX-LABEL: combine_bitwise_ops_test2c: 508; AVX: # %bb.0: 509; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 510; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 511; AVX-NEXT: retq 512 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 513 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 514 %or = or <4 x i32> %shuf1, %shuf2 515 ret <4 x i32> %or 516} 517 518define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 519; SSE2-LABEL: combine_bitwise_ops_test3c: 520; SSE2: # %bb.0: 521; SSE2-NEXT: xorps %xmm1, %xmm0 522; SSE2-NEXT: xorps %xmm1, %xmm1 523; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 524; SSE2-NEXT: retq 525; 526; SSSE3-LABEL: combine_bitwise_ops_test3c: 527; SSSE3: # %bb.0: 528; SSSE3-NEXT: xorps %xmm1, %xmm0 529; SSSE3-NEXT: xorps %xmm1, %xmm1 530; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 531; SSSE3-NEXT: retq 532; 533; SSE41-LABEL: combine_bitwise_ops_test3c: 534; SSE41: # %bb.0: 535; SSE41-NEXT: xorps %xmm1, %xmm0 536; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 537; SSE41-NEXT: retq 538; 539; AVX-LABEL: combine_bitwise_ops_test3c: 540; AVX: # %bb.0: 541; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 542; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 543; AVX-NEXT: retq 544 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 545 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 546 %xor = xor <4 x i32> %shuf1, %shuf2 547 ret <4 x i32> %xor 548} 549 550define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 551; SSE-LABEL: combine_bitwise_ops_test4c: 552; SSE: # %bb.0: 553; SSE-NEXT: andps %xmm1, %xmm0 554; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 555; SSE-NEXT: movaps %xmm2, %xmm0 556; SSE-NEXT: retq 557; 558; AVX-LABEL: combine_bitwise_ops_test4c: 559; AVX: # %bb.0: 560; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 561; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 562; AVX-NEXT: retq 563 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 564 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 565 %and = and <4 x i32> %shuf1, %shuf2 566 ret <4 x i32> %and 567} 568 569define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 570; SSE-LABEL: combine_bitwise_ops_test5c: 571; SSE: # %bb.0: 572; SSE-NEXT: orps %xmm1, %xmm0 573; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 574; SSE-NEXT: movaps %xmm2, %xmm0 575; SSE-NEXT: retq 576; 577; AVX-LABEL: combine_bitwise_ops_test5c: 578; AVX: # %bb.0: 579; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 580; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 581; AVX-NEXT: retq 582 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 583 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 584 %or = or <4 x i32> %shuf1, %shuf2 585 ret <4 x i32> %or 586} 587 588define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 589; SSE2-LABEL: combine_bitwise_ops_test6c: 590; SSE2: # %bb.0: 591; SSE2-NEXT: xorps %xmm1, %xmm0 592; SSE2-NEXT: xorps %xmm1, %xmm1 593; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 594; SSE2-NEXT: movaps %xmm1, %xmm0 595; SSE2-NEXT: retq 596; 597; SSSE3-LABEL: combine_bitwise_ops_test6c: 598; SSSE3: # %bb.0: 599; SSSE3-NEXT: xorps %xmm1, %xmm0 600; SSSE3-NEXT: xorps %xmm1, %xmm1 601; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 602; SSSE3-NEXT: movaps %xmm1, %xmm0 603; SSSE3-NEXT: retq 604; 605; SSE41-LABEL: combine_bitwise_ops_test6c: 606; SSE41: # %bb.0: 607; SSE41-NEXT: xorps %xmm1, %xmm0 608; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 609; SSE41-NEXT: retq 610; 611; AVX-LABEL: combine_bitwise_ops_test6c: 612; AVX: # %bb.0: 613; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 614; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 615; AVX-NEXT: retq 616 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 617 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 618 %xor = xor <4 x i32> %shuf1, %shuf2 619 ret <4 x i32> %xor 620} 621 622define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { 623; SSE-LABEL: combine_nested_undef_test1: 624; SSE: # %bb.0: 625; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 626; SSE-NEXT: retq 627; 628; AVX-LABEL: combine_nested_undef_test1: 629; AVX: # %bb.0: 630; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 631; AVX-NEXT: retq 632 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 633 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 634 ret <4 x i32> %2 635} 636 637define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { 638; SSE-LABEL: combine_nested_undef_test2: 639; SSE: # %bb.0: 640; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 641; SSE-NEXT: retq 642; 643; AVX-LABEL: combine_nested_undef_test2: 644; AVX: # %bb.0: 645; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 646; AVX-NEXT: retq 647 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 648 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 649 ret <4 x i32> %2 650} 651 652define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { 653; SSE-LABEL: combine_nested_undef_test3: 654; SSE: # %bb.0: 655; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 656; SSE-NEXT: retq 657; 658; AVX-LABEL: combine_nested_undef_test3: 659; AVX: # %bb.0: 660; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 661; AVX-NEXT: retq 662 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 663 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 664 ret <4 x i32> %2 665} 666 667define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { 668; SSE-LABEL: combine_nested_undef_test4: 669; SSE: # %bb.0: 670; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 671; SSE-NEXT: retq 672; 673; AVX1-LABEL: combine_nested_undef_test4: 674; AVX1: # %bb.0: 675; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 676; AVX1-NEXT: retq 677; 678; AVX2-LABEL: combine_nested_undef_test4: 679; AVX2: # %bb.0: 680; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 681; AVX2-NEXT: retq 682 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> 683 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> 684 ret <4 x i32> %2 685} 686 687define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { 688; SSE-LABEL: combine_nested_undef_test5: 689; SSE: # %bb.0: 690; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 691; SSE-NEXT: retq 692; 693; AVX-LABEL: combine_nested_undef_test5: 694; AVX: # %bb.0: 695; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 696; AVX-NEXT: retq 697 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 698 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> 699 ret <4 x i32> %2 700} 701 702define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { 703; SSE-LABEL: combine_nested_undef_test6: 704; SSE: # %bb.0: 705; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 706; SSE-NEXT: retq 707; 708; AVX-LABEL: combine_nested_undef_test6: 709; AVX: # %bb.0: 710; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] 711; AVX-NEXT: retq 712 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 713 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> 714 ret <4 x i32> %2 715} 716 717define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { 718; SSE-LABEL: combine_nested_undef_test7: 719; SSE: # %bb.0: 720; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 721; SSE-NEXT: retq 722; 723; AVX-LABEL: combine_nested_undef_test7: 724; AVX: # %bb.0: 725; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] 726; AVX-NEXT: retq 727 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 728 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 729 ret <4 x i32> %2 730} 731 732define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { 733; SSE-LABEL: combine_nested_undef_test8: 734; SSE: # %bb.0: 735; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 736; SSE-NEXT: retq 737; 738; AVX-LABEL: combine_nested_undef_test8: 739; AVX: # %bb.0: 740; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] 741; AVX-NEXT: retq 742 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 743 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 744 ret <4 x i32> %2 745} 746 747define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { 748; SSE-LABEL: combine_nested_undef_test9: 749; SSE: # %bb.0: 750; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 751; SSE-NEXT: retq 752; 753; AVX-LABEL: combine_nested_undef_test9: 754; AVX: # %bb.0: 755; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] 756; AVX-NEXT: retq 757 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> 758 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 759 ret <4 x i32> %2 760} 761 762define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { 763; SSE-LABEL: combine_nested_undef_test10: 764; SSE: # %bb.0: 765; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 766; SSE-NEXT: retq 767; 768; AVX-LABEL: combine_nested_undef_test10: 769; AVX: # %bb.0: 770; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] 771; AVX-NEXT: retq 772 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 773 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> 774 ret <4 x i32> %2 775} 776 777define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { 778; SSE-LABEL: combine_nested_undef_test11: 779; SSE: # %bb.0: 780; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 781; SSE-NEXT: retq 782; 783; AVX-LABEL: combine_nested_undef_test11: 784; AVX: # %bb.0: 785; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] 786; AVX-NEXT: retq 787 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> 788 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> 789 ret <4 x i32> %2 790} 791 792define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { 793; SSE-LABEL: combine_nested_undef_test12: 794; SSE: # %bb.0: 795; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 796; SSE-NEXT: retq 797; 798; AVX1-LABEL: combine_nested_undef_test12: 799; AVX1: # %bb.0: 800; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 801; AVX1-NEXT: retq 802; 803; AVX2-LABEL: combine_nested_undef_test12: 804; AVX2: # %bb.0: 805; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 806; AVX2-NEXT: retq 807 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> 808 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> 809 ret <4 x i32> %2 810} 811 812; The following pair of shuffles is folded into vector %A. 813define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { 814; CHECK-LABEL: combine_nested_undef_test13: 815; CHECK: # %bb.0: 816; CHECK-NEXT: retq 817 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> 818 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> 819 ret <4 x i32> %2 820} 821 822; The following pair of shuffles is folded into vector %B. 823define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { 824; SSE-LABEL: combine_nested_undef_test14: 825; SSE: # %bb.0: 826; SSE-NEXT: movaps %xmm1, %xmm0 827; SSE-NEXT: retq 828; 829; AVX-LABEL: combine_nested_undef_test14: 830; AVX: # %bb.0: 831; AVX-NEXT: vmovaps %xmm1, %xmm0 832; AVX-NEXT: retq 833 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 834 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> 835 ret <4 x i32> %2 836} 837 838 839; Verify that we don't optimize the following cases. We expect more than one shuffle. 840; 841; FIXME: Many of these already don't make sense, and the rest should stop 842; making sense with th enew vector shuffle lowering. Revisit at least testing for 843; it. 844 845define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { 846; SSE2-LABEL: combine_nested_undef_test15: 847; SSE2: # %bb.0: 848; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 849; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 850; SSE2-NEXT: movaps %xmm1, %xmm0 851; SSE2-NEXT: retq 852; 853; SSSE3-LABEL: combine_nested_undef_test15: 854; SSSE3: # %bb.0: 855; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 856; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 857; SSSE3-NEXT: movaps %xmm1, %xmm0 858; SSSE3-NEXT: retq 859; 860; SSE41-LABEL: combine_nested_undef_test15: 861; SSE41: # %bb.0: 862; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 863; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 864; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 865; SSE41-NEXT: retq 866; 867; AVX1-LABEL: combine_nested_undef_test15: 868; AVX1: # %bb.0: 869; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 870; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 871; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 872; AVX1-NEXT: retq 873; 874; AVX2-LABEL: combine_nested_undef_test15: 875; AVX2: # %bb.0: 876; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 877; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 878; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 879; AVX2-NEXT: retq 880 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 881 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 882 ret <4 x i32> %2 883} 884 885define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { 886; SSE2-LABEL: combine_nested_undef_test16: 887; SSE2: # %bb.0: 888; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 889; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 890; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 891; SSE2-NEXT: retq 892; 893; SSSE3-LABEL: combine_nested_undef_test16: 894; SSSE3: # %bb.0: 895; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 896; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 897; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 898; SSSE3-NEXT: retq 899; 900; SSE41-LABEL: combine_nested_undef_test16: 901; SSE41: # %bb.0: 902; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 903; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 904; SSE41-NEXT: retq 905; 906; AVX-LABEL: combine_nested_undef_test16: 907; AVX: # %bb.0: 908; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] 909; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 910; AVX-NEXT: retq 911 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 912 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 913 ret <4 x i32> %2 914} 915 916define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { 917; SSE2-LABEL: combine_nested_undef_test17: 918; SSE2: # %bb.0: 919; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 920; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 921; SSE2-NEXT: retq 922; 923; SSSE3-LABEL: combine_nested_undef_test17: 924; SSSE3: # %bb.0: 925; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 926; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 927; SSSE3-NEXT: retq 928; 929; SSE41-LABEL: combine_nested_undef_test17: 930; SSE41: # %bb.0: 931; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 932; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 933; SSE41-NEXT: retq 934; 935; AVX-LABEL: combine_nested_undef_test17: 936; AVX: # %bb.0: 937; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 938; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 939; AVX-NEXT: retq 940 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 941 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 942 ret <4 x i32> %2 943} 944 945define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { 946; SSE-LABEL: combine_nested_undef_test18: 947; SSE: # %bb.0: 948; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 949; SSE-NEXT: retq 950; 951; AVX-LABEL: combine_nested_undef_test18: 952; AVX: # %bb.0: 953; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] 954; AVX-NEXT: retq 955 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 956 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> 957 ret <4 x i32> %2 958} 959 960define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { 961; SSE2-LABEL: combine_nested_undef_test19: 962; SSE2: # %bb.0: 963; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 964; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 965; SSE2-NEXT: retq 966; 967; SSSE3-LABEL: combine_nested_undef_test19: 968; SSSE3: # %bb.0: 969; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 970; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 971; SSSE3-NEXT: retq 972; 973; SSE41-LABEL: combine_nested_undef_test19: 974; SSE41: # %bb.0: 975; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 976; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 977; SSE41-NEXT: retq 978; 979; AVX-LABEL: combine_nested_undef_test19: 980; AVX: # %bb.0: 981; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 982; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] 983; AVX-NEXT: retq 984 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 985 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 986 ret <4 x i32> %2 987} 988 989define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { 990; SSE2-LABEL: combine_nested_undef_test20: 991; SSE2: # %bb.0: 992; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 993; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 994; SSE2-NEXT: movaps %xmm1, %xmm0 995; SSE2-NEXT: retq 996; 997; SSSE3-LABEL: combine_nested_undef_test20: 998; SSSE3: # %bb.0: 999; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1000; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1001; SSSE3-NEXT: movaps %xmm1, %xmm0 1002; SSSE3-NEXT: retq 1003; 1004; SSE41-LABEL: combine_nested_undef_test20: 1005; SSE41: # %bb.0: 1006; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1007; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1008; SSE41-NEXT: retq 1009; 1010; AVX-LABEL: combine_nested_undef_test20: 1011; AVX: # %bb.0: 1012; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1013; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] 1014; AVX-NEXT: retq 1015 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> 1016 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1017 ret <4 x i32> %2 1018} 1019 1020define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { 1021; SSE2-LABEL: combine_nested_undef_test21: 1022; SSE2: # %bb.0: 1023; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1024; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1025; SSE2-NEXT: retq 1026; 1027; SSSE3-LABEL: combine_nested_undef_test21: 1028; SSSE3: # %bb.0: 1029; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1030; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1031; SSSE3-NEXT: retq 1032; 1033; SSE41-LABEL: combine_nested_undef_test21: 1034; SSE41: # %bb.0: 1035; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1036; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1037; SSE41-NEXT: retq 1038; 1039; AVX1-LABEL: combine_nested_undef_test21: 1040; AVX1: # %bb.0: 1041; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1042; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1043; AVX1-NEXT: retq 1044; 1045; AVX2-LABEL: combine_nested_undef_test21: 1046; AVX2: # %bb.0: 1047; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1048; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1049; AVX2-NEXT: retq 1050 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1051 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1052 ret <4 x i32> %2 1053} 1054 1055 1056; Test that we correctly combine shuffles according to rule 1057; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) 1058 1059define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { 1060; SSE-LABEL: combine_nested_undef_test22: 1061; SSE: # %bb.0: 1062; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1063; SSE-NEXT: retq 1064; 1065; AVX-LABEL: combine_nested_undef_test22: 1066; AVX: # %bb.0: 1067; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] 1068; AVX-NEXT: retq 1069 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1070 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> 1071 ret <4 x i32> %2 1072} 1073 1074define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { 1075; SSE-LABEL: combine_nested_undef_test23: 1076; SSE: # %bb.0: 1077; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1078; SSE-NEXT: retq 1079; 1080; AVX-LABEL: combine_nested_undef_test23: 1081; AVX: # %bb.0: 1082; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] 1083; AVX-NEXT: retq 1084 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1085 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1086 ret <4 x i32> %2 1087} 1088 1089define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { 1090; SSE-LABEL: combine_nested_undef_test24: 1091; SSE: # %bb.0: 1092; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1093; SSE-NEXT: retq 1094; 1095; AVX-LABEL: combine_nested_undef_test24: 1096; AVX: # %bb.0: 1097; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] 1098; AVX-NEXT: retq 1099 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1100 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> 1101 ret <4 x i32> %2 1102} 1103 1104define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { 1105; SSE-LABEL: combine_nested_undef_test25: 1106; SSE: # %bb.0: 1107; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1108; SSE-NEXT: retq 1109; 1110; AVX1-LABEL: combine_nested_undef_test25: 1111; AVX1: # %bb.0: 1112; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1113; AVX1-NEXT: retq 1114; 1115; AVX2-LABEL: combine_nested_undef_test25: 1116; AVX2: # %bb.0: 1117; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1118; AVX2-NEXT: retq 1119 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> 1120 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> 1121 ret <4 x i32> %2 1122} 1123 1124define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { 1125; SSE-LABEL: combine_nested_undef_test26: 1126; SSE: # %bb.0: 1127; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1128; SSE-NEXT: retq 1129; 1130; AVX-LABEL: combine_nested_undef_test26: 1131; AVX: # %bb.0: 1132; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 1133; AVX-NEXT: retq 1134 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> 1135 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 1136 ret <4 x i32> %2 1137} 1138 1139define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { 1140; SSE-LABEL: combine_nested_undef_test27: 1141; SSE: # %bb.0: 1142; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1143; SSE-NEXT: retq 1144; 1145; AVX1-LABEL: combine_nested_undef_test27: 1146; AVX1: # %bb.0: 1147; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1148; AVX1-NEXT: retq 1149; 1150; AVX2-LABEL: combine_nested_undef_test27: 1151; AVX2: # %bb.0: 1152; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1153; AVX2-NEXT: retq 1154 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> 1155 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> 1156 ret <4 x i32> %2 1157} 1158 1159define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { 1160; SSE-LABEL: combine_nested_undef_test28: 1161; SSE: # %bb.0: 1162; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1163; SSE-NEXT: retq 1164; 1165; AVX-LABEL: combine_nested_undef_test28: 1166; AVX: # %bb.0: 1167; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] 1168; AVX-NEXT: retq 1169 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 1170 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> 1171 ret <4 x i32> %2 1172} 1173 1174define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { 1175; SSE-LABEL: combine_test1: 1176; SSE: # %bb.0: 1177; SSE-NEXT: movaps %xmm1, %xmm0 1178; SSE-NEXT: retq 1179; 1180; AVX-LABEL: combine_test1: 1181; AVX: # %bb.0: 1182; AVX-NEXT: vmovaps %xmm1, %xmm0 1183; AVX-NEXT: retq 1184 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1185 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1186 ret <4 x float> %2 1187} 1188 1189define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { 1190; SSE2-LABEL: combine_test2: 1191; SSE2: # %bb.0: 1192; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1193; SSE2-NEXT: movaps %xmm1, %xmm0 1194; SSE2-NEXT: retq 1195; 1196; SSSE3-LABEL: combine_test2: 1197; SSSE3: # %bb.0: 1198; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1199; SSSE3-NEXT: movaps %xmm1, %xmm0 1200; SSSE3-NEXT: retq 1201; 1202; SSE41-LABEL: combine_test2: 1203; SSE41: # %bb.0: 1204; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1205; SSE41-NEXT: retq 1206; 1207; AVX-LABEL: combine_test2: 1208; AVX: # %bb.0: 1209; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1210; AVX-NEXT: retq 1211 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1212 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1213 ret <4 x float> %2 1214} 1215 1216define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { 1217; SSE-LABEL: combine_test3: 1218; SSE: # %bb.0: 1219; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1220; SSE-NEXT: retq 1221; 1222; AVX-LABEL: combine_test3: 1223; AVX: # %bb.0: 1224; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1225; AVX-NEXT: retq 1226 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1227 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1228 ret <4 x float> %2 1229} 1230 1231define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { 1232; SSE-LABEL: combine_test4: 1233; SSE: # %bb.0: 1234; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1235; SSE-NEXT: retq 1236; 1237; AVX-LABEL: combine_test4: 1238; AVX: # %bb.0: 1239; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1240; AVX-NEXT: retq 1241 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1242 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1243 ret <4 x float> %2 1244} 1245 1246define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { 1247; SSE2-LABEL: combine_test5: 1248; SSE2: # %bb.0: 1249; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1250; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1251; SSE2-NEXT: retq 1252; 1253; SSSE3-LABEL: combine_test5: 1254; SSSE3: # %bb.0: 1255; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1256; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1257; SSSE3-NEXT: retq 1258; 1259; SSE41-LABEL: combine_test5: 1260; SSE41: # %bb.0: 1261; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1262; SSE41-NEXT: retq 1263; 1264; AVX-LABEL: combine_test5: 1265; AVX: # %bb.0: 1266; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1267; AVX-NEXT: retq 1268 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1269 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1270 ret <4 x float> %2 1271} 1272 1273define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { 1274; SSE-LABEL: combine_test6: 1275; SSE: # %bb.0: 1276; SSE-NEXT: movaps %xmm1, %xmm0 1277; SSE-NEXT: retq 1278; 1279; AVX-LABEL: combine_test6: 1280; AVX: # %bb.0: 1281; AVX-NEXT: vmovaps %xmm1, %xmm0 1282; AVX-NEXT: retq 1283 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1284 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1285 ret <4 x i32> %2 1286} 1287 1288define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { 1289; SSE2-LABEL: combine_test7: 1290; SSE2: # %bb.0: 1291; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1292; SSE2-NEXT: movaps %xmm1, %xmm0 1293; SSE2-NEXT: retq 1294; 1295; SSSE3-LABEL: combine_test7: 1296; SSSE3: # %bb.0: 1297; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1298; SSSE3-NEXT: movaps %xmm1, %xmm0 1299; SSSE3-NEXT: retq 1300; 1301; SSE41-LABEL: combine_test7: 1302; SSE41: # %bb.0: 1303; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1304; SSE41-NEXT: retq 1305; 1306; AVX-LABEL: combine_test7: 1307; AVX: # %bb.0: 1308; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1309; AVX-NEXT: retq 1310 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1311 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1312 ret <4 x i32> %2 1313} 1314 1315define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { 1316; SSE-LABEL: combine_test8: 1317; SSE: # %bb.0: 1318; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1319; SSE-NEXT: retq 1320; 1321; AVX-LABEL: combine_test8: 1322; AVX: # %bb.0: 1323; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1324; AVX-NEXT: retq 1325 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1326 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1327 ret <4 x i32> %2 1328} 1329 1330define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { 1331; SSE-LABEL: combine_test9: 1332; SSE: # %bb.0: 1333; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1334; SSE-NEXT: movaps %xmm1, %xmm0 1335; SSE-NEXT: retq 1336; 1337; AVX-LABEL: combine_test9: 1338; AVX: # %bb.0: 1339; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1340; AVX-NEXT: retq 1341 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1342 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1343 ret <4 x i32> %2 1344} 1345 1346define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { 1347; SSE2-LABEL: combine_test10: 1348; SSE2: # %bb.0: 1349; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1350; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1351; SSE2-NEXT: retq 1352; 1353; SSSE3-LABEL: combine_test10: 1354; SSSE3: # %bb.0: 1355; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1356; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1357; SSSE3-NEXT: retq 1358; 1359; SSE41-LABEL: combine_test10: 1360; SSE41: # %bb.0: 1361; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1362; SSE41-NEXT: retq 1363; 1364; AVX-LABEL: combine_test10: 1365; AVX: # %bb.0: 1366; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1367; AVX-NEXT: retq 1368 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1369 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1370 ret <4 x i32> %2 1371} 1372 1373define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { 1374; CHECK-LABEL: combine_test11: 1375; CHECK: # %bb.0: 1376; CHECK-NEXT: retq 1377 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1378 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1379 ret <4 x float> %2 1380} 1381 1382define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { 1383; SSE2-LABEL: combine_test12: 1384; SSE2: # %bb.0: 1385; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1386; SSE2-NEXT: movaps %xmm1, %xmm0 1387; SSE2-NEXT: retq 1388; 1389; SSSE3-LABEL: combine_test12: 1390; SSSE3: # %bb.0: 1391; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1392; SSSE3-NEXT: movaps %xmm1, %xmm0 1393; SSSE3-NEXT: retq 1394; 1395; SSE41-LABEL: combine_test12: 1396; SSE41: # %bb.0: 1397; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1398; SSE41-NEXT: retq 1399; 1400; AVX-LABEL: combine_test12: 1401; AVX: # %bb.0: 1402; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1403; AVX-NEXT: retq 1404 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1405 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1406 ret <4 x float> %2 1407} 1408 1409define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { 1410; SSE-LABEL: combine_test13: 1411; SSE: # %bb.0: 1412; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1413; SSE-NEXT: retq 1414; 1415; AVX-LABEL: combine_test13: 1416; AVX: # %bb.0: 1417; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1418; AVX-NEXT: retq 1419 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1420 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1421 ret <4 x float> %2 1422} 1423 1424define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { 1425; SSE-LABEL: combine_test14: 1426; SSE: # %bb.0: 1427; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1428; SSE-NEXT: retq 1429; 1430; AVX-LABEL: combine_test14: 1431; AVX: # %bb.0: 1432; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1433; AVX-NEXT: retq 1434 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1435 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1436 ret <4 x float> %2 1437} 1438 1439define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { 1440; SSE2-LABEL: combine_test15: 1441; SSE2: # %bb.0: 1442; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1443; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1444; SSE2-NEXT: retq 1445; 1446; SSSE3-LABEL: combine_test15: 1447; SSSE3: # %bb.0: 1448; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1449; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1450; SSSE3-NEXT: retq 1451; 1452; SSE41-LABEL: combine_test15: 1453; SSE41: # %bb.0: 1454; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1455; SSE41-NEXT: retq 1456; 1457; AVX-LABEL: combine_test15: 1458; AVX: # %bb.0: 1459; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1460; AVX-NEXT: retq 1461 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1462 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1463 ret <4 x float> %2 1464} 1465 1466define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { 1467; CHECK-LABEL: combine_test16: 1468; CHECK: # %bb.0: 1469; CHECK-NEXT: retq 1470 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1471 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1472 ret <4 x i32> %2 1473} 1474 1475define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { 1476; SSE2-LABEL: combine_test17: 1477; SSE2: # %bb.0: 1478; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1479; SSE2-NEXT: movaps %xmm1, %xmm0 1480; SSE2-NEXT: retq 1481; 1482; SSSE3-LABEL: combine_test17: 1483; SSSE3: # %bb.0: 1484; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1485; SSSE3-NEXT: movaps %xmm1, %xmm0 1486; SSSE3-NEXT: retq 1487; 1488; SSE41-LABEL: combine_test17: 1489; SSE41: # %bb.0: 1490; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1491; SSE41-NEXT: retq 1492; 1493; AVX-LABEL: combine_test17: 1494; AVX: # %bb.0: 1495; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1496; AVX-NEXT: retq 1497 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1498 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1499 ret <4 x i32> %2 1500} 1501 1502define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { 1503; SSE-LABEL: combine_test18: 1504; SSE: # %bb.0: 1505; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1506; SSE-NEXT: retq 1507; 1508; AVX-LABEL: combine_test18: 1509; AVX: # %bb.0: 1510; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1511; AVX-NEXT: retq 1512 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1513 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1514 ret <4 x i32> %2 1515} 1516 1517define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { 1518; SSE-LABEL: combine_test19: 1519; SSE: # %bb.0: 1520; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1521; SSE-NEXT: retq 1522; 1523; AVX-LABEL: combine_test19: 1524; AVX: # %bb.0: 1525; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1526; AVX-NEXT: retq 1527 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1528 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1529 ret <4 x i32> %2 1530} 1531 1532define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { 1533; SSE2-LABEL: combine_test20: 1534; SSE2: # %bb.0: 1535; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1536; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1537; SSE2-NEXT: retq 1538; 1539; SSSE3-LABEL: combine_test20: 1540; SSSE3: # %bb.0: 1541; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1542; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1543; SSSE3-NEXT: retq 1544; 1545; SSE41-LABEL: combine_test20: 1546; SSE41: # %bb.0: 1547; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1548; SSE41-NEXT: retq 1549; 1550; AVX-LABEL: combine_test20: 1551; AVX: # %bb.0: 1552; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1553; AVX-NEXT: retq 1554 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1555 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1556 ret <4 x i32> %2 1557} 1558 1559define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { 1560; SSE-LABEL: combine_test21: 1561; SSE: # %bb.0: 1562; SSE-NEXT: movaps %xmm0, %xmm2 1563; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1564; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1565; SSE-NEXT: movaps %xmm2, (%rdi) 1566; SSE-NEXT: retq 1567; 1568; AVX-LABEL: combine_test21: 1569; AVX: # %bb.0: 1570; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1571; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1572; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1573; AVX-NEXT: vmovaps %xmm2, (%rdi) 1574; AVX-NEXT: vzeroupper 1575; AVX-NEXT: retq 1576 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1577 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1578 store <4 x i32> %1, <4 x i32>* %ptr, align 16 1579 ret <4 x i32> %2 1580} 1581 1582define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) { 1583; SSE-LABEL: combine_test22: 1584; SSE: # %bb.0: 1585; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1586; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1587; SSE-NEXT: retq 1588; 1589; AVX-LABEL: combine_test22: 1590; AVX: # %bb.0: 1591; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1592; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1593; AVX-NEXT: retq 1594; Current AVX2 lowering of this is still awful, not adding a test case. 1595 %1 = load <2 x float>, <2 x float>* %a, align 8 1596 %2 = load <2 x float>, <2 x float>* %b, align 8 1597 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1598 ret <8 x float> %3 1599} 1600 1601; PR22359 1602define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) { 1603; SSE-LABEL: combine_test23: 1604; SSE: # %bb.0: 1605; SSE-NEXT: movups %xmm0, (%rdi) 1606; SSE-NEXT: retq 1607; 1608; AVX-LABEL: combine_test23: 1609; AVX: # %bb.0: 1610; AVX-NEXT: vmovups %xmm0, (%rdi) 1611; AVX-NEXT: vzeroupper 1612; AVX-NEXT: retq 1613 %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1 1614 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1> 1615 %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3> 1616 store <2 x float> %shuffle0, <2 x float>* %ptr, align 8 1617 store <2 x float> %shuffle1, <2 x float>* %idx2, align 8 1618 ret void 1619} 1620 1621; Check some negative cases. 1622; FIXME: Do any of these really make sense? Are they redundant with the above tests? 1623 1624define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { 1625; SSE-LABEL: combine_test1b: 1626; SSE: # %bb.0: 1627; SSE-NEXT: movaps %xmm1, %xmm0 1628; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 1629; SSE-NEXT: retq 1630; 1631; AVX-LABEL: combine_test1b: 1632; AVX: # %bb.0: 1633; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] 1634; AVX-NEXT: retq 1635 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1636 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> 1637 ret <4 x float> %2 1638} 1639 1640define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { 1641; SSE2-LABEL: combine_test2b: 1642; SSE2: # %bb.0: 1643; SSE2-NEXT: movaps %xmm1, %xmm0 1644; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1645; SSE2-NEXT: retq 1646; 1647; SSSE3-LABEL: combine_test2b: 1648; SSSE3: # %bb.0: 1649; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1650; SSSE3-NEXT: retq 1651; 1652; SSE41-LABEL: combine_test2b: 1653; SSE41: # %bb.0: 1654; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1655; SSE41-NEXT: retq 1656; 1657; AVX-LABEL: combine_test2b: 1658; AVX: # %bb.0: 1659; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] 1660; AVX-NEXT: retq 1661 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1662 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> 1663 ret <4 x float> %2 1664} 1665 1666define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { 1667; SSE2-LABEL: combine_test3b: 1668; SSE2: # %bb.0: 1669; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1670; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1671; SSE2-NEXT: retq 1672; 1673; SSSE3-LABEL: combine_test3b: 1674; SSSE3: # %bb.0: 1675; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1676; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1677; SSSE3-NEXT: retq 1678; 1679; SSE41-LABEL: combine_test3b: 1680; SSE41: # %bb.0: 1681; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1682; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1683; SSE41-NEXT: retq 1684; 1685; AVX-LABEL: combine_test3b: 1686; AVX: # %bb.0: 1687; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1688; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1689; AVX-NEXT: retq 1690 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> 1691 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> 1692 ret <4 x float> %2 1693} 1694 1695define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { 1696; SSE-LABEL: combine_test4b: 1697; SSE: # %bb.0: 1698; SSE-NEXT: movaps %xmm1, %xmm0 1699; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 1700; SSE-NEXT: retq 1701; 1702; AVX-LABEL: combine_test4b: 1703; AVX: # %bb.0: 1704; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] 1705; AVX-NEXT: retq 1706 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1707 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> 1708 ret <4 x float> %2 1709} 1710 1711 1712; Verify that we correctly fold shuffles even when we use illegal vector types. 1713 1714define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { 1715; SSE2-LABEL: combine_test1c: 1716; SSE2: # %bb.0: 1717; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1718; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1719; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1720; SSE2-NEXT: andps %xmm0, %xmm2 1721; SSE2-NEXT: andnps %xmm1, %xmm0 1722; SSE2-NEXT: orps %xmm2, %xmm0 1723; SSE2-NEXT: retq 1724; 1725; SSSE3-LABEL: combine_test1c: 1726; SSSE3: # %bb.0: 1727; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1728; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1729; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1730; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 1731; SSSE3-NEXT: retq 1732; 1733; SSE41-LABEL: combine_test1c: 1734; SSE41: # %bb.0: 1735; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1736; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1737; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1738; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1739; SSE41-NEXT: movdqa %xmm1, %xmm0 1740; SSE41-NEXT: retq 1741; 1742; AVX-LABEL: combine_test1c: 1743; AVX: # %bb.0: 1744; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1745; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1746; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1747; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1748; AVX-NEXT: retq 1749 %A = load <4 x i8>, <4 x i8>* %a 1750 %B = load <4 x i8>, <4 x i8>* %b 1751 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1752 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1753 ret <4 x i8> %2 1754} 1755 1756define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { 1757; SSE-LABEL: combine_test2c: 1758; SSE: # %bb.0: 1759; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1760; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1761; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1762; SSE-NEXT: retq 1763; 1764; AVX-LABEL: combine_test2c: 1765; AVX: # %bb.0: 1766; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1767; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1768; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1769; AVX-NEXT: retq 1770 %A = load <4 x i8>, <4 x i8>* %a 1771 %B = load <4 x i8>, <4 x i8>* %b 1772 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> 1773 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1774 ret <4 x i8> %2 1775} 1776 1777define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { 1778; SSE-LABEL: combine_test3c: 1779; SSE: # %bb.0: 1780; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1781; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1782; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1783; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1784; SSE-NEXT: retq 1785; 1786; AVX-LABEL: combine_test3c: 1787; AVX: # %bb.0: 1788; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1789; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1790; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1791; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1792; AVX-NEXT: retq 1793 %A = load <4 x i8>, <4 x i8>* %a 1794 %B = load <4 x i8>, <4 x i8>* %b 1795 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1796 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1797 ret <4 x i8> %2 1798} 1799 1800define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { 1801; SSE2-LABEL: combine_test4c: 1802; SSE2: # %bb.0: 1803; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1804; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1805; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1806; SSE2-NEXT: andps %xmm0, %xmm2 1807; SSE2-NEXT: andnps %xmm1, %xmm0 1808; SSE2-NEXT: orps %xmm2, %xmm0 1809; SSE2-NEXT: retq 1810; 1811; SSSE3-LABEL: combine_test4c: 1812; SSSE3: # %bb.0: 1813; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1814; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1815; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1816; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 1817; SSSE3-NEXT: retq 1818; 1819; SSE41-LABEL: combine_test4c: 1820; SSE41: # %bb.0: 1821; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1822; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1823; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1824; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1825; SSE41-NEXT: movdqa %xmm1, %xmm0 1826; SSE41-NEXT: retq 1827; 1828; AVX-LABEL: combine_test4c: 1829; AVX: # %bb.0: 1830; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1831; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1832; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1833; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1834; AVX-NEXT: retq 1835 %A = load <4 x i8>, <4 x i8>* %a 1836 %B = load <4 x i8>, <4 x i8>* %b 1837 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1838 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1839 ret <4 x i8> %2 1840} 1841 1842 1843; The following test cases are generated from this C++ code 1844; 1845;__m128 blend_01(__m128 a, __m128 b) 1846;{ 1847; __m128 s = a; 1848; s = _mm_blend_ps( s, b, 1<<0 ); 1849; s = _mm_blend_ps( s, b, 1<<1 ); 1850; return s; 1851;} 1852; 1853;__m128 blend_02(__m128 a, __m128 b) 1854;{ 1855; __m128 s = a; 1856; s = _mm_blend_ps( s, b, 1<<0 ); 1857; s = _mm_blend_ps( s, b, 1<<2 ); 1858; return s; 1859;} 1860; 1861;__m128 blend_123(__m128 a, __m128 b) 1862;{ 1863; __m128 s = a; 1864; s = _mm_blend_ps( s, b, 1<<1 ); 1865; s = _mm_blend_ps( s, b, 1<<2 ); 1866; s = _mm_blend_ps( s, b, 1<<3 ); 1867; return s; 1868;} 1869 1870; Ideally, we should collapse the following shuffles into a single one. 1871 1872define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { 1873; SSE2-LABEL: combine_blend_01: 1874; SSE2: # %bb.0: 1875; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1876; SSE2-NEXT: retq 1877; 1878; SSSE3-LABEL: combine_blend_01: 1879; SSSE3: # %bb.0: 1880; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1881; SSSE3-NEXT: retq 1882; 1883; SSE41-LABEL: combine_blend_01: 1884; SSE41: # %bb.0: 1885; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1886; SSE41-NEXT: retq 1887; 1888; AVX-LABEL: combine_blend_01: 1889; AVX: # %bb.0: 1890; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1891; AVX-NEXT: retq 1892 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> 1893 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1894 ret <4 x float> %shuffle6 1895} 1896 1897define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { 1898; SSE2-LABEL: combine_blend_02: 1899; SSE2: # %bb.0: 1900; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1901; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1902; SSE2-NEXT: movaps %xmm1, %xmm0 1903; SSE2-NEXT: retq 1904; 1905; SSSE3-LABEL: combine_blend_02: 1906; SSSE3: # %bb.0: 1907; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1908; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1909; SSSE3-NEXT: movaps %xmm1, %xmm0 1910; SSSE3-NEXT: retq 1911; 1912; SSE41-LABEL: combine_blend_02: 1913; SSE41: # %bb.0: 1914; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1915; SSE41-NEXT: retq 1916; 1917; AVX-LABEL: combine_blend_02: 1918; AVX: # %bb.0: 1919; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1920; AVX-NEXT: retq 1921 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> 1922 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1923 ret <4 x float> %shuffle6 1924} 1925 1926define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { 1927; SSE2-LABEL: combine_blend_123: 1928; SSE2: # %bb.0: 1929; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1930; SSE2-NEXT: movaps %xmm1, %xmm0 1931; SSE2-NEXT: retq 1932; 1933; SSSE3-LABEL: combine_blend_123: 1934; SSSE3: # %bb.0: 1935; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1936; SSSE3-NEXT: movaps %xmm1, %xmm0 1937; SSSE3-NEXT: retq 1938; 1939; SSE41-LABEL: combine_blend_123: 1940; SSE41: # %bb.0: 1941; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1942; SSE41-NEXT: retq 1943; 1944; AVX-LABEL: combine_blend_123: 1945; AVX: # %bb.0: 1946; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1947; AVX-NEXT: retq 1948 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 1949 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> 1950 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1951 ret <4 x float> %shuffle12 1952} 1953 1954define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { 1955; SSE-LABEL: combine_test_movhl_1: 1956; SSE: # %bb.0: 1957; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1958; SSE-NEXT: movaps %xmm1, %xmm0 1959; SSE-NEXT: retq 1960; 1961; AVX-LABEL: combine_test_movhl_1: 1962; AVX: # %bb.0: 1963; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1964; AVX-NEXT: retq 1965 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> 1966 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> 1967 ret <4 x i32> %2 1968} 1969 1970define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { 1971; SSE-LABEL: combine_test_movhl_2: 1972; SSE: # %bb.0: 1973; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1974; SSE-NEXT: movaps %xmm1, %xmm0 1975; SSE-NEXT: retq 1976; 1977; AVX-LABEL: combine_test_movhl_2: 1978; AVX: # %bb.0: 1979; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1980; AVX-NEXT: retq 1981 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> 1982 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> 1983 ret <4 x i32> %2 1984} 1985 1986define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { 1987; SSE-LABEL: combine_test_movhl_3: 1988; SSE: # %bb.0: 1989; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1990; SSE-NEXT: movaps %xmm1, %xmm0 1991; SSE-NEXT: retq 1992; 1993; AVX-LABEL: combine_test_movhl_3: 1994; AVX: # %bb.0: 1995; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1996; AVX-NEXT: retq 1997 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> 1998 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> 1999 ret <4 x i32> %2 2000} 2001 2002 2003; Verify that we fold shuffles according to rule: 2004; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) 2005 2006define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { 2007; SSE2-LABEL: combine_undef_input_test1: 2008; SSE2: # %bb.0: 2009; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2010; SSE2-NEXT: retq 2011; 2012; SSSE3-LABEL: combine_undef_input_test1: 2013; SSSE3: # %bb.0: 2014; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2015; SSSE3-NEXT: retq 2016; 2017; SSE41-LABEL: combine_undef_input_test1: 2018; SSE41: # %bb.0: 2019; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2020; SSE41-NEXT: retq 2021; 2022; AVX-LABEL: combine_undef_input_test1: 2023; AVX: # %bb.0: 2024; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2025; AVX-NEXT: retq 2026 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2027 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2028 ret <4 x float> %2 2029} 2030 2031define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { 2032; SSE-LABEL: combine_undef_input_test2: 2033; SSE: # %bb.0: 2034; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2035; SSE-NEXT: retq 2036; 2037; AVX-LABEL: combine_undef_input_test2: 2038; AVX: # %bb.0: 2039; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2040; AVX-NEXT: retq 2041 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2042 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2043 ret <4 x float> %2 2044} 2045 2046define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { 2047; SSE-LABEL: combine_undef_input_test3: 2048; SSE: # %bb.0: 2049; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2050; SSE-NEXT: retq 2051; 2052; AVX-LABEL: combine_undef_input_test3: 2053; AVX: # %bb.0: 2054; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2055; AVX-NEXT: retq 2056 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2057 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2058 ret <4 x float> %2 2059} 2060 2061define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { 2062; SSE-LABEL: combine_undef_input_test4: 2063; SSE: # %bb.0: 2064; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2065; SSE-NEXT: retq 2066; 2067; AVX-LABEL: combine_undef_input_test4: 2068; AVX: # %bb.0: 2069; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2070; AVX-NEXT: retq 2071 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2072 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2073 ret <4 x float> %2 2074} 2075 2076define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { 2077; SSE2-LABEL: combine_undef_input_test5: 2078; SSE2: # %bb.0: 2079; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2080; SSE2-NEXT: retq 2081; 2082; SSSE3-LABEL: combine_undef_input_test5: 2083; SSSE3: # %bb.0: 2084; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2085; SSSE3-NEXT: retq 2086; 2087; SSE41-LABEL: combine_undef_input_test5: 2088; SSE41: # %bb.0: 2089; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2090; SSE41-NEXT: retq 2091; 2092; AVX-LABEL: combine_undef_input_test5: 2093; AVX: # %bb.0: 2094; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2095; AVX-NEXT: retq 2096 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2097 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2098 ret <4 x float> %2 2099} 2100 2101 2102; Verify that we fold shuffles according to rule: 2103; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2104 2105define <4 x float> @combine_undef_input_test6(<4 x float> %a) { 2106; CHECK-LABEL: combine_undef_input_test6: 2107; CHECK: # %bb.0: 2108; CHECK-NEXT: retq 2109 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2110 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2111 ret <4 x float> %2 2112} 2113 2114define <4 x float> @combine_undef_input_test7(<4 x float> %a) { 2115; SSE2-LABEL: combine_undef_input_test7: 2116; SSE2: # %bb.0: 2117; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2118; SSE2-NEXT: retq 2119; 2120; SSSE3-LABEL: combine_undef_input_test7: 2121; SSSE3: # %bb.0: 2122; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2123; SSSE3-NEXT: retq 2124; 2125; SSE41-LABEL: combine_undef_input_test7: 2126; SSE41: # %bb.0: 2127; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2128; SSE41-NEXT: retq 2129; 2130; AVX-LABEL: combine_undef_input_test7: 2131; AVX: # %bb.0: 2132; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2133; AVX-NEXT: retq 2134 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2135 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2136 ret <4 x float> %2 2137} 2138 2139define <4 x float> @combine_undef_input_test8(<4 x float> %a) { 2140; SSE2-LABEL: combine_undef_input_test8: 2141; SSE2: # %bb.0: 2142; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2143; SSE2-NEXT: retq 2144; 2145; SSSE3-LABEL: combine_undef_input_test8: 2146; SSSE3: # %bb.0: 2147; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2148; SSSE3-NEXT: retq 2149; 2150; SSE41-LABEL: combine_undef_input_test8: 2151; SSE41: # %bb.0: 2152; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2153; SSE41-NEXT: retq 2154; 2155; AVX-LABEL: combine_undef_input_test8: 2156; AVX: # %bb.0: 2157; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2158; AVX-NEXT: retq 2159 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2160 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2161 ret <4 x float> %2 2162} 2163 2164define <4 x float> @combine_undef_input_test9(<4 x float> %a) { 2165; SSE-LABEL: combine_undef_input_test9: 2166; SSE: # %bb.0: 2167; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2168; SSE-NEXT: retq 2169; 2170; AVX-LABEL: combine_undef_input_test9: 2171; AVX: # %bb.0: 2172; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2173; AVX-NEXT: retq 2174 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2175 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2176 ret <4 x float> %2 2177} 2178 2179define <4 x float> @combine_undef_input_test10(<4 x float> %a) { 2180; CHECK-LABEL: combine_undef_input_test10: 2181; CHECK: # %bb.0: 2182; CHECK-NEXT: retq 2183 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2184 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2185 ret <4 x float> %2 2186} 2187 2188define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { 2189; SSE2-LABEL: combine_undef_input_test11: 2190; SSE2: # %bb.0: 2191; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2192; SSE2-NEXT: retq 2193; 2194; SSSE3-LABEL: combine_undef_input_test11: 2195; SSSE3: # %bb.0: 2196; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2197; SSSE3-NEXT: retq 2198; 2199; SSE41-LABEL: combine_undef_input_test11: 2200; SSE41: # %bb.0: 2201; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2202; SSE41-NEXT: retq 2203; 2204; AVX-LABEL: combine_undef_input_test11: 2205; AVX: # %bb.0: 2206; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2207; AVX-NEXT: retq 2208 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2209 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> 2210 ret <4 x float> %2 2211} 2212 2213define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { 2214; SSE-LABEL: combine_undef_input_test12: 2215; SSE: # %bb.0: 2216; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2217; SSE-NEXT: retq 2218; 2219; AVX-LABEL: combine_undef_input_test12: 2220; AVX: # %bb.0: 2221; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2222; AVX-NEXT: retq 2223 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2224 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2225 ret <4 x float> %2 2226} 2227 2228define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { 2229; SSE-LABEL: combine_undef_input_test13: 2230; SSE: # %bb.0: 2231; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2232; SSE-NEXT: retq 2233; 2234; AVX-LABEL: combine_undef_input_test13: 2235; AVX: # %bb.0: 2236; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2237; AVX-NEXT: retq 2238 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2239 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> 2240 ret <4 x float> %2 2241} 2242 2243define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { 2244; SSE-LABEL: combine_undef_input_test14: 2245; SSE: # %bb.0: 2246; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2247; SSE-NEXT: retq 2248; 2249; AVX-LABEL: combine_undef_input_test14: 2250; AVX: # %bb.0: 2251; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2252; AVX-NEXT: retq 2253 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2254 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2255 ret <4 x float> %2 2256} 2257 2258define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { 2259; SSE2-LABEL: combine_undef_input_test15: 2260; SSE2: # %bb.0: 2261; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2262; SSE2-NEXT: retq 2263; 2264; SSSE3-LABEL: combine_undef_input_test15: 2265; SSSE3: # %bb.0: 2266; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2267; SSSE3-NEXT: retq 2268; 2269; SSE41-LABEL: combine_undef_input_test15: 2270; SSE41: # %bb.0: 2271; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2272; SSE41-NEXT: retq 2273; 2274; AVX-LABEL: combine_undef_input_test15: 2275; AVX: # %bb.0: 2276; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2277; AVX-NEXT: retq 2278 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2279 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2280 ret <4 x float> %2 2281} 2282 2283 2284; Verify that shuffles are canonicalized according to rules: 2285; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 2286; 2287; This allows to trigger the following combine rule: 2288; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2289; 2290; As a result, all the shuffle pairs in each function below should be 2291; combined into a single legal shuffle operation. 2292 2293define <4 x float> @combine_undef_input_test16(<4 x float> %a) { 2294; CHECK-LABEL: combine_undef_input_test16: 2295; CHECK: # %bb.0: 2296; CHECK-NEXT: retq 2297 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2298 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 2299 ret <4 x float> %2 2300} 2301 2302define <4 x float> @combine_undef_input_test17(<4 x float> %a) { 2303; SSE2-LABEL: combine_undef_input_test17: 2304; SSE2: # %bb.0: 2305; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2306; SSE2-NEXT: retq 2307; 2308; SSSE3-LABEL: combine_undef_input_test17: 2309; SSSE3: # %bb.0: 2310; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2311; SSSE3-NEXT: retq 2312; 2313; SSE41-LABEL: combine_undef_input_test17: 2314; SSE41: # %bb.0: 2315; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2316; SSE41-NEXT: retq 2317; 2318; AVX-LABEL: combine_undef_input_test17: 2319; AVX: # %bb.0: 2320; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2321; AVX-NEXT: retq 2322 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2323 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2324 ret <4 x float> %2 2325} 2326 2327define <4 x float> @combine_undef_input_test18(<4 x float> %a) { 2328; SSE2-LABEL: combine_undef_input_test18: 2329; SSE2: # %bb.0: 2330; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2331; SSE2-NEXT: retq 2332; 2333; SSSE3-LABEL: combine_undef_input_test18: 2334; SSSE3: # %bb.0: 2335; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2336; SSSE3-NEXT: retq 2337; 2338; SSE41-LABEL: combine_undef_input_test18: 2339; SSE41: # %bb.0: 2340; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2341; SSE41-NEXT: retq 2342; 2343; AVX-LABEL: combine_undef_input_test18: 2344; AVX: # %bb.0: 2345; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2346; AVX-NEXT: retq 2347 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2348 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 2349 ret <4 x float> %2 2350} 2351 2352define <4 x float> @combine_undef_input_test19(<4 x float> %a) { 2353; SSE-LABEL: combine_undef_input_test19: 2354; SSE: # %bb.0: 2355; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2356; SSE-NEXT: retq 2357; 2358; AVX-LABEL: combine_undef_input_test19: 2359; AVX: # %bb.0: 2360; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2361; AVX-NEXT: retq 2362 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2363 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2364 ret <4 x float> %2 2365} 2366 2367define <4 x float> @combine_undef_input_test20(<4 x float> %a) { 2368; CHECK-LABEL: combine_undef_input_test20: 2369; CHECK: # %bb.0: 2370; CHECK-NEXT: retq 2371 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2372 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2373 ret <4 x float> %2 2374} 2375 2376; These tests are designed to test the ability to combine away unnecessary 2377; operations feeding into a shuffle. The AVX cases are the important ones as 2378; they leverage operations which cannot be done naturally on the entire vector 2379; and thus are decomposed into multiple smaller operations. 2380 2381define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { 2382; SSE-LABEL: combine_unneeded_subvector1: 2383; SSE: # %bb.0: 2384; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2385; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] 2386; SSE-NEXT: movdqa %xmm0, %xmm1 2387; SSE-NEXT: retq 2388; 2389; AVX1-LABEL: combine_unneeded_subvector1: 2390; AVX1: # %bb.0: 2391; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2392; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2393; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2394; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2395; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 2396; AVX1-NEXT: retq 2397; 2398; AVX2-SLOW-LABEL: combine_unneeded_subvector1: 2399; AVX2-SLOW: # %bb.0: 2400; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2401; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2402; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2403; AVX2-SLOW-NEXT: retq 2404; 2405; AVX2-FAST-LABEL: combine_unneeded_subvector1: 2406; AVX2-FAST: # %bb.0: 2407; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2408; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] 2409; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] 2410; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2411; AVX2-FAST-NEXT: retq 2412 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2413 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> 2414 ret <8 x i32> %c 2415} 2416 2417define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { 2418; SSE-LABEL: combine_unneeded_subvector2: 2419; SSE: # %bb.0: 2420; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2421; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] 2422; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 2423; SSE-NEXT: retq 2424; 2425; AVX1-LABEL: combine_unneeded_subvector2: 2426; AVX1: # %bb.0: 2427; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2428; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2429; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2430; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2431; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2432; AVX1-NEXT: retq 2433; 2434; AVX2-LABEL: combine_unneeded_subvector2: 2435; AVX2: # %bb.0: 2436; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2437; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2438; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2439; AVX2-NEXT: retq 2440 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2441 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> 2442 ret <8 x i32> %d 2443} 2444 2445define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { 2446; SSE2-LABEL: combine_insertps1: 2447; SSE2: # %bb.0: 2448; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2449; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2450; SSE2-NEXT: movaps %xmm1, %xmm0 2451; SSE2-NEXT: retq 2452; 2453; SSSE3-LABEL: combine_insertps1: 2454; SSSE3: # %bb.0: 2455; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2456; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2457; SSSE3-NEXT: movaps %xmm1, %xmm0 2458; SSSE3-NEXT: retq 2459; 2460; SSE41-LABEL: combine_insertps1: 2461; SSE41: # %bb.0: 2462; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2463; SSE41-NEXT: retq 2464; 2465; AVX-LABEL: combine_insertps1: 2466; AVX: # %bb.0: 2467; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2468; AVX-NEXT: retq 2469 2470 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> 2471 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 2472 ret <4 x float> %d 2473} 2474 2475define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { 2476; SSE2-LABEL: combine_insertps2: 2477; SSE2: # %bb.0: 2478; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2479; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2480; SSE2-NEXT: movaps %xmm1, %xmm0 2481; SSE2-NEXT: retq 2482; 2483; SSSE3-LABEL: combine_insertps2: 2484; SSSE3: # %bb.0: 2485; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2486; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2487; SSSE3-NEXT: movaps %xmm1, %xmm0 2488; SSSE3-NEXT: retq 2489; 2490; SSE41-LABEL: combine_insertps2: 2491; SSE41: # %bb.0: 2492; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2493; SSE41-NEXT: retq 2494; 2495; AVX-LABEL: combine_insertps2: 2496; AVX: # %bb.0: 2497; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2498; AVX-NEXT: retq 2499 2500 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> 2501 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2502 ret <4 x float> %d 2503} 2504 2505define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { 2506; SSE2-LABEL: combine_insertps3: 2507; SSE2: # %bb.0: 2508; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2509; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2510; SSE2-NEXT: retq 2511; 2512; SSSE3-LABEL: combine_insertps3: 2513; SSSE3: # %bb.0: 2514; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2515; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2516; SSSE3-NEXT: retq 2517; 2518; SSE41-LABEL: combine_insertps3: 2519; SSE41: # %bb.0: 2520; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2521; SSE41-NEXT: retq 2522; 2523; AVX-LABEL: combine_insertps3: 2524; AVX: # %bb.0: 2525; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2526; AVX-NEXT: retq 2527 2528 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2529 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> 2530 ret <4 x float> %d 2531} 2532 2533define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { 2534; SSE2-LABEL: combine_insertps4: 2535; SSE2: # %bb.0: 2536; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 2537; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2538; SSE2-NEXT: retq 2539; 2540; SSSE3-LABEL: combine_insertps4: 2541; SSSE3: # %bb.0: 2542; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 2543; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2544; SSSE3-NEXT: retq 2545; 2546; SSE41-LABEL: combine_insertps4: 2547; SSE41: # %bb.0: 2548; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2549; SSE41-NEXT: retq 2550; 2551; AVX-LABEL: combine_insertps4: 2552; AVX: # %bb.0: 2553; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2554; AVX-NEXT: retq 2555 2556 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2557 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> 2558 ret <4 x float> %d 2559} 2560 2561define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) { 2562; SSE-LABEL: combine_scalar_load_with_blend_with_zero: 2563; SSE: # %bb.0: 2564; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2565; SSE-NEXT: movaps %xmm0, (%rsi) 2566; SSE-NEXT: retq 2567; 2568; AVX-LABEL: combine_scalar_load_with_blend_with_zero: 2569; AVX: # %bb.0: 2570; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2571; AVX-NEXT: vmovaps %xmm0, (%rsi) 2572; AVX-NEXT: retq 2573 %1 = load double, double* %a0, align 8 2574 %2 = insertelement <2 x double> undef, double %1, i32 0 2575 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1 2576 %4 = bitcast <2 x double> %3 to <4 x float> 2577 %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 2578 store <4 x float> %5, <4 x float>* %a1, align 16 2579 ret void 2580} 2581 2582; PR30371 2583define <4 x float> @combine_constant_insertion_v4f32(float %f) { 2584; SSE2-LABEL: combine_constant_insertion_v4f32: 2585; SSE2: # %bb.0: 2586; SSE2-NEXT: movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0> 2587; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2588; SSE2-NEXT: movaps %xmm1, %xmm0 2589; SSE2-NEXT: retq 2590; 2591; SSSE3-LABEL: combine_constant_insertion_v4f32: 2592; SSSE3: # %bb.0: 2593; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0> 2594; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2595; SSSE3-NEXT: movaps %xmm1, %xmm0 2596; SSSE3-NEXT: retq 2597; 2598; SSE41-LABEL: combine_constant_insertion_v4f32: 2599; SSE41: # %bb.0: 2600; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2601; SSE41-NEXT: retq 2602; 2603; AVX-LABEL: combine_constant_insertion_v4f32: 2604; AVX: # %bb.0: 2605; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2606; AVX-NEXT: retq 2607 %a0 = insertelement <4 x float> undef, float %f, i32 0 2608 %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2609 ret <4 x float> %ret 2610} 2611 2612define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { 2613; SSE2-LABEL: combine_constant_insertion_v4i32: 2614; SSE2: # %bb.0: 2615; SSE2-NEXT: movd %edi, %xmm1 2616; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2617; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2618; SSE2-NEXT: retq 2619; 2620; SSSE3-LABEL: combine_constant_insertion_v4i32: 2621; SSSE3: # %bb.0: 2622; SSSE3-NEXT: movd %edi, %xmm1 2623; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2624; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2625; SSSE3-NEXT: retq 2626; 2627; SSE41-LABEL: combine_constant_insertion_v4i32: 2628; SSE41: # %bb.0: 2629; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <u,4,5,30> 2630; SSE41-NEXT: pinsrd $0, %edi, %xmm0 2631; SSE41-NEXT: retq 2632; 2633; AVX-LABEL: combine_constant_insertion_v4i32: 2634; AVX: # %bb.0: 2635; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,4,5,30> 2636; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 2637; AVX-NEXT: retq 2638 %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 2639 %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2640 ret <4 x i32> %ret 2641} 2642 2643define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { 2644; SSE2-LABEL: PR22377: 2645; SSE2: # %bb.0: # %entry 2646; SSE2-NEXT: movaps %xmm0, %xmm1 2647; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3] 2648; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2649; SSE2-NEXT: addps %xmm0, %xmm1 2650; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2651; SSE2-NEXT: retq 2652; 2653; SSSE3-LABEL: PR22377: 2654; SSSE3: # %bb.0: # %entry 2655; SSSE3-NEXT: movaps %xmm0, %xmm1 2656; SSSE3-NEXT: haddps %xmm0, %xmm1 2657; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2658; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2659; SSSE3-NEXT: retq 2660; 2661; SSE41-LABEL: PR22377: 2662; SSE41: # %bb.0: # %entry 2663; SSE41-NEXT: movaps %xmm0, %xmm1 2664; SSE41-NEXT: haddps %xmm0, %xmm1 2665; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2666; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2667; SSE41-NEXT: retq 2668; 2669; AVX-LABEL: PR22377: 2670; AVX: # %bb.0: # %entry 2671; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 2672; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2673; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2674; AVX-NEXT: retq 2675entry: 2676 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3> 2677 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2678 %r2 = fadd <4 x float> %s1, %s2 2679 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2680 ret <4 x float> %s3 2681} 2682 2683define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) { 2684; SSE2-LABEL: PR22390: 2685; SSE2: # %bb.0: # %entry 2686; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2687; SSE2-NEXT: movaps %xmm0, %xmm2 2688; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2689; SSE2-NEXT: addps %xmm0, %xmm2 2690; SSE2-NEXT: movaps %xmm2, %xmm0 2691; SSE2-NEXT: retq 2692; 2693; SSSE3-LABEL: PR22390: 2694; SSSE3: # %bb.0: # %entry 2695; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2696; SSSE3-NEXT: movaps %xmm0, %xmm2 2697; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2698; SSSE3-NEXT: addps %xmm0, %xmm2 2699; SSSE3-NEXT: movaps %xmm2, %xmm0 2700; SSSE3-NEXT: retq 2701; 2702; SSE41-LABEL: PR22390: 2703; SSE41: # %bb.0: # %entry 2704; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2705; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2706; SSE41-NEXT: addps %xmm1, %xmm0 2707; SSE41-NEXT: retq 2708; 2709; AVX-LABEL: PR22390: 2710; AVX: # %bb.0: # %entry 2711; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2712; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2713; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2714; AVX-NEXT: retq 2715entry: 2716 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> 2717 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 2718 %r2 = fadd <4 x float> %s1, %s2 2719 ret <4 x float> %r2 2720} 2721 2722define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { 2723; SSE-LABEL: PR22412: 2724; SSE: # %bb.0: # %entry 2725; SSE-NEXT: movaps %xmm3, %xmm1 2726; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2727; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2] 2728; SSE-NEXT: retq 2729; 2730; AVX1-LABEL: PR22412: 2731; AVX1: # %bb.0: # %entry 2732; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] 2733; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2734; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6] 2735; AVX1-NEXT: retq 2736; 2737; AVX2-LABEL: PR22412: 2738; AVX2: # %bb.0: # %entry 2739; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2740; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 2741; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] 2742; AVX2-NEXT: retq 2743entry: 2744 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2745 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2> 2746 ret <8 x float> %s2 2747} 2748 2749define <4 x float> @PR30264(<4 x float> %x) { 2750; SSE2-LABEL: PR30264: 2751; SSE2: # %bb.0: 2752; SSE2-NEXT: xorps %xmm1, %xmm1 2753; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2754; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] 2755; SSE2-NEXT: movaps %xmm1, %xmm0 2756; SSE2-NEXT: retq 2757; 2758; SSSE3-LABEL: PR30264: 2759; SSSE3: # %bb.0: 2760; SSSE3-NEXT: xorps %xmm1, %xmm1 2761; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2762; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] 2763; SSSE3-NEXT: movaps %xmm1, %xmm0 2764; SSSE3-NEXT: retq 2765; 2766; SSE41-LABEL: PR30264: 2767; SSE41: # %bb.0: 2768; SSE41-NEXT: movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0> 2769; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3] 2770; SSE41-NEXT: movaps %xmm1, %xmm0 2771; SSE41-NEXT: retq 2772; 2773; AVX-LABEL: PR30264: 2774; AVX: # %bb.0: 2775; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0> 2776; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3] 2777; AVX-NEXT: retq 2778 %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2779 %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2780 ret <4 x float> %shuf2 2781} 2782 2783define <8 x i16> @PR39549(<16 x i8> %x) { 2784; SSE-LABEL: PR39549: 2785; SSE: # %bb.0: 2786; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2787; SSE-NEXT: psraw $8, %xmm0 2788; SSE-NEXT: retq 2789; 2790; AVX-LABEL: PR39549: 2791; AVX: # %bb.0: 2792; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2793; AVX-NEXT: vpsraw $8, %xmm0, %xmm0 2794; AVX-NEXT: retq 2795 %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef> 2796 %b = bitcast <16 x i8> %a to <8 x i16> 2797 %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2798 %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2799 ret <8 x i16> %d 2800} 2801 2802define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) { 2803; SSE-LABEL: PR41545: 2804; SSE: # %bb.0: 2805; SSE-NEXT: paddd %xmm1, %xmm0 2806; SSE-NEXT: retq 2807; 2808; AVX-LABEL: PR41545: 2809; AVX: # %bb.0: 2810; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2811; AVX-NEXT: retq 2812 %1 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 2813 %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 2814 %3 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 2815 %4 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 2816 %5 = zext <4 x i8> %1 to <4 x i32> 2817 %6 = zext <4 x i8> %2 to <4 x i32> 2818 %7 = zext <4 x i8> %3 to <4 x i32> 2819 %8 = zext <4 x i8> %4 to <4 x i32> 2820 %9 = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8> 2821 %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16> 2822 %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24> 2823 %12 = or <4 x i32> %5, %9 2824 %13 = or <4 x i32> %12, %10 2825 %14 = or <4 x i32> %13, %11 2826 %15 = add <4 x i32> %a0, %14 2827 ret <4 x i32> %15 2828} 2829 2830define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) { 2831; SSE-LABEL: shuffle_extract_insert: 2832; SSE: # %bb.0: 2833; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2834; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2835; SSE-NEXT: retq 2836; 2837; AVX1-LABEL: shuffle_extract_insert: 2838; AVX1: # %bb.0: 2839; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2840; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2841; AVX1-NEXT: retq 2842; 2843; AVX2-SLOW-LABEL: shuffle_extract_insert: 2844; AVX2-SLOW: # %bb.0: 2845; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2846; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2847; AVX2-SLOW-NEXT: retq 2848; 2849; AVX2-FAST-LABEL: shuffle_extract_insert: 2850; AVX2-FAST: # %bb.0: 2851; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] 2852; AVX2-FAST-NEXT: retq 2853 %a0 = extractelement <8 x i16> %a, i32 0 2854 %a1 = extractelement <8 x i16> %a, i32 1 2855 %a3 = extractelement <8 x i16> %a, i32 3 2856 %a4 = extractelement <8 x i16> %a, i32 4 2857 %a5 = extractelement <8 x i16> %a, i32 5 2858 %a6 = extractelement <8 x i16> %a, i32 6 2859 %a7 = extractelement <8 x i16> %a, i32 7 2860 %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2861 %2 = insertelement <8 x i16> %1, i16 %a1, i32 1 2862 %3 = insertelement <8 x i16> %2, i16 %a0, i32 2 2863 %4 = insertelement <8 x i16> %3, i16 %a3, i32 3 2864 %5 = insertelement <8 x i16> %4, i16 %a6, i32 4 2865 %6 = insertelement <8 x i16> %5, i16 %a5, i32 5 2866 %7 = insertelement <8 x i16> %6, i16 %a4, i32 6 2867 %8 = insertelement <8 x i16> %7, i16 %a7, i32 7 2868 ret <8 x i16> %8 2869} 2870 2871define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) { 2872; SSE2-LABEL: shuffle_extract_insert_double: 2873; SSE2: # %bb.0: 2874; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 2875; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2876; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2877; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 2878; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2879; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2880; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2881; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2882; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2883; SSE2-NEXT: retq 2884; 2885; SSSE3-LABEL: shuffle_extract_insert_double: 2886; SSSE3: # %bb.0: 2887; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2888; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2889; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2890; SSSE3-NEXT: retq 2891; 2892; SSE41-LABEL: shuffle_extract_insert_double: 2893; SSE41: # %bb.0: 2894; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2895; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2896; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2897; SSE41-NEXT: retq 2898; 2899; AVX-LABEL: shuffle_extract_insert_double: 2900; AVX: # %bb.0: 2901; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2902; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2903; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2904; AVX-NEXT: retq 2905 %a0 = extractelement <8 x i16> %a, i32 0 2906 %a4 = extractelement <8 x i16> %a, i32 4 2907 %a6 = extractelement <8 x i16> %a, i32 6 2908 %b11 = extractelement <8 x i16> %b, i32 3 2909 %b13 = extractelement <8 x i16> %b, i32 5 2910 %b15 = extractelement <8 x i16> %b, i32 7 2911 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2912 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2 2913 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3 2914 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4 2915 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5 2916 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6 2917 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 2918 ret <8 x i16> %7 2919} 2920 2921define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { 2922; SSE2-LABEL: shuffle_extract_concat_insert: 2923; SSE2: # %bb.0: 2924; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2925; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2926; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2927; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2928; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2929; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] 2930; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2931; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2932; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 2933; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2934; SSE2-NEXT: retq 2935; 2936; SSSE3-LABEL: shuffle_extract_concat_insert: 2937; SSSE3: # %bb.0: 2938; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2939; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2940; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2941; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2942; SSSE3-NEXT: retq 2943; 2944; SSE41-LABEL: shuffle_extract_concat_insert: 2945; SSE41: # %bb.0: 2946; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2947; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2948; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2949; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2950; SSE41-NEXT: retq 2951; 2952; AVX-LABEL: shuffle_extract_concat_insert: 2953; AVX: # %bb.0: 2954; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2955; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2956; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2957; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2958; AVX-NEXT: retq 2959 %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2960 %a0 = extractelement <8 x i16> %a, i32 0 2961 %a4 = extractelement <8 x i16> %a, i32 4 2962 %a6 = extractelement <8 x i16> %a, i32 6 2963 %b11 = extractelement <8 x i16> %b, i32 3 2964 %b13 = extractelement <8 x i16> %b, i32 5 2965 %b15 = extractelement <8 x i16> %b, i32 7 2966 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2967 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2 2968 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3 2969 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4 2970 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5 2971 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6 2972 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 2973 ret <8 x i16> %7 2974} 2975 2976define <8 x i16> @shuffle_scalar_to_vector_extract(<8 x i8>* %p0, i8* %p1, i8* %p2) { 2977; SSE2-LABEL: shuffle_scalar_to_vector_extract: 2978; SSE2: # %bb.0: 2979; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2980; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2981; SSE2-NEXT: psraw $8, %xmm1 2982; SSE2-NEXT: pextrw $7, %xmm1, %eax 2983; SSE2-NEXT: movd %eax, %xmm2 2984; SSE2-NEXT: movsbl (%rsi), %eax 2985; SSE2-NEXT: movd %eax, %xmm0 2986; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 2987; SSE2-NEXT: movsbl (%rdx), %eax 2988; SSE2-NEXT: movd %eax, %xmm0 2989; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2990; SSE2-NEXT: pxor %xmm0, %xmm0 2991; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2992; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2993; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2994; SSE2-NEXT: retq 2995; 2996; SSSE3-LABEL: shuffle_scalar_to_vector_extract: 2997; SSSE3: # %bb.0: 2998; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2999; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 3000; SSSE3-NEXT: psraw $8, %xmm1 3001; SSSE3-NEXT: movsbl (%rsi), %eax 3002; SSSE3-NEXT: movd %eax, %xmm2 3003; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 3004; SSSE3-NEXT: movsbl (%rdx), %eax 3005; SSSE3-NEXT: movd %eax, %xmm0 3006; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3007; SSSE3-NEXT: pxor %xmm0, %xmm0 3008; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3009; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3010; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3011; SSSE3-NEXT: retq 3012; 3013; SSE41-LABEL: shuffle_scalar_to_vector_extract: 3014; SSE41: # %bb.0: 3015; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 3016; SSE41-NEXT: pextrw $4, %xmm0, %eax 3017; SSE41-NEXT: pextrw $7, %xmm0, %ecx 3018; SSE41-NEXT: pxor %xmm0, %xmm0 3019; SSE41-NEXT: pinsrw $1, %eax, %xmm0 3020; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB 3021; SSE41-NEXT: pinsrw $2, %eax, %xmm0 3022; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 3023; SSE41-NEXT: movsbl (%rsi), %eax 3024; SSE41-NEXT: pinsrw $5, %eax, %xmm0 3025; SSE41-NEXT: movsbl (%rdx), %eax 3026; SSE41-NEXT: pinsrw $6, %eax, %xmm0 3027; SSE41-NEXT: retq 3028; 3029; AVX-LABEL: shuffle_scalar_to_vector_extract: 3030; AVX: # %bb.0: 3031; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 3032; AVX-NEXT: vpextrw $4, %xmm0, %eax 3033; AVX-NEXT: vpextrw $7, %xmm0, %ecx 3034; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3035; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 3036; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB 3037; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 3038; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 3039; AVX-NEXT: movsbl (%rsi), %eax 3040; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 3041; AVX-NEXT: movsbl (%rdx), %eax 3042; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 3043; AVX-NEXT: retq 3044 %tmp = load <8 x i8>, <8 x i8>* %p0, align 1 3045 %tmp1 = sext <8 x i8> %tmp to <8 x i16> 3046 %tmp2 = load i8, i8* %p1, align 1 3047 %cvt1 = sext i8 %tmp2 to i16 3048 %tmp3 = load i8, i8* %p2, align 1 3049 %cvt2 = sext i8 %tmp3 to i16 3050 %tmp4 = extractelement <8 x i16> %tmp1, i32 4 3051 %tmp5 = extractelement <8 x i16> %tmp1, i32 7 3052 %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0 3053 %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1 3054 %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3 3055 %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4 3056 %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5 3057 %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6 3058 %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7 3059 %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> 3060 ret <8 x i16> %tmp13 3061} 3062 3063define void @PR43024() { 3064; SSE-LABEL: PR43024: 3065; SSE: # %bb.0: 3066; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3067; SSE-NEXT: movaps %xmm0, (%rax) 3068; SSE-NEXT: addss {{.*}}(%rip), %xmm0 3069; SSE-NEXT: xorps %xmm1, %xmm1 3070; SSE-NEXT: addss %xmm1, %xmm0 3071; SSE-NEXT: addss %xmm1, %xmm0 3072; SSE-NEXT: movss %xmm0, (%rax) 3073; SSE-NEXT: retq 3074; 3075; AVX-LABEL: PR43024: 3076; AVX: # %bb.0: 3077; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3078; AVX-NEXT: vmovaps %xmm0, (%rax) 3079; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 3080; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 3081; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 3082; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 3083; AVX-NEXT: vmovss %xmm0, (%rax) 3084; AVX-NEXT: retq 3085 store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16 3086 %1 = load <4 x float>, <4 x float>* undef, align 16 3087 %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0> 3088 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 3089 %4 = fadd <4 x float> %2, %3 3090 %5 = fadd <4 x float> zeroinitializer, %4 3091 %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 3092 %7 = fadd <4 x float> %6, %5 3093 %8 = extractelement <4 x float> %7, i32 0 3094 store float %8, float* undef, align 8 3095 ret void 3096} 3097 3098define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) { 3099; SSE2-LABEL: PR45604: 3100; SSE2: # %bb.0: 3101; SSE2-NEXT: movdqa (%rsi), %xmm1 3102; SSE2-NEXT: movd %xmm1, %eax 3103; SSE2-NEXT: movzwl %ax, %eax 3104; SSE2-NEXT: movd %eax, %xmm0 3105; SSE2-NEXT: movl $11, %eax 3106; SSE2-NEXT: pinsrw $2, %eax, %xmm0 3107; SSE2-NEXT: pextrw $1, %xmm1, %ecx 3108; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 3109; SSE2-NEXT: pinsrw $6, %eax, %xmm0 3110; SSE2-NEXT: pextrw $2, %xmm1, %ecx 3111; SSE2-NEXT: movd %ecx, %xmm2 3112; SSE2-NEXT: pinsrw $2, %eax, %xmm2 3113; SSE2-NEXT: pextrw $3, %xmm1, %ecx 3114; SSE2-NEXT: pinsrw $4, %ecx, %xmm2 3115; SSE2-NEXT: pinsrw $6, %eax, %xmm2 3116; SSE2-NEXT: pextrw $4, %xmm1, %ecx 3117; SSE2-NEXT: movd %ecx, %xmm3 3118; SSE2-NEXT: pinsrw $2, %eax, %xmm3 3119; SSE2-NEXT: pextrw $5, %xmm1, %ecx 3120; SSE2-NEXT: pinsrw $4, %ecx, %xmm3 3121; SSE2-NEXT: pinsrw $6, %eax, %xmm3 3122; SSE2-NEXT: pextrw $6, %xmm1, %ecx 3123; SSE2-NEXT: movd %ecx, %xmm4 3124; SSE2-NEXT: pinsrw $2, %eax, %xmm4 3125; SSE2-NEXT: pextrw $7, %xmm1, %ecx 3126; SSE2-NEXT: pinsrw $4, %ecx, %xmm4 3127; SSE2-NEXT: pinsrw $6, %eax, %xmm4 3128; SSE2-NEXT: movdqa %xmm4, 48(%rdi) 3129; SSE2-NEXT: movdqa %xmm3, 32(%rdi) 3130; SSE2-NEXT: movdqa %xmm2, 16(%rdi) 3131; SSE2-NEXT: movdqa %xmm0, (%rdi) 3132; SSE2-NEXT: retq 3133; 3134; SSSE3-LABEL: PR45604: 3135; SSSE3: # %bb.0: 3136; SSSE3-NEXT: movdqa (%rsi), %xmm1 3137; SSSE3-NEXT: movd %xmm1, %eax 3138; SSSE3-NEXT: movzwl %ax, %eax 3139; SSSE3-NEXT: movd %eax, %xmm0 3140; SSSE3-NEXT: movl $11, %eax 3141; SSSE3-NEXT: pinsrw $2, %eax, %xmm0 3142; SSSE3-NEXT: pextrw $1, %xmm1, %ecx 3143; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 3144; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 3145; SSSE3-NEXT: pextrw $2, %xmm1, %ecx 3146; SSSE3-NEXT: movd %ecx, %xmm2 3147; SSSE3-NEXT: pinsrw $2, %eax, %xmm2 3148; SSSE3-NEXT: pextrw $3, %xmm1, %ecx 3149; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2 3150; SSSE3-NEXT: pinsrw $6, %eax, %xmm2 3151; SSSE3-NEXT: pextrw $4, %xmm1, %ecx 3152; SSSE3-NEXT: movd %ecx, %xmm3 3153; SSSE3-NEXT: pinsrw $2, %eax, %xmm3 3154; SSSE3-NEXT: pextrw $5, %xmm1, %ecx 3155; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3 3156; SSSE3-NEXT: pinsrw $6, %eax, %xmm3 3157; SSSE3-NEXT: pextrw $6, %xmm1, %ecx 3158; SSSE3-NEXT: movd %ecx, %xmm4 3159; SSSE3-NEXT: pinsrw $2, %eax, %xmm4 3160; SSSE3-NEXT: pextrw $7, %xmm1, %ecx 3161; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4 3162; SSSE3-NEXT: pinsrw $6, %eax, %xmm4 3163; SSSE3-NEXT: movdqa %xmm4, 48(%rdi) 3164; SSSE3-NEXT: movdqa %xmm3, 32(%rdi) 3165; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) 3166; SSSE3-NEXT: movdqa %xmm0, (%rdi) 3167; SSSE3-NEXT: retq 3168; 3169; SSE41-LABEL: PR45604: 3170; SSE41: # %bb.0: 3171; SSE41-NEXT: movdqa (%rsi), %xmm1 3172; SSE41-NEXT: pextrw $2, %xmm1, %eax 3173; SSE41-NEXT: movd %eax, %xmm0 3174; SSE41-NEXT: movl $11, %eax 3175; SSE41-NEXT: pinsrw $2, %eax, %xmm0 3176; SSE41-NEXT: pextrw $3, %xmm1, %ecx 3177; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 3178; SSE41-NEXT: pinsrw $6, %eax, %xmm0 3179; SSE41-NEXT: pextrw $4, %xmm1, %ecx 3180; SSE41-NEXT: movd %ecx, %xmm2 3181; SSE41-NEXT: pinsrw $2, %eax, %xmm2 3182; SSE41-NEXT: pextrw $5, %xmm1, %ecx 3183; SSE41-NEXT: pinsrw $4, %ecx, %xmm2 3184; SSE41-NEXT: pinsrw $6, %eax, %xmm2 3185; SSE41-NEXT: pextrw $6, %xmm1, %ecx 3186; SSE41-NEXT: movd %ecx, %xmm3 3187; SSE41-NEXT: pinsrw $2, %eax, %xmm3 3188; SSE41-NEXT: pextrw $7, %xmm1, %ecx 3189; SSE41-NEXT: pinsrw $4, %ecx, %xmm3 3190; SSE41-NEXT: pinsrw $6, %eax, %xmm3 3191; SSE41-NEXT: pxor %xmm4, %xmm4 3192; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7] 3193; SSE41-NEXT: pinsrw $2, %eax, %xmm4 3194; SSE41-NEXT: pextrw $1, %xmm1, %ecx 3195; SSE41-NEXT: pinsrw $4, %ecx, %xmm4 3196; SSE41-NEXT: pinsrw $6, %eax, %xmm4 3197; SSE41-NEXT: movdqa %xmm4, (%rdi) 3198; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 3199; SSE41-NEXT: movdqa %xmm2, 32(%rdi) 3200; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 3201; SSE41-NEXT: retq 3202; 3203; AVX1-LABEL: PR45604: 3204; AVX1: # %bb.0: 3205; AVX1-NEXT: vmovdqa (%rsi), %xmm0 3206; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 3207; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3208; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] 3209; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 3210; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 3211; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3212; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3213; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 3214; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 3215; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3216; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3217; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3218; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 3219; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 3220; AVX1-NEXT: vmovups %ymm0, (%rdi) 3221; AVX1-NEXT: vmovups %ymm1, 32(%rdi) 3222; AVX1-NEXT: vzeroupper 3223; AVX1-NEXT: retq 3224; 3225; AVX2-LABEL: PR45604: 3226; AVX2: # %bb.0: 3227; AVX2-NEXT: vmovdqa (%rsi), %xmm0 3228; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2] 3229; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> 3230; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3231; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0> 3232; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] 3233; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 3234; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3235; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 3236; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 3237; AVX2-NEXT: vmovdqu %ymm1, (%rdi) 3238; AVX2-NEXT: vzeroupper 3239; AVX2-NEXT: retq 3240 %v1 = load <8 x i16>, <8 x i16>* %src, align 16 3241 %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3242 %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 3243 store <32 x i16> %v3, <32 x i16>* %dst, align 16 3244 ret void 3245} 3246