1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF 9 10define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) { 11; SSSE3-LABEL: phaddw1: 12; SSSE3: # %bb.0: 13; SSSE3-NEXT: phaddw %xmm1, %xmm0 14; SSSE3-NEXT: retq 15; 16; AVX-LABEL: phaddw1: 17; AVX: # %bb.0: 18; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 19; AVX-NEXT: retq 20 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 21 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 22 %r = add <8 x i16> %a, %b 23 ret <8 x i16> %r 24} 25 26define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) { 27; SSSE3-LABEL: phaddw2: 28; SSSE3: # %bb.0: 29; SSSE3-NEXT: phaddw %xmm1, %xmm0 30; SSSE3-NEXT: retq 31; 32; AVX-LABEL: phaddw2: 33; AVX: # %bb.0: 34; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 35; AVX-NEXT: retq 36 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> 37 %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7> 38 %r = add <8 x i16> %a, %b 39 ret <8 x i16> %r 40} 41 42define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) { 43; SSSE3-LABEL: phaddd1: 44; SSSE3: # %bb.0: 45; SSSE3-NEXT: phaddd %xmm1, %xmm0 46; SSSE3-NEXT: retq 47; 48; AVX-LABEL: phaddd1: 49; AVX: # %bb.0: 50; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 51; AVX-NEXT: retq 52 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 53 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 54 %r = add <4 x i32> %a, %b 55 ret <4 x i32> %r 56} 57 58define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) { 59; SSSE3-LABEL: phaddd2: 60; SSSE3: # %bb.0: 61; SSSE3-NEXT: phaddd %xmm1, %xmm0 62; SSSE3-NEXT: retq 63; 64; AVX-LABEL: phaddd2: 65; AVX: # %bb.0: 66; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 67; AVX-NEXT: retq 68 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 69 %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3> 70 %r = add <4 x i32> %a, %b 71 ret <4 x i32> %r 72} 73 74define <4 x i32> @phaddd3(<4 x i32> %x) { 75; SSSE3-LABEL: phaddd3: 76; SSSE3: # %bb.0: 77; SSSE3-NEXT: phaddd %xmm0, %xmm0 78; SSSE3-NEXT: retq 79; 80; AVX-LABEL: phaddd3: 81; AVX: # %bb.0: 82; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 83; AVX-NEXT: retq 84 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 85 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 86 %r = add <4 x i32> %a, %b 87 ret <4 x i32> %r 88} 89 90define <4 x i32> @phaddd4(<4 x i32> %x) { 91; SSSE3-LABEL: phaddd4: 92; SSSE3: # %bb.0: 93; SSSE3-NEXT: phaddd %xmm0, %xmm0 94; SSSE3-NEXT: retq 95; 96; AVX-LABEL: phaddd4: 97; AVX: # %bb.0: 98; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 99; AVX-NEXT: retq 100 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 101 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 102 %r = add <4 x i32> %a, %b 103 ret <4 x i32> %r 104} 105 106define <4 x i32> @phaddd5(<4 x i32> %x) { 107; SSSE3-LABEL: phaddd5: 108; SSSE3: # %bb.0: 109; SSSE3-NEXT: phaddd %xmm0, %xmm0 110; SSSE3-NEXT: retq 111; 112; AVX-LABEL: phaddd5: 113; AVX: # %bb.0: 114; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 115; AVX-NEXT: retq 116 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef> 117 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef> 118 %r = add <4 x i32> %a, %b 119 ret <4 x i32> %r 120} 121 122define <4 x i32> @phaddd6(<4 x i32> %x) { 123; SSSE3-SLOW-LABEL: phaddd6: 124; SSSE3-SLOW: # %bb.0: 125; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 126; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 127; SSSE3-SLOW-NEXT: retq 128; 129; SSSE3-FAST-LABEL: phaddd6: 130; SSSE3-FAST: # %bb.0: 131; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 132; SSSE3-FAST-NEXT: retq 133; 134; AVX-SLOW-LABEL: phaddd6: 135; AVX-SLOW: # %bb.0: 136; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 137; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 138; AVX-SLOW-NEXT: retq 139; 140; AVX-FAST-LABEL: phaddd6: 141; AVX-FAST: # %bb.0: 142; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 143; AVX-FAST-NEXT: retq 144; 145; AVX2-SHUF-LABEL: phaddd6: 146; AVX2-SHUF: # %bb.0: 147; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 148; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0 149; AVX2-SHUF-NEXT: retq 150 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 151 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 152 %r = add <4 x i32> %a, %b 153 ret <4 x i32> %r 154} 155 156define <4 x i32> @phaddd7(<4 x i32> %x) { 157; SSSE3-LABEL: phaddd7: 158; SSSE3: # %bb.0: 159; SSSE3-NEXT: phaddd %xmm0, %xmm0 160; SSSE3-NEXT: retq 161; 162; AVX-LABEL: phaddd7: 163; AVX: # %bb.0: 164; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 165; AVX-NEXT: retq 166 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 167 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 168 %r = add <4 x i32> %a, %b 169 ret <4 x i32> %r 170} 171 172define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) { 173; SSSE3-LABEL: phsubw1: 174; SSSE3: # %bb.0: 175; SSSE3-NEXT: phsubw %xmm1, %xmm0 176; SSSE3-NEXT: retq 177; 178; AVX-LABEL: phsubw1: 179; AVX: # %bb.0: 180; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0 181; AVX-NEXT: retq 182 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 183 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 184 %r = sub <8 x i16> %a, %b 185 ret <8 x i16> %r 186} 187 188define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) { 189; SSSE3-LABEL: phsubd1: 190; SSSE3: # %bb.0: 191; SSSE3-NEXT: phsubd %xmm1, %xmm0 192; SSSE3-NEXT: retq 193; 194; AVX-LABEL: phsubd1: 195; AVX: # %bb.0: 196; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 197; AVX-NEXT: retq 198 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 199 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 200 %r = sub <4 x i32> %a, %b 201 ret <4 x i32> %r 202} 203 204define <4 x i32> @phsubd2(<4 x i32> %x) { 205; SSSE3-LABEL: phsubd2: 206; SSSE3: # %bb.0: 207; SSSE3-NEXT: phsubd %xmm0, %xmm0 208; SSSE3-NEXT: retq 209; 210; AVX-LABEL: phsubd2: 211; AVX: # %bb.0: 212; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 213; AVX-NEXT: retq 214 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 215 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 216 %r = sub <4 x i32> %a, %b 217 ret <4 x i32> %r 218} 219 220define <4 x i32> @phsubd3(<4 x i32> %x) { 221; SSSE3-LABEL: phsubd3: 222; SSSE3: # %bb.0: 223; SSSE3-NEXT: phsubd %xmm0, %xmm0 224; SSSE3-NEXT: retq 225; 226; AVX-LABEL: phsubd3: 227; AVX: # %bb.0: 228; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 229; AVX-NEXT: retq 230 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 231 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 232 %r = sub <4 x i32> %a, %b 233 ret <4 x i32> %r 234} 235 236define <4 x i32> @phsubd4(<4 x i32> %x) { 237; SSSE3-SLOW-LABEL: phsubd4: 238; SSSE3-SLOW: # %bb.0: 239; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 240; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0 241; SSSE3-SLOW-NEXT: retq 242; 243; SSSE3-FAST-LABEL: phsubd4: 244; SSSE3-FAST: # %bb.0: 245; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 246; SSSE3-FAST-NEXT: retq 247; 248; AVX-SLOW-LABEL: phsubd4: 249; AVX-SLOW: # %bb.0: 250; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 251; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 252; AVX-SLOW-NEXT: retq 253; 254; AVX-FAST-LABEL: phsubd4: 255; AVX-FAST: # %bb.0: 256; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 257; AVX-FAST-NEXT: retq 258; 259; AVX2-SHUF-LABEL: phsubd4: 260; AVX2-SHUF: # %bb.0: 261; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 262; AVX2-SHUF-NEXT: vpsubd %xmm1, %xmm0, %xmm0 263; AVX2-SHUF-NEXT: retq 264 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 265 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 266 %r = sub <4 x i32> %a, %b 267 ret <4 x i32> %r 268} 269 270define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) { 271; SSSE3-LABEL: phsubw1_reverse: 272; SSSE3: # %bb.0: 273; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 274; SSSE3-NEXT: movdqa %xmm1, %xmm4 275; SSSE3-NEXT: pshufb %xmm3, %xmm4 276; SSSE3-NEXT: movdqa %xmm0, %xmm2 277; SSSE3-NEXT: pshufb %xmm3, %xmm2 278; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 279; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 280; SSSE3-NEXT: pshufb %xmm3, %xmm1 281; SSSE3-NEXT: pshufb %xmm3, %xmm0 282; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 283; SSSE3-NEXT: psubw %xmm0, %xmm2 284; SSSE3-NEXT: movdqa %xmm2, %xmm0 285; SSSE3-NEXT: retq 286; 287; AVX-LABEL: phsubw1_reverse: 288; AVX: # %bb.0: 289; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 290; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3 291; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 292; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 293; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 294; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] 295; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] 296; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 297; AVX-NEXT: vpsubw %xmm0, %xmm2, %xmm0 298; AVX-NEXT: retq 299 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 300 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 301 %r = sub <8 x i16> %a, %b 302 ret <8 x i16> %r 303} 304 305define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) { 306; SSSE3-LABEL: phsubd1_reverse: 307; SSSE3: # %bb.0: 308; SSSE3-NEXT: movaps %xmm0, %xmm2 309; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] 310; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 311; SSSE3-NEXT: psubd %xmm0, %xmm2 312; SSSE3-NEXT: movdqa %xmm2, %xmm0 313; SSSE3-NEXT: retq 314; 315; AVX-LABEL: phsubd1_reverse: 316; AVX: # %bb.0: 317; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3] 318; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 319; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 320; AVX-NEXT: retq 321 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 322 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 323 %r = sub <4 x i32> %a, %b 324 ret <4 x i32> %r 325} 326 327define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { 328; SSSE3-LABEL: phaddd_single_source1: 329; SSSE3: # %bb.0: 330; SSSE3-NEXT: phaddd %xmm0, %xmm0 331; SSSE3-NEXT: retq 332; 333; AVX-LABEL: phaddd_single_source1: 334; AVX: # %bb.0: 335; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 336; AVX-NEXT: retq 337 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2> 338 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3> 339 %add = add <4 x i32> %l, %r 340 ret <4 x i32> %add 341} 342 343define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { 344; SSSE3-LABEL: phaddd_single_source2: 345; SSSE3: # %bb.0: 346; SSSE3-NEXT: phaddd %xmm0, %xmm0 347; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 348; SSSE3-NEXT: retq 349; 350; AVX-LABEL: phaddd_single_source2: 351; AVX: # %bb.0: 352; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 353; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 354; AVX-NEXT: retq 355 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2> 356 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3> 357 %add = add <4 x i32> %l, %r 358 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef> 359 ret <4 x i32> %shuffle2 360} 361 362define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { 363; SSSE3-LABEL: phaddd_single_source3: 364; SSSE3: # %bb.0: 365; SSSE3-NEXT: phaddd %xmm0, %xmm0 366; SSSE3-NEXT: retq 367; 368; AVX-LABEL: phaddd_single_source3: 369; AVX: # %bb.0: 370; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 371; AVX-NEXT: retq 372 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> 373 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> 374 %add = add <4 x i32> %l, %r 375 ret <4 x i32> %add 376} 377 378define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { 379; SSSE3-SLOW-LABEL: phaddd_single_source4: 380; SSSE3-SLOW: # %bb.0: 381; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 382; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 383; SSSE3-SLOW-NEXT: retq 384; 385; SSSE3-FAST-LABEL: phaddd_single_source4: 386; SSSE3-FAST: # %bb.0: 387; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 388; SSSE3-FAST-NEXT: retq 389; 390; AVX-SLOW-LABEL: phaddd_single_source4: 391; AVX-SLOW: # %bb.0: 392; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 393; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 394; AVX-SLOW-NEXT: retq 395; 396; AVX-FAST-LABEL: phaddd_single_source4: 397; AVX-FAST: # %bb.0: 398; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 399; AVX-FAST-NEXT: retq 400; 401; AVX2-SHUF-LABEL: phaddd_single_source4: 402; AVX2-SHUF: # %bb.0: 403; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 404; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 405; AVX2-SHUF-NEXT: retq 406 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 407 %add = add <4 x i32> %l, %x 408 ret <4 x i32> %add 409} 410 411define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { 412; SSSE3-SLOW-LABEL: phaddd_single_source5: 413; SSSE3-SLOW: # %bb.0: 414; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 415; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 416; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 417; SSSE3-SLOW-NEXT: retq 418; 419; SSSE3-FAST-LABEL: phaddd_single_source5: 420; SSSE3-FAST: # %bb.0: 421; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 422; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 423; SSSE3-FAST-NEXT: retq 424; 425; AVX-SLOW-LABEL: phaddd_single_source5: 426; AVX-SLOW: # %bb.0: 427; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 428; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 429; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 430; AVX-SLOW-NEXT: retq 431; 432; AVX-FAST-LABEL: phaddd_single_source5: 433; AVX-FAST: # %bb.0: 434; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 435; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 436; AVX-FAST-NEXT: retq 437; 438; AVX2-SHUF-LABEL: phaddd_single_source5: 439; AVX2-SHUF: # %bb.0: 440; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 441; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 442; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 443; AVX2-SHUF-NEXT: retq 444 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 445 %add = add <4 x i32> %l, %x 446 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 447 ret <4 x i32> %shuffle2 448} 449 450define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { 451; SSSE3-LABEL: phaddd_single_source6: 452; SSSE3: # %bb.0: 453; SSSE3-NEXT: phaddd %xmm0, %xmm0 454; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 455; SSSE3-NEXT: retq 456; 457; AVX-LABEL: phaddd_single_source6: 458; AVX: # %bb.0: 459; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 460; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 461; AVX-NEXT: retq 462 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> 463 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> 464 %add = add <4 x i32> %l, %r 465 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 466 ret <4 x i32> %shuffle2 467} 468 469define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { 470; SSSE3-LABEL: phaddw_single_source1: 471; SSSE3: # %bb.0: 472; SSSE3-NEXT: phaddw %xmm0, %xmm0 473; SSSE3-NEXT: retq 474; 475; AVX-LABEL: phaddw_single_source1: 476; AVX: # %bb.0: 477; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 478; AVX-NEXT: retq 479 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6> 480 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 481 %add = add <8 x i16> %l, %r 482 ret <8 x i16> %add 483} 484 485define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { 486; SSSE3-LABEL: phaddw_single_source2: 487; SSSE3: # %bb.0: 488; SSSE3-NEXT: phaddw %xmm0, %xmm0 489; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] 490; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 491; SSSE3-NEXT: retq 492; 493; AVX-SLOW-LABEL: phaddw_single_source2: 494; AVX-SLOW: # %bb.0: 495; AVX-SLOW-NEXT: vphaddw %xmm0, %xmm0, %xmm0 496; AVX-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] 497; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 498; AVX-SLOW-NEXT: retq 499; 500; AVX-FAST-LABEL: phaddw_single_source2: 501; AVX-FAST: # %bb.0: 502; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 503; AVX-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] 504; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 505; AVX-FAST-NEXT: retq 506; 507; AVX2-SHUF-LABEL: phaddw_single_source2: 508; AVX2-SHUF: # %bb.0: 509; AVX2-SHUF-NEXT: vphaddw %xmm0, %xmm0, %xmm0 510; AVX2-SHUF-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1] 511; AVX2-SHUF-NEXT: retq 512 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6> 513 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 514 %add = add <8 x i16> %l, %r 515 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef> 516 ret <8 x i16> %shuffle2 517} 518 519define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { 520; SSSE3-LABEL: phaddw_single_source3: 521; SSSE3: # %bb.0: 522; SSSE3-NEXT: phaddw %xmm0, %xmm0 523; SSSE3-NEXT: retq 524; 525; AVX-LABEL: phaddw_single_source3: 526; AVX: # %bb.0: 527; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 528; AVX-NEXT: retq 529 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef> 530 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef> 531 %add = add <8 x i16> %l, %r 532 ret <8 x i16> %add 533} 534 535define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { 536; SSSE3-SLOW-LABEL: phaddw_single_source4: 537; SSSE3-SLOW: # %bb.0: 538; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 539; SSSE3-SLOW-NEXT: pslld $16, %xmm1 540; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 541; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 542; SSSE3-SLOW-NEXT: retq 543; 544; SSSE3-FAST-LABEL: phaddw_single_source4: 545; SSSE3-FAST: # %bb.0: 546; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 547; SSSE3-FAST-NEXT: retq 548; 549; AVX-SLOW-LABEL: phaddw_single_source4: 550; AVX-SLOW: # %bb.0: 551; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1 552; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 553; AVX-SLOW-NEXT: retq 554; 555; AVX-FAST-LABEL: phaddw_single_source4: 556; AVX-FAST: # %bb.0: 557; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 558; AVX-FAST-NEXT: retq 559; 560; AVX2-SHUF-LABEL: phaddw_single_source4: 561; AVX2-SHUF: # %bb.0: 562; AVX2-SHUF-NEXT: vpslld $16, %xmm0, %xmm1 563; AVX2-SHUF-NEXT: vpaddw %xmm0, %xmm1, %xmm0 564; AVX2-SHUF-NEXT: retq 565 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6> 566 %add = add <8 x i16> %l, %x 567 ret <8 x i16> %add 568} 569 570define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { 571; SSSE3-LABEL: phaddw_single_source6: 572; SSSE3: # %bb.0: 573; SSSE3-NEXT: phaddw %xmm0, %xmm0 574; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 575; SSSE3-NEXT: retq 576; 577; AVX-LABEL: phaddw_single_source6: 578; AVX: # %bb.0: 579; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 580; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 581; AVX-NEXT: retq 582 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef> 583 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef> 584 %add = add <8 x i16> %l, %r 585 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 586 ret <8 x i16> %shuffle2 587} 588 589; PR39921 + PR39936 590define i32 @PR39936_v8i32(<8 x i32>) { 591; SSSE3-SLOW-LABEL: PR39936_v8i32: 592; SSSE3-SLOW: # %bb.0: 593; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 594; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 595; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 596; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 597; SSSE3-SLOW-NEXT: movd %xmm1, %eax 598; SSSE3-SLOW-NEXT: retq 599; 600; SSSE3-FAST-LABEL: PR39936_v8i32: 601; SSSE3-FAST: # %bb.0: 602; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 603; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 604; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 605; SSSE3-FAST-NEXT: movd %xmm0, %eax 606; SSSE3-FAST-NEXT: retq 607; 608; AVX1-SLOW-LABEL: PR39936_v8i32: 609; AVX1-SLOW: # %bb.0: 610; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 611; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 612; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 613; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 614; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 615; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 616; AVX1-SLOW-NEXT: vzeroupper 617; AVX1-SLOW-NEXT: retq 618; 619; AVX1-FAST-LABEL: PR39936_v8i32: 620; AVX1-FAST: # %bb.0: 621; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 622; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 623; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 624; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 625; AVX1-FAST-NEXT: vmovd %xmm0, %eax 626; AVX1-FAST-NEXT: vzeroupper 627; AVX1-FAST-NEXT: retq 628; 629; AVX2-SLOW-LABEL: PR39936_v8i32: 630; AVX2-SLOW: # %bb.0: 631; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 632; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 633; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 634; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 635; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 636; AVX2-SLOW-NEXT: vmovd %xmm0, %eax 637; AVX2-SLOW-NEXT: vzeroupper 638; AVX2-SLOW-NEXT: retq 639; 640; AVX2-FAST-LABEL: PR39936_v8i32: 641; AVX2-FAST: # %bb.0: 642; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 643; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 644; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 645; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 646; AVX2-FAST-NEXT: vmovd %xmm0, %eax 647; AVX2-FAST-NEXT: vzeroupper 648; AVX2-FAST-NEXT: retq 649; 650; AVX2-SHUF-LABEL: PR39936_v8i32: 651; AVX2-SHUF: # %bb.0: 652; AVX2-SHUF-NEXT: vextracti128 $1, %ymm0, %xmm1 653; AVX2-SHUF-NEXT: vphaddd %xmm1, %xmm0, %xmm0 654; AVX2-SHUF-NEXT: vphaddd %xmm0, %xmm0, %xmm0 655; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 656; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 657; AVX2-SHUF-NEXT: vmovd %xmm0, %eax 658; AVX2-SHUF-NEXT: vzeroupper 659; AVX2-SHUF-NEXT: retq 660 %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 661 %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 662 %4 = add <8 x i32> %2, %3 663 %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 664 %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 665 %7 = add <8 x i32> %5, %6 666 %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 667 %9 = add <8 x i32> %8, %7 668 %10 = extractelement <8 x i32> %9, i32 0 669 ret i32 %10 670} 671 672