1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6 7define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) { 8; SSE-LABEL: hadd_ps_test1: 9; SSE: # %bb.0: 10; SSE-NEXT: haddps %xmm1, %xmm0 11; SSE-NEXT: retq 12; 13; AVX-LABEL: hadd_ps_test1: 14; AVX: # %bb.0: 15; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 16; AVX-NEXT: retq 17 %vecext = extractelement <4 x float> %A, i32 0 18 %vecext1 = extractelement <4 x float> %A, i32 1 19 %add = fadd float %vecext, %vecext1 20 %vecinit = insertelement <4 x float> undef, float %add, i32 0 21 %vecext2 = extractelement <4 x float> %A, i32 2 22 %vecext3 = extractelement <4 x float> %A, i32 3 23 %add4 = fadd float %vecext2, %vecext3 24 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 25 %vecext6 = extractelement <4 x float> %B, i32 0 26 %vecext7 = extractelement <4 x float> %B, i32 1 27 %add8 = fadd float %vecext6, %vecext7 28 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 29 %vecext10 = extractelement <4 x float> %B, i32 2 30 %vecext11 = extractelement <4 x float> %B, i32 3 31 %add12 = fadd float %vecext10, %vecext11 32 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 33 ret <4 x float> %vecinit13 34} 35 36define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) { 37; SSE-LABEL: hadd_ps_test2: 38; SSE: # %bb.0: 39; SSE-NEXT: haddps %xmm1, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: hadd_ps_test2: 43; AVX: # %bb.0: 44; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 45; AVX-NEXT: retq 46 %vecext = extractelement <4 x float> %A, i32 2 47 %vecext1 = extractelement <4 x float> %A, i32 3 48 %add = fadd float %vecext, %vecext1 49 %vecinit = insertelement <4 x float> undef, float %add, i32 1 50 %vecext2 = extractelement <4 x float> %A, i32 0 51 %vecext3 = extractelement <4 x float> %A, i32 1 52 %add4 = fadd float %vecext2, %vecext3 53 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0 54 %vecext6 = extractelement <4 x float> %B, i32 2 55 %vecext7 = extractelement <4 x float> %B, i32 3 56 %add8 = fadd float %vecext6, %vecext7 57 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3 58 %vecext10 = extractelement <4 x float> %B, i32 0 59 %vecext11 = extractelement <4 x float> %B, i32 1 60 %add12 = fadd float %vecext10, %vecext11 61 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2 62 ret <4 x float> %vecinit13 63} 64 65define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) { 66; SSE-LABEL: hsub_ps_test1: 67; SSE: # %bb.0: 68; SSE-NEXT: hsubps %xmm1, %xmm0 69; SSE-NEXT: retq 70; 71; AVX-LABEL: hsub_ps_test1: 72; AVX: # %bb.0: 73; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 74; AVX-NEXT: retq 75 %vecext = extractelement <4 x float> %A, i32 0 76 %vecext1 = extractelement <4 x float> %A, i32 1 77 %sub = fsub float %vecext, %vecext1 78 %vecinit = insertelement <4 x float> undef, float %sub, i32 0 79 %vecext2 = extractelement <4 x float> %A, i32 2 80 %vecext3 = extractelement <4 x float> %A, i32 3 81 %sub4 = fsub float %vecext2, %vecext3 82 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1 83 %vecext6 = extractelement <4 x float> %B, i32 0 84 %vecext7 = extractelement <4 x float> %B, i32 1 85 %sub8 = fsub float %vecext6, %vecext7 86 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2 87 %vecext10 = extractelement <4 x float> %B, i32 2 88 %vecext11 = extractelement <4 x float> %B, i32 3 89 %sub12 = fsub float %vecext10, %vecext11 90 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3 91 ret <4 x float> %vecinit13 92} 93 94define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) { 95; SSE-LABEL: hsub_ps_test2: 96; SSE: # %bb.0: 97; SSE-NEXT: hsubps %xmm1, %xmm0 98; SSE-NEXT: retq 99; 100; AVX-LABEL: hsub_ps_test2: 101; AVX: # %bb.0: 102; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 103; AVX-NEXT: retq 104 %vecext = extractelement <4 x float> %A, i32 2 105 %vecext1 = extractelement <4 x float> %A, i32 3 106 %sub = fsub float %vecext, %vecext1 107 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 108 %vecext2 = extractelement <4 x float> %A, i32 0 109 %vecext3 = extractelement <4 x float> %A, i32 1 110 %sub4 = fsub float %vecext2, %vecext3 111 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 112 %vecext6 = extractelement <4 x float> %B, i32 2 113 %vecext7 = extractelement <4 x float> %B, i32 3 114 %sub8 = fsub float %vecext6, %vecext7 115 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 116 %vecext10 = extractelement <4 x float> %B, i32 0 117 %vecext11 = extractelement <4 x float> %B, i32 1 118 %sub12 = fsub float %vecext10, %vecext11 119 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 120 ret <4 x float> %vecinit13 121} 122 123define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { 124; SSE3-LABEL: phadd_d_test1: 125; SSE3: # %bb.0: 126; SSE3-NEXT: movd %xmm0, %eax 127; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 128; SSE3-NEXT: movd %xmm2, %ecx 129; SSE3-NEXT: addl %eax, %ecx 130; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 131; SSE3-NEXT: movd %xmm2, %eax 132; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 133; SSE3-NEXT: movd %xmm0, %edx 134; SSE3-NEXT: addl %eax, %edx 135; SSE3-NEXT: movd %xmm1, %eax 136; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 137; SSE3-NEXT: movd %xmm0, %esi 138; SSE3-NEXT: addl %eax, %esi 139; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 140; SSE3-NEXT: movd %xmm0, %eax 141; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 142; SSE3-NEXT: movd %xmm0, %edi 143; SSE3-NEXT: addl %eax, %edi 144; SSE3-NEXT: movd %edi, %xmm0 145; SSE3-NEXT: movd %esi, %xmm1 146; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 147; SSE3-NEXT: movd %edx, %xmm2 148; SSE3-NEXT: movd %ecx, %xmm0 149; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 150; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 151; SSE3-NEXT: retq 152; 153; SSSE3-LABEL: phadd_d_test1: 154; SSSE3: # %bb.0: 155; SSSE3-NEXT: phaddd %xmm1, %xmm0 156; SSSE3-NEXT: retq 157; 158; AVX-LABEL: phadd_d_test1: 159; AVX: # %bb.0: 160; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 161; AVX-NEXT: retq 162 %vecext = extractelement <4 x i32> %A, i32 0 163 %vecext1 = extractelement <4 x i32> %A, i32 1 164 %add = add i32 %vecext, %vecext1 165 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0 166 %vecext2 = extractelement <4 x i32> %A, i32 2 167 %vecext3 = extractelement <4 x i32> %A, i32 3 168 %add4 = add i32 %vecext2, %vecext3 169 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1 170 %vecext6 = extractelement <4 x i32> %B, i32 0 171 %vecext7 = extractelement <4 x i32> %B, i32 1 172 %add8 = add i32 %vecext6, %vecext7 173 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2 174 %vecext10 = extractelement <4 x i32> %B, i32 2 175 %vecext11 = extractelement <4 x i32> %B, i32 3 176 %add12 = add i32 %vecext10, %vecext11 177 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3 178 ret <4 x i32> %vecinit13 179} 180 181define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { 182; SSE3-LABEL: phadd_d_test2: 183; SSE3: # %bb.0: 184; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 185; SSE3-NEXT: movd %xmm2, %eax 186; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 187; SSE3-NEXT: movd %xmm2, %ecx 188; SSE3-NEXT: addl %eax, %ecx 189; SSE3-NEXT: movd %xmm0, %eax 190; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 191; SSE3-NEXT: movd %xmm0, %edx 192; SSE3-NEXT: addl %eax, %edx 193; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 194; SSE3-NEXT: movd %xmm0, %eax 195; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 196; SSE3-NEXT: movd %xmm0, %esi 197; SSE3-NEXT: addl %eax, %esi 198; SSE3-NEXT: movd %esi, %xmm0 199; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 200; SSE3-NEXT: movd %xmm2, %eax 201; SSE3-NEXT: movd %xmm1, %esi 202; SSE3-NEXT: addl %eax, %esi 203; SSE3-NEXT: movd %esi, %xmm1 204; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 205; SSE3-NEXT: movd %ecx, %xmm2 206; SSE3-NEXT: movd %edx, %xmm0 207; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 208; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 209; SSE3-NEXT: retq 210; 211; SSSE3-LABEL: phadd_d_test2: 212; SSSE3: # %bb.0: 213; SSSE3-NEXT: phaddd %xmm1, %xmm0 214; SSSE3-NEXT: retq 215; 216; AVX-LABEL: phadd_d_test2: 217; AVX: # %bb.0: 218; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 219; AVX-NEXT: retq 220 %vecext = extractelement <4 x i32> %A, i32 2 221 %vecext1 = extractelement <4 x i32> %A, i32 3 222 %add = add i32 %vecext, %vecext1 223 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1 224 %vecext2 = extractelement <4 x i32> %A, i32 0 225 %vecext3 = extractelement <4 x i32> %A, i32 1 226 %add4 = add i32 %vecext2, %vecext3 227 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0 228 %vecext6 = extractelement <4 x i32> %B, i32 3 229 %vecext7 = extractelement <4 x i32> %B, i32 2 230 %add8 = add i32 %vecext6, %vecext7 231 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3 232 %vecext10 = extractelement <4 x i32> %B, i32 1 233 %vecext11 = extractelement <4 x i32> %B, i32 0 234 %add12 = add i32 %vecext10, %vecext11 235 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2 236 ret <4 x i32> %vecinit13 237} 238 239define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { 240; SSE3-LABEL: phsub_d_test1: 241; SSE3: # %bb.0: 242; SSE3-NEXT: movd %xmm0, %eax 243; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 244; SSE3-NEXT: movd %xmm2, %ecx 245; SSE3-NEXT: subl %ecx, %eax 246; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 247; SSE3-NEXT: movd %xmm2, %ecx 248; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 249; SSE3-NEXT: movd %xmm0, %edx 250; SSE3-NEXT: subl %edx, %ecx 251; SSE3-NEXT: movd %xmm1, %edx 252; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 253; SSE3-NEXT: movd %xmm0, %esi 254; SSE3-NEXT: subl %esi, %edx 255; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 256; SSE3-NEXT: movd %xmm0, %esi 257; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 258; SSE3-NEXT: movd %xmm0, %edi 259; SSE3-NEXT: subl %edi, %esi 260; SSE3-NEXT: movd %esi, %xmm0 261; SSE3-NEXT: movd %edx, %xmm1 262; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 263; SSE3-NEXT: movd %ecx, %xmm2 264; SSE3-NEXT: movd %eax, %xmm0 265; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 266; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 267; SSE3-NEXT: retq 268; 269; SSSE3-LABEL: phsub_d_test1: 270; SSSE3: # %bb.0: 271; SSSE3-NEXT: phsubd %xmm1, %xmm0 272; SSSE3-NEXT: retq 273; 274; AVX-LABEL: phsub_d_test1: 275; AVX: # %bb.0: 276; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 277; AVX-NEXT: retq 278 %vecext = extractelement <4 x i32> %A, i32 0 279 %vecext1 = extractelement <4 x i32> %A, i32 1 280 %sub = sub i32 %vecext, %vecext1 281 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 282 %vecext2 = extractelement <4 x i32> %A, i32 2 283 %vecext3 = extractelement <4 x i32> %A, i32 3 284 %sub4 = sub i32 %vecext2, %vecext3 285 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 286 %vecext6 = extractelement <4 x i32> %B, i32 0 287 %vecext7 = extractelement <4 x i32> %B, i32 1 288 %sub8 = sub i32 %vecext6, %vecext7 289 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 290 %vecext10 = extractelement <4 x i32> %B, i32 2 291 %vecext11 = extractelement <4 x i32> %B, i32 3 292 %sub12 = sub i32 %vecext10, %vecext11 293 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 294 ret <4 x i32> %vecinit13 295} 296 297define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { 298; SSE3-LABEL: phsub_d_test2: 299; SSE3: # %bb.0: 300; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 301; SSE3-NEXT: movd %xmm2, %eax 302; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 303; SSE3-NEXT: movd %xmm2, %ecx 304; SSE3-NEXT: subl %ecx, %eax 305; SSE3-NEXT: movd %xmm0, %ecx 306; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 307; SSE3-NEXT: movd %xmm0, %edx 308; SSE3-NEXT: subl %edx, %ecx 309; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 310; SSE3-NEXT: movd %xmm0, %edx 311; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 312; SSE3-NEXT: movd %xmm0, %esi 313; SSE3-NEXT: subl %esi, %edx 314; SSE3-NEXT: movd %edx, %xmm0 315; SSE3-NEXT: movd %xmm1, %edx 316; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 317; SSE3-NEXT: movd %xmm1, %esi 318; SSE3-NEXT: subl %esi, %edx 319; SSE3-NEXT: movd %edx, %xmm1 320; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 321; SSE3-NEXT: movd %eax, %xmm2 322; SSE3-NEXT: movd %ecx, %xmm0 323; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 324; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 325; SSE3-NEXT: retq 326; 327; SSSE3-LABEL: phsub_d_test2: 328; SSSE3: # %bb.0: 329; SSSE3-NEXT: phsubd %xmm1, %xmm0 330; SSSE3-NEXT: retq 331; 332; AVX-LABEL: phsub_d_test2: 333; AVX: # %bb.0: 334; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 335; AVX-NEXT: retq 336 %vecext = extractelement <4 x i32> %A, i32 2 337 %vecext1 = extractelement <4 x i32> %A, i32 3 338 %sub = sub i32 %vecext, %vecext1 339 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1 340 %vecext2 = extractelement <4 x i32> %A, i32 0 341 %vecext3 = extractelement <4 x i32> %A, i32 1 342 %sub4 = sub i32 %vecext2, %vecext3 343 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0 344 %vecext6 = extractelement <4 x i32> %B, i32 2 345 %vecext7 = extractelement <4 x i32> %B, i32 3 346 %sub8 = sub i32 %vecext6, %vecext7 347 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3 348 %vecext10 = extractelement <4 x i32> %B, i32 0 349 %vecext11 = extractelement <4 x i32> %B, i32 1 350 %sub12 = sub i32 %vecext10, %vecext11 351 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2 352 ret <4 x i32> %vecinit13 353} 354 355define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) { 356; SSE-LABEL: hadd_pd_test1: 357; SSE: # %bb.0: 358; SSE-NEXT: haddpd %xmm1, %xmm0 359; SSE-NEXT: retq 360; 361; AVX-LABEL: hadd_pd_test1: 362; AVX: # %bb.0: 363; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 364; AVX-NEXT: retq 365 %vecext = extractelement <2 x double> %A, i32 0 366 %vecext1 = extractelement <2 x double> %A, i32 1 367 %add = fadd double %vecext, %vecext1 368 %vecinit = insertelement <2 x double> undef, double %add, i32 0 369 %vecext2 = extractelement <2 x double> %B, i32 0 370 %vecext3 = extractelement <2 x double> %B, i32 1 371 %add2 = fadd double %vecext2, %vecext3 372 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 373 ret <2 x double> %vecinit2 374} 375 376define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) { 377; SSE-LABEL: hadd_pd_test2: 378; SSE: # %bb.0: 379; SSE-NEXT: haddpd %xmm1, %xmm0 380; SSE-NEXT: retq 381; 382; AVX-LABEL: hadd_pd_test2: 383; AVX: # %bb.0: 384; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 385; AVX-NEXT: retq 386 %vecext = extractelement <2 x double> %A, i32 1 387 %vecext1 = extractelement <2 x double> %A, i32 0 388 %add = fadd double %vecext, %vecext1 389 %vecinit = insertelement <2 x double> undef, double %add, i32 0 390 %vecext2 = extractelement <2 x double> %B, i32 1 391 %vecext3 = extractelement <2 x double> %B, i32 0 392 %add2 = fadd double %vecext2, %vecext3 393 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 394 ret <2 x double> %vecinit2 395} 396 397define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) { 398; SSE-LABEL: hsub_pd_test1: 399; SSE: # %bb.0: 400; SSE-NEXT: hsubpd %xmm1, %xmm0 401; SSE-NEXT: retq 402; 403; AVX-LABEL: hsub_pd_test1: 404; AVX: # %bb.0: 405; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 406; AVX-NEXT: retq 407 %vecext = extractelement <2 x double> %A, i32 0 408 %vecext1 = extractelement <2 x double> %A, i32 1 409 %sub = fsub double %vecext, %vecext1 410 %vecinit = insertelement <2 x double> undef, double %sub, i32 0 411 %vecext2 = extractelement <2 x double> %B, i32 0 412 %vecext3 = extractelement <2 x double> %B, i32 1 413 %sub2 = fsub double %vecext2, %vecext3 414 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1 415 ret <2 x double> %vecinit2 416} 417 418define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) { 419; SSE-LABEL: hsub_pd_test2: 420; SSE: # %bb.0: 421; SSE-NEXT: hsubpd %xmm1, %xmm0 422; SSE-NEXT: retq 423; 424; AVX-LABEL: hsub_pd_test2: 425; AVX: # %bb.0: 426; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 427; AVX-NEXT: retq 428 %vecext = extractelement <2 x double> %B, i32 0 429 %vecext1 = extractelement <2 x double> %B, i32 1 430 %sub = fsub double %vecext, %vecext1 431 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 432 %vecext2 = extractelement <2 x double> %A, i32 0 433 %vecext3 = extractelement <2 x double> %A, i32 1 434 %sub2 = fsub double %vecext2, %vecext3 435 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 436 ret <2 x double> %vecinit2 437} 438 439define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) { 440; SSE-LABEL: avx_vhadd_pd_test: 441; SSE: # %bb.0: 442; SSE-NEXT: haddpd %xmm1, %xmm0 443; SSE-NEXT: haddpd %xmm3, %xmm2 444; SSE-NEXT: movapd %xmm2, %xmm1 445; SSE-NEXT: retq 446; 447; AVX-LABEL: avx_vhadd_pd_test: 448; AVX: # %bb.0: 449; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 450; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 451; AVX-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 452; AVX-NEXT: retq 453 %vecext = extractelement <4 x double> %A, i32 0 454 %vecext1 = extractelement <4 x double> %A, i32 1 455 %add = fadd double %vecext, %vecext1 456 %vecinit = insertelement <4 x double> undef, double %add, i32 0 457 %vecext2 = extractelement <4 x double> %A, i32 2 458 %vecext3 = extractelement <4 x double> %A, i32 3 459 %add4 = fadd double %vecext2, %vecext3 460 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 461 %vecext6 = extractelement <4 x double> %B, i32 0 462 %vecext7 = extractelement <4 x double> %B, i32 1 463 %add8 = fadd double %vecext6, %vecext7 464 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 465 %vecext10 = extractelement <4 x double> %B, i32 2 466 %vecext11 = extractelement <4 x double> %B, i32 3 467 %add12 = fadd double %vecext10, %vecext11 468 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 469 ret <4 x double> %vecinit13 470} 471 472define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) { 473; SSE-LABEL: avx_vhsub_pd_test: 474; SSE: # %bb.0: 475; SSE-NEXT: hsubpd %xmm1, %xmm0 476; SSE-NEXT: hsubpd %xmm3, %xmm2 477; SSE-NEXT: movapd %xmm2, %xmm1 478; SSE-NEXT: retq 479; 480; AVX-LABEL: avx_vhsub_pd_test: 481; AVX: # %bb.0: 482; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 483; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 484; AVX-NEXT: vhsubpd %ymm2, %ymm0, %ymm0 485; AVX-NEXT: retq 486 %vecext = extractelement <4 x double> %A, i32 0 487 %vecext1 = extractelement <4 x double> %A, i32 1 488 %sub = fsub double %vecext, %vecext1 489 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 490 %vecext2 = extractelement <4 x double> %A, i32 2 491 %vecext3 = extractelement <4 x double> %A, i32 3 492 %sub4 = fsub double %vecext2, %vecext3 493 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 494 %vecext6 = extractelement <4 x double> %B, i32 0 495 %vecext7 = extractelement <4 x double> %B, i32 1 496 %sub8 = fsub double %vecext6, %vecext7 497 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 498 %vecext10 = extractelement <4 x double> %B, i32 2 499 %vecext11 = extractelement <4 x double> %B, i32 3 500 %sub12 = fsub double %vecext10, %vecext11 501 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 502 ret <4 x double> %vecinit13 503} 504 505define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { 506; SSE3-LABEL: avx2_vphadd_d_test: 507; SSE3: # %bb.0: 508; SSE3-NEXT: movd %xmm0, %ecx 509; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 510; SSE3-NEXT: movd %xmm4, %r8d 511; SSE3-NEXT: addl %ecx, %r8d 512; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 513; SSE3-NEXT: movd %xmm4, %edx 514; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 515; SSE3-NEXT: movd %xmm0, %r9d 516; SSE3-NEXT: addl %edx, %r9d 517; SSE3-NEXT: movd %xmm1, %edx 518; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 519; SSE3-NEXT: movd %xmm0, %esi 520; SSE3-NEXT: addl %edx, %esi 521; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 522; SSE3-NEXT: movd %xmm0, %edx 523; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 524; SSE3-NEXT: movd %xmm0, %edi 525; SSE3-NEXT: addl %edx, %edi 526; SSE3-NEXT: movd %xmm2, %eax 527; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 528; SSE3-NEXT: movd %xmm0, %r10d 529; SSE3-NEXT: addl %eax, %r10d 530; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 531; SSE3-NEXT: movd %xmm0, %eax 532; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 533; SSE3-NEXT: movd %xmm0, %ecx 534; SSE3-NEXT: addl %eax, %ecx 535; SSE3-NEXT: movd %xmm3, %eax 536; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 537; SSE3-NEXT: movd %xmm0, %edx 538; SSE3-NEXT: addl %eax, %edx 539; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 540; SSE3-NEXT: movd %xmm0, %r11d 541; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 542; SSE3-NEXT: movd %xmm0, %eax 543; SSE3-NEXT: addl %r11d, %eax 544; SSE3-NEXT: movd %edi, %xmm0 545; SSE3-NEXT: movd %esi, %xmm1 546; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 547; SSE3-NEXT: movd %r9d, %xmm2 548; SSE3-NEXT: movd %r8d, %xmm0 549; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 550; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 551; SSE3-NEXT: movd %eax, %xmm1 552; SSE3-NEXT: movd %edx, %xmm2 553; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 554; SSE3-NEXT: movd %ecx, %xmm3 555; SSE3-NEXT: movd %r10d, %xmm1 556; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 557; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 558; SSE3-NEXT: retq 559; 560; SSSE3-LABEL: avx2_vphadd_d_test: 561; SSSE3: # %bb.0: 562; SSSE3-NEXT: phaddd %xmm1, %xmm0 563; SSSE3-NEXT: phaddd %xmm3, %xmm2 564; SSSE3-NEXT: movdqa %xmm2, %xmm1 565; SSSE3-NEXT: retq 566; 567; AVX1-LABEL: avx2_vphadd_d_test: 568; AVX1: # %bb.0: 569; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 570; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 571; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 572; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 573; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 574; AVX1-NEXT: retq 575; 576; AVX2-LABEL: avx2_vphadd_d_test: 577; AVX2: # %bb.0: 578; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 579; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 580; AVX2-NEXT: vphaddd %ymm2, %ymm0, %ymm0 581; AVX2-NEXT: retq 582 %vecext = extractelement <8 x i32> %A, i32 0 583 %vecext1 = extractelement <8 x i32> %A, i32 1 584 %add = add i32 %vecext, %vecext1 585 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 586 %vecext2 = extractelement <8 x i32> %A, i32 2 587 %vecext3 = extractelement <8 x i32> %A, i32 3 588 %add4 = add i32 %vecext2, %vecext3 589 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 590 %vecext6 = extractelement <8 x i32> %A, i32 4 591 %vecext7 = extractelement <8 x i32> %A, i32 5 592 %add8 = add i32 %vecext6, %vecext7 593 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 594 %vecext10 = extractelement <8 x i32> %A, i32 6 595 %vecext11 = extractelement <8 x i32> %A, i32 7 596 %add12 = add i32 %vecext10, %vecext11 597 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 598 %vecext14 = extractelement <8 x i32> %B, i32 0 599 %vecext15 = extractelement <8 x i32> %B, i32 1 600 %add16 = add i32 %vecext14, %vecext15 601 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 602 %vecext18 = extractelement <8 x i32> %B, i32 2 603 %vecext19 = extractelement <8 x i32> %B, i32 3 604 %add20 = add i32 %vecext18, %vecext19 605 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 606 %vecext22 = extractelement <8 x i32> %B, i32 4 607 %vecext23 = extractelement <8 x i32> %B, i32 5 608 %add24 = add i32 %vecext22, %vecext23 609 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 610 %vecext26 = extractelement <8 x i32> %B, i32 6 611 %vecext27 = extractelement <8 x i32> %B, i32 7 612 %add28 = add i32 %vecext26, %vecext27 613 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 614 ret <8 x i32> %vecinit29 615} 616 617define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { 618; SSE3-LABEL: avx2_vphadd_w_test: 619; SSE3: # %bb.0: 620; SSE3-NEXT: pushq %rbp 621; SSE3-NEXT: pushq %r15 622; SSE3-NEXT: pushq %r14 623; SSE3-NEXT: pushq %r13 624; SSE3-NEXT: pushq %r12 625; SSE3-NEXT: pushq %rbx 626; SSE3-NEXT: movd %xmm0, %eax 627; SSE3-NEXT: pextrw $1, %xmm0, %ecx 628; SSE3-NEXT: addl %eax, %ecx 629; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 630; SSE3-NEXT: pextrw $2, %xmm0, %eax 631; SSE3-NEXT: pextrw $3, %xmm0, %ecx 632; SSE3-NEXT: addl %eax, %ecx 633; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 634; SSE3-NEXT: pextrw $4, %xmm0, %eax 635; SSE3-NEXT: pextrw $5, %xmm0, %r11d 636; SSE3-NEXT: addl %eax, %r11d 637; SSE3-NEXT: pextrw $6, %xmm0, %eax 638; SSE3-NEXT: pextrw $7, %xmm0, %r15d 639; SSE3-NEXT: addl %eax, %r15d 640; SSE3-NEXT: movd %xmm1, %eax 641; SSE3-NEXT: pextrw $1, %xmm1, %r13d 642; SSE3-NEXT: addl %eax, %r13d 643; SSE3-NEXT: pextrw $2, %xmm1, %eax 644; SSE3-NEXT: pextrw $3, %xmm1, %ebx 645; SSE3-NEXT: addl %eax, %ebx 646; SSE3-NEXT: pextrw $4, %xmm1, %eax 647; SSE3-NEXT: pextrw $5, %xmm1, %r8d 648; SSE3-NEXT: addl %eax, %r8d 649; SSE3-NEXT: pextrw $6, %xmm1, %eax 650; SSE3-NEXT: pextrw $7, %xmm1, %esi 651; SSE3-NEXT: addl %eax, %esi 652; SSE3-NEXT: movd %xmm2, %eax 653; SSE3-NEXT: pextrw $1, %xmm2, %r10d 654; SSE3-NEXT: addl %eax, %r10d 655; SSE3-NEXT: pextrw $2, %xmm2, %eax 656; SSE3-NEXT: pextrw $3, %xmm2, %r14d 657; SSE3-NEXT: addl %eax, %r14d 658; SSE3-NEXT: pextrw $4, %xmm2, %eax 659; SSE3-NEXT: pextrw $5, %xmm2, %r12d 660; SSE3-NEXT: addl %eax, %r12d 661; SSE3-NEXT: pextrw $6, %xmm2, %eax 662; SSE3-NEXT: pextrw $7, %xmm2, %r9d 663; SSE3-NEXT: addl %eax, %r9d 664; SSE3-NEXT: movd %xmm3, %eax 665; SSE3-NEXT: pextrw $1, %xmm3, %ebp 666; SSE3-NEXT: addl %eax, %ebp 667; SSE3-NEXT: pextrw $2, %xmm3, %edx 668; SSE3-NEXT: pextrw $3, %xmm3, %edi 669; SSE3-NEXT: addl %edx, %edi 670; SSE3-NEXT: pextrw $4, %xmm3, %edx 671; SSE3-NEXT: pextrw $5, %xmm3, %ecx 672; SSE3-NEXT: addl %edx, %ecx 673; SSE3-NEXT: pextrw $6, %xmm3, %edx 674; SSE3-NEXT: pextrw $7, %xmm3, %eax 675; SSE3-NEXT: addl %edx, %eax 676; SSE3-NEXT: movd %esi, %xmm8 677; SSE3-NEXT: movd %r8d, %xmm3 678; SSE3-NEXT: movd %ebx, %xmm9 679; SSE3-NEXT: movd %r13d, %xmm4 680; SSE3-NEXT: movd %r15d, %xmm10 681; SSE3-NEXT: movd %r11d, %xmm7 682; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload 683; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero 684; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 685; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero 686; SSE3-NEXT: movd %eax, %xmm12 687; SSE3-NEXT: movd %ecx, %xmm6 688; SSE3-NEXT: movd %edi, %xmm13 689; SSE3-NEXT: movd %ebp, %xmm5 690; SSE3-NEXT: movd %r9d, %xmm14 691; SSE3-NEXT: movd %r12d, %xmm2 692; SSE3-NEXT: movd %r14d, %xmm15 693; SSE3-NEXT: movd %r10d, %xmm1 694; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 695; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 696; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 697; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 698; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 699; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 700; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 701; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 702; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 703; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 704; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 705; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 706; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 707; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 708; SSE3-NEXT: popq %rbx 709; SSE3-NEXT: popq %r12 710; SSE3-NEXT: popq %r13 711; SSE3-NEXT: popq %r14 712; SSE3-NEXT: popq %r15 713; SSE3-NEXT: popq %rbp 714; SSE3-NEXT: retq 715; 716; SSSE3-LABEL: avx2_vphadd_w_test: 717; SSSE3: # %bb.0: 718; SSSE3-NEXT: phaddw %xmm1, %xmm0 719; SSSE3-NEXT: phaddw %xmm3, %xmm2 720; SSSE3-NEXT: movdqa %xmm2, %xmm1 721; SSSE3-NEXT: retq 722; 723; AVX1-LABEL: avx2_vphadd_w_test: 724; AVX1: # %bb.0: 725; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 726; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1 727; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 728; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0 729; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 730; AVX1-NEXT: retq 731; 732; AVX2-LABEL: avx2_vphadd_w_test: 733; AVX2: # %bb.0: 734; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 735; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 736; AVX2-NEXT: vphaddw %ymm2, %ymm0, %ymm0 737; AVX2-NEXT: retq 738 %vecext = extractelement <16 x i16> %a, i32 0 739 %vecext1 = extractelement <16 x i16> %a, i32 1 740 %add = add i16 %vecext, %vecext1 741 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 742 %vecext4 = extractelement <16 x i16> %a, i32 2 743 %vecext6 = extractelement <16 x i16> %a, i32 3 744 %add8 = add i16 %vecext4, %vecext6 745 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 746 %vecext11 = extractelement <16 x i16> %a, i32 4 747 %vecext13 = extractelement <16 x i16> %a, i32 5 748 %add15 = add i16 %vecext11, %vecext13 749 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 750 %vecext18 = extractelement <16 x i16> %a, i32 6 751 %vecext20 = extractelement <16 x i16> %a, i32 7 752 %add22 = add i16 %vecext18, %vecext20 753 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 754 %vecext25 = extractelement <16 x i16> %a, i32 8 755 %vecext27 = extractelement <16 x i16> %a, i32 9 756 %add29 = add i16 %vecext25, %vecext27 757 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4 758 %vecext32 = extractelement <16 x i16> %a, i32 10 759 %vecext34 = extractelement <16 x i16> %a, i32 11 760 %add36 = add i16 %vecext32, %vecext34 761 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5 762 %vecext39 = extractelement <16 x i16> %a, i32 12 763 %vecext41 = extractelement <16 x i16> %a, i32 13 764 %add43 = add i16 %vecext39, %vecext41 765 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6 766 %vecext46 = extractelement <16 x i16> %a, i32 14 767 %vecext48 = extractelement <16 x i16> %a, i32 15 768 %add50 = add i16 %vecext46, %vecext48 769 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7 770 %vecext53 = extractelement <16 x i16> %b, i32 0 771 %vecext55 = extractelement <16 x i16> %b, i32 1 772 %add57 = add i16 %vecext53, %vecext55 773 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8 774 %vecext60 = extractelement <16 x i16> %b, i32 2 775 %vecext62 = extractelement <16 x i16> %b, i32 3 776 %add64 = add i16 %vecext60, %vecext62 777 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9 778 %vecext67 = extractelement <16 x i16> %b, i32 4 779 %vecext69 = extractelement <16 x i16> %b, i32 5 780 %add71 = add i16 %vecext67, %vecext69 781 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10 782 %vecext74 = extractelement <16 x i16> %b, i32 6 783 %vecext76 = extractelement <16 x i16> %b, i32 7 784 %add78 = add i16 %vecext74, %vecext76 785 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11 786 %vecext81 = extractelement <16 x i16> %b, i32 8 787 %vecext83 = extractelement <16 x i16> %b, i32 9 788 %add85 = add i16 %vecext81, %vecext83 789 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 790 %vecext88 = extractelement <16 x i16> %b, i32 10 791 %vecext90 = extractelement <16 x i16> %b, i32 11 792 %add92 = add i16 %vecext88, %vecext90 793 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 794 %vecext95 = extractelement <16 x i16> %b, i32 12 795 %vecext97 = extractelement <16 x i16> %b, i32 13 796 %add99 = add i16 %vecext95, %vecext97 797 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 798 %vecext102 = extractelement <16 x i16> %b, i32 14 799 %vecext104 = extractelement <16 x i16> %b, i32 15 800 %add106 = add i16 %vecext102, %vecext104 801 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 802 ret <16 x i16> %vecinit108 803} 804 805; Verify that we don't select horizontal subs in the following functions. 806 807define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { 808; SSE-LABEL: not_a_hsub_1: 809; SSE: # %bb.0: 810; SSE-NEXT: movd %xmm0, %eax 811; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 812; SSE-NEXT: movd %xmm2, %ecx 813; SSE-NEXT: subl %ecx, %eax 814; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 815; SSE-NEXT: movd %xmm2, %ecx 816; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 817; SSE-NEXT: movd %xmm0, %edx 818; SSE-NEXT: subl %edx, %ecx 819; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 820; SSE-NEXT: movd %xmm0, %edx 821; SSE-NEXT: movd %xmm1, %esi 822; SSE-NEXT: subl %esi, %edx 823; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 824; SSE-NEXT: movd %xmm0, %esi 825; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 826; SSE-NEXT: movd %xmm0, %edi 827; SSE-NEXT: subl %edi, %esi 828; SSE-NEXT: movd %esi, %xmm0 829; SSE-NEXT: movd %edx, %xmm1 830; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 831; SSE-NEXT: movd %ecx, %xmm2 832; SSE-NEXT: movd %eax, %xmm0 833; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 834; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 835; SSE-NEXT: retq 836; 837; AVX-LABEL: not_a_hsub_1: 838; AVX: # %bb.0: 839; AVX-NEXT: vmovd %xmm0, %eax 840; AVX-NEXT: vpextrd $1, %xmm0, %ecx 841; AVX-NEXT: subl %ecx, %eax 842; AVX-NEXT: vpextrd $2, %xmm0, %ecx 843; AVX-NEXT: vpextrd $3, %xmm0, %edx 844; AVX-NEXT: subl %edx, %ecx 845; AVX-NEXT: vpextrd $1, %xmm1, %edx 846; AVX-NEXT: vmovd %xmm1, %esi 847; AVX-NEXT: subl %esi, %edx 848; AVX-NEXT: vpextrd $3, %xmm1, %esi 849; AVX-NEXT: vpextrd $2, %xmm1, %edi 850; AVX-NEXT: subl %edi, %esi 851; AVX-NEXT: vmovd %eax, %xmm0 852; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 853; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 854; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 855; AVX-NEXT: retq 856 %vecext = extractelement <4 x i32> %A, i32 0 857 %vecext1 = extractelement <4 x i32> %A, i32 1 858 %sub = sub i32 %vecext, %vecext1 859 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 860 %vecext2 = extractelement <4 x i32> %A, i32 2 861 %vecext3 = extractelement <4 x i32> %A, i32 3 862 %sub4 = sub i32 %vecext2, %vecext3 863 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 864 %vecext6 = extractelement <4 x i32> %B, i32 1 865 %vecext7 = extractelement <4 x i32> %B, i32 0 866 %sub8 = sub i32 %vecext6, %vecext7 867 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 868 %vecext10 = extractelement <4 x i32> %B, i32 3 869 %vecext11 = extractelement <4 x i32> %B, i32 2 870 %sub12 = sub i32 %vecext10, %vecext11 871 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 872 ret <4 x i32> %vecinit13 873} 874 875define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { 876; SSE-LABEL: not_a_hsub_2: 877; SSE: # %bb.0: 878; SSE-NEXT: movaps %xmm0, %xmm2 879; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 880; SSE-NEXT: movaps %xmm0, %xmm3 881; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] 882; SSE-NEXT: subss %xmm3, %xmm2 883; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 884; SSE-NEXT: subss %xmm3, %xmm0 885; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 886; SSE-NEXT: movaps %xmm1, %xmm2 887; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] 888; SSE-NEXT: movaps %xmm1, %xmm3 889; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 890; SSE-NEXT: subss %xmm3, %xmm2 891; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 892; SSE-NEXT: subss %xmm3, %xmm1 893; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 894; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 895; SSE-NEXT: retq 896; 897; AVX-LABEL: not_a_hsub_2: 898; AVX: # %bb.0: 899; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 900; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] 901; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 902; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 903; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 904; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 905; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 906; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 907; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 908; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 909; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 910; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 911; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 912; AVX-NEXT: retq 913 %vecext = extractelement <4 x float> %A, i32 2 914 %vecext1 = extractelement <4 x float> %A, i32 3 915 %sub = fsub float %vecext, %vecext1 916 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 917 %vecext2 = extractelement <4 x float> %A, i32 0 918 %vecext3 = extractelement <4 x float> %A, i32 1 919 %sub4 = fsub float %vecext2, %vecext3 920 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 921 %vecext6 = extractelement <4 x float> %B, i32 3 922 %vecext7 = extractelement <4 x float> %B, i32 2 923 %sub8 = fsub float %vecext6, %vecext7 924 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 925 %vecext10 = extractelement <4 x float> %B, i32 0 926 %vecext11 = extractelement <4 x float> %B, i32 1 927 %sub12 = fsub float %vecext10, %vecext11 928 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 929 ret <4 x float> %vecinit13 930} 931 932define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { 933; SSE-LABEL: not_a_hsub_3: 934; SSE: # %bb.0: 935; SSE-NEXT: movapd %xmm1, %xmm2 936; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 937; SSE-NEXT: subsd %xmm2, %xmm1 938; SSE-NEXT: movapd %xmm0, %xmm2 939; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 940; SSE-NEXT: subsd %xmm0, %xmm2 941; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] 942; SSE-NEXT: movapd %xmm2, %xmm0 943; SSE-NEXT: retq 944; 945; AVX-LABEL: not_a_hsub_3: 946; AVX: # %bb.0: 947; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 948; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1 949; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 950; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0 951; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 952; AVX-NEXT: retq 953 %vecext = extractelement <2 x double> %B, i32 0 954 %vecext1 = extractelement <2 x double> %B, i32 1 955 %sub = fsub double %vecext, %vecext1 956 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 957 %vecext2 = extractelement <2 x double> %A, i32 1 958 %vecext3 = extractelement <2 x double> %A, i32 0 959 %sub2 = fsub double %vecext2, %vecext3 960 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 961 ret <2 x double> %vecinit2 962} 963 964; Test AVX horizontal add/sub of packed single/double precision 965; floating point values from 256-bit vectors. 966 967define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) { 968; SSE-LABEL: avx_vhadd_ps: 969; SSE: # %bb.0: 970; SSE-NEXT: haddps %xmm2, %xmm0 971; SSE-NEXT: haddps %xmm3, %xmm1 972; SSE-NEXT: retq 973; 974; AVX-LABEL: avx_vhadd_ps: 975; AVX: # %bb.0: 976; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 977; AVX-NEXT: retq 978 %vecext = extractelement <8 x float> %a, i32 0 979 %vecext1 = extractelement <8 x float> %a, i32 1 980 %add = fadd float %vecext, %vecext1 981 %vecinit = insertelement <8 x float> undef, float %add, i32 0 982 %vecext2 = extractelement <8 x float> %a, i32 2 983 %vecext3 = extractelement <8 x float> %a, i32 3 984 %add4 = fadd float %vecext2, %vecext3 985 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 986 %vecext6 = extractelement <8 x float> %b, i32 0 987 %vecext7 = extractelement <8 x float> %b, i32 1 988 %add8 = fadd float %vecext6, %vecext7 989 %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2 990 %vecext10 = extractelement <8 x float> %b, i32 2 991 %vecext11 = extractelement <8 x float> %b, i32 3 992 %add12 = fadd float %vecext10, %vecext11 993 %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3 994 %vecext14 = extractelement <8 x float> %a, i32 4 995 %vecext15 = extractelement <8 x float> %a, i32 5 996 %add16 = fadd float %vecext14, %vecext15 997 %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4 998 %vecext18 = extractelement <8 x float> %a, i32 6 999 %vecext19 = extractelement <8 x float> %a, i32 7 1000 %add20 = fadd float %vecext18, %vecext19 1001 %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5 1002 %vecext22 = extractelement <8 x float> %b, i32 4 1003 %vecext23 = extractelement <8 x float> %b, i32 5 1004 %add24 = fadd float %vecext22, %vecext23 1005 %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6 1006 %vecext26 = extractelement <8 x float> %b, i32 6 1007 %vecext27 = extractelement <8 x float> %b, i32 7 1008 %add28 = fadd float %vecext26, %vecext27 1009 %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7 1010 ret <8 x float> %vecinit29 1011} 1012 1013define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) { 1014; SSE-LABEL: avx_vhsub_ps: 1015; SSE: # %bb.0: 1016; SSE-NEXT: hsubps %xmm2, %xmm0 1017; SSE-NEXT: hsubps %xmm3, %xmm1 1018; SSE-NEXT: retq 1019; 1020; AVX-LABEL: avx_vhsub_ps: 1021; AVX: # %bb.0: 1022; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 1023; AVX-NEXT: retq 1024 %vecext = extractelement <8 x float> %a, i32 0 1025 %vecext1 = extractelement <8 x float> %a, i32 1 1026 %sub = fsub float %vecext, %vecext1 1027 %vecinit = insertelement <8 x float> undef, float %sub, i32 0 1028 %vecext2 = extractelement <8 x float> %a, i32 2 1029 %vecext3 = extractelement <8 x float> %a, i32 3 1030 %sub4 = fsub float %vecext2, %vecext3 1031 %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1 1032 %vecext6 = extractelement <8 x float> %b, i32 0 1033 %vecext7 = extractelement <8 x float> %b, i32 1 1034 %sub8 = fsub float %vecext6, %vecext7 1035 %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2 1036 %vecext10 = extractelement <8 x float> %b, i32 2 1037 %vecext11 = extractelement <8 x float> %b, i32 3 1038 %sub12 = fsub float %vecext10, %vecext11 1039 %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3 1040 %vecext14 = extractelement <8 x float> %a, i32 4 1041 %vecext15 = extractelement <8 x float> %a, i32 5 1042 %sub16 = fsub float %vecext14, %vecext15 1043 %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4 1044 %vecext18 = extractelement <8 x float> %a, i32 6 1045 %vecext19 = extractelement <8 x float> %a, i32 7 1046 %sub20 = fsub float %vecext18, %vecext19 1047 %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5 1048 %vecext22 = extractelement <8 x float> %b, i32 4 1049 %vecext23 = extractelement <8 x float> %b, i32 5 1050 %sub24 = fsub float %vecext22, %vecext23 1051 %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6 1052 %vecext26 = extractelement <8 x float> %b, i32 6 1053 %vecext27 = extractelement <8 x float> %b, i32 7 1054 %sub28 = fsub float %vecext26, %vecext27 1055 %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7 1056 ret <8 x float> %vecinit29 1057} 1058 1059define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) { 1060; SSE-LABEL: avx_hadd_pd: 1061; SSE: # %bb.0: 1062; SSE-NEXT: haddpd %xmm2, %xmm0 1063; SSE-NEXT: haddpd %xmm3, %xmm1 1064; SSE-NEXT: retq 1065; 1066; AVX-LABEL: avx_hadd_pd: 1067; AVX: # %bb.0: 1068; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1069; AVX-NEXT: retq 1070 %vecext = extractelement <4 x double> %a, i32 0 1071 %vecext1 = extractelement <4 x double> %a, i32 1 1072 %add = fadd double %vecext, %vecext1 1073 %vecinit = insertelement <4 x double> undef, double %add, i32 0 1074 %vecext2 = extractelement <4 x double> %b, i32 0 1075 %vecext3 = extractelement <4 x double> %b, i32 1 1076 %add4 = fadd double %vecext2, %vecext3 1077 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 1078 %vecext6 = extractelement <4 x double> %a, i32 2 1079 %vecext7 = extractelement <4 x double> %a, i32 3 1080 %add8 = fadd double %vecext6, %vecext7 1081 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 1082 %vecext10 = extractelement <4 x double> %b, i32 2 1083 %vecext11 = extractelement <4 x double> %b, i32 3 1084 %add12 = fadd double %vecext10, %vecext11 1085 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 1086 ret <4 x double> %vecinit13 1087} 1088 1089define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) { 1090; SSE-LABEL: avx_hsub_pd: 1091; SSE: # %bb.0: 1092; SSE-NEXT: hsubpd %xmm2, %xmm0 1093; SSE-NEXT: hsubpd %xmm3, %xmm1 1094; SSE-NEXT: retq 1095; 1096; AVX-LABEL: avx_hsub_pd: 1097; AVX: # %bb.0: 1098; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 1099; AVX-NEXT: retq 1100 %vecext = extractelement <4 x double> %a, i32 0 1101 %vecext1 = extractelement <4 x double> %a, i32 1 1102 %sub = fsub double %vecext, %vecext1 1103 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 1104 %vecext2 = extractelement <4 x double> %b, i32 0 1105 %vecext3 = extractelement <4 x double> %b, i32 1 1106 %sub4 = fsub double %vecext2, %vecext3 1107 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 1108 %vecext6 = extractelement <4 x double> %a, i32 2 1109 %vecext7 = extractelement <4 x double> %a, i32 3 1110 %sub8 = fsub double %vecext6, %vecext7 1111 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 1112 %vecext10 = extractelement <4 x double> %b, i32 2 1113 %vecext11 = extractelement <4 x double> %b, i32 3 1114 %sub12 = fsub double %vecext10, %vecext11 1115 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 1116 ret <4 x double> %vecinit13 1117} 1118 1119; Test AVX2 horizontal add of packed integer values from 256-bit vectors. 1120 1121define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { 1122; SSE3-LABEL: avx2_hadd_d: 1123; SSE3: # %bb.0: 1124; SSE3-NEXT: movd %xmm0, %ecx 1125; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 1126; SSE3-NEXT: movd %xmm4, %r8d 1127; SSE3-NEXT: addl %ecx, %r8d 1128; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1129; SSE3-NEXT: movd %xmm4, %edx 1130; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1131; SSE3-NEXT: movd %xmm0, %r9d 1132; SSE3-NEXT: addl %edx, %r9d 1133; SSE3-NEXT: movd %xmm2, %edx 1134; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 1135; SSE3-NEXT: movd %xmm0, %esi 1136; SSE3-NEXT: addl %edx, %esi 1137; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1138; SSE3-NEXT: movd %xmm0, %edx 1139; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 1140; SSE3-NEXT: movd %xmm0, %edi 1141; SSE3-NEXT: addl %edx, %edi 1142; SSE3-NEXT: movd %xmm1, %eax 1143; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1144; SSE3-NEXT: movd %xmm0, %r10d 1145; SSE3-NEXT: addl %eax, %r10d 1146; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1147; SSE3-NEXT: movd %xmm0, %eax 1148; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 1149; SSE3-NEXT: movd %xmm0, %ecx 1150; SSE3-NEXT: addl %eax, %ecx 1151; SSE3-NEXT: movd %xmm3, %eax 1152; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 1153; SSE3-NEXT: movd %xmm0, %edx 1154; SSE3-NEXT: addl %eax, %edx 1155; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 1156; SSE3-NEXT: movd %xmm0, %r11d 1157; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 1158; SSE3-NEXT: movd %xmm0, %eax 1159; SSE3-NEXT: addl %r11d, %eax 1160; SSE3-NEXT: movd %edi, %xmm0 1161; SSE3-NEXT: movd %esi, %xmm1 1162; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1163; SSE3-NEXT: movd %r9d, %xmm2 1164; SSE3-NEXT: movd %r8d, %xmm0 1165; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1166; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1167; SSE3-NEXT: movd %eax, %xmm1 1168; SSE3-NEXT: movd %edx, %xmm2 1169; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1170; SSE3-NEXT: movd %ecx, %xmm3 1171; SSE3-NEXT: movd %r10d, %xmm1 1172; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1173; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1174; SSE3-NEXT: retq 1175; 1176; SSSE3-LABEL: avx2_hadd_d: 1177; SSSE3: # %bb.0: 1178; SSSE3-NEXT: phaddd %xmm2, %xmm0 1179; SSSE3-NEXT: phaddd %xmm3, %xmm1 1180; SSSE3-NEXT: retq 1181; 1182; AVX1-LABEL: avx2_hadd_d: 1183; AVX1: # %bb.0: 1184; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1185; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1186; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2 1187; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1188; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1189; AVX1-NEXT: retq 1190; 1191; AVX2-LABEL: avx2_hadd_d: 1192; AVX2: # %bb.0: 1193; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 1194; AVX2-NEXT: retq 1195 %vecext = extractelement <8 x i32> %a, i32 0 1196 %vecext1 = extractelement <8 x i32> %a, i32 1 1197 %add = add i32 %vecext, %vecext1 1198 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 1199 %vecext2 = extractelement <8 x i32> %a, i32 2 1200 %vecext3 = extractelement <8 x i32> %a, i32 3 1201 %add4 = add i32 %vecext2, %vecext3 1202 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 1203 %vecext6 = extractelement <8 x i32> %b, i32 0 1204 %vecext7 = extractelement <8 x i32> %b, i32 1 1205 %add8 = add i32 %vecext6, %vecext7 1206 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 1207 %vecext10 = extractelement <8 x i32> %b, i32 2 1208 %vecext11 = extractelement <8 x i32> %b, i32 3 1209 %add12 = add i32 %vecext10, %vecext11 1210 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 1211 %vecext14 = extractelement <8 x i32> %a, i32 4 1212 %vecext15 = extractelement <8 x i32> %a, i32 5 1213 %add16 = add i32 %vecext14, %vecext15 1214 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 1215 %vecext18 = extractelement <8 x i32> %a, i32 6 1216 %vecext19 = extractelement <8 x i32> %a, i32 7 1217 %add20 = add i32 %vecext18, %vecext19 1218 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 1219 %vecext22 = extractelement <8 x i32> %b, i32 4 1220 %vecext23 = extractelement <8 x i32> %b, i32 5 1221 %add24 = add i32 %vecext22, %vecext23 1222 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 1223 %vecext26 = extractelement <8 x i32> %b, i32 6 1224 %vecext27 = extractelement <8 x i32> %b, i32 7 1225 %add28 = add i32 %vecext26, %vecext27 1226 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 1227 ret <8 x i32> %vecinit29 1228} 1229 1230define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind { 1231; SSE3-LABEL: avx2_hadd_w: 1232; SSE3: # %bb.0: 1233; SSE3-NEXT: pushq %rbp 1234; SSE3-NEXT: pushq %r15 1235; SSE3-NEXT: pushq %r14 1236; SSE3-NEXT: pushq %r13 1237; SSE3-NEXT: pushq %r12 1238; SSE3-NEXT: pushq %rbx 1239; SSE3-NEXT: movd %xmm0, %eax 1240; SSE3-NEXT: pextrw $1, %xmm0, %r10d 1241; SSE3-NEXT: addl %eax, %r10d 1242; SSE3-NEXT: pextrw $2, %xmm0, %eax 1243; SSE3-NEXT: pextrw $3, %xmm0, %r11d 1244; SSE3-NEXT: addl %eax, %r11d 1245; SSE3-NEXT: pextrw $4, %xmm0, %eax 1246; SSE3-NEXT: pextrw $5, %xmm0, %r12d 1247; SSE3-NEXT: addl %eax, %r12d 1248; SSE3-NEXT: pextrw $6, %xmm0, %eax 1249; SSE3-NEXT: pextrw $7, %xmm0, %r13d 1250; SSE3-NEXT: addl %eax, %r13d 1251; SSE3-NEXT: movd %xmm1, %eax 1252; SSE3-NEXT: pextrw $1, %xmm1, %ecx 1253; SSE3-NEXT: addl %eax, %ecx 1254; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1255; SSE3-NEXT: pextrw $2, %xmm1, %eax 1256; SSE3-NEXT: pextrw $3, %xmm1, %ecx 1257; SSE3-NEXT: addl %eax, %ecx 1258; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1259; SSE3-NEXT: pextrw $4, %xmm1, %eax 1260; SSE3-NEXT: pextrw $5, %xmm1, %r14d 1261; SSE3-NEXT: addl %eax, %r14d 1262; SSE3-NEXT: pextrw $6, %xmm1, %esi 1263; SSE3-NEXT: pextrw $7, %xmm1, %r15d 1264; SSE3-NEXT: addl %esi, %r15d 1265; SSE3-NEXT: movd %xmm2, %esi 1266; SSE3-NEXT: pextrw $1, %xmm2, %ebp 1267; SSE3-NEXT: addl %esi, %ebp 1268; SSE3-NEXT: pextrw $2, %xmm2, %esi 1269; SSE3-NEXT: pextrw $3, %xmm2, %edi 1270; SSE3-NEXT: addl %esi, %edi 1271; SSE3-NEXT: pextrw $4, %xmm2, %esi 1272; SSE3-NEXT: pextrw $5, %xmm2, %eax 1273; SSE3-NEXT: addl %esi, %eax 1274; SSE3-NEXT: pextrw $6, %xmm2, %esi 1275; SSE3-NEXT: pextrw $7, %xmm2, %ecx 1276; SSE3-NEXT: addl %esi, %ecx 1277; SSE3-NEXT: movd %xmm3, %ebx 1278; SSE3-NEXT: pextrw $1, %xmm3, %r9d 1279; SSE3-NEXT: addl %ebx, %r9d 1280; SSE3-NEXT: pextrw $2, %xmm3, %edx 1281; SSE3-NEXT: pextrw $3, %xmm3, %ebx 1282; SSE3-NEXT: addl %edx, %ebx 1283; SSE3-NEXT: pextrw $4, %xmm3, %edx 1284; SSE3-NEXT: pextrw $5, %xmm3, %esi 1285; SSE3-NEXT: addl %edx, %esi 1286; SSE3-NEXT: pextrw $6, %xmm3, %r8d 1287; SSE3-NEXT: pextrw $7, %xmm3, %edx 1288; SSE3-NEXT: addl %r8d, %edx 1289; SSE3-NEXT: movd %ecx, %xmm8 1290; SSE3-NEXT: movd %eax, %xmm3 1291; SSE3-NEXT: movd %edi, %xmm9 1292; SSE3-NEXT: movd %ebp, %xmm4 1293; SSE3-NEXT: movd %r13d, %xmm10 1294; SSE3-NEXT: movd %r12d, %xmm7 1295; SSE3-NEXT: movd %r11d, %xmm11 1296; SSE3-NEXT: movd %r10d, %xmm0 1297; SSE3-NEXT: movd %edx, %xmm12 1298; SSE3-NEXT: movd %esi, %xmm6 1299; SSE3-NEXT: movd %ebx, %xmm13 1300; SSE3-NEXT: movd %r9d, %xmm5 1301; SSE3-NEXT: movd %r15d, %xmm14 1302; SSE3-NEXT: movd %r14d, %xmm2 1303; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload 1304; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero 1305; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload 1306; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero 1307; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 1308; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 1309; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 1310; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1311; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 1312; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 1313; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 1314; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 1315; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 1316; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 1317; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 1318; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 1319; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1320; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 1321; SSE3-NEXT: popq %rbx 1322; SSE3-NEXT: popq %r12 1323; SSE3-NEXT: popq %r13 1324; SSE3-NEXT: popq %r14 1325; SSE3-NEXT: popq %r15 1326; SSE3-NEXT: popq %rbp 1327; SSE3-NEXT: retq 1328; 1329; SSSE3-LABEL: avx2_hadd_w: 1330; SSSE3: # %bb.0: 1331; SSSE3-NEXT: phaddw %xmm2, %xmm0 1332; SSSE3-NEXT: phaddw %xmm3, %xmm1 1333; SSSE3-NEXT: retq 1334; 1335; AVX1-LABEL: avx2_hadd_w: 1336; AVX1: # %bb.0: 1337; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1338; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1339; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 1340; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 1341; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1342; AVX1-NEXT: retq 1343; 1344; AVX2-LABEL: avx2_hadd_w: 1345; AVX2: # %bb.0: 1346; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 1347; AVX2-NEXT: retq 1348 %vecext = extractelement <16 x i16> %a, i32 0 1349 %vecext1 = extractelement <16 x i16> %a, i32 1 1350 %add = add i16 %vecext, %vecext1 1351 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 1352 %vecext4 = extractelement <16 x i16> %a, i32 2 1353 %vecext6 = extractelement <16 x i16> %a, i32 3 1354 %add8 = add i16 %vecext4, %vecext6 1355 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 1356 %vecext11 = extractelement <16 x i16> %a, i32 4 1357 %vecext13 = extractelement <16 x i16> %a, i32 5 1358 %add15 = add i16 %vecext11, %vecext13 1359 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 1360 %vecext18 = extractelement <16 x i16> %a, i32 6 1361 %vecext20 = extractelement <16 x i16> %a, i32 7 1362 %add22 = add i16 %vecext18, %vecext20 1363 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 1364 %vecext25 = extractelement <16 x i16> %a, i32 8 1365 %vecext27 = extractelement <16 x i16> %a, i32 9 1366 %add29 = add i16 %vecext25, %vecext27 1367 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8 1368 %vecext32 = extractelement <16 x i16> %a, i32 10 1369 %vecext34 = extractelement <16 x i16> %a, i32 11 1370 %add36 = add i16 %vecext32, %vecext34 1371 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9 1372 %vecext39 = extractelement <16 x i16> %a, i32 12 1373 %vecext41 = extractelement <16 x i16> %a, i32 13 1374 %add43 = add i16 %vecext39, %vecext41 1375 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10 1376 %vecext46 = extractelement <16 x i16> %a, i32 14 1377 %vecext48 = extractelement <16 x i16> %a, i32 15 1378 %add50 = add i16 %vecext46, %vecext48 1379 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11 1380 %vecext53 = extractelement <16 x i16> %b, i32 0 1381 %vecext55 = extractelement <16 x i16> %b, i32 1 1382 %add57 = add i16 %vecext53, %vecext55 1383 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4 1384 %vecext60 = extractelement <16 x i16> %b, i32 2 1385 %vecext62 = extractelement <16 x i16> %b, i32 3 1386 %add64 = add i16 %vecext60, %vecext62 1387 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5 1388 %vecext67 = extractelement <16 x i16> %b, i32 4 1389 %vecext69 = extractelement <16 x i16> %b, i32 5 1390 %add71 = add i16 %vecext67, %vecext69 1391 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6 1392 %vecext74 = extractelement <16 x i16> %b, i32 6 1393 %vecext76 = extractelement <16 x i16> %b, i32 7 1394 %add78 = add i16 %vecext74, %vecext76 1395 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7 1396 %vecext81 = extractelement <16 x i16> %b, i32 8 1397 %vecext83 = extractelement <16 x i16> %b, i32 9 1398 %add85 = add i16 %vecext81, %vecext83 1399 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 1400 %vecext88 = extractelement <16 x i16> %b, i32 10 1401 %vecext90 = extractelement <16 x i16> %b, i32 11 1402 %add92 = add i16 %vecext88, %vecext90 1403 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 1404 %vecext95 = extractelement <16 x i16> %b, i32 12 1405 %vecext97 = extractelement <16 x i16> %b, i32 13 1406 %add99 = add i16 %vecext95, %vecext97 1407 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 1408 %vecext102 = extractelement <16 x i16> %b, i32 14 1409 %vecext104 = extractelement <16 x i16> %b, i32 15 1410 %add106 = add i16 %vecext102, %vecext104 1411 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 1412 ret <16 x i16> %vecinit108 1413} 1414