1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5 6; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. 7 8define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) { 9; SSE-LABEL: test1_undef: 10; SSE: # BB#0: 11; SSE-NEXT: haddps %xmm1, %xmm0 12; SSE-NEXT: retq 13; 14; AVX-LABEL: test1_undef: 15; AVX: # BB#0: 16; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 17; AVX-NEXT: retq 18 %vecext = extractelement <4 x float> %a, i32 0 19 %vecext1 = extractelement <4 x float> %a, i32 1 20 %add = fadd float %vecext, %vecext1 21 %vecinit = insertelement <4 x float> undef, float %add, i32 0 22 %vecext2 = extractelement <4 x float> %a, i32 2 23 %vecext3 = extractelement <4 x float> %a, i32 3 24 %add4 = fadd float %vecext2, %vecext3 25 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 26 %vecext10 = extractelement <4 x float> %b, i32 2 27 %vecext11 = extractelement <4 x float> %b, i32 3 28 %add12 = fadd float %vecext10, %vecext11 29 %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3 30 ret <4 x float> %vecinit13 31} 32 33define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) { 34; SSE-LABEL: test2_undef: 35; SSE: # BB#0: 36; SSE-NEXT: haddps %xmm1, %xmm0 37; SSE-NEXT: retq 38; 39; AVX-LABEL: test2_undef: 40; AVX: # BB#0: 41; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 42; AVX-NEXT: retq 43 %vecext = extractelement <4 x float> %a, i32 0 44 %vecext1 = extractelement <4 x float> %a, i32 1 45 %add = fadd float %vecext, %vecext1 46 %vecinit = insertelement <4 x float> undef, float %add, i32 0 47 %vecext6 = extractelement <4 x float> %b, i32 0 48 %vecext7 = extractelement <4 x float> %b, i32 1 49 %add8 = fadd float %vecext6, %vecext7 50 %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2 51 %vecext10 = extractelement <4 x float> %b, i32 2 52 %vecext11 = extractelement <4 x float> %b, i32 3 53 %add12 = fadd float %vecext10, %vecext11 54 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 55 ret <4 x float> %vecinit13 56} 57 58define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) { 59; SSE-LABEL: test3_undef: 60; SSE: # BB#0: 61; SSE-NEXT: haddps %xmm1, %xmm0 62; SSE-NEXT: retq 63; 64; AVX-LABEL: test3_undef: 65; AVX: # BB#0: 66; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 67; AVX-NEXT: retq 68 %vecext = extractelement <4 x float> %a, i32 0 69 %vecext1 = extractelement <4 x float> %a, i32 1 70 %add = fadd float %vecext, %vecext1 71 %vecinit = insertelement <4 x float> undef, float %add, i32 0 72 %vecext2 = extractelement <4 x float> %a, i32 2 73 %vecext3 = extractelement <4 x float> %a, i32 3 74 %add4 = fadd float %vecext2, %vecext3 75 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 76 %vecext6 = extractelement <4 x float> %b, i32 0 77 %vecext7 = extractelement <4 x float> %b, i32 1 78 %add8 = fadd float %vecext6, %vecext7 79 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 80 ret <4 x float> %vecinit9 81} 82 83define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) { 84; SSE-LABEL: test4_undef: 85; SSE: # BB#0: 86; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 87; SSE-NEXT: addss %xmm1, %xmm0 88; SSE-NEXT: retq 89; 90; AVX-LABEL: test4_undef: 91; AVX: # BB#0: 92; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 93; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 94; AVX-NEXT: retq 95 %vecext = extractelement <4 x float> %a, i32 0 96 %vecext1 = extractelement <4 x float> %a, i32 1 97 %add = fadd float %vecext, %vecext1 98 %vecinit = insertelement <4 x float> undef, float %add, i32 0 99 ret <4 x float> %vecinit 100} 101 102define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) { 103; SSE-LABEL: test5_undef: 104; SSE: # BB#0: 105; SSE-NEXT: movapd %xmm0, %xmm1 106; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] 107; SSE-NEXT: addsd %xmm0, %xmm1 108; SSE-NEXT: movapd %xmm1, %xmm0 109; SSE-NEXT: retq 110; 111; AVX-LABEL: test5_undef: 112; AVX: # BB#0: 113; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 114; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 115; AVX-NEXT: retq 116 %vecext = extractelement <2 x double> %a, i32 0 117 %vecext1 = extractelement <2 x double> %a, i32 1 118 %add = fadd double %vecext, %vecext1 119 %vecinit = insertelement <2 x double> undef, double %add, i32 0 120 ret <2 x double> %vecinit 121} 122 123define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) { 124; SSE-LABEL: test6_undef: 125; SSE: # BB#0: 126; SSE-NEXT: haddps %xmm0, %xmm0 127; SSE-NEXT: retq 128; 129; AVX-LABEL: test6_undef: 130; AVX: # BB#0: 131; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 132; AVX-NEXT: retq 133 %vecext = extractelement <4 x float> %a, i32 0 134 %vecext1 = extractelement <4 x float> %a, i32 1 135 %add = fadd float %vecext, %vecext1 136 %vecinit = insertelement <4 x float> undef, float %add, i32 0 137 %vecext2 = extractelement <4 x float> %a, i32 2 138 %vecext3 = extractelement <4 x float> %a, i32 3 139 %add4 = fadd float %vecext2, %vecext3 140 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 141 ret <4 x float> %vecinit5 142} 143 144define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) { 145; SSE-LABEL: test7_undef: 146; SSE: # BB#0: 147; SSE-NEXT: haddps %xmm1, %xmm0 148; SSE-NEXT: retq 149; 150; AVX-LABEL: test7_undef: 151; AVX: # BB#0: 152; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 153; AVX-NEXT: retq 154 %vecext = extractelement <4 x float> %b, i32 0 155 %vecext1 = extractelement <4 x float> %b, i32 1 156 %add = fadd float %vecext, %vecext1 157 %vecinit = insertelement <4 x float> undef, float %add, i32 2 158 %vecext2 = extractelement <4 x float> %b, i32 2 159 %vecext3 = extractelement <4 x float> %b, i32 3 160 %add4 = fadd float %vecext2, %vecext3 161 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 162 ret <4 x float> %vecinit5 163} 164 165define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) { 166; SSE-LABEL: test8_undef: 167; SSE: # BB#0: 168; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 169; SSE-NEXT: addss %xmm0, %xmm1 170; SSE-NEXT: movaps %xmm0, %xmm2 171; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 172; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 173; SSE-NEXT: addss %xmm2, %xmm0 174; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 175; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] 176; SSE-NEXT: movaps %xmm1, %xmm0 177; SSE-NEXT: retq 178; 179; AVX-LABEL: test8_undef: 180; AVX: # BB#0: 181; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 182; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 183; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 184; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 185; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 186; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 187; AVX-NEXT: retq 188 %vecext = extractelement <4 x float> %a, i32 0 189 %vecext1 = extractelement <4 x float> %a, i32 1 190 %add = fadd float %vecext, %vecext1 191 %vecinit = insertelement <4 x float> undef, float %add, i32 0 192 %vecext2 = extractelement <4 x float> %a, i32 2 193 %vecext3 = extractelement <4 x float> %a, i32 3 194 %add4 = fadd float %vecext2, %vecext3 195 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2 196 ret <4 x float> %vecinit5 197} 198 199define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) { 200; SSE-LABEL: test9_undef: 201; SSE: # BB#0: 202; SSE-NEXT: haddps %xmm1, %xmm0 203; SSE-NEXT: retq 204; 205; AVX-LABEL: test9_undef: 206; AVX: # BB#0: 207; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 208; AVX-NEXT: retq 209 %vecext = extractelement <4 x float> %a, i32 0 210 %vecext1 = extractelement <4 x float> %a, i32 1 211 %add = fadd float %vecext, %vecext1 212 %vecinit = insertelement <4 x float> undef, float %add, i32 0 213 %vecext2 = extractelement <4 x float> %b, i32 2 214 %vecext3 = extractelement <4 x float> %b, i32 3 215 %add4 = fadd float %vecext2, %vecext3 216 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 217 ret <4 x float> %vecinit5 218} 219 220define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) { 221; SSE-LABEL: test10_undef: 222; SSE: # BB#0: 223; SSE-NEXT: haddps %xmm2, %xmm0 224; SSE-NEXT: retq 225; 226; AVX-LABEL: test10_undef: 227; AVX: # BB#0: 228; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 229; AVX-NEXT: retq 230 %vecext = extractelement <8 x float> %a, i32 0 231 %vecext1 = extractelement <8 x float> %a, i32 1 232 %add = fadd float %vecext, %vecext1 233 %vecinit = insertelement <8 x float> undef, float %add, i32 0 234 %vecext2 = extractelement <8 x float> %b, i32 2 235 %vecext3 = extractelement <8 x float> %b, i32 3 236 %add4 = fadd float %vecext2, %vecext3 237 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3 238 ret <8 x float> %vecinit5 239} 240 241define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) { 242; SSE-LABEL: test11_undef: 243; SSE: # BB#0: 244; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 245; SSE-NEXT: addss %xmm1, %xmm0 246; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 247; SSE-NEXT: addss %xmm3, %xmm1 248; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] 249; SSE-NEXT: retq 250; 251; AVX-LABEL: test11_undef: 252; AVX: # BB#0: 253; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 254; AVX-NEXT: retq 255 %vecext = extractelement <8 x float> %a, i32 0 256 %vecext1 = extractelement <8 x float> %a, i32 1 257 %add = fadd float %vecext, %vecext1 258 %vecinit = insertelement <8 x float> undef, float %add, i32 0 259 %vecext2 = extractelement <8 x float> %b, i32 4 260 %vecext3 = extractelement <8 x float> %b, i32 5 261 %add4 = fadd float %vecext2, %vecext3 262 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6 263 ret <8 x float> %vecinit5 264} 265 266define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) { 267; SSE-LABEL: test12_undef: 268; SSE: # BB#0: 269; SSE-NEXT: haddps %xmm0, %xmm0 270; SSE-NEXT: retq 271; 272; AVX-LABEL: test12_undef: 273; AVX: # BB#0: 274; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 275; AVX-NEXT: retq 276 %vecext = extractelement <8 x float> %a, i32 0 277 %vecext1 = extractelement <8 x float> %a, i32 1 278 %add = fadd float %vecext, %vecext1 279 %vecinit = insertelement <8 x float> undef, float %add, i32 0 280 %vecext2 = extractelement <8 x float> %a, i32 2 281 %vecext3 = extractelement <8 x float> %a, i32 3 282 %add4 = fadd float %vecext2, %vecext3 283 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 284 ret <8 x float> %vecinit5 285} 286 287define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) { 288; SSE-LABEL: test13_undef: 289; SSE: # BB#0: 290; SSE-NEXT: haddps %xmm1, %xmm0 291; SSE-NEXT: retq 292; 293; AVX-LABEL: test13_undef: 294; AVX: # BB#0: 295; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 296; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 297; AVX-NEXT: retq 298 %vecext = extractelement <8 x float> %a, i32 0 299 %vecext1 = extractelement <8 x float> %a, i32 1 300 %add1 = fadd float %vecext, %vecext1 301 %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0 302 %vecext2 = extractelement <8 x float> %a, i32 2 303 %vecext3 = extractelement <8 x float> %a, i32 3 304 %add2 = fadd float %vecext2, %vecext3 305 %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1 306 %vecext4 = extractelement <8 x float> %a, i32 4 307 %vecext5 = extractelement <8 x float> %a, i32 5 308 %add3 = fadd float %vecext4, %vecext5 309 %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2 310 %vecext6 = extractelement <8 x float> %a, i32 6 311 %vecext7 = extractelement <8 x float> %a, i32 7 312 %add4 = fadd float %vecext6, %vecext7 313 %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3 314 ret <8 x float> %vecinit4 315} 316 317define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) { 318; SSE-LABEL: test14_undef: 319; SSE: # BB#0: 320; SSE-NEXT: phaddd %xmm2, %xmm0 321; SSE-NEXT: retq 322; 323; AVX1-LABEL: test14_undef: 324; AVX1: # BB#0: 325; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 326; AVX1-NEXT: retq 327; 328; AVX2-LABEL: test14_undef: 329; AVX2: # BB#0: 330; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 331; AVX2-NEXT: retq 332 %vecext = extractelement <8 x i32> %a, i32 0 333 %vecext1 = extractelement <8 x i32> %a, i32 1 334 %add = add i32 %vecext, %vecext1 335 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 336 %vecext2 = extractelement <8 x i32> %b, i32 2 337 %vecext3 = extractelement <8 x i32> %b, i32 3 338 %add4 = add i32 %vecext2, %vecext3 339 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3 340 ret <8 x i32> %vecinit5 341} 342 343; On AVX2, the following sequence can be folded into a single horizontal add. 344; If the Subtarget doesn't support AVX2, then we avoid emitting two packed 345; integer horizontal adds instead of two scalar adds followed by vector inserts. 346define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { 347; SSE-LABEL: test15_undef: 348; SSE: # BB#0: 349; SSE-NEXT: movd %xmm0, %eax 350; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 351; SSE-NEXT: movd %xmm0, %ecx 352; SSE-NEXT: addl %eax, %ecx 353; SSE-NEXT: movd %xmm3, %eax 354; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 355; SSE-NEXT: movd %xmm0, %edx 356; SSE-NEXT: addl %eax, %edx 357; SSE-NEXT: movd %ecx, %xmm0 358; SSE-NEXT: movd %edx, %xmm1 359; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 360; SSE-NEXT: retq 361; 362; AVX1-LABEL: test15_undef: 363; AVX1: # BB#0: 364; AVX1-NEXT: vmovd %xmm0, %eax 365; AVX1-NEXT: vpextrd $1, %xmm0, %ecx 366; AVX1-NEXT: addl %eax, %ecx 367; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 368; AVX1-NEXT: vmovd %xmm0, %eax 369; AVX1-NEXT: vpextrd $1, %xmm0, %edx 370; AVX1-NEXT: addl %eax, %edx 371; AVX1-NEXT: vmovd %ecx, %xmm0 372; AVX1-NEXT: vmovd %edx, %xmm1 373; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 374; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 375; AVX1-NEXT: retq 376; 377; AVX2-LABEL: test15_undef: 378; AVX2: # BB#0: 379; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 380; AVX2-NEXT: retq 381 %vecext = extractelement <8 x i32> %a, i32 0 382 %vecext1 = extractelement <8 x i32> %a, i32 1 383 %add = add i32 %vecext, %vecext1 384 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 385 %vecext2 = extractelement <8 x i32> %b, i32 4 386 %vecext3 = extractelement <8 x i32> %b, i32 5 387 %add4 = add i32 %vecext2, %vecext3 388 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6 389 ret <8 x i32> %vecinit5 390} 391 392define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) { 393; SSE-LABEL: test16_undef: 394; SSE: # BB#0: 395; SSE-NEXT: phaddd %xmm0, %xmm0 396; SSE-NEXT: retq 397; 398; AVX1-LABEL: test16_undef: 399; AVX1: # BB#0: 400; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 401; AVX1-NEXT: retq 402; 403; AVX2-LABEL: test16_undef: 404; AVX2: # BB#0: 405; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 406; AVX2-NEXT: retq 407 %vecext = extractelement <8 x i32> %a, i32 0 408 %vecext1 = extractelement <8 x i32> %a, i32 1 409 %add = add i32 %vecext, %vecext1 410 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 411 %vecext2 = extractelement <8 x i32> %a, i32 2 412 %vecext3 = extractelement <8 x i32> %a, i32 3 413 %add4 = add i32 %vecext2, %vecext3 414 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 415 ret <8 x i32> %vecinit5 416} 417 418define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { 419; SSE-LABEL: test17_undef: 420; SSE: # BB#0: 421; SSE-NEXT: phaddd %xmm1, %xmm0 422; SSE-NEXT: retq 423; 424; AVX1-LABEL: test17_undef: 425; AVX1: # BB#0: 426; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 427; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 428; AVX1-NEXT: retq 429; 430; AVX2-LABEL: test17_undef: 431; AVX2: # BB#0: 432; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 433; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 434; AVX2-NEXT: retq 435 %vecext = extractelement <8 x i32> %a, i32 0 436 %vecext1 = extractelement <8 x i32> %a, i32 1 437 %add1 = add i32 %vecext, %vecext1 438 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0 439 %vecext2 = extractelement <8 x i32> %a, i32 2 440 %vecext3 = extractelement <8 x i32> %a, i32 3 441 %add2 = add i32 %vecext2, %vecext3 442 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1 443 %vecext4 = extractelement <8 x i32> %a, i32 4 444 %vecext5 = extractelement <8 x i32> %a, i32 5 445 %add3 = add i32 %vecext4, %vecext5 446 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2 447 %vecext6 = extractelement <8 x i32> %a, i32 6 448 %vecext7 = extractelement <8 x i32> %a, i32 7 449 %add4 = add i32 %vecext6, %vecext7 450 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3 451 ret <8 x i32> %vecinit4 452} 453