1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 5 6; Verify that we correctly generate 'addsub' instructions from 7; a sequence of vector extracts + float add/sub + vector inserts. 8 9define <4 x float> @test1(<4 x float> %A, <4 x float> %B) { 10; SSE-LABEL: test1: 11; SSE: # %bb.0: 12; SSE-NEXT: addsubps %xmm1, %xmm0 13; SSE-NEXT: retq 14; 15; AVX-LABEL: test1: 16; AVX: # %bb.0: 17; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 18; AVX-NEXT: retq 19 %1 = extractelement <4 x float> %A, i32 0 20 %2 = extractelement <4 x float> %B, i32 0 21 %sub = fsub float %1, %2 22 %3 = extractelement <4 x float> %A, i32 2 23 %4 = extractelement <4 x float> %B, i32 2 24 %sub2 = fsub float %3, %4 25 %5 = extractelement <4 x float> %A, i32 1 26 %6 = extractelement <4 x float> %B, i32 1 27 %add = fadd float %5, %6 28 %7 = extractelement <4 x float> %A, i32 3 29 %8 = extractelement <4 x float> %B, i32 3 30 %add2 = fadd float %7, %8 31 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 32 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 33 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0 34 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2 35 ret <4 x float> %vecinsert4 36} 37 38define <4 x float> @test2(<4 x float> %A, <4 x float> %B) { 39; SSE-LABEL: test2: 40; SSE: # %bb.0: 41; SSE-NEXT: addsubps %xmm1, %xmm0 42; SSE-NEXT: retq 43; 44; AVX-LABEL: test2: 45; AVX: # %bb.0: 46; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 47; AVX-NEXT: retq 48 %1 = extractelement <4 x float> %A, i32 2 49 %2 = extractelement <4 x float> %B, i32 2 50 %sub2 = fsub float %1, %2 51 %3 = extractelement <4 x float> %A, i32 3 52 %4 = extractelement <4 x float> %B, i32 3 53 %add2 = fadd float %3, %4 54 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2 55 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 56 ret <4 x float> %vecinsert2 57} 58 59define <4 x float> @test3(<4 x float> %A, <4 x float> %B) { 60; SSE-LABEL: test3: 61; SSE: # %bb.0: 62; SSE-NEXT: addsubps %xmm1, %xmm0 63; SSE-NEXT: retq 64; 65; AVX-LABEL: test3: 66; AVX: # %bb.0: 67; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 68; AVX-NEXT: retq 69 %1 = extractelement <4 x float> %A, i32 0 70 %2 = extractelement <4 x float> %B, i32 0 71 %sub = fsub float %1, %2 72 %3 = extractelement <4 x float> %A, i32 3 73 %4 = extractelement <4 x float> %B, i32 3 74 %add = fadd float %4, %3 75 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0 76 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3 77 ret <4 x float> %vecinsert2 78} 79 80define <4 x float> @test4(<4 x float> %A, <4 x float> %B) { 81; SSE-LABEL: test4: 82; SSE: # %bb.0: 83; SSE-NEXT: addsubps %xmm1, %xmm0 84; SSE-NEXT: retq 85; 86; AVX-LABEL: test4: 87; AVX: # %bb.0: 88; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 89; AVX-NEXT: retq 90 %1 = extractelement <4 x float> %A, i32 2 91 %2 = extractelement <4 x float> %B, i32 2 92 %sub = fsub float %1, %2 93 %3 = extractelement <4 x float> %A, i32 1 94 %4 = extractelement <4 x float> %B, i32 1 95 %add = fadd float %3, %4 96 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2 97 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1 98 ret <4 x float> %vecinsert2 99} 100 101define <4 x float> @test5(<4 x float> %A, <4 x float> %B) { 102; SSE-LABEL: test5: 103; SSE: # %bb.0: 104; SSE-NEXT: addsubps %xmm1, %xmm0 105; SSE-NEXT: retq 106; 107; AVX-LABEL: test5: 108; AVX: # %bb.0: 109; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 110; AVX-NEXT: retq 111 %1 = extractelement <4 x float> %A, i32 0 112 %2 = extractelement <4 x float> %B, i32 0 113 %sub2 = fsub float %1, %2 114 %3 = extractelement <4 x float> %A, i32 1 115 %4 = extractelement <4 x float> %B, i32 1 116 %add2 = fadd float %3, %4 117 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0 118 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1 119 ret <4 x float> %vecinsert2 120} 121 122define <4 x float> @test6(<4 x float> %A, <4 x float> %B) { 123; SSE-LABEL: test6: 124; SSE: # %bb.0: 125; SSE-NEXT: addsubps %xmm1, %xmm0 126; SSE-NEXT: retq 127; 128; AVX-LABEL: test6: 129; AVX: # %bb.0: 130; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 131; AVX-NEXT: retq 132 %1 = extractelement <4 x float> %A, i32 0 133 %2 = extractelement <4 x float> %B, i32 0 134 %sub = fsub float %1, %2 135 %3 = extractelement <4 x float> %A, i32 2 136 %4 = extractelement <4 x float> %B, i32 2 137 %sub2 = fsub float %3, %4 138 %5 = extractelement <4 x float> %A, i32 1 139 %6 = extractelement <4 x float> %B, i32 1 140 %add = fadd float %5, %6 141 %7 = extractelement <4 x float> %A, i32 3 142 %8 = extractelement <4 x float> %B, i32 3 143 %add2 = fadd float %7, %8 144 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 145 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 146 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0 147 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2 148 ret <4 x float> %vecinsert4 149} 150 151define <4 x double> @test7(<4 x double> %A, <4 x double> %B) { 152; SSE-LABEL: test7: 153; SSE: # %bb.0: 154; SSE-NEXT: addsubpd %xmm2, %xmm0 155; SSE-NEXT: addsubpd %xmm3, %xmm1 156; SSE-NEXT: retq 157; 158; AVX-LABEL: test7: 159; AVX: # %bb.0: 160; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 161; AVX-NEXT: retq 162 %1 = extractelement <4 x double> %A, i32 0 163 %2 = extractelement <4 x double> %B, i32 0 164 %sub = fsub double %1, %2 165 %3 = extractelement <4 x double> %A, i32 2 166 %4 = extractelement <4 x double> %B, i32 2 167 %sub2 = fsub double %3, %4 168 %5 = extractelement <4 x double> %A, i32 1 169 %6 = extractelement <4 x double> %B, i32 1 170 %add = fadd double %5, %6 171 %7 = extractelement <4 x double> %A, i32 3 172 %8 = extractelement <4 x double> %B, i32 3 173 %add2 = fadd double %7, %8 174 %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1 175 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3 176 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0 177 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2 178 ret <4 x double> %vecinsert4 179} 180 181define <2 x double> @test8(<2 x double> %A, <2 x double> %B) { 182; SSE-LABEL: test8: 183; SSE: # %bb.0: 184; SSE-NEXT: addsubpd %xmm1, %xmm0 185; SSE-NEXT: retq 186; 187; AVX-LABEL: test8: 188; AVX: # %bb.0: 189; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 190; AVX-NEXT: retq 191 %1 = extractelement <2 x double> %A, i32 0 192 %2 = extractelement <2 x double> %B, i32 0 193 %sub = fsub double %1, %2 194 %3 = extractelement <2 x double> %A, i32 1 195 %4 = extractelement <2 x double> %B, i32 1 196 %add = fadd double %3, %4 197 %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0 198 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1 199 ret <2 x double> %vecinsert2 200} 201 202define <8 x float> @test9(<8 x float> %A, <8 x float> %B) { 203; SSE-LABEL: test9: 204; SSE: # %bb.0: 205; SSE-NEXT: addsubps %xmm2, %xmm0 206; SSE-NEXT: addsubps %xmm3, %xmm1 207; SSE-NEXT: retq 208; 209; AVX-LABEL: test9: 210; AVX: # %bb.0: 211; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 212; AVX-NEXT: retq 213 %1 = extractelement <8 x float> %A, i32 0 214 %2 = extractelement <8 x float> %B, i32 0 215 %sub = fsub float %1, %2 216 %3 = extractelement <8 x float> %A, i32 2 217 %4 = extractelement <8 x float> %B, i32 2 218 %sub2 = fsub float %3, %4 219 %5 = extractelement <8 x float> %A, i32 1 220 %6 = extractelement <8 x float> %B, i32 1 221 %add = fadd float %5, %6 222 %7 = extractelement <8 x float> %A, i32 3 223 %8 = extractelement <8 x float> %B, i32 3 224 %add2 = fadd float %7, %8 225 %9 = extractelement <8 x float> %A, i32 4 226 %10 = extractelement <8 x float> %B, i32 4 227 %sub3 = fsub float %9, %10 228 %11 = extractelement <8 x float> %A, i32 6 229 %12 = extractelement <8 x float> %B, i32 6 230 %sub4 = fsub float %11, %12 231 %13 = extractelement <8 x float> %A, i32 5 232 %14 = extractelement <8 x float> %B, i32 5 233 %add3 = fadd float %13, %14 234 %15 = extractelement <8 x float> %A, i32 7 235 %16 = extractelement <8 x float> %B, i32 7 236 %add4 = fadd float %15, %16 237 %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1 238 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3 239 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0 240 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2 241 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5 242 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7 243 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4 244 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6 245 ret <8 x float> %vecinsert8 246} 247 248; Verify that we don't generate addsub instruction for the following 249; functions. 250 251define <4 x float> @test10(<4 x float> %A, <4 x float> %B) { 252; SSE-LABEL: test10: 253; SSE: # %bb.0: 254; SSE-NEXT: subss %xmm1, %xmm0 255; SSE-NEXT: retq 256; 257; AVX-LABEL: test10: 258; AVX: # %bb.0: 259; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 260; AVX-NEXT: retq 261 %1 = extractelement <4 x float> %A, i32 0 262 %2 = extractelement <4 x float> %B, i32 0 263 %sub = fsub float %1, %2 264 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0 265 ret <4 x float> %vecinsert1 266} 267 268define <4 x float> @test11(<4 x float> %A, <4 x float> %B) { 269; SSE-LABEL: test11: 270; SSE: # %bb.0: 271; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 272; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 273; SSE-NEXT: subss %xmm1, %xmm0 274; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 275; SSE-NEXT: retq 276; 277; AVX1-LABEL: test11: 278; AVX1: # %bb.0: 279; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 280; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 281; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 282; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 283; AVX1-NEXT: retq 284; 285; AVX512-LABEL: test11: 286; AVX512: # %bb.0: 287; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 288; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 289; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 290; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 291; AVX512-NEXT: retq 292 %1 = extractelement <4 x float> %A, i32 2 293 %2 = extractelement <4 x float> %B, i32 2 294 %sub = fsub float %1, %2 295 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2 296 ret <4 x float> %vecinsert1 297} 298 299define <4 x float> @test12(<4 x float> %A, <4 x float> %B) { 300; SSE-LABEL: test12: 301; SSE: # %bb.0: 302; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 303; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 304; SSE-NEXT: addss %xmm0, %xmm1 305; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2] 306; SSE-NEXT: retq 307; 308; AVX1-LABEL: test12: 309; AVX1: # %bb.0: 310; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 311; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 312; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 313; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 314; AVX1-NEXT: retq 315; 316; AVX512-LABEL: test12: 317; AVX512: # %bb.0: 318; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 319; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 320; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 321; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 322; AVX512-NEXT: retq 323 %1 = extractelement <4 x float> %A, i32 1 324 %2 = extractelement <4 x float> %B, i32 1 325 %add = fadd float %1, %2 326 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 327 ret <4 x float> %vecinsert1 328} 329 330define <4 x float> @test13(<4 x float> %A, <4 x float> %B) { 331; SSE-LABEL: test13: 332; SSE: # %bb.0: 333; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 334; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 335; SSE-NEXT: addss %xmm0, %xmm1 336; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 337; SSE-NEXT: movaps %xmm1, %xmm0 338; SSE-NEXT: retq 339; 340; AVX1-LABEL: test13: 341; AVX1: # %bb.0: 342; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 343; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 344; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 345; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 346; AVX1-NEXT: retq 347; 348; AVX512-LABEL: test13: 349; AVX512: # %bb.0: 350; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 351; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 352; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 353; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 354; AVX512-NEXT: retq 355 %1 = extractelement <4 x float> %A, i32 3 356 %2 = extractelement <4 x float> %B, i32 3 357 %add = fadd float %1, %2 358 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3 359 ret <4 x float> %vecinsert1 360} 361 362define <4 x float> @test14(<4 x float> %A, <4 x float> %B) { 363; SSE-LABEL: test14: 364; SSE: # %bb.0: 365; SSE-NEXT: movaps %xmm0, %xmm2 366; SSE-NEXT: subss %xmm1, %xmm2 367; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 368; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 369; SSE-NEXT: subss %xmm1, %xmm0 370; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 371; SSE-NEXT: movaps %xmm2, %xmm0 372; SSE-NEXT: retq 373; 374; AVX-LABEL: test14: 375; AVX: # %bb.0: 376; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm2 377; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 378; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 379; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 380; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] 381; AVX-NEXT: retq 382 %1 = extractelement <4 x float> %A, i32 0 383 %2 = extractelement <4 x float> %B, i32 0 384 %sub = fsub float %1, %2 385 %3 = extractelement <4 x float> %A, i32 2 386 %4 = extractelement <4 x float> %B, i32 2 387 %sub2 = fsub float %3, %4 388 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0 389 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2 390 ret <4 x float> %vecinsert2 391} 392 393define <4 x float> @test15(<4 x float> %A, <4 x float> %B) { 394; SSE-LABEL: test15: 395; SSE: # %bb.0: 396; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 397; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 398; SSE-NEXT: addss %xmm3, %xmm2 399; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 400; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 401; SSE-NEXT: addss %xmm0, %xmm1 402; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] 403; SSE-NEXT: movaps %xmm2, %xmm0 404; SSE-NEXT: retq 405; 406; AVX1-LABEL: test15: 407; AVX1: # %bb.0: 408; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 409; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 410; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 411; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 412; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 413; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 414; AVX1-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2] 415; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 416; AVX1-NEXT: retq 417; 418; AVX512-LABEL: test15: 419; AVX512: # %bb.0: 420; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 421; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 422; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2 423; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 424; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 425; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 426; AVX512-NEXT: vbroadcastss %xmm2, %xmm1 427; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 428; AVX512-NEXT: retq 429 %1 = extractelement <4 x float> %A, i32 1 430 %2 = extractelement <4 x float> %B, i32 1 431 %add = fadd float %1, %2 432 %3 = extractelement <4 x float> %A, i32 3 433 %4 = extractelement <4 x float> %B, i32 3 434 %add2 = fadd float %3, %4 435 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 436 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 437 ret <4 x float> %vecinsert2 438} 439 440define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { 441; SSE-LABEL: test16: 442; SSE: # %bb.0: 443; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 444; SSE-NEXT: movaps %xmm0, %xmm2 445; SSE-NEXT: subss %xmm3, %xmm2 446; SSE-NEXT: movaps %xmm0, %xmm4 447; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 448; SSE-NEXT: movaps %xmm1, %xmm5 449; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 450; SSE-NEXT: subss %xmm5, %xmm4 451; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] 452; SSE-NEXT: addss %xmm3, %xmm5 453; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 454; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 455; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 456; SSE-NEXT: addss %xmm0, %xmm1 457; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 458; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] 459; SSE-NEXT: movaps %xmm2, %xmm0 460; SSE-NEXT: retq 461; 462; AVX-LABEL: test16: 463; AVX: # %bb.0: 464; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 465; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3 466; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 467; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] 468; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4 469; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] 470; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 471; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 472; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] 473; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 474; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 475; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 476; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 477; AVX-NEXT: retq 478 %1 = extractelement <4 x float> %A, i32 0 479 %2 = extractelement <4 x float> %B, i32 0 480 %sub = fsub float %1, 42.0 481 %3 = extractelement <4 x float> %A, i32 2 482 %4 = extractelement <4 x float> %B, i32 2 483 %sub2 = fsub float %3, %4 484 %5 = extractelement <4 x float> %A, i32 1 485 %6 = extractelement <4 x float> %B, i32 1 486 %add = fadd float %5, 42.0 487 %7 = extractelement <4 x float> %A, i32 3 488 %8 = extractelement <4 x float> %B, i32 3 489 %add2 = fadd float %7, %8 490 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 491 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 492 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0 493 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2 494 ret <4 x float> %vecinsert4 495} 496 497define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) { 498; SSE-LABEL: test_v2f32: 499; SSE: # %bb.0: 500; SSE-NEXT: addsubps %xmm1, %xmm0 501; SSE-NEXT: retq 502; 503; AVX-LABEL: test_v2f32: 504; AVX: # %bb.0: 505; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 506; AVX-NEXT: retq 507 %v2 = extractelement <2 x float> %v0, i32 0 508 %v3 = extractelement <2 x float> %v1, i32 0 509 %v4 = extractelement <2 x float> %v0, i32 1 510 %v5 = extractelement <2 x float> %v1, i32 1 511 %sub = fsub float %v2, %v3 512 %add = fadd float %v5, %v4 513 %res0 = insertelement <2 x float> undef, float %sub, i32 0 514 %res1 = insertelement <2 x float> %res0, float %add, i32 1 515 ret <2 x float> %res1 516} 517 518define <16 x float> @test17(<16 x float> %A, <16 x float> %B) { 519; SSE-LABEL: test17: 520; SSE: # %bb.0: 521; SSE-NEXT: addsubps %xmm4, %xmm0 522; SSE-NEXT: addsubps %xmm5, %xmm1 523; SSE-NEXT: movaps %xmm0, %xmm2 524; SSE-NEXT: movaps %xmm1, %xmm3 525; SSE-NEXT: retq 526; 527; AVX1-LABEL: test17: 528; AVX1: # %bb.0: 529; AVX1-NEXT: vaddsubps %ymm2, %ymm0, %ymm0 530; AVX1-NEXT: vmovaps %ymm0, %ymm1 531; AVX1-NEXT: retq 532; 533; AVX512-LABEL: test17: 534; AVX512: # %bb.0: 535; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm2 536; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 537; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 538; AVX512-NEXT: vsubss %xmm4, %xmm3, %xmm3 539; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 540; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] 541; AVX512-NEXT: vaddss %xmm5, %xmm4, %xmm4 542; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] 543; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 544; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] 545; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[3,3,3,3] 546; AVX512-NEXT: vaddss %xmm4, %xmm3, %xmm3 547; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] 548; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 549; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 550; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm3 551; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 552; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] 553; AVX512-NEXT: vsubss %xmm5, %xmm4, %xmm4 554; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] 555; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] 556; AVX512-NEXT: vaddss %xmm6, %xmm5, %xmm5 557; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] 558; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] 559; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 560; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 561; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 562; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] 563; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 564; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 565; AVX512-NEXT: retq 566 %1 = extractelement <16 x float> %A, i32 0 567 %2 = extractelement <16 x float> %B, i32 0 568 %sub = fsub float %1, %2 569 %3 = extractelement <16 x float> %A, i32 2 570 %4 = extractelement <16 x float> %B, i32 2 571 %sub2 = fsub float %3, %4 572 %5 = extractelement <16 x float> %A, i32 1 573 %6 = extractelement <16 x float> %B, i32 1 574 %add = fadd float %5, %6 575 %7 = extractelement <16 x float> %A, i32 3 576 %8 = extractelement <16 x float> %B, i32 3 577 %add2 = fadd float %7, %8 578 %9 = extractelement <16 x float> %A, i32 4 579 %10 = extractelement <16 x float> %B, i32 4 580 %sub3 = fsub float %9, %10 581 %11 = extractelement <16 x float> %A, i32 6 582 %12 = extractelement <16 x float> %B, i32 6 583 %sub4 = fsub float %11, %12 584 %13 = extractelement <16 x float> %A, i32 5 585 %14 = extractelement <16 x float> %B, i32 5 586 %add3 = fadd float %13, %14 587 %15 = extractelement <16 x float> %A, i32 7 588 %16 = extractelement <16 x float> %B, i32 7 589 %add4 = fadd float %15, %16 590 %17 = extractelement <16 x float> %A, i32 8 591 %18 = extractelement <16 x float> %B, i32 8 592 %sub5 = fsub float %1, %2 593 %19 = extractelement <16 x float> %A, i32 10 594 %20 = extractelement <16 x float> %B, i32 10 595 %sub6 = fsub float %3, %4 596 %21 = extractelement <16 x float> %A, i32 9 597 %22 = extractelement <16 x float> %B, i32 9 598 %add5 = fadd float %5, %6 599 %23 = extractelement <16 x float> %A, i32 11 600 %24 = extractelement <16 x float> %B, i32 11 601 %add6 = fadd float %7, %8 602 %25 = extractelement <16 x float> %A, i32 12 603 %26 = extractelement <16 x float> %B, i32 12 604 %sub7 = fsub float %9, %10 605 %27 = extractelement <16 x float> %A, i32 14 606 %28 = extractelement <16 x float> %B, i32 14 607 %sub8 = fsub float %11, %12 608 %29 = extractelement <16 x float> %A, i32 13 609 %30 = extractelement <16 x float> %B, i32 13 610 %add7 = fadd float %13, %14 611 %31 = extractelement <16 x float> %A, i32 15 612 %32 = extractelement <16 x float> %B, i32 15 613 %add8 = fadd float %15, %16 614 %vecinsert1 = insertelement <16 x float> undef, float %add, i32 1 615 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3 616 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0 617 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2 618 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5 619 %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7 620 %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4 621 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6 622 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9 623 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11 624 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8 625 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10 626 %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13 627 %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15 628 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12 629 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14 630 ret <16 x float> %vecinsert16 631} 632 633define <8 x double> @test18(<8 x double> %A, <8 x double> %B) { 634; SSE-LABEL: test18: 635; SSE: # %bb.0: 636; SSE-NEXT: addsubpd %xmm4, %xmm0 637; SSE-NEXT: addsubpd %xmm5, %xmm1 638; SSE-NEXT: addsubpd %xmm6, %xmm2 639; SSE-NEXT: addsubpd %xmm7, %xmm3 640; SSE-NEXT: retq 641; 642; AVX1-LABEL: test18: 643; AVX1: # %bb.0: 644; AVX1-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0 645; AVX1-NEXT: vaddsubpd %ymm3, %ymm1, %ymm1 646; AVX1-NEXT: retq 647; 648; AVX512-LABEL: test18: 649; AVX512: # %bb.0: 650; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm2 651; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 652; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4 653; AVX512-NEXT: vsubsd %xmm4, %xmm3, %xmm5 654; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] 655; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm1[1,0] 656; AVX512-NEXT: vaddsd %xmm7, %xmm6, %xmm6 657; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm6[0] 658; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 659; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] 660; AVX512-NEXT: vaddsd %xmm4, %xmm3, %xmm3 661; AVX512-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm5[0],xmm3[0] 662; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm4 663; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5 664; AVX512-NEXT: vsubsd %xmm5, %xmm4, %xmm6 665; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 666; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 667; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm7 668; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] 669; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] 670; AVX512-NEXT: vaddsd %xmm5, %xmm4, %xmm4 671; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm6[0],xmm4[0] 672; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 673; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 674; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 675; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm7[0],xmm0[0] 676; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 677; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 678; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 679; AVX512-NEXT: retq 680 %1 = extractelement <8 x double> %A, i32 0 681 %2 = extractelement <8 x double> %B, i32 0 682 %sub = fsub double %1, %2 683 %3 = extractelement <8 x double> %A, i32 2 684 %4 = extractelement <8 x double> %B, i32 2 685 %sub2 = fsub double %3, %4 686 %5 = extractelement <8 x double> %A, i32 1 687 %6 = extractelement <8 x double> %B, i32 1 688 %add = fadd double %5, %6 689 %7 = extractelement <8 x double> %A, i32 3 690 %8 = extractelement <8 x double> %B, i32 3 691 %add2 = fadd double %7, %8 692 %9 = extractelement <8 x double> %A, i32 4 693 %10 = extractelement <8 x double> %B, i32 4 694 %sub3 = fsub double %9, %10 695 %11 = extractelement <8 x double> %A, i32 6 696 %12 = extractelement <8 x double> %B, i32 6 697 %sub4 = fsub double %11, %12 698 %13 = extractelement <8 x double> %A, i32 5 699 %14 = extractelement <8 x double> %B, i32 5 700 %add3 = fadd double %13, %14 701 %15 = extractelement <8 x double> %A, i32 7 702 %16 = extractelement <8 x double> %B, i32 7 703 %add4 = fadd double %15, %16 704 %vecinsert1 = insertelement <8 x double> undef, double %add, i32 1 705 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3 706 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0 707 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2 708 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5 709 %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7 710 %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4 711 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6 712 ret <8 x double> %vecinsert8 713} 714