1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX512 10 11; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. 12 13define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) { 14; SSE-LABEL: test14_undef: 15; SSE: # %bb.0: 16; SSE-NEXT: phaddd %xmm2, %xmm0 17; SSE-NEXT: retq 18; 19; AVX-LABEL: test14_undef: 20; AVX: # %bb.0: 21; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 22; AVX-NEXT: retq 23 %vecext = extractelement <8 x i32> %a, i32 0 24 %vecext1 = extractelement <8 x i32> %a, i32 1 25 %add = add i32 %vecext, %vecext1 26 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 27 %vecext2 = extractelement <8 x i32> %b, i32 2 28 %vecext3 = extractelement <8 x i32> %b, i32 3 29 %add4 = add i32 %vecext2, %vecext3 30 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3 31 ret <8 x i32> %vecinit5 32} 33 34; integer horizontal adds instead of two scalar adds followed by vector inserts. 35define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { 36; SSE-SLOW-LABEL: test15_undef: 37; SSE-SLOW: # %bb.0: 38; SSE-SLOW-NEXT: movd %xmm0, %eax 39; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 40; SSE-SLOW-NEXT: movd %xmm0, %ecx 41; SSE-SLOW-NEXT: addl %eax, %ecx 42; SSE-SLOW-NEXT: movd %xmm3, %eax 43; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 44; SSE-SLOW-NEXT: movd %xmm0, %edx 45; SSE-SLOW-NEXT: addl %eax, %edx 46; SSE-SLOW-NEXT: movd %ecx, %xmm0 47; SSE-SLOW-NEXT: movd %edx, %xmm1 48; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 49; SSE-SLOW-NEXT: retq 50; 51; SSE-FAST-LABEL: test15_undef: 52; SSE-FAST: # %bb.0: 53; SSE-FAST-NEXT: movdqa %xmm3, %xmm1 54; SSE-FAST-NEXT: phaddd %xmm0, %xmm0 55; SSE-FAST-NEXT: phaddd %xmm3, %xmm1 56; SSE-FAST-NEXT: retq 57; 58; AVX1-SLOW-LABEL: test15_undef: 59; AVX1-SLOW: # %bb.0: 60; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 61; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %ecx 62; AVX1-SLOW-NEXT: addl %eax, %ecx 63; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0 64; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 65; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %edx 66; AVX1-SLOW-NEXT: addl %eax, %edx 67; AVX1-SLOW-NEXT: vmovd %ecx, %xmm0 68; AVX1-SLOW-NEXT: vmovd %edx, %xmm1 69; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 70; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 71; AVX1-SLOW-NEXT: retq 72; 73; AVX1-FAST-LABEL: test15_undef: 74; AVX1-FAST: # %bb.0: 75; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 76; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 77; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 78; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 79; AVX1-FAST-NEXT: retq 80; 81; AVX2-LABEL: test15_undef: 82; AVX2: # %bb.0: 83; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 84; AVX2-NEXT: retq 85; 86; AVX512-LABEL: test15_undef: 87; AVX512: # %bb.0: 88; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 89; AVX512-NEXT: retq 90 %vecext = extractelement <8 x i32> %a, i32 0 91 %vecext1 = extractelement <8 x i32> %a, i32 1 92 %add = add i32 %vecext, %vecext1 93 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 94 %vecext2 = extractelement <8 x i32> %b, i32 4 95 %vecext3 = extractelement <8 x i32> %b, i32 5 96 %add4 = add i32 %vecext2, %vecext3 97 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6 98 ret <8 x i32> %vecinit5 99} 100 101define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) { 102; SSE-LABEL: PR40243_alt: 103; SSE: # %bb.0: 104; SSE-NEXT: phaddd %xmm3, %xmm1 105; SSE-NEXT: retq 106; 107; AVX1-LABEL: PR40243_alt: 108; AVX1: # %bb.0: 109; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 110; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 111; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 112; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 113; AVX1-NEXT: retq 114; 115; AVX2-LABEL: PR40243_alt: 116; AVX2: # %bb.0: 117; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 118; AVX2-NEXT: retq 119; 120; AVX512-LABEL: PR40243_alt: 121; AVX512: # %bb.0: 122; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 123; AVX512-NEXT: retq 124 %a4 = extractelement <8 x i32> %a, i32 4 125 %a5 = extractelement <8 x i32> %a, i32 5 126 %add4 = add i32 %a4, %a5 127 %b6 = extractelement <8 x i32> %b, i32 6 128 %b7 = extractelement <8 x i32> %b, i32 7 129 %add7 = add i32 %b6, %b7 130 %r4 = insertelement <8 x i32> undef, i32 %add4, i32 4 131 %r = insertelement <8 x i32> %r4, i32 %add7, i32 7 132 ret <8 x i32> %r 133} 134 135define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) { 136; SSE-LABEL: test16_undef: 137; SSE: # %bb.0: 138; SSE-NEXT: phaddd %xmm0, %xmm0 139; SSE-NEXT: retq 140; 141; AVX-LABEL: test16_undef: 142; AVX: # %bb.0: 143; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 144; AVX-NEXT: retq 145 %vecext = extractelement <8 x i32> %a, i32 0 146 %vecext1 = extractelement <8 x i32> %a, i32 1 147 %add = add i32 %vecext, %vecext1 148 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 149 %vecext2 = extractelement <8 x i32> %a, i32 2 150 %vecext3 = extractelement <8 x i32> %a, i32 3 151 %add4 = add i32 %vecext2, %vecext3 152 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 153 ret <8 x i32> %vecinit5 154} 155 156define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) { 157; SSE-LABEL: test16_v16i32_undef: 158; SSE: # %bb.0: 159; SSE-NEXT: phaddd %xmm0, %xmm0 160; SSE-NEXT: retq 161; 162; AVX-LABEL: test16_v16i32_undef: 163; AVX: # %bb.0: 164; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 165; AVX-NEXT: retq 166 %vecext = extractelement <16 x i32> %a, i32 0 167 %vecext1 = extractelement <16 x i32> %a, i32 1 168 %add = add i32 %vecext, %vecext1 169 %vecinit = insertelement <16 x i32> undef, i32 %add, i32 0 170 %vecext2 = extractelement <16 x i32> %a, i32 2 171 %vecext3 = extractelement <16 x i32> %a, i32 3 172 %add4 = add i32 %vecext2, %vecext3 173 %vecinit5 = insertelement <16 x i32> %vecinit, i32 %add4, i32 1 174 ret <16 x i32> %vecinit5 175} 176 177define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { 178; SSE-LABEL: test17_undef: 179; SSE: # %bb.0: 180; SSE-NEXT: phaddd %xmm1, %xmm0 181; SSE-NEXT: retq 182; 183; AVX1-LABEL: test17_undef: 184; AVX1: # %bb.0: 185; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 186; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 187; AVX1-NEXT: retq 188; 189; AVX2-LABEL: test17_undef: 190; AVX2: # %bb.0: 191; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 192; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 193; AVX2-NEXT: retq 194; 195; AVX512-LABEL: test17_undef: 196; AVX512: # %bb.0: 197; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 198; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0 199; AVX512-NEXT: retq 200 %vecext = extractelement <8 x i32> %a, i32 0 201 %vecext1 = extractelement <8 x i32> %a, i32 1 202 %add1 = add i32 %vecext, %vecext1 203 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0 204 %vecext2 = extractelement <8 x i32> %a, i32 2 205 %vecext3 = extractelement <8 x i32> %a, i32 3 206 %add2 = add i32 %vecext2, %vecext3 207 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1 208 %vecext4 = extractelement <8 x i32> %a, i32 4 209 %vecext5 = extractelement <8 x i32> %a, i32 5 210 %add3 = add i32 %vecext4, %vecext5 211 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2 212 %vecext6 = extractelement <8 x i32> %a, i32 6 213 %vecext7 = extractelement <8 x i32> %a, i32 7 214 %add4 = add i32 %vecext6, %vecext7 215 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3 216 ret <8 x i32> %vecinit4 217} 218 219define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) { 220; SSE-LABEL: test17_v16i32_undef: 221; SSE: # %bb.0: 222; SSE-NEXT: phaddd %xmm1, %xmm0 223; SSE-NEXT: retq 224; 225; AVX1-LABEL: test17_v16i32_undef: 226; AVX1: # %bb.0: 227; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 228; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 229; AVX1-NEXT: retq 230; 231; AVX2-LABEL: test17_v16i32_undef: 232; AVX2: # %bb.0: 233; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 234; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 235; AVX2-NEXT: retq 236; 237; AVX512-LABEL: test17_v16i32_undef: 238; AVX512: # %bb.0: 239; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 240; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0 241; AVX512-NEXT: retq 242 %vecext = extractelement <16 x i32> %a, i32 0 243 %vecext1 = extractelement <16 x i32> %a, i32 1 244 %add1 = add i32 %vecext, %vecext1 245 %vecinit1 = insertelement <16 x i32> undef, i32 %add1, i32 0 246 %vecext2 = extractelement <16 x i32> %a, i32 2 247 %vecext3 = extractelement <16 x i32> %a, i32 3 248 %add2 = add i32 %vecext2, %vecext3 249 %vecinit2 = insertelement <16 x i32> %vecinit1, i32 %add2, i32 1 250 %vecext4 = extractelement <16 x i32> %a, i32 4 251 %vecext5 = extractelement <16 x i32> %a, i32 5 252 %add3 = add i32 %vecext4, %vecext5 253 %vecinit3 = insertelement <16 x i32> %vecinit2, i32 %add3, i32 2 254 %vecext6 = extractelement <16 x i32> %a, i32 6 255 %vecext7 = extractelement <16 x i32> %a, i32 7 256 %add4 = add i32 %vecext6, %vecext7 257 %vecinit4 = insertelement <16 x i32> %vecinit3, i32 %add4, i32 3 258 ret <16 x i32> %vecinit4 259} 260 261