1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 9 10; 11; vXi64 12; 13 14define i64 @test_v2i64(<2 x i64> %a0) { 15; SSE-LABEL: test_v2i64: 16; SSE: # %bb.0: 17; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 18; SSE-NEXT: paddq %xmm0, %xmm1 19; SSE-NEXT: movq %xmm1, %rax 20; SSE-NEXT: retq 21; 22; AVX-LABEL: test_v2i64: 23; AVX: # %bb.0: 24; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 25; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 26; AVX-NEXT: vmovq %xmm0, %rax 27; AVX-NEXT: retq 28; 29; AVX512-LABEL: test_v2i64: 30; AVX512: # %bb.0: 31; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 32; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 33; AVX512-NEXT: vmovq %xmm0, %rax 34; AVX512-NEXT: retq 35 %1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a0) 36 ret i64 %1 37} 38 39define i64 @test_v4i64(<4 x i64> %a0) { 40; SSE-LABEL: test_v4i64: 41; SSE: # %bb.0: 42; SSE-NEXT: paddq %xmm1, %xmm0 43; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 44; SSE-NEXT: paddq %xmm0, %xmm1 45; SSE-NEXT: movq %xmm1, %rax 46; SSE-NEXT: retq 47; 48; AVX1-LABEL: test_v4i64: 49; AVX1: # %bb.0: 50; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 51; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 52; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 53; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 54; AVX1-NEXT: vmovq %xmm0, %rax 55; AVX1-NEXT: vzeroupper 56; AVX1-NEXT: retq 57; 58; AVX2-LABEL: test_v4i64: 59; AVX2: # %bb.0: 60; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 61; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 62; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 63; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 64; AVX2-NEXT: vmovq %xmm0, %rax 65; AVX2-NEXT: vzeroupper 66; AVX2-NEXT: retq 67; 68; AVX512-LABEL: test_v4i64: 69; AVX512: # %bb.0: 70; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 71; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 72; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 73; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 74; AVX512-NEXT: vmovq %xmm0, %rax 75; AVX512-NEXT: vzeroupper 76; AVX512-NEXT: retq 77 %1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a0) 78 ret i64 %1 79} 80 81define i64 @test_v8i64(<8 x i64> %a0) { 82; SSE-LABEL: test_v8i64: 83; SSE: # %bb.0: 84; SSE-NEXT: paddq %xmm3, %xmm1 85; SSE-NEXT: paddq %xmm2, %xmm1 86; SSE-NEXT: paddq %xmm0, %xmm1 87; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 88; SSE-NEXT: paddq %xmm1, %xmm0 89; SSE-NEXT: movq %xmm0, %rax 90; SSE-NEXT: retq 91; 92; AVX1-LABEL: test_v8i64: 93; AVX1: # %bb.0: 94; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 95; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 96; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 97; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 98; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 99; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 100; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 101; AVX1-NEXT: vmovq %xmm0, %rax 102; AVX1-NEXT: vzeroupper 103; AVX1-NEXT: retq 104; 105; AVX2-LABEL: test_v8i64: 106; AVX2: # %bb.0: 107; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 108; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 109; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 110; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 111; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 112; AVX2-NEXT: vmovq %xmm0, %rax 113; AVX2-NEXT: vzeroupper 114; AVX2-NEXT: retq 115; 116; AVX512-LABEL: test_v8i64: 117; AVX512: # %bb.0: 118; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 119; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 120; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 121; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 122; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 123; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 124; AVX512-NEXT: vmovq %xmm0, %rax 125; AVX512-NEXT: vzeroupper 126; AVX512-NEXT: retq 127 %1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a0) 128 ret i64 %1 129} 130 131define i64 @test_v16i64(<16 x i64> %a0) { 132; SSE-LABEL: test_v16i64: 133; SSE: # %bb.0: 134; SSE-NEXT: paddq %xmm6, %xmm2 135; SSE-NEXT: paddq %xmm7, %xmm3 136; SSE-NEXT: paddq %xmm5, %xmm3 137; SSE-NEXT: paddq %xmm1, %xmm3 138; SSE-NEXT: paddq %xmm4, %xmm2 139; SSE-NEXT: paddq %xmm3, %xmm2 140; SSE-NEXT: paddq %xmm0, %xmm2 141; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 142; SSE-NEXT: paddq %xmm2, %xmm0 143; SSE-NEXT: movq %xmm0, %rax 144; SSE-NEXT: retq 145; 146; AVX1-LABEL: test_v16i64: 147; AVX1: # %bb.0: 148; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 149; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 150; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 151; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 152; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 153; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 154; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 155; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 156; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 157; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 158; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 159; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 160; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 161; AVX1-NEXT: vmovq %xmm0, %rax 162; AVX1-NEXT: vzeroupper 163; AVX1-NEXT: retq 164; 165; AVX2-LABEL: test_v16i64: 166; AVX2: # %bb.0: 167; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 168; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 169; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 170; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 171; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 172; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 173; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 174; AVX2-NEXT: vmovq %xmm0, %rax 175; AVX2-NEXT: vzeroupper 176; AVX2-NEXT: retq 177; 178; AVX512-LABEL: test_v16i64: 179; AVX512: # %bb.0: 180; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 181; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 182; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 183; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 184; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 185; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 186; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 187; AVX512-NEXT: vmovq %xmm0, %rax 188; AVX512-NEXT: vzeroupper 189; AVX512-NEXT: retq 190 %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a0) 191 ret i64 %1 192} 193 194; 195; vXi32 196; 197 198define i32 @test_v2i32(<2 x i32> %a0) { 199; SSE-LABEL: test_v2i32: 200; SSE: # %bb.0: 201; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 202; SSE-NEXT: paddd %xmm0, %xmm1 203; SSE-NEXT: movd %xmm1, %eax 204; SSE-NEXT: retq 205; 206; AVX1-SLOW-LABEL: test_v2i32: 207; AVX1-SLOW: # %bb.0: 208; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 209; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 210; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 211; AVX1-SLOW-NEXT: retq 212; 213; AVX1-FAST-LABEL: test_v2i32: 214; AVX1-FAST: # %bb.0: 215; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 216; AVX1-FAST-NEXT: vmovd %xmm0, %eax 217; AVX1-FAST-NEXT: retq 218; 219; AVX2-LABEL: test_v2i32: 220; AVX2: # %bb.0: 221; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 222; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 223; AVX2-NEXT: vmovd %xmm0, %eax 224; AVX2-NEXT: retq 225; 226; AVX512-LABEL: test_v2i32: 227; AVX512: # %bb.0: 228; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 229; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 230; AVX512-NEXT: vmovd %xmm0, %eax 231; AVX512-NEXT: retq 232 %1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a0) 233 ret i32 %1 234} 235 236define i32 @test_v4i32(<4 x i32> %a0) { 237; SSE-LABEL: test_v4i32: 238; SSE: # %bb.0: 239; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 240; SSE-NEXT: paddd %xmm0, %xmm1 241; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 242; SSE-NEXT: paddd %xmm1, %xmm0 243; SSE-NEXT: movd %xmm0, %eax 244; SSE-NEXT: retq 245; 246; AVX1-SLOW-LABEL: test_v4i32: 247; AVX1-SLOW: # %bb.0: 248; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 249; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 250; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 251; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 252; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 253; AVX1-SLOW-NEXT: retq 254; 255; AVX1-FAST-LABEL: test_v4i32: 256; AVX1-FAST: # %bb.0: 257; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 258; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 259; AVX1-FAST-NEXT: vmovd %xmm0, %eax 260; AVX1-FAST-NEXT: retq 261; 262; AVX2-LABEL: test_v4i32: 263; AVX2: # %bb.0: 264; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 265; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 266; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 267; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 268; AVX2-NEXT: vmovd %xmm0, %eax 269; AVX2-NEXT: retq 270; 271; AVX512-LABEL: test_v4i32: 272; AVX512: # %bb.0: 273; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 274; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 275; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 276; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 277; AVX512-NEXT: vmovd %xmm0, %eax 278; AVX512-NEXT: retq 279 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a0) 280 ret i32 %1 281} 282 283define i32 @test_v8i32(<8 x i32> %a0) { 284; SSE-LABEL: test_v8i32: 285; SSE: # %bb.0: 286; SSE-NEXT: paddd %xmm1, %xmm0 287; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 288; SSE-NEXT: paddd %xmm0, %xmm1 289; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 290; SSE-NEXT: paddd %xmm1, %xmm0 291; SSE-NEXT: movd %xmm0, %eax 292; SSE-NEXT: retq 293; 294; AVX1-SLOW-LABEL: test_v8i32: 295; AVX1-SLOW: # %bb.0: 296; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 297; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 298; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 299; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 300; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 301; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 302; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 303; AVX1-SLOW-NEXT: vzeroupper 304; AVX1-SLOW-NEXT: retq 305; 306; AVX1-FAST-LABEL: test_v8i32: 307; AVX1-FAST: # %bb.0: 308; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 309; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 310; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 311; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 312; AVX1-FAST-NEXT: vmovd %xmm0, %eax 313; AVX1-FAST-NEXT: vzeroupper 314; AVX1-FAST-NEXT: retq 315; 316; AVX2-LABEL: test_v8i32: 317; AVX2: # %bb.0: 318; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 319; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 320; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 321; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 322; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 323; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 324; AVX2-NEXT: vmovd %xmm0, %eax 325; AVX2-NEXT: vzeroupper 326; AVX2-NEXT: retq 327; 328; AVX512-LABEL: test_v8i32: 329; AVX512: # %bb.0: 330; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 331; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 332; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 333; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 334; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 335; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 336; AVX512-NEXT: vmovd %xmm0, %eax 337; AVX512-NEXT: vzeroupper 338; AVX512-NEXT: retq 339 %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a0) 340 ret i32 %1 341} 342 343define i32 @test_v16i32(<16 x i32> %a0) { 344; SSE-LABEL: test_v16i32: 345; SSE: # %bb.0: 346; SSE-NEXT: paddd %xmm3, %xmm1 347; SSE-NEXT: paddd %xmm2, %xmm1 348; SSE-NEXT: paddd %xmm0, %xmm1 349; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 350; SSE-NEXT: paddd %xmm1, %xmm0 351; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 352; SSE-NEXT: paddd %xmm0, %xmm1 353; SSE-NEXT: movd %xmm1, %eax 354; SSE-NEXT: retq 355; 356; AVX1-SLOW-LABEL: test_v16i32: 357; AVX1-SLOW: # %bb.0: 358; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 359; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 360; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 361; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 362; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 363; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 364; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 365; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 366; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 367; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 368; AVX1-SLOW-NEXT: vzeroupper 369; AVX1-SLOW-NEXT: retq 370; 371; AVX1-FAST-LABEL: test_v16i32: 372; AVX1-FAST: # %bb.0: 373; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 374; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 375; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 376; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 377; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 378; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 379; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 380; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 381; AVX1-FAST-NEXT: vmovd %xmm0, %eax 382; AVX1-FAST-NEXT: vzeroupper 383; AVX1-FAST-NEXT: retq 384; 385; AVX2-LABEL: test_v16i32: 386; AVX2: # %bb.0: 387; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 388; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 389; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 390; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 391; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 392; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 393; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 394; AVX2-NEXT: vmovd %xmm0, %eax 395; AVX2-NEXT: vzeroupper 396; AVX2-NEXT: retq 397; 398; AVX512-LABEL: test_v16i32: 399; AVX512: # %bb.0: 400; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 401; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 402; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 403; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 404; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 405; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 406; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 407; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 408; AVX512-NEXT: vmovd %xmm0, %eax 409; AVX512-NEXT: vzeroupper 410; AVX512-NEXT: retq 411 %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a0) 412 ret i32 %1 413} 414 415define i32 @test_v32i32(<32 x i32> %a0) { 416; SSE-LABEL: test_v32i32: 417; SSE: # %bb.0: 418; SSE-NEXT: paddd %xmm6, %xmm2 419; SSE-NEXT: paddd %xmm7, %xmm3 420; SSE-NEXT: paddd %xmm5, %xmm3 421; SSE-NEXT: paddd %xmm1, %xmm3 422; SSE-NEXT: paddd %xmm4, %xmm2 423; SSE-NEXT: paddd %xmm3, %xmm2 424; SSE-NEXT: paddd %xmm0, %xmm2 425; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 426; SSE-NEXT: paddd %xmm2, %xmm0 427; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 428; SSE-NEXT: paddd %xmm0, %xmm1 429; SSE-NEXT: movd %xmm1, %eax 430; SSE-NEXT: retq 431; 432; AVX1-SLOW-LABEL: test_v32i32: 433; AVX1-SLOW: # %bb.0: 434; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 435; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 436; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 437; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 438; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 439; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 440; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 441; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 442; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 443; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 444; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 445; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 446; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 447; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 448; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 449; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 450; AVX1-SLOW-NEXT: vzeroupper 451; AVX1-SLOW-NEXT: retq 452; 453; AVX1-FAST-LABEL: test_v32i32: 454; AVX1-FAST: # %bb.0: 455; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 456; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 457; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 458; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 459; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 460; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 461; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 462; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 463; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2 464; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 465; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 466; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 467; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 468; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 469; AVX1-FAST-NEXT: vmovd %xmm0, %eax 470; AVX1-FAST-NEXT: vzeroupper 471; AVX1-FAST-NEXT: retq 472; 473; AVX2-LABEL: test_v32i32: 474; AVX2: # %bb.0: 475; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 476; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 477; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 478; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 479; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 480; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 481; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 482; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 483; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 484; AVX2-NEXT: vmovd %xmm0, %eax 485; AVX2-NEXT: vzeroupper 486; AVX2-NEXT: retq 487; 488; AVX512-LABEL: test_v32i32: 489; AVX512: # %bb.0: 490; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 491; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 492; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 493; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 494; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 495; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 496; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 497; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 498; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 499; AVX512-NEXT: vmovd %xmm0, %eax 500; AVX512-NEXT: vzeroupper 501; AVX512-NEXT: retq 502 %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %a0) 503 ret i32 %1 504} 505 506; 507; vXi16 508; 509 510define i16 @test_v2i16(<2 x i16> %a0) { 511; SSE-LABEL: test_v2i16: 512; SSE: # %bb.0: 513; SSE-NEXT: movdqa %xmm0, %xmm1 514; SSE-NEXT: psrld $16, %xmm1 515; SSE-NEXT: paddw %xmm0, %xmm1 516; SSE-NEXT: movd %xmm1, %eax 517; SSE-NEXT: # kill: def $ax killed $ax killed $eax 518; SSE-NEXT: retq 519; 520; AVX1-SLOW-LABEL: test_v2i16: 521; AVX1-SLOW: # %bb.0: 522; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 523; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 524; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 525; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 526; AVX1-SLOW-NEXT: retq 527; 528; AVX1-FAST-LABEL: test_v2i16: 529; AVX1-FAST: # %bb.0: 530; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 531; AVX1-FAST-NEXT: vmovd %xmm0, %eax 532; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 533; AVX1-FAST-NEXT: retq 534; 535; AVX2-LABEL: test_v2i16: 536; AVX2: # %bb.0: 537; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 538; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 539; AVX2-NEXT: vmovd %xmm0, %eax 540; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 541; AVX2-NEXT: retq 542; 543; AVX512-LABEL: test_v2i16: 544; AVX512: # %bb.0: 545; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 546; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 547; AVX512-NEXT: vmovd %xmm0, %eax 548; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 549; AVX512-NEXT: retq 550 %1 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a0) 551 ret i16 %1 552} 553 554define i16 @test_v4i16(<4 x i16> %a0) { 555; SSE-LABEL: test_v4i16: 556; SSE: # %bb.0: 557; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 558; SSE-NEXT: paddw %xmm0, %xmm1 559; SSE-NEXT: movdqa %xmm1, %xmm0 560; SSE-NEXT: psrld $16, %xmm0 561; SSE-NEXT: paddw %xmm1, %xmm0 562; SSE-NEXT: movd %xmm0, %eax 563; SSE-NEXT: # kill: def $ax killed $ax killed $eax 564; SSE-NEXT: retq 565; 566; AVX1-SLOW-LABEL: test_v4i16: 567; AVX1-SLOW: # %bb.0: 568; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 569; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 570; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 571; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 572; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 573; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 574; AVX1-SLOW-NEXT: retq 575; 576; AVX1-FAST-LABEL: test_v4i16: 577; AVX1-FAST: # %bb.0: 578; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 579; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 580; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 581; AVX1-FAST-NEXT: vmovd %xmm0, %eax 582; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 583; AVX1-FAST-NEXT: retq 584; 585; AVX2-LABEL: test_v4i16: 586; AVX2: # %bb.0: 587; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 588; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 589; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 590; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 591; AVX2-NEXT: vmovd %xmm0, %eax 592; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 593; AVX2-NEXT: retq 594; 595; AVX512-LABEL: test_v4i16: 596; AVX512: # %bb.0: 597; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 598; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 599; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 600; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 601; AVX512-NEXT: vmovd %xmm0, %eax 602; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 603; AVX512-NEXT: retq 604 %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a0) 605 ret i16 %1 606} 607 608define i16 @test_v8i16(<8 x i16> %a0) { 609; SSE-LABEL: test_v8i16: 610; SSE: # %bb.0: 611; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 612; SSE-NEXT: paddw %xmm0, %xmm1 613; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 614; SSE-NEXT: paddw %xmm1, %xmm0 615; SSE-NEXT: movdqa %xmm0, %xmm1 616; SSE-NEXT: psrld $16, %xmm1 617; SSE-NEXT: paddw %xmm0, %xmm1 618; SSE-NEXT: movd %xmm1, %eax 619; SSE-NEXT: # kill: def $ax killed $ax killed $eax 620; SSE-NEXT: retq 621; 622; AVX1-SLOW-LABEL: test_v8i16: 623; AVX1-SLOW: # %bb.0: 624; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 625; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 626; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 627; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 628; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 629; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 630; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 631; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 632; AVX1-SLOW-NEXT: retq 633; 634; AVX1-FAST-LABEL: test_v8i16: 635; AVX1-FAST: # %bb.0: 636; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 637; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 638; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 639; AVX1-FAST-NEXT: vmovd %xmm0, %eax 640; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 641; AVX1-FAST-NEXT: retq 642; 643; AVX2-LABEL: test_v8i16: 644; AVX2: # %bb.0: 645; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 646; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 647; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 648; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 649; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 650; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 651; AVX2-NEXT: vmovd %xmm0, %eax 652; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 653; AVX2-NEXT: retq 654; 655; AVX512-LABEL: test_v8i16: 656; AVX512: # %bb.0: 657; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 658; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 659; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 660; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 661; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 662; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 663; AVX512-NEXT: vmovd %xmm0, %eax 664; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 665; AVX512-NEXT: retq 666 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a0) 667 ret i16 %1 668} 669 670define i16 @test_v16i16(<16 x i16> %a0) { 671; SSE-LABEL: test_v16i16: 672; SSE: # %bb.0: 673; SSE-NEXT: paddw %xmm1, %xmm0 674; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 675; SSE-NEXT: paddw %xmm0, %xmm1 676; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 677; SSE-NEXT: paddw %xmm1, %xmm0 678; SSE-NEXT: movdqa %xmm0, %xmm1 679; SSE-NEXT: psrld $16, %xmm1 680; SSE-NEXT: paddw %xmm0, %xmm1 681; SSE-NEXT: movd %xmm1, %eax 682; SSE-NEXT: # kill: def $ax killed $ax killed $eax 683; SSE-NEXT: retq 684; 685; AVX1-SLOW-LABEL: test_v16i16: 686; AVX1-SLOW: # %bb.0: 687; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 688; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 689; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 690; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 691; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 692; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 693; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 694; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 695; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 696; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 697; AVX1-SLOW-NEXT: vzeroupper 698; AVX1-SLOW-NEXT: retq 699; 700; AVX1-FAST-LABEL: test_v16i16: 701; AVX1-FAST: # %bb.0: 702; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 703; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 704; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 705; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 706; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 707; AVX1-FAST-NEXT: vmovd %xmm0, %eax 708; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 709; AVX1-FAST-NEXT: vzeroupper 710; AVX1-FAST-NEXT: retq 711; 712; AVX2-LABEL: test_v16i16: 713; AVX2: # %bb.0: 714; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 715; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 716; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 717; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 718; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 719; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 720; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 721; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 722; AVX2-NEXT: vmovd %xmm0, %eax 723; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 724; AVX2-NEXT: vzeroupper 725; AVX2-NEXT: retq 726; 727; AVX512-LABEL: test_v16i16: 728; AVX512: # %bb.0: 729; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 730; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 731; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 732; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 733; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 734; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 735; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 736; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 737; AVX512-NEXT: vmovd %xmm0, %eax 738; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 739; AVX512-NEXT: vzeroupper 740; AVX512-NEXT: retq 741 %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a0) 742 ret i16 %1 743} 744 745define i16 @test_v32i16(<32 x i16> %a0) { 746; SSE-LABEL: test_v32i16: 747; SSE: # %bb.0: 748; SSE-NEXT: paddw %xmm3, %xmm1 749; SSE-NEXT: paddw %xmm2, %xmm1 750; SSE-NEXT: paddw %xmm0, %xmm1 751; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 752; SSE-NEXT: paddw %xmm1, %xmm0 753; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 754; SSE-NEXT: paddw %xmm0, %xmm1 755; SSE-NEXT: movdqa %xmm1, %xmm0 756; SSE-NEXT: psrld $16, %xmm0 757; SSE-NEXT: paddw %xmm1, %xmm0 758; SSE-NEXT: movd %xmm0, %eax 759; SSE-NEXT: # kill: def $ax killed $ax killed $eax 760; SSE-NEXT: retq 761; 762; AVX1-SLOW-LABEL: test_v32i16: 763; AVX1-SLOW: # %bb.0: 764; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 765; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 766; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 767; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 768; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 769; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 770; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 771; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 772; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 773; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 774; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 775; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 776; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 777; AVX1-SLOW-NEXT: vzeroupper 778; AVX1-SLOW-NEXT: retq 779; 780; AVX1-FAST-LABEL: test_v32i16: 781; AVX1-FAST: # %bb.0: 782; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 783; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 784; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 785; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1 786; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 787; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 788; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 789; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 790; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 791; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 792; AVX1-FAST-NEXT: vmovd %xmm0, %eax 793; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 794; AVX1-FAST-NEXT: vzeroupper 795; AVX1-FAST-NEXT: retq 796; 797; AVX2-LABEL: test_v32i16: 798; AVX2: # %bb.0: 799; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 800; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 801; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 802; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 803; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 804; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 805; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 806; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 807; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 808; AVX2-NEXT: vmovd %xmm0, %eax 809; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 810; AVX2-NEXT: vzeroupper 811; AVX2-NEXT: retq 812; 813; AVX512-LABEL: test_v32i16: 814; AVX512: # %bb.0: 815; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 816; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 817; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 818; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 819; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 820; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 821; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 822; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 823; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 824; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 825; AVX512-NEXT: vmovd %xmm0, %eax 826; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 827; AVX512-NEXT: vzeroupper 828; AVX512-NEXT: retq 829 %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a0) 830 ret i16 %1 831} 832 833define i16 @test_v64i16(<64 x i16> %a0) { 834; SSE-LABEL: test_v64i16: 835; SSE: # %bb.0: 836; SSE-NEXT: paddw %xmm6, %xmm2 837; SSE-NEXT: paddw %xmm7, %xmm3 838; SSE-NEXT: paddw %xmm5, %xmm3 839; SSE-NEXT: paddw %xmm1, %xmm3 840; SSE-NEXT: paddw %xmm4, %xmm2 841; SSE-NEXT: paddw %xmm3, %xmm2 842; SSE-NEXT: paddw %xmm0, %xmm2 843; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 844; SSE-NEXT: paddw %xmm2, %xmm0 845; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 846; SSE-NEXT: paddw %xmm0, %xmm1 847; SSE-NEXT: movdqa %xmm1, %xmm0 848; SSE-NEXT: psrld $16, %xmm0 849; SSE-NEXT: paddw %xmm1, %xmm0 850; SSE-NEXT: movd %xmm0, %eax 851; SSE-NEXT: # kill: def $ax killed $ax killed $eax 852; SSE-NEXT: retq 853; 854; AVX1-SLOW-LABEL: test_v64i16: 855; AVX1-SLOW: # %bb.0: 856; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4 857; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 858; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 859; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1 860; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 861; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 862; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 863; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 864; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2 865; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1 866; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 867; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 868; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 869; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 870; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 871; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 872; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 873; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 874; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 875; AVX1-SLOW-NEXT: vzeroupper 876; AVX1-SLOW-NEXT: retq 877; 878; AVX1-FAST-LABEL: test_v64i16: 879; AVX1-FAST: # %bb.0: 880; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4 881; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 882; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 883; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1 884; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 885; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 886; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 887; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 888; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2 889; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1 890; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 891; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 892; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 893; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 894; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 895; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 896; AVX1-FAST-NEXT: vmovd %xmm0, %eax 897; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 898; AVX1-FAST-NEXT: vzeroupper 899; AVX1-FAST-NEXT: retq 900; 901; AVX2-LABEL: test_v64i16: 902; AVX2: # %bb.0: 903; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 904; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 905; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 906; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 907; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 908; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 909; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 910; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 911; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 912; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 913; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 914; AVX2-NEXT: vmovd %xmm0, %eax 915; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 916; AVX2-NEXT: vzeroupper 917; AVX2-NEXT: retq 918; 919; AVX512-LABEL: test_v64i16: 920; AVX512: # %bb.0: 921; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 922; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 923; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 924; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 925; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 926; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 927; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 928; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 929; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 930; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 931; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 932; AVX512-NEXT: vmovd %xmm0, %eax 933; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 934; AVX512-NEXT: vzeroupper 935; AVX512-NEXT: retq 936 %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %a0) 937 ret i16 %1 938} 939 940; 941; vXi8 942; 943 944define i8 @test_v2i8(<2 x i8> %a0) { 945; SSE-LABEL: test_v2i8: 946; SSE: # %bb.0: 947; SSE-NEXT: movdqa %xmm0, %xmm1 948; SSE-NEXT: psrlw $8, %xmm1 949; SSE-NEXT: paddb %xmm0, %xmm1 950; SSE-NEXT: movd %xmm1, %eax 951; SSE-NEXT: # kill: def $al killed $al killed $eax 952; SSE-NEXT: retq 953; 954; AVX-LABEL: test_v2i8: 955; AVX: # %bb.0: 956; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 957; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 958; AVX-NEXT: vmovd %xmm0, %eax 959; AVX-NEXT: # kill: def $al killed $al killed $eax 960; AVX-NEXT: retq 961; 962; AVX512-LABEL: test_v2i8: 963; AVX512: # %bb.0: 964; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 965; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 966; AVX512-NEXT: vmovd %xmm0, %eax 967; AVX512-NEXT: # kill: def $al killed $al killed $eax 968; AVX512-NEXT: retq 969 %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0) 970 ret i8 %1 971} 972 973define i8 @test_v2i8_load(<2 x i8>* %p) { 974; SSE-LABEL: test_v2i8_load: 975; SSE: # %bb.0: 976; SSE-NEXT: movzwl (%rdi), %eax 977; SSE-NEXT: movd %eax, %xmm0 978; SSE-NEXT: movdqa %xmm0, %xmm1 979; SSE-NEXT: psrlw $8, %xmm1 980; SSE-NEXT: paddb %xmm0, %xmm1 981; SSE-NEXT: movd %xmm1, %eax 982; SSE-NEXT: # kill: def $al killed $al killed $eax 983; SSE-NEXT: retq 984; 985; AVX-LABEL: test_v2i8_load: 986; AVX: # %bb.0: 987; AVX-NEXT: movzwl (%rdi), %eax 988; AVX-NEXT: vmovd %eax, %xmm0 989; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 990; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 991; AVX-NEXT: vmovd %xmm0, %eax 992; AVX-NEXT: # kill: def $al killed $al killed $eax 993; AVX-NEXT: retq 994; 995; AVX512-LABEL: test_v2i8_load: 996; AVX512: # %bb.0: 997; AVX512-NEXT: movzwl (%rdi), %eax 998; AVX512-NEXT: vmovd %eax, %xmm0 999; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 1000; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1001; AVX512-NEXT: vmovd %xmm0, %eax 1002; AVX512-NEXT: # kill: def $al killed $al killed $eax 1003; AVX512-NEXT: retq 1004 %a0 = load <2 x i8>, <2 x i8>* %p 1005 %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0) 1006 ret i8 %1 1007} 1008 1009define i8 @test_v4i8(<4 x i8> %a0) { 1010; SSE2-LABEL: test_v4i8: 1011; SSE2: # %bb.0: 1012; SSE2-NEXT: pxor %xmm1, %xmm1 1013; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1014; SSE2-NEXT: psadbw %xmm1, %xmm0 1015; SSE2-NEXT: movd %xmm0, %eax 1016; SSE2-NEXT: # kill: def $al killed $al killed $eax 1017; SSE2-NEXT: retq 1018; 1019; SSE41-LABEL: test_v4i8: 1020; SSE41: # %bb.0: 1021; SSE41-NEXT: pxor %xmm1, %xmm1 1022; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1023; SSE41-NEXT: psadbw %xmm1, %xmm0 1024; SSE41-NEXT: movd %xmm0, %eax 1025; SSE41-NEXT: # kill: def $al killed $al killed $eax 1026; SSE41-NEXT: retq 1027; 1028; AVX-LABEL: test_v4i8: 1029; AVX: # %bb.0: 1030; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1031; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1032; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1033; AVX-NEXT: vmovd %xmm0, %eax 1034; AVX-NEXT: # kill: def $al killed $al killed $eax 1035; AVX-NEXT: retq 1036; 1037; AVX512-LABEL: test_v4i8: 1038; AVX512: # %bb.0: 1039; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1040; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1041; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1042; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1043; AVX512-NEXT: vmovd %xmm0, %eax 1044; AVX512-NEXT: # kill: def $al killed $al killed $eax 1045; AVX512-NEXT: retq 1046 %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0) 1047 ret i8 %1 1048} 1049 1050define i8 @test_v4i8_load(<4 x i8>* %p) { 1051; SSE-LABEL: test_v4i8_load: 1052; SSE: # %bb.0: 1053; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1054; SSE-NEXT: pxor %xmm1, %xmm1 1055; SSE-NEXT: psadbw %xmm0, %xmm1 1056; SSE-NEXT: movd %xmm1, %eax 1057; SSE-NEXT: # kill: def $al killed $al killed $eax 1058; SSE-NEXT: retq 1059; 1060; AVX-LABEL: test_v4i8_load: 1061; AVX: # %bb.0: 1062; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1063; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1064; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1065; AVX-NEXT: vmovd %xmm0, %eax 1066; AVX-NEXT: # kill: def $al killed $al killed $eax 1067; AVX-NEXT: retq 1068; 1069; AVX512-LABEL: test_v4i8_load: 1070; AVX512: # %bb.0: 1071; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1072; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1073; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1074; AVX512-NEXT: vmovd %xmm0, %eax 1075; AVX512-NEXT: # kill: def $al killed $al killed $eax 1076; AVX512-NEXT: retq 1077 %a0 = load <4 x i8>, <4 x i8>* %p 1078 %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0) 1079 ret i8 %1 1080} 1081 1082define i8 @test_v8i8(<8 x i8> %a0) { 1083; SSE-LABEL: test_v8i8: 1084; SSE: # %bb.0: 1085; SSE-NEXT: pxor %xmm1, %xmm1 1086; SSE-NEXT: psadbw %xmm0, %xmm1 1087; SSE-NEXT: movd %xmm1, %eax 1088; SSE-NEXT: # kill: def $al killed $al killed $eax 1089; SSE-NEXT: retq 1090; 1091; AVX-LABEL: test_v8i8: 1092; AVX: # %bb.0: 1093; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1094; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1095; AVX-NEXT: vmovd %xmm0, %eax 1096; AVX-NEXT: # kill: def $al killed $al killed $eax 1097; AVX-NEXT: retq 1098; 1099; AVX512-LABEL: test_v8i8: 1100; AVX512: # %bb.0: 1101; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1102; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1103; AVX512-NEXT: vmovd %xmm0, %eax 1104; AVX512-NEXT: # kill: def $al killed $al killed $eax 1105; AVX512-NEXT: retq 1106 %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0) 1107 ret i8 %1 1108} 1109 1110define i8 @test_v8i8_load(<8 x i8>* %p) { 1111; SSE-LABEL: test_v8i8_load: 1112; SSE: # %bb.0: 1113; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1114; SSE-NEXT: pxor %xmm1, %xmm1 1115; SSE-NEXT: psadbw %xmm0, %xmm1 1116; SSE-NEXT: movd %xmm1, %eax 1117; SSE-NEXT: # kill: def $al killed $al killed $eax 1118; SSE-NEXT: retq 1119; 1120; AVX-LABEL: test_v8i8_load: 1121; AVX: # %bb.0: 1122; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1123; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1124; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1125; AVX-NEXT: vmovd %xmm0, %eax 1126; AVX-NEXT: # kill: def $al killed $al killed $eax 1127; AVX-NEXT: retq 1128; 1129; AVX512-LABEL: test_v8i8_load: 1130; AVX512: # %bb.0: 1131; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1132; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1133; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1134; AVX512-NEXT: vmovd %xmm0, %eax 1135; AVX512-NEXT: # kill: def $al killed $al killed $eax 1136; AVX512-NEXT: retq 1137 %a0 = load <8 x i8>, <8 x i8>* %p 1138 %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0) 1139 ret i8 %1 1140} 1141 1142define i8 @test_v16i8(<16 x i8> %a0) { 1143; SSE-LABEL: test_v16i8: 1144; SSE: # %bb.0: 1145; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1146; SSE-NEXT: paddb %xmm0, %xmm1 1147; SSE-NEXT: pxor %xmm0, %xmm0 1148; SSE-NEXT: psadbw %xmm1, %xmm0 1149; SSE-NEXT: movd %xmm0, %eax 1150; SSE-NEXT: # kill: def $al killed $al killed $eax 1151; SSE-NEXT: retq 1152; 1153; AVX-LABEL: test_v16i8: 1154; AVX: # %bb.0: 1155; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1156; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1157; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1158; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1159; AVX-NEXT: vmovd %xmm0, %eax 1160; AVX-NEXT: # kill: def $al killed $al killed $eax 1161; AVX-NEXT: retq 1162; 1163; AVX512-LABEL: test_v16i8: 1164; AVX512: # %bb.0: 1165; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1166; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1167; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1168; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1169; AVX512-NEXT: vmovd %xmm0, %eax 1170; AVX512-NEXT: # kill: def $al killed $al killed $eax 1171; AVX512-NEXT: retq 1172 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a0) 1173 ret i8 %1 1174} 1175 1176define i8 @test_v32i8(<32 x i8> %a0) { 1177; SSE-LABEL: test_v32i8: 1178; SSE: # %bb.0: 1179; SSE-NEXT: paddb %xmm1, %xmm0 1180; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1181; SSE-NEXT: paddb %xmm0, %xmm1 1182; SSE-NEXT: pxor %xmm0, %xmm0 1183; SSE-NEXT: psadbw %xmm1, %xmm0 1184; SSE-NEXT: movd %xmm0, %eax 1185; SSE-NEXT: # kill: def $al killed $al killed $eax 1186; SSE-NEXT: retq 1187; 1188; AVX1-LABEL: test_v32i8: 1189; AVX1: # %bb.0: 1190; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1191; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1192; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1193; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1194; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1195; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1196; AVX1-NEXT: vmovd %xmm0, %eax 1197; AVX1-NEXT: # kill: def $al killed $al killed $eax 1198; AVX1-NEXT: vzeroupper 1199; AVX1-NEXT: retq 1200; 1201; AVX2-LABEL: test_v32i8: 1202; AVX2: # %bb.0: 1203; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1204; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1205; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1206; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1207; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1208; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1209; AVX2-NEXT: vmovd %xmm0, %eax 1210; AVX2-NEXT: # kill: def $al killed $al killed $eax 1211; AVX2-NEXT: vzeroupper 1212; AVX2-NEXT: retq 1213; 1214; AVX512-LABEL: test_v32i8: 1215; AVX512: # %bb.0: 1216; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1217; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1218; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1219; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1220; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1221; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1222; AVX512-NEXT: vmovd %xmm0, %eax 1223; AVX512-NEXT: # kill: def $al killed $al killed $eax 1224; AVX512-NEXT: vzeroupper 1225; AVX512-NEXT: retq 1226 %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a0) 1227 ret i8 %1 1228} 1229 1230define i8 @test_v64i8(<64 x i8> %a0) { 1231; SSE-LABEL: test_v64i8: 1232; SSE: # %bb.0: 1233; SSE-NEXT: paddb %xmm3, %xmm1 1234; SSE-NEXT: paddb %xmm2, %xmm1 1235; SSE-NEXT: paddb %xmm0, %xmm1 1236; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1237; SSE-NEXT: paddb %xmm1, %xmm0 1238; SSE-NEXT: pxor %xmm1, %xmm1 1239; SSE-NEXT: psadbw %xmm0, %xmm1 1240; SSE-NEXT: movd %xmm1, %eax 1241; SSE-NEXT: # kill: def $al killed $al killed $eax 1242; SSE-NEXT: retq 1243; 1244; AVX1-LABEL: test_v64i8: 1245; AVX1: # %bb.0: 1246; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1247; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1248; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 1249; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 1250; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1251; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1252; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1253; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1254; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1255; AVX1-NEXT: vmovd %xmm0, %eax 1256; AVX1-NEXT: # kill: def $al killed $al killed $eax 1257; AVX1-NEXT: vzeroupper 1258; AVX1-NEXT: retq 1259; 1260; AVX2-LABEL: test_v64i8: 1261; AVX2: # %bb.0: 1262; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1263; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1264; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1265; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1266; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1267; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1268; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1269; AVX2-NEXT: vmovd %xmm0, %eax 1270; AVX2-NEXT: # kill: def $al killed $al killed $eax 1271; AVX2-NEXT: vzeroupper 1272; AVX2-NEXT: retq 1273; 1274; AVX512-LABEL: test_v64i8: 1275; AVX512: # %bb.0: 1276; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1277; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1278; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1279; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1280; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1281; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1282; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1283; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1284; AVX512-NEXT: vmovd %xmm0, %eax 1285; AVX512-NEXT: # kill: def $al killed $al killed $eax 1286; AVX512-NEXT: vzeroupper 1287; AVX512-NEXT: retq 1288 %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a0) 1289 ret i8 %1 1290} 1291 1292define i8 @test_v128i8(<128 x i8> %a0) { 1293; SSE-LABEL: test_v128i8: 1294; SSE: # %bb.0: 1295; SSE-NEXT: paddb %xmm7, %xmm3 1296; SSE-NEXT: paddb %xmm5, %xmm3 1297; SSE-NEXT: paddb %xmm1, %xmm3 1298; SSE-NEXT: paddb %xmm6, %xmm2 1299; SSE-NEXT: paddb %xmm4, %xmm2 1300; SSE-NEXT: paddb %xmm3, %xmm2 1301; SSE-NEXT: paddb %xmm0, %xmm2 1302; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1303; SSE-NEXT: paddb %xmm2, %xmm0 1304; SSE-NEXT: pxor %xmm1, %xmm1 1305; SSE-NEXT: psadbw %xmm0, %xmm1 1306; SSE-NEXT: movd %xmm1, %eax 1307; SSE-NEXT: # kill: def $al killed $al killed $eax 1308; SSE-NEXT: retq 1309; 1310; AVX1-LABEL: test_v128i8: 1311; AVX1: # %bb.0: 1312; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 1313; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1314; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 1315; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1316; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 1317; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1318; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 1319; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 1320; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 1321; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 1322; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1323; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1324; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1325; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1326; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1327; AVX1-NEXT: vmovd %xmm0, %eax 1328; AVX1-NEXT: # kill: def $al killed $al killed $eax 1329; AVX1-NEXT: vzeroupper 1330; AVX1-NEXT: retq 1331; 1332; AVX2-LABEL: test_v128i8: 1333; AVX2: # %bb.0: 1334; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 1335; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 1336; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1337; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1338; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1339; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1340; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1341; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1342; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1343; AVX2-NEXT: vmovd %xmm0, %eax 1344; AVX2-NEXT: # kill: def $al killed $al killed $eax 1345; AVX2-NEXT: vzeroupper 1346; AVX2-NEXT: retq 1347; 1348; AVX512-LABEL: test_v128i8: 1349; AVX512: # %bb.0: 1350; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 1351; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1352; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1353; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1354; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1355; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1356; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1357; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1358; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1359; AVX512-NEXT: vmovd %xmm0, %eax 1360; AVX512-NEXT: # kill: def $al killed $al killed $eax 1361; AVX512-NEXT: vzeroupper 1362; AVX512-NEXT: retq 1363 %1 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %a0) 1364 ret i8 %1 1365} 1366 1367declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 1368declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 1369declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 1370declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 1371 1372declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 1373declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 1374declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 1375declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 1376declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 1377 1378declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) 1379declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 1380declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 1381declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 1382declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 1383declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 1384 1385declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) 1386declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) 1387declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 1388declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 1389declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 1390declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 1391declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) 1392