1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW 7 8define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 9; SSE2-LABEL: _Z10test_shortPsS_i_128: 10; SSE2: # %bb.0: # %entry 11; SSE2-NEXT: movl %edx, %eax 12; SSE2-NEXT: pxor %xmm0, %xmm0 13; SSE2-NEXT: xorl %ecx, %ecx 14; SSE2-NEXT: .p2align 4, 0x90 15; SSE2-NEXT: .LBB0_1: # %vector.body 16; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 17; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 18; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 19; SSE2-NEXT: movdqa %xmm2, %xmm3 20; SSE2-NEXT: pmulhw %xmm1, %xmm3 21; SSE2-NEXT: pmullw %xmm1, %xmm2 22; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 23; SSE2-NEXT: paddd %xmm2, %xmm0 24; SSE2-NEXT: addq $8, %rcx 25; SSE2-NEXT: cmpq %rcx, %rax 26; SSE2-NEXT: jne .LBB0_1 27; SSE2-NEXT: # %bb.2: # %middle.block 28; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 29; SSE2-NEXT: paddd %xmm0, %xmm1 30; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 31; SSE2-NEXT: paddd %xmm1, %xmm0 32; SSE2-NEXT: movd %xmm0, %eax 33; SSE2-NEXT: retq 34; 35; AVX-LABEL: _Z10test_shortPsS_i_128: 36; AVX: # %bb.0: # %entry 37; AVX-NEXT: movl %edx, %eax 38; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 39; AVX-NEXT: xorl %ecx, %ecx 40; AVX-NEXT: .p2align 4, 0x90 41; AVX-NEXT: .LBB0_1: # %vector.body 42; AVX-NEXT: # =>This Inner Loop Header: Depth=1 43; AVX-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm1 44; AVX-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm2 45; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 46; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 47; AVX-NEXT: addq $8, %rcx 48; AVX-NEXT: cmpq %rcx, %rax 49; AVX-NEXT: jne .LBB0_1 50; AVX-NEXT: # %bb.2: # %middle.block 51; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 52; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 53; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 54; AVX-NEXT: vmovd %xmm0, %eax 55; AVX-NEXT: retq 56entry: 57 %3 = zext i32 %2 to i64 58 br label %vector.body 59 60vector.body: 61 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 62 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 63 %4 = getelementptr inbounds i16, i16* %0, i64 %index 64 %5 = bitcast i16* %4 to <4 x i16>* 65 %wide.load = load <4 x i16>, <4 x i16>* %5, align 2 66 %6 = sext <4 x i16> %wide.load to <4 x i32> 67 %7 = getelementptr inbounds i16, i16* %1, i64 %index 68 %8 = bitcast i16* %7 to <4 x i16>* 69 %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2 70 %9 = sext <4 x i16> %wide.load14 to <4 x i32> 71 %10 = mul nsw <4 x i32> %9, %6 72 %11 = add nsw <4 x i32> %10, %vec.phi 73 %index.next = add i64 %index, 8 74 %12 = icmp eq i64 %index.next, %3 75 br i1 %12, label %middle.block, label %vector.body 76 77middle.block: 78 %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 79 %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15 80 %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 81 %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17 82 %13 = extractelement <4 x i32> %bin.rdx18, i32 0 83 ret i32 %13 84} 85 86define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 87; SSE2-LABEL: _Z10test_shortPsS_i_256: 88; SSE2: # %bb.0: # %entry 89; SSE2-NEXT: movl %edx, %eax 90; SSE2-NEXT: pxor %xmm0, %xmm0 91; SSE2-NEXT: xorl %ecx, %ecx 92; SSE2-NEXT: pxor %xmm1, %xmm1 93; SSE2-NEXT: .p2align 4, 0x90 94; SSE2-NEXT: .LBB1_1: # %vector.body 95; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 96; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 97; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 98; SSE2-NEXT: pmaddwd %xmm2, %xmm3 99; SSE2-NEXT: paddd %xmm3, %xmm1 100; SSE2-NEXT: addq $8, %rcx 101; SSE2-NEXT: cmpq %rcx, %rax 102; SSE2-NEXT: jne .LBB1_1 103; SSE2-NEXT: # %bb.2: # %middle.block 104; SSE2-NEXT: paddd %xmm0, %xmm1 105; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 106; SSE2-NEXT: paddd %xmm1, %xmm0 107; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 108; SSE2-NEXT: paddd %xmm0, %xmm1 109; SSE2-NEXT: movd %xmm1, %eax 110; SSE2-NEXT: retq 111; 112; AVX1-LABEL: _Z10test_shortPsS_i_256: 113; AVX1: # %bb.0: # %entry 114; AVX1-NEXT: movl %edx, %eax 115; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 116; AVX1-NEXT: xorl %ecx, %ecx 117; AVX1-NEXT: .p2align 4, 0x90 118; AVX1-NEXT: .LBB1_1: # %vector.body 119; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 120; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 121; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 122; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 123; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 124; AVX1-NEXT: addq $8, %rcx 125; AVX1-NEXT: cmpq %rcx, %rax 126; AVX1-NEXT: jne .LBB1_1 127; AVX1-NEXT: # %bb.2: # %middle.block 128; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 129; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 130; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 131; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 132; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 133; AVX1-NEXT: vmovd %xmm0, %eax 134; AVX1-NEXT: vzeroupper 135; AVX1-NEXT: retq 136; 137; AVX256-LABEL: _Z10test_shortPsS_i_256: 138; AVX256: # %bb.0: # %entry 139; AVX256-NEXT: movl %edx, %eax 140; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 141; AVX256-NEXT: xorl %ecx, %ecx 142; AVX256-NEXT: .p2align 4, 0x90 143; AVX256-NEXT: .LBB1_1: # %vector.body 144; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 145; AVX256-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 146; AVX256-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 147; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 148; AVX256-NEXT: addq $8, %rcx 149; AVX256-NEXT: cmpq %rcx, %rax 150; AVX256-NEXT: jne .LBB1_1 151; AVX256-NEXT: # %bb.2: # %middle.block 152; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 153; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 154; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 155; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 156; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 157; AVX256-NEXT: vmovd %xmm0, %eax 158; AVX256-NEXT: vzeroupper 159; AVX256-NEXT: retq 160entry: 161 %3 = zext i32 %2 to i64 162 br label %vector.body 163 164vector.body: 165 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 166 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 167 %4 = getelementptr inbounds i16, i16* %0, i64 %index 168 %5 = bitcast i16* %4 to <8 x i16>* 169 %wide.load = load <8 x i16>, <8 x i16>* %5, align 2 170 %6 = sext <8 x i16> %wide.load to <8 x i32> 171 %7 = getelementptr inbounds i16, i16* %1, i64 %index 172 %8 = bitcast i16* %7 to <8 x i16>* 173 %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2 174 %9 = sext <8 x i16> %wide.load14 to <8 x i32> 175 %10 = mul nsw <8 x i32> %9, %6 176 %11 = add nsw <8 x i32> %10, %vec.phi 177 %index.next = add i64 %index, 8 178 %12 = icmp eq i64 %index.next, %3 179 br i1 %12, label %middle.block, label %vector.body 180 181middle.block: 182 %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 183 %bin.rdx = add <8 x i32> %11, %rdx.shuf 184 %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 185 %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15 186 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 187 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 188 %13 = extractelement <8 x i32> %bin.rdx18, i32 0 189 ret i32 %13 190} 191 192define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 193; SSE2-LABEL: _Z10test_shortPsS_i_512: 194; SSE2: # %bb.0: # %entry 195; SSE2-NEXT: movl %edx, %eax 196; SSE2-NEXT: pxor %xmm0, %xmm0 197; SSE2-NEXT: xorl %ecx, %ecx 198; SSE2-NEXT: pxor %xmm2, %xmm2 199; SSE2-NEXT: pxor %xmm1, %xmm1 200; SSE2-NEXT: .p2align 4, 0x90 201; SSE2-NEXT: .LBB2_1: # %vector.body 202; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 203; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm3 204; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm4 205; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 206; SSE2-NEXT: pmaddwd %xmm3, %xmm5 207; SSE2-NEXT: paddd %xmm5, %xmm2 208; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 209; SSE2-NEXT: pmaddwd %xmm4, %xmm3 210; SSE2-NEXT: paddd %xmm3, %xmm1 211; SSE2-NEXT: addq $16, %rcx 212; SSE2-NEXT: cmpq %rcx, %rax 213; SSE2-NEXT: jne .LBB2_1 214; SSE2-NEXT: # %bb.2: # %middle.block 215; SSE2-NEXT: paddd %xmm0, %xmm2 216; SSE2-NEXT: paddd %xmm0, %xmm1 217; SSE2-NEXT: paddd %xmm2, %xmm1 218; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 219; SSE2-NEXT: paddd %xmm1, %xmm0 220; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 221; SSE2-NEXT: paddd %xmm0, %xmm1 222; SSE2-NEXT: movd %xmm1, %eax 223; SSE2-NEXT: retq 224; 225; AVX1-LABEL: _Z10test_shortPsS_i_512: 226; AVX1: # %bb.0: # %entry 227; AVX1-NEXT: movl %edx, %eax 228; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 229; AVX1-NEXT: xorl %ecx, %ecx 230; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 231; AVX1-NEXT: .p2align 4, 0x90 232; AVX1-NEXT: .LBB2_1: # %vector.body 233; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 234; AVX1-NEXT: vmovdqu (%rdi,%rcx,2), %ymm2 235; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 236; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 237; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 238; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 239; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 240; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 241; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 242; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 243; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 244; AVX1-NEXT: addq $16, %rcx 245; AVX1-NEXT: cmpq %rcx, %rax 246; AVX1-NEXT: jne .LBB2_1 247; AVX1-NEXT: # %bb.2: # %middle.block 248; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 249; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 250; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 251; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 252; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 253; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 254; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 255; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 256; AVX1-NEXT: vmovd %xmm0, %eax 257; AVX1-NEXT: vzeroupper 258; AVX1-NEXT: retq 259; 260; AVX2-LABEL: _Z10test_shortPsS_i_512: 261; AVX2: # %bb.0: # %entry 262; AVX2-NEXT: movl %edx, %eax 263; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 264; AVX2-NEXT: xorl %ecx, %ecx 265; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 266; AVX2-NEXT: .p2align 4, 0x90 267; AVX2-NEXT: .LBB2_1: # %vector.body 268; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 269; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 270; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 271; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 272; AVX2-NEXT: addq $16, %rcx 273; AVX2-NEXT: cmpq %rcx, %rax 274; AVX2-NEXT: jne .LBB2_1 275; AVX2-NEXT: # %bb.2: # %middle.block 276; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 277; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 278; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 279; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 280; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 281; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 282; AVX2-NEXT: vmovd %xmm0, %eax 283; AVX2-NEXT: vzeroupper 284; AVX2-NEXT: retq 285; 286; AVX512-LABEL: _Z10test_shortPsS_i_512: 287; AVX512: # %bb.0: # %entry 288; AVX512-NEXT: movl %edx, %eax 289; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 290; AVX512-NEXT: xorl %ecx, %ecx 291; AVX512-NEXT: .p2align 4, 0x90 292; AVX512-NEXT: .LBB2_1: # %vector.body 293; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 294; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %ymm1 295; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1 296; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 297; AVX512-NEXT: addq $16, %rcx 298; AVX512-NEXT: cmpq %rcx, %rax 299; AVX512-NEXT: jne .LBB2_1 300; AVX512-NEXT: # %bb.2: # %middle.block 301; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 302; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 303; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 304; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 305; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 306; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 307; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 308; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 309; AVX512-NEXT: vmovd %xmm0, %eax 310; AVX512-NEXT: vzeroupper 311; AVX512-NEXT: retq 312entry: 313 %3 = zext i32 %2 to i64 314 br label %vector.body 315 316vector.body: 317 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 318 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 319 %4 = getelementptr inbounds i16, i16* %0, i64 %index 320 %5 = bitcast i16* %4 to <16 x i16>* 321 %wide.load = load <16 x i16>, <16 x i16>* %5, align 2 322 %6 = sext <16 x i16> %wide.load to <16 x i32> 323 %7 = getelementptr inbounds i16, i16* %1, i64 %index 324 %8 = bitcast i16* %7 to <16 x i16>* 325 %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2 326 %9 = sext <16 x i16> %wide.load14 to <16 x i32> 327 %10 = mul nsw <16 x i32> %9, %6 328 %11 = add nsw <16 x i32> %10, %vec.phi 329 %index.next = add i64 %index, 16 330 %12 = icmp eq i64 %index.next, %3 331 br i1 %12, label %middle.block, label %vector.body 332 333middle.block: 334 %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 335 %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1 336 %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 337 %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf 338 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 339 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 340 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 341 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 342 %13 = extractelement <16 x i32> %bin.rdx18, i32 0 343 ret i32 %13 344} 345 346define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 347; SSE2-LABEL: _Z10test_shortPsS_i_1024: 348; SSE2: # %bb.0: # %entry 349; SSE2-NEXT: movl %edx, %eax 350; SSE2-NEXT: pxor %xmm8, %xmm8 351; SSE2-NEXT: xorl %ecx, %ecx 352; SSE2-NEXT: pxor %xmm2, %xmm2 353; SSE2-NEXT: pxor %xmm4, %xmm4 354; SSE2-NEXT: pxor %xmm1, %xmm1 355; SSE2-NEXT: pxor %xmm3, %xmm3 356; SSE2-NEXT: .p2align 4, 0x90 357; SSE2-NEXT: .LBB3_1: # %vector.body 358; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 359; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5 360; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 361; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 362; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm9 363; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm0 364; SSE2-NEXT: pmaddwd %xmm5, %xmm0 365; SSE2-NEXT: paddd %xmm0, %xmm2 366; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm0 367; SSE2-NEXT: pmaddwd %xmm6, %xmm0 368; SSE2-NEXT: paddd %xmm0, %xmm4 369; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm0 370; SSE2-NEXT: pmaddwd %xmm7, %xmm0 371; SSE2-NEXT: paddd %xmm0, %xmm1 372; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm0 373; SSE2-NEXT: pmaddwd %xmm9, %xmm0 374; SSE2-NEXT: paddd %xmm0, %xmm3 375; SSE2-NEXT: addq $16, %rcx 376; SSE2-NEXT: cmpq %rcx, %rax 377; SSE2-NEXT: jne .LBB3_1 378; SSE2-NEXT: # %bb.2: # %middle.block 379; SSE2-NEXT: paddd %xmm8, %xmm4 380; SSE2-NEXT: paddd %xmm8, %xmm3 381; SSE2-NEXT: paddd %xmm4, %xmm3 382; SSE2-NEXT: paddd %xmm8, %xmm2 383; SSE2-NEXT: paddd %xmm8, %xmm1 384; SSE2-NEXT: paddd %xmm3, %xmm1 385; SSE2-NEXT: paddd %xmm2, %xmm1 386; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 387; SSE2-NEXT: paddd %xmm1, %xmm0 388; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 389; SSE2-NEXT: paddd %xmm0, %xmm1 390; SSE2-NEXT: movd %xmm1, %eax 391; SSE2-NEXT: retq 392; 393; AVX1-LABEL: _Z10test_shortPsS_i_1024: 394; AVX1: # %bb.0: # %entry 395; AVX1-NEXT: movl %edx, %eax 396; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 397; AVX1-NEXT: xorl %ecx, %ecx 398; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 399; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 400; AVX1-NEXT: .p2align 4, 0x90 401; AVX1-NEXT: .LBB3_1: # %vector.body 402; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 403; AVX1-NEXT: vmovdqu (%rdi,%rcx,2), %ymm3 404; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,2), %ymm4 405; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %ymm5 406; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm6 407; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7 408; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 409; AVX1-NEXT: vpmaddwd %xmm7, %xmm0, %xmm0 410; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 411; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0 412; AVX1-NEXT: vpmaddwd %xmm4, %xmm6, %xmm4 413; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 414; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 415; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 416; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 417; AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 418; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 419; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 420; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3 421; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 422; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 423; AVX1-NEXT: addq $16, %rcx 424; AVX1-NEXT: cmpq %rcx, %rax 425; AVX1-NEXT: jne .LBB3_1 426; AVX1-NEXT: # %bb.2: # %middle.block 427; AVX1-NEXT: vpaddd %xmm8, %xmm2, %xmm0 428; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 429; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4 430; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 431; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm5 432; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 433; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 434; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 435; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 436; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 437; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 438; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 439; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 440; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 441; AVX1-NEXT: vmovd %xmm0, %eax 442; AVX1-NEXT: vzeroupper 443; AVX1-NEXT: retq 444; 445; AVX2-LABEL: _Z10test_shortPsS_i_1024: 446; AVX2: # %bb.0: # %entry 447; AVX2-NEXT: movl %edx, %eax 448; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 449; AVX2-NEXT: xorl %ecx, %ecx 450; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 451; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 452; AVX2-NEXT: .p2align 4, 0x90 453; AVX2-NEXT: .LBB3_1: # %vector.body 454; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 455; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 456; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 457; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4 458; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 459; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 460; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 461; AVX2-NEXT: addq $16, %rcx 462; AVX2-NEXT: cmpq %rcx, %rax 463; AVX2-NEXT: jne .LBB3_1 464; AVX2-NEXT: # %bb.2: # %middle.block 465; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 466; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 467; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 468; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 469; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 470; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 471; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 472; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 473; AVX2-NEXT: vmovd %xmm0, %eax 474; AVX2-NEXT: vzeroupper 475; AVX2-NEXT: retq 476; 477; AVX512F-LABEL: _Z10test_shortPsS_i_1024: 478; AVX512F: # %bb.0: # %entry 479; AVX512F-NEXT: movl %edx, %eax 480; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 481; AVX512F-NEXT: xorl %ecx, %ecx 482; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 483; AVX512F-NEXT: .p2align 4, 0x90 484; AVX512F-NEXT: .LBB3_1: # %vector.body 485; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 486; AVX512F-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 487; AVX512F-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm3 488; AVX512F-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3 489; AVX512F-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 490; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 491; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 492; AVX512F-NEXT: addq $16, %rcx 493; AVX512F-NEXT: cmpq %rcx, %rax 494; AVX512F-NEXT: jne .LBB3_1 495; AVX512F-NEXT: # %bb.2: # %middle.block 496; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 497; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 498; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 499; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 500; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 501; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 502; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 503; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 504; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 505; AVX512F-NEXT: vmovd %xmm0, %eax 506; AVX512F-NEXT: vzeroupper 507; AVX512F-NEXT: retq 508; 509; AVX512BW-LABEL: _Z10test_shortPsS_i_1024: 510; AVX512BW: # %bb.0: # %entry 511; AVX512BW-NEXT: movl %edx, %eax 512; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 513; AVX512BW-NEXT: xorl %ecx, %ecx 514; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 515; AVX512BW-NEXT: .p2align 4, 0x90 516; AVX512BW-NEXT: .LBB3_1: # %vector.body 517; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 518; AVX512BW-NEXT: vmovdqu64 (%rsi,%rcx,2), %zmm2 519; AVX512BW-NEXT: vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2 520; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 521; AVX512BW-NEXT: addq $16, %rcx 522; AVX512BW-NEXT: cmpq %rcx, %rax 523; AVX512BW-NEXT: jne .LBB3_1 524; AVX512BW-NEXT: # %bb.2: # %middle.block 525; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 526; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 527; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 528; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 529; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 530; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 531; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 532; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 533; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 534; AVX512BW-NEXT: vmovd %xmm0, %eax 535; AVX512BW-NEXT: vzeroupper 536; AVX512BW-NEXT: retq 537entry: 538 %3 = zext i32 %2 to i64 539 br label %vector.body 540 541vector.body: 542 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 543 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 544 %4 = getelementptr inbounds i16, i16* %0, i64 %index 545 %5 = bitcast i16* %4 to <32 x i16>* 546 %wide.load = load <32 x i16>, <32 x i16>* %5, align 2 547 %6 = sext <32 x i16> %wide.load to <32 x i32> 548 %7 = getelementptr inbounds i16, i16* %1, i64 %index 549 %8 = bitcast i16* %7 to <32 x i16>* 550 %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2 551 %9 = sext <32 x i16> %wide.load14 to <32 x i32> 552 %10 = mul nsw <32 x i32> %9, %6 553 %11 = add nsw <32 x i32> %10, %vec.phi 554 %index.next = add i64 %index, 16 555 %12 = icmp eq i64 %index.next, %3 556 br i1 %12, label %middle.block, label %vector.body 557 558middle.block: 559 %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 560 %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2 561 %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 562 %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1 563 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 564 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 565 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 566 %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15 567 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 568 %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17 569 %13 = extractelement <32 x i32> %bin.rdx18, i32 0 570 ret i32 %13 571} 572 573define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { 574; SSE2-LABEL: _Z9test_charPcS_i_128: 575; SSE2: # %bb.0: # %entry 576; SSE2-NEXT: movl %edx, %eax 577; SSE2-NEXT: pxor %xmm0, %xmm0 578; SSE2-NEXT: xorl %ecx, %ecx 579; SSE2-NEXT: .p2align 4, 0x90 580; SSE2-NEXT: .LBB4_1: # %vector.body 581; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 582; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 583; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 584; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 585; SSE2-NEXT: psrad $24, %xmm1 586; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 587; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 588; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 589; SSE2-NEXT: psrad $24, %xmm2 590; SSE2-NEXT: pmullw %xmm1, %xmm2 591; SSE2-NEXT: pslld $16, %xmm2 592; SSE2-NEXT: psrad $16, %xmm2 593; SSE2-NEXT: paddd %xmm2, %xmm0 594; SSE2-NEXT: addq $16, %rcx 595; SSE2-NEXT: cmpq %rcx, %rax 596; SSE2-NEXT: jne .LBB4_1 597; SSE2-NEXT: # %bb.2: # %middle.block 598; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 599; SSE2-NEXT: paddd %xmm0, %xmm1 600; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 601; SSE2-NEXT: paddd %xmm1, %xmm0 602; SSE2-NEXT: movd %xmm0, %eax 603; SSE2-NEXT: retq 604; 605; AVX-LABEL: _Z9test_charPcS_i_128: 606; AVX: # %bb.0: # %entry 607; AVX-NEXT: movl %edx, %eax 608; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 609; AVX-NEXT: xorl %ecx, %ecx 610; AVX-NEXT: .p2align 4, 0x90 611; AVX-NEXT: .LBB4_1: # %vector.body 612; AVX-NEXT: # =>This Inner Loop Header: Depth=1 613; AVX-NEXT: vpmovsxbd (%rdi,%rcx), %xmm1 614; AVX-NEXT: vpmovsxbd (%rsi,%rcx), %xmm2 615; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 616; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 617; AVX-NEXT: addq $16, %rcx 618; AVX-NEXT: cmpq %rcx, %rax 619; AVX-NEXT: jne .LBB4_1 620; AVX-NEXT: # %bb.2: # %middle.block 621; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 622; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 623; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 624; AVX-NEXT: vmovd %xmm0, %eax 625; AVX-NEXT: retq 626entry: 627 %3 = zext i32 %2 to i64 628 br label %vector.body 629 630vector.body: 631 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 632 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 633 %4 = getelementptr inbounds i8, i8* %0, i64 %index 634 %5 = bitcast i8* %4 to <4 x i8>* 635 %wide.load = load <4 x i8>, <4 x i8>* %5, align 1 636 %6 = sext <4 x i8> %wide.load to <4 x i32> 637 %7 = getelementptr inbounds i8, i8* %1, i64 %index 638 %8 = bitcast i8* %7 to <4 x i8>* 639 %wide.load14 = load <4 x i8>, <4 x i8>* %8, align 1 640 %9 = sext <4 x i8> %wide.load14 to <4 x i32> 641 %10 = mul nsw <4 x i32> %9, %6 642 %11 = add nsw <4 x i32> %10, %vec.phi 643 %index.next = add i64 %index, 16 644 %12 = icmp eq i64 %index.next, %3 645 br i1 %12, label %middle.block, label %vector.body 646 647middle.block: 648 %rdx.shuf17 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 649 %bin.rdx18 = add <4 x i32> %11, %rdx.shuf17 650 %rdx.shuf19 = shufflevector <4 x i32> %bin.rdx18, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 651 %bin.rdx20 = add <4 x i32> %bin.rdx18, %rdx.shuf19 652 %13 = extractelement <4 x i32> %bin.rdx20, i32 0 653 ret i32 %13 654} 655 656define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { 657; SSE2-LABEL: _Z9test_charPcS_i_256: 658; SSE2: # %bb.0: # %entry 659; SSE2-NEXT: movl %edx, %eax 660; SSE2-NEXT: pxor %xmm0, %xmm0 661; SSE2-NEXT: xorl %ecx, %ecx 662; SSE2-NEXT: pxor %xmm1, %xmm1 663; SSE2-NEXT: .p2align 4, 0x90 664; SSE2-NEXT: .LBB5_1: # %vector.body 665; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 666; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 667; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 668; SSE2-NEXT: psraw $8, %xmm2 669; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 670; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 671; SSE2-NEXT: psraw $8, %xmm3 672; SSE2-NEXT: pmaddwd %xmm2, %xmm3 673; SSE2-NEXT: paddd %xmm3, %xmm1 674; SSE2-NEXT: addq $16, %rcx 675; SSE2-NEXT: cmpq %rcx, %rax 676; SSE2-NEXT: jne .LBB5_1 677; SSE2-NEXT: # %bb.2: # %middle.block 678; SSE2-NEXT: paddd %xmm0, %xmm1 679; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 680; SSE2-NEXT: paddd %xmm1, %xmm0 681; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 682; SSE2-NEXT: paddd %xmm0, %xmm1 683; SSE2-NEXT: movd %xmm1, %eax 684; SSE2-NEXT: retq 685; 686; AVX1-LABEL: _Z9test_charPcS_i_256: 687; AVX1: # %bb.0: # %entry 688; AVX1-NEXT: movl %edx, %eax 689; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 690; AVX1-NEXT: xorl %ecx, %ecx 691; AVX1-NEXT: .p2align 4, 0x90 692; AVX1-NEXT: .LBB5_1: # %vector.body 693; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 694; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 695; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 696; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 697; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 698; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 699; AVX1-NEXT: addq $16, %rcx 700; AVX1-NEXT: cmpq %rcx, %rax 701; AVX1-NEXT: jne .LBB5_1 702; AVX1-NEXT: # %bb.2: # %middle.block 703; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 704; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 705; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 706; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 707; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 708; AVX1-NEXT: vmovd %xmm0, %eax 709; AVX1-NEXT: vzeroupper 710; AVX1-NEXT: retq 711; 712; AVX256-LABEL: _Z9test_charPcS_i_256: 713; AVX256: # %bb.0: # %entry 714; AVX256-NEXT: movl %edx, %eax 715; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 716; AVX256-NEXT: xorl %ecx, %ecx 717; AVX256-NEXT: .p2align 4, 0x90 718; AVX256-NEXT: .LBB5_1: # %vector.body 719; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 720; AVX256-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 721; AVX256-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 722; AVX256-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 723; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 724; AVX256-NEXT: addq $16, %rcx 725; AVX256-NEXT: cmpq %rcx, %rax 726; AVX256-NEXT: jne .LBB5_1 727; AVX256-NEXT: # %bb.2: # %middle.block 728; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 729; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 730; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 731; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 732; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 733; AVX256-NEXT: vmovd %xmm0, %eax 734; AVX256-NEXT: vzeroupper 735; AVX256-NEXT: retq 736entry: 737 %3 = zext i32 %2 to i64 738 br label %vector.body 739 740vector.body: 741 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 742 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 743 %4 = getelementptr inbounds i8, i8* %0, i64 %index 744 %5 = bitcast i8* %4 to <8 x i8>* 745 %wide.load = load <8 x i8>, <8 x i8>* %5, align 1 746 %6 = sext <8 x i8> %wide.load to <8 x i32> 747 %7 = getelementptr inbounds i8, i8* %1, i64 %index 748 %8 = bitcast i8* %7 to <8 x i8>* 749 %wide.load14 = load <8 x i8>, <8 x i8>* %8, align 1 750 %9 = sext <8 x i8> %wide.load14 to <8 x i32> 751 %10 = mul nsw <8 x i32> %9, %6 752 %11 = add nsw <8 x i32> %10, %vec.phi 753 %index.next = add i64 %index, 16 754 %12 = icmp eq i64 %index.next, %3 755 br i1 %12, label %middle.block, label %vector.body 756 757middle.block: 758 %rdx.shuf15 = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 759 %bin.rdx16 = add <8 x i32> %11, %rdx.shuf15 760 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 761 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 762 %rdx.shuf19 = shufflevector <8 x i32> %bin.rdx18, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 763 %bin.rdx20 = add <8 x i32> %bin.rdx18, %rdx.shuf19 764 %13 = extractelement <8 x i32> %bin.rdx20, i32 0 765 ret i32 %13 766} 767 768define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { 769; SSE2-LABEL: _Z9test_charPcS_i_512: 770; SSE2: # %bb.0: # %entry 771; SSE2-NEXT: movl %edx, %eax 772; SSE2-NEXT: pxor %xmm0, %xmm0 773; SSE2-NEXT: xorl %ecx, %ecx 774; SSE2-NEXT: pxor %xmm2, %xmm2 775; SSE2-NEXT: pxor %xmm1, %xmm1 776; SSE2-NEXT: .p2align 4, 0x90 777; SSE2-NEXT: .LBB6_1: # %vector.body 778; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 779; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 780; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 781; SSE2-NEXT: psraw $8, %xmm3 782; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 783; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 784; SSE2-NEXT: psraw $8, %xmm4 785; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero 786; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 787; SSE2-NEXT: psraw $8, %xmm5 788; SSE2-NEXT: pmaddwd %xmm3, %xmm5 789; SSE2-NEXT: paddd %xmm5, %xmm2 790; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 791; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 792; SSE2-NEXT: psraw $8, %xmm3 793; SSE2-NEXT: pmaddwd %xmm4, %xmm3 794; SSE2-NEXT: paddd %xmm3, %xmm1 795; SSE2-NEXT: addq $16, %rcx 796; SSE2-NEXT: cmpq %rcx, %rax 797; SSE2-NEXT: jne .LBB6_1 798; SSE2-NEXT: # %bb.2: # %middle.block 799; SSE2-NEXT: paddd %xmm0, %xmm2 800; SSE2-NEXT: paddd %xmm0, %xmm1 801; SSE2-NEXT: paddd %xmm2, %xmm1 802; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 803; SSE2-NEXT: paddd %xmm1, %xmm0 804; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 805; SSE2-NEXT: paddd %xmm0, %xmm1 806; SSE2-NEXT: movd %xmm1, %eax 807; SSE2-NEXT: retq 808; 809; AVX1-LABEL: _Z9test_charPcS_i_512: 810; AVX1: # %bb.0: # %entry 811; AVX1-NEXT: movl %edx, %eax 812; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 813; AVX1-NEXT: xorl %ecx, %ecx 814; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 815; AVX1-NEXT: .p2align 4, 0x90 816; AVX1-NEXT: .LBB6_1: # %vector.body 817; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 818; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm2 819; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm3 820; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 821; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 822; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 823; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 824; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 825; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 826; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 827; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 828; AVX1-NEXT: addq $16, %rcx 829; AVX1-NEXT: cmpq %rcx, %rax 830; AVX1-NEXT: jne .LBB6_1 831; AVX1-NEXT: # %bb.2: # %middle.block 832; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 833; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 834; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 835; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 836; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 837; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 838; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 839; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 840; AVX1-NEXT: vmovd %xmm0, %eax 841; AVX1-NEXT: vzeroupper 842; AVX1-NEXT: retq 843; 844; AVX2-LABEL: _Z9test_charPcS_i_512: 845; AVX2: # %bb.0: # %entry 846; AVX2-NEXT: movl %edx, %eax 847; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 848; AVX2-NEXT: xorl %ecx, %ecx 849; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 850; AVX2-NEXT: .p2align 4, 0x90 851; AVX2-NEXT: .LBB6_1: # %vector.body 852; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 853; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 854; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 855; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 856; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 857; AVX2-NEXT: addq $16, %rcx 858; AVX2-NEXT: cmpq %rcx, %rax 859; AVX2-NEXT: jne .LBB6_1 860; AVX2-NEXT: # %bb.2: # %middle.block 861; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 862; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 863; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 864; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 865; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 866; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 867; AVX2-NEXT: vmovd %xmm0, %eax 868; AVX2-NEXT: vzeroupper 869; AVX2-NEXT: retq 870; 871; AVX512-LABEL: _Z9test_charPcS_i_512: 872; AVX512: # %bb.0: # %entry 873; AVX512-NEXT: movl %edx, %eax 874; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 875; AVX512-NEXT: xorl %ecx, %ecx 876; AVX512-NEXT: .p2align 4, 0x90 877; AVX512-NEXT: .LBB6_1: # %vector.body 878; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 879; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1 880; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2 881; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 882; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 883; AVX512-NEXT: addq $16, %rcx 884; AVX512-NEXT: cmpq %rcx, %rax 885; AVX512-NEXT: jne .LBB6_1 886; AVX512-NEXT: # %bb.2: # %middle.block 887; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 888; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 889; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 890; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 891; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 892; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 893; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 894; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 895; AVX512-NEXT: vmovd %xmm0, %eax 896; AVX512-NEXT: vzeroupper 897; AVX512-NEXT: retq 898entry: 899 %3 = zext i32 %2 to i64 900 br label %vector.body 901 902vector.body: 903 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 904 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 905 %4 = getelementptr inbounds i8, i8* %0, i64 %index 906 %5 = bitcast i8* %4 to <16 x i8>* 907 %wide.load = load <16 x i8>, <16 x i8>* %5, align 1 908 %6 = sext <16 x i8> %wide.load to <16 x i32> 909 %7 = getelementptr inbounds i8, i8* %1, i64 %index 910 %8 = bitcast i8* %7 to <16 x i8>* 911 %wide.load14 = load <16 x i8>, <16 x i8>* %8, align 1 912 %9 = sext <16 x i8> %wide.load14 to <16 x i32> 913 %10 = mul nsw <16 x i32> %9, %6 914 %11 = add nsw <16 x i32> %10, %vec.phi 915 %index.next = add i64 %index, 16 916 %12 = icmp eq i64 %index.next, %3 917 br i1 %12, label %middle.block, label %vector.body 918 919middle.block: 920 %rdx.shuf = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 921 %bin.rdx = add <16 x i32> %11, %rdx.shuf 922 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 923 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 924 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 925 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 926 %rdx.shuf19 = shufflevector <16 x i32> %bin.rdx18, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 927 %bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19 928 %13 = extractelement <16 x i32> %bin.rdx20, i32 0 929 ret i32 %13 930} 931 932define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { 933; SSE2-LABEL: _Z9test_charPcS_i_1024: 934; SSE2: # %bb.0: # %entry 935; SSE2-NEXT: movl %edx, %eax 936; SSE2-NEXT: pxor %xmm8, %xmm8 937; SSE2-NEXT: xorl %ecx, %ecx 938; SSE2-NEXT: pxor %xmm9, %xmm9 939; SSE2-NEXT: pxor %xmm4, %xmm4 940; SSE2-NEXT: pxor %xmm1, %xmm1 941; SSE2-NEXT: pxor %xmm3, %xmm3 942; SSE2-NEXT: .p2align 4, 0x90 943; SSE2-NEXT: .LBB7_1: # %vector.body 944; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 945; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero 946; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 947; SSE2-NEXT: psraw $8, %xmm5 948; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero 949; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 950; SSE2-NEXT: psraw $8, %xmm6 951; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero 952; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 953; SSE2-NEXT: psraw $8, %xmm7 954; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 955; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 956; SSE2-NEXT: psraw $8, %xmm0 957; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 958; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 959; SSE2-NEXT: psraw $8, %xmm2 960; SSE2-NEXT: pmaddwd %xmm5, %xmm2 961; SSE2-NEXT: paddd %xmm2, %xmm9 962; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 963; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 964; SSE2-NEXT: psraw $8, %xmm2 965; SSE2-NEXT: pmaddwd %xmm6, %xmm2 966; SSE2-NEXT: paddd %xmm2, %xmm4 967; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 968; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 969; SSE2-NEXT: psraw $8, %xmm2 970; SSE2-NEXT: pmaddwd %xmm7, %xmm2 971; SSE2-NEXT: paddd %xmm2, %xmm1 972; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 973; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 974; SSE2-NEXT: psraw $8, %xmm2 975; SSE2-NEXT: pmaddwd %xmm0, %xmm2 976; SSE2-NEXT: paddd %xmm2, %xmm3 977; SSE2-NEXT: addq $32, %rcx 978; SSE2-NEXT: cmpq %rcx, %rax 979; SSE2-NEXT: jne .LBB7_1 980; SSE2-NEXT: # %bb.2: # %middle.block 981; SSE2-NEXT: paddd %xmm8, %xmm4 982; SSE2-NEXT: paddd %xmm8, %xmm3 983; SSE2-NEXT: paddd %xmm4, %xmm3 984; SSE2-NEXT: paddd %xmm8, %xmm9 985; SSE2-NEXT: paddd %xmm8, %xmm1 986; SSE2-NEXT: paddd %xmm3, %xmm1 987; SSE2-NEXT: paddd %xmm9, %xmm1 988; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 989; SSE2-NEXT: paddd %xmm1, %xmm0 990; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 991; SSE2-NEXT: paddd %xmm0, %xmm1 992; SSE2-NEXT: movd %xmm1, %eax 993; SSE2-NEXT: retq 994; 995; AVX1-LABEL: _Z9test_charPcS_i_1024: 996; AVX1: # %bb.0: # %entry 997; AVX1-NEXT: movl %edx, %eax 998; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 999; AVX1-NEXT: xorl %ecx, %ecx 1000; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1001; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1002; AVX1-NEXT: .p2align 4, 0x90 1003; AVX1-NEXT: .LBB7_1: # %vector.body 1004; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1005; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 1006; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm4 1007; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm5 1008; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm6 1009; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 1010; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 1011; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 1012; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 1013; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 1014; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 1015; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 1016; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 1017; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 1018; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 1019; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 1020; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 1021; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1022; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 1023; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 1024; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1025; AVX1-NEXT: addq $32, %rcx 1026; AVX1-NEXT: cmpq %rcx, %rax 1027; AVX1-NEXT: jne .LBB7_1 1028; AVX1-NEXT: # %bb.2: # %middle.block 1029; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3 1030; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1031; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1032; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1033; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 1034; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 1035; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 1036; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 1037; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1038; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1039; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1040; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1041; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1042; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1043; AVX1-NEXT: vmovd %xmm0, %eax 1044; AVX1-NEXT: vzeroupper 1045; AVX1-NEXT: retq 1046; 1047; AVX2-LABEL: _Z9test_charPcS_i_1024: 1048; AVX2: # %bb.0: # %entry 1049; AVX2-NEXT: movl %edx, %eax 1050; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1051; AVX2-NEXT: xorl %ecx, %ecx 1052; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1053; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1054; AVX2-NEXT: .p2align 4, 0x90 1055; AVX2-NEXT: .LBB7_1: # %vector.body 1056; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1057; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 1058; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4 1059; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 1060; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 1061; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 1062; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3 1063; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 1064; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 1065; AVX2-NEXT: addq $32, %rcx 1066; AVX2-NEXT: cmpq %rcx, %rax 1067; AVX2-NEXT: jne .LBB7_1 1068; AVX2-NEXT: # %bb.2: # %middle.block 1069; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 1070; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 1071; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 1072; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1073; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1074; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1075; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1076; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 1077; AVX2-NEXT: vmovd %xmm0, %eax 1078; AVX2-NEXT: vzeroupper 1079; AVX2-NEXT: retq 1080; 1081; AVX512F-LABEL: _Z9test_charPcS_i_1024: 1082; AVX512F: # %bb.0: # %entry 1083; AVX512F-NEXT: movl %edx, %eax 1084; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 1085; AVX512F-NEXT: xorl %ecx, %ecx 1086; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1087; AVX512F-NEXT: .p2align 4, 0x90 1088; AVX512F-NEXT: .LBB7_1: # %vector.body 1089; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 1090; AVX512F-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 1091; AVX512F-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 1092; AVX512F-NEXT: vpmovsxbw (%rsi,%rcx), %ymm4 1093; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 1094; AVX512F-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 1095; AVX512F-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 1096; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 1097; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1098; AVX512F-NEXT: addq $32, %rcx 1099; AVX512F-NEXT: cmpq %rcx, %rax 1100; AVX512F-NEXT: jne .LBB7_1 1101; AVX512F-NEXT: # %bb.2: # %middle.block 1102; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1103; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1104; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1105; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 1106; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1107; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1108; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1109; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1110; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1111; AVX512F-NEXT: vmovd %xmm0, %eax 1112; AVX512F-NEXT: vzeroupper 1113; AVX512F-NEXT: retq 1114; 1115; AVX512BW-LABEL: _Z9test_charPcS_i_1024: 1116; AVX512BW: # %bb.0: # %entry 1117; AVX512BW-NEXT: movl %edx, %eax 1118; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 1119; AVX512BW-NEXT: xorl %ecx, %ecx 1120; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1121; AVX512BW-NEXT: .p2align 4, 0x90 1122; AVX512BW-NEXT: .LBB7_1: # %vector.body 1123; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 1124; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 1125; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 1126; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 1127; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1128; AVX512BW-NEXT: addq $32, %rcx 1129; AVX512BW-NEXT: cmpq %rcx, %rax 1130; AVX512BW-NEXT: jne .LBB7_1 1131; AVX512BW-NEXT: # %bb.2: # %middle.block 1132; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1133; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1134; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1135; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1136; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1137; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1138; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1139; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1140; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1141; AVX512BW-NEXT: vmovd %xmm0, %eax 1142; AVX512BW-NEXT: vzeroupper 1143; AVX512BW-NEXT: retq 1144entry: 1145 %3 = zext i32 %2 to i64 1146 br label %vector.body 1147 1148vector.body: 1149 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1150 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1151 %4 = getelementptr inbounds i8, i8* %0, i64 %index 1152 %5 = bitcast i8* %4 to <32 x i8>* 1153 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 1154 %6 = sext <32 x i8> %wide.load to <32 x i32> 1155 %7 = getelementptr inbounds i8, i8* %1, i64 %index 1156 %8 = bitcast i8* %7 to <32 x i8>* 1157 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 1158 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 1159 %10 = mul nsw <32 x i32> %9, %6 1160 %11 = add nsw <32 x i32> %10, %vec.phi 1161 %index.next = add i64 %index, 32 1162 %12 = icmp eq i64 %index.next, %3 1163 br i1 %12, label %middle.block, label %vector.body 1164 1165middle.block: 1166 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1167 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 1168 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1169 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 1170 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1171 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 1172 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1173 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 1174 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1175 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 1176 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 1177 ret i32 %13 1178} 1179 1180define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 1181; SSE2-LABEL: test_unsigned_short_128: 1182; SSE2: # %bb.0: # %entry 1183; SSE2-NEXT: movl %edx, %eax 1184; SSE2-NEXT: pxor %xmm0, %xmm0 1185; SSE2-NEXT: xorl %ecx, %ecx 1186; SSE2-NEXT: .p2align 4, 0x90 1187; SSE2-NEXT: .LBB8_1: # %vector.body 1188; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1189; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 1190; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 1191; SSE2-NEXT: movdqa %xmm2, %xmm3 1192; SSE2-NEXT: pmulhuw %xmm1, %xmm3 1193; SSE2-NEXT: pmullw %xmm1, %xmm2 1194; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1195; SSE2-NEXT: paddd %xmm2, %xmm0 1196; SSE2-NEXT: addq $16, %rcx 1197; SSE2-NEXT: cmpq %rcx, %rax 1198; SSE2-NEXT: jne .LBB8_1 1199; SSE2-NEXT: # %bb.2: # %middle.block 1200; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1201; SSE2-NEXT: paddd %xmm0, %xmm1 1202; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1203; SSE2-NEXT: paddd %xmm1, %xmm0 1204; SSE2-NEXT: movd %xmm0, %eax 1205; SSE2-NEXT: retq 1206; 1207; AVX-LABEL: test_unsigned_short_128: 1208; AVX: # %bb.0: # %entry 1209; AVX-NEXT: movl %edx, %eax 1210; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1211; AVX-NEXT: xorl %ecx, %ecx 1212; AVX-NEXT: .p2align 4, 0x90 1213; AVX-NEXT: .LBB8_1: # %vector.body 1214; AVX-NEXT: # =>This Inner Loop Header: Depth=1 1215; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1216; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1217; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 1218; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1219; AVX-NEXT: addq $16, %rcx 1220; AVX-NEXT: cmpq %rcx, %rax 1221; AVX-NEXT: jne .LBB8_1 1222; AVX-NEXT: # %bb.2: # %middle.block 1223; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1224; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1225; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1226; AVX-NEXT: vmovd %xmm0, %eax 1227; AVX-NEXT: retq 1228entry: 1229 %3 = zext i32 %2 to i64 1230 br label %vector.body 1231 1232vector.body: 1233 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1234 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1235 %4 = getelementptr inbounds i16, i16* %0, i64 %index 1236 %5 = bitcast i16* %4 to <4 x i16>* 1237 %wide.load = load <4 x i16>, <4 x i16>* %5, align 2 1238 %6 = zext <4 x i16> %wide.load to <4 x i32> 1239 %7 = getelementptr inbounds i16, i16* %1, i64 %index 1240 %8 = bitcast i16* %7 to <4 x i16>* 1241 %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2 1242 %9 = zext <4 x i16> %wide.load14 to <4 x i32> 1243 %10 = mul nsw <4 x i32> %9, %6 1244 %11 = add nsw <4 x i32> %10, %vec.phi 1245 %index.next = add i64 %index, 16 1246 %12 = icmp eq i64 %index.next, %3 1247 br i1 %12, label %middle.block, label %vector.body 1248 1249middle.block: 1250 %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1251 %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15 1252 %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1253 %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17 1254 %13 = extractelement <4 x i32> %bin.rdx18, i32 0 1255 ret i32 %13 1256} 1257 1258define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 1259; SSE2-LABEL: test_unsigned_short_256: 1260; SSE2: # %bb.0: # %entry 1261; SSE2-NEXT: movl %edx, %eax 1262; SSE2-NEXT: pxor %xmm0, %xmm0 1263; SSE2-NEXT: xorl %ecx, %ecx 1264; SSE2-NEXT: pxor %xmm1, %xmm1 1265; SSE2-NEXT: .p2align 4, 0x90 1266; SSE2-NEXT: .LBB9_1: # %vector.body 1267; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1268; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 1269; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 1270; SSE2-NEXT: movdqa %xmm3, %xmm4 1271; SSE2-NEXT: pmulhuw %xmm2, %xmm4 1272; SSE2-NEXT: pmullw %xmm2, %xmm3 1273; SSE2-NEXT: movdqa %xmm3, %xmm2 1274; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1275; SSE2-NEXT: paddd %xmm2, %xmm0 1276; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1277; SSE2-NEXT: paddd %xmm3, %xmm1 1278; SSE2-NEXT: addq $16, %rcx 1279; SSE2-NEXT: cmpq %rcx, %rax 1280; SSE2-NEXT: jne .LBB9_1 1281; SSE2-NEXT: # %bb.2: # %middle.block 1282; SSE2-NEXT: paddd %xmm1, %xmm0 1283; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1284; SSE2-NEXT: paddd %xmm0, %xmm1 1285; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1286; SSE2-NEXT: paddd %xmm1, %xmm0 1287; SSE2-NEXT: movd %xmm0, %eax 1288; SSE2-NEXT: retq 1289; 1290; AVX1-LABEL: test_unsigned_short_256: 1291; AVX1: # %bb.0: # %entry 1292; AVX1-NEXT: movl %edx, %eax 1293; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 1294; AVX1-NEXT: xorl %ecx, %ecx 1295; AVX1-NEXT: .p2align 4, 0x90 1296; AVX1-NEXT: .LBB9_1: # %vector.body 1297; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1298; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1299; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1300; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1301; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 1302; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1303; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 1304; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1305; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 1306; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 1307; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1308; AVX1-NEXT: addq $16, %rcx 1309; AVX1-NEXT: cmpq %rcx, %rax 1310; AVX1-NEXT: jne .LBB9_1 1311; AVX1-NEXT: # %bb.2: # %middle.block 1312; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1313; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1314; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1315; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1316; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1317; AVX1-NEXT: vmovd %xmm0, %eax 1318; AVX1-NEXT: vzeroupper 1319; AVX1-NEXT: retq 1320; 1321; AVX256-LABEL: test_unsigned_short_256: 1322; AVX256: # %bb.0: # %entry 1323; AVX256-NEXT: movl %edx, %eax 1324; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 1325; AVX256-NEXT: xorl %ecx, %ecx 1326; AVX256-NEXT: .p2align 4, 0x90 1327; AVX256-NEXT: .LBB9_1: # %vector.body 1328; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 1329; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1330; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1331; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1332; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 1333; AVX256-NEXT: addq $16, %rcx 1334; AVX256-NEXT: cmpq %rcx, %rax 1335; AVX256-NEXT: jne .LBB9_1 1336; AVX256-NEXT: # %bb.2: # %middle.block 1337; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 1338; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1339; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1340; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1341; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 1342; AVX256-NEXT: vmovd %xmm0, %eax 1343; AVX256-NEXT: vzeroupper 1344; AVX256-NEXT: retq 1345entry: 1346 %3 = zext i32 %2 to i64 1347 br label %vector.body 1348 1349vector.body: 1350 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1351 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1352 %4 = getelementptr inbounds i16, i16* %0, i64 %index 1353 %5 = bitcast i16* %4 to <8 x i16>* 1354 %wide.load = load <8 x i16>, <8 x i16>* %5, align 2 1355 %6 = zext <8 x i16> %wide.load to <8 x i32> 1356 %7 = getelementptr inbounds i16, i16* %1, i64 %index 1357 %8 = bitcast i16* %7 to <8 x i16>* 1358 %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2 1359 %9 = zext <8 x i16> %wide.load14 to <8 x i32> 1360 %10 = mul nsw <8 x i32> %9, %6 1361 %11 = add nsw <8 x i32> %10, %vec.phi 1362 %index.next = add i64 %index, 16 1363 %12 = icmp eq i64 %index.next, %3 1364 br i1 %12, label %middle.block, label %vector.body 1365 1366middle.block: 1367 %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 1368 %bin.rdx = add <8 x i32> %11, %rdx.shuf 1369 %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1370 %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15 1371 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1372 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 1373 %13 = extractelement <8 x i32> %bin.rdx18, i32 0 1374 ret i32 %13 1375} 1376 1377define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 1378; SSE2-LABEL: test_unsigned_short_512: 1379; SSE2: # %bb.0: # %entry 1380; SSE2-NEXT: movl %edx, %eax 1381; SSE2-NEXT: pxor %xmm0, %xmm0 1382; SSE2-NEXT: xorl %ecx, %ecx 1383; SSE2-NEXT: pxor %xmm1, %xmm1 1384; SSE2-NEXT: pxor %xmm3, %xmm3 1385; SSE2-NEXT: pxor %xmm2, %xmm2 1386; SSE2-NEXT: .p2align 4, 0x90 1387; SSE2-NEXT: .LBB10_1: # %vector.body 1388; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1389; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 1390; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm8 1391; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm6 1392; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm7 1393; SSE2-NEXT: movdqa %xmm6, %xmm5 1394; SSE2-NEXT: pmulhuw %xmm4, %xmm5 1395; SSE2-NEXT: pmullw %xmm4, %xmm6 1396; SSE2-NEXT: movdqa %xmm6, %xmm4 1397; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 1398; SSE2-NEXT: paddd %xmm4, %xmm0 1399; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 1400; SSE2-NEXT: paddd %xmm6, %xmm1 1401; SSE2-NEXT: movdqa %xmm7, %xmm4 1402; SSE2-NEXT: pmulhuw %xmm8, %xmm4 1403; SSE2-NEXT: pmullw %xmm8, %xmm7 1404; SSE2-NEXT: movdqa %xmm7, %xmm5 1405; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 1406; SSE2-NEXT: paddd %xmm5, %xmm3 1407; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 1408; SSE2-NEXT: paddd %xmm7, %xmm2 1409; SSE2-NEXT: addq $16, %rcx 1410; SSE2-NEXT: cmpq %rcx, %rax 1411; SSE2-NEXT: jne .LBB10_1 1412; SSE2-NEXT: # %bb.2: # %middle.block 1413; SSE2-NEXT: paddd %xmm3, %xmm0 1414; SSE2-NEXT: paddd %xmm2, %xmm1 1415; SSE2-NEXT: paddd %xmm0, %xmm1 1416; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1417; SSE2-NEXT: paddd %xmm1, %xmm0 1418; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1419; SSE2-NEXT: paddd %xmm0, %xmm1 1420; SSE2-NEXT: movd %xmm1, %eax 1421; SSE2-NEXT: retq 1422; 1423; AVX1-LABEL: test_unsigned_short_512: 1424; AVX1: # %bb.0: # %entry 1425; AVX1-NEXT: movl %edx, %eax 1426; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 1427; AVX1-NEXT: xorl %ecx, %ecx 1428; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1429; AVX1-NEXT: .p2align 4, 0x90 1430; AVX1-NEXT: .LBB10_1: # %vector.body 1431; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1432; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1433; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1434; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1435; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1436; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1437; AVX1-NEXT: vpmulld %xmm2, %xmm6, %xmm2 1438; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1439; AVX1-NEXT: vpmulld %xmm3, %xmm6, %xmm3 1440; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1441; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4 1442; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1443; AVX1-NEXT: vpmulld %xmm5, %xmm6, %xmm5 1444; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 1445; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 1446; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 1447; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1448; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1449; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 1450; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 1451; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1452; AVX1-NEXT: addq $16, %rcx 1453; AVX1-NEXT: cmpq %rcx, %rax 1454; AVX1-NEXT: jne .LBB10_1 1455; AVX1-NEXT: # %bb.2: # %middle.block 1456; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1457; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1458; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1459; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1460; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1461; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1462; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1463; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1464; AVX1-NEXT: vmovd %xmm0, %eax 1465; AVX1-NEXT: vzeroupper 1466; AVX1-NEXT: retq 1467; 1468; AVX2-LABEL: test_unsigned_short_512: 1469; AVX2: # %bb.0: # %entry 1470; AVX2-NEXT: movl %edx, %eax 1471; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1472; AVX2-NEXT: xorl %ecx, %ecx 1473; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1474; AVX2-NEXT: .p2align 4, 0x90 1475; AVX2-NEXT: .LBB10_1: # %vector.body 1476; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1477; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1478; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1479; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1480; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2 1481; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 1482; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1483; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 1484; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 1485; AVX2-NEXT: addq $16, %rcx 1486; AVX2-NEXT: cmpq %rcx, %rax 1487; AVX2-NEXT: jne .LBB10_1 1488; AVX2-NEXT: # %bb.2: # %middle.block 1489; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1490; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1491; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1492; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1493; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1494; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 1495; AVX2-NEXT: vmovd %xmm0, %eax 1496; AVX2-NEXT: vzeroupper 1497; AVX2-NEXT: retq 1498; 1499; AVX512-LABEL: test_unsigned_short_512: 1500; AVX512: # %bb.0: # %entry 1501; AVX512-NEXT: movl %edx, %eax 1502; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 1503; AVX512-NEXT: xorl %ecx, %ecx 1504; AVX512-NEXT: .p2align 4, 0x90 1505; AVX512-NEXT: .LBB10_1: # %vector.body 1506; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 1507; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1508; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1509; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 1510; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1511; AVX512-NEXT: addq $16, %rcx 1512; AVX512-NEXT: cmpq %rcx, %rax 1513; AVX512-NEXT: jne .LBB10_1 1514; AVX512-NEXT: # %bb.2: # %middle.block 1515; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1516; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1517; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1518; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1519; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1520; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1521; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1522; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1523; AVX512-NEXT: vmovd %xmm0, %eax 1524; AVX512-NEXT: vzeroupper 1525; AVX512-NEXT: retq 1526entry: 1527 %3 = zext i32 %2 to i64 1528 br label %vector.body 1529 1530vector.body: 1531 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1532 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1533 %4 = getelementptr inbounds i16, i16* %0, i64 %index 1534 %5 = bitcast i16* %4 to <16 x i16>* 1535 %wide.load = load <16 x i16>, <16 x i16>* %5, align 2 1536 %6 = zext <16 x i16> %wide.load to <16 x i32> 1537 %7 = getelementptr inbounds i16, i16* %1, i64 %index 1538 %8 = bitcast i16* %7 to <16 x i16>* 1539 %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2 1540 %9 = zext <16 x i16> %wide.load14 to <16 x i32> 1541 %10 = mul nsw <16 x i32> %9, %6 1542 %11 = add nsw <16 x i32> %10, %vec.phi 1543 %index.next = add i64 %index, 16 1544 %12 = icmp eq i64 %index.next, %3 1545 br i1 %12, label %middle.block, label %vector.body 1546 1547middle.block: 1548 %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1549 %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1 1550 %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1551 %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf 1552 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1553 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 1554 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1555 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 1556 %13 = extractelement <16 x i32> %bin.rdx18, i32 0 1557 ret i32 %13 1558} 1559 1560define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 1561; SSE2-LABEL: test_unsigned_short_1024: 1562; SSE2: # %bb.0: # %entry 1563; SSE2-NEXT: movl %edx, %eax 1564; SSE2-NEXT: pxor %xmm8, %xmm8 1565; SSE2-NEXT: xorl %ecx, %ecx 1566; SSE2-NEXT: pxor %xmm3, %xmm3 1567; SSE2-NEXT: pxor %xmm9, %xmm9 1568; SSE2-NEXT: pxor %xmm10, %xmm10 1569; SSE2-NEXT: pxor %xmm4, %xmm4 1570; SSE2-NEXT: pxor %xmm6, %xmm6 1571; SSE2-NEXT: pxor %xmm5, %xmm5 1572; SSE2-NEXT: pxor %xmm7, %xmm7 1573; SSE2-NEXT: .p2align 4, 0x90 1574; SSE2-NEXT: .LBB11_1: # %vector.body 1575; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1576; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm0 1577; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm1 1578; SSE2-NEXT: movdqa %xmm1, %xmm2 1579; SSE2-NEXT: pmulhuw %xmm0, %xmm2 1580; SSE2-NEXT: pmullw %xmm0, %xmm1 1581; SSE2-NEXT: movdqa %xmm1, %xmm0 1582; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1583; SSE2-NEXT: paddd %xmm0, %xmm7 1584; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm0 1585; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1586; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm2 1587; SSE2-NEXT: paddd %xmm1, %xmm5 1588; SSE2-NEXT: movdqa %xmm2, %xmm1 1589; SSE2-NEXT: pmulhuw %xmm0, %xmm1 1590; SSE2-NEXT: pmullw %xmm0, %xmm2 1591; SSE2-NEXT: movdqa %xmm2, %xmm0 1592; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1593; SSE2-NEXT: paddd %xmm0, %xmm6 1594; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm0 1595; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1596; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1 1597; SSE2-NEXT: paddd %xmm2, %xmm4 1598; SSE2-NEXT: movdqa %xmm1, %xmm2 1599; SSE2-NEXT: pmulhuw %xmm0, %xmm2 1600; SSE2-NEXT: pmullw %xmm0, %xmm1 1601; SSE2-NEXT: movdqa %xmm1, %xmm0 1602; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1603; SSE2-NEXT: paddd %xmm0, %xmm8 1604; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm0 1605; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1606; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm2 1607; SSE2-NEXT: paddd %xmm1, %xmm3 1608; SSE2-NEXT: movdqa %xmm2, %xmm1 1609; SSE2-NEXT: pmulhuw %xmm0, %xmm1 1610; SSE2-NEXT: pmullw %xmm0, %xmm2 1611; SSE2-NEXT: movdqa %xmm2, %xmm0 1612; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1613; SSE2-NEXT: paddd %xmm0, %xmm9 1614; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1615; SSE2-NEXT: paddd %xmm2, %xmm10 1616; SSE2-NEXT: addq $16, %rcx 1617; SSE2-NEXT: cmpq %rcx, %rax 1618; SSE2-NEXT: jne .LBB11_1 1619; SSE2-NEXT: # %bb.2: # %middle.block 1620; SSE2-NEXT: paddd %xmm6, %xmm3 1621; SSE2-NEXT: paddd %xmm7, %xmm10 1622; SSE2-NEXT: paddd %xmm3, %xmm10 1623; SSE2-NEXT: paddd %xmm4, %xmm8 1624; SSE2-NEXT: paddd %xmm5, %xmm9 1625; SSE2-NEXT: paddd %xmm10, %xmm9 1626; SSE2-NEXT: paddd %xmm8, %xmm9 1627; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1] 1628; SSE2-NEXT: paddd %xmm9, %xmm0 1629; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1630; SSE2-NEXT: paddd %xmm0, %xmm1 1631; SSE2-NEXT: movd %xmm1, %eax 1632; SSE2-NEXT: retq 1633; 1634; AVX1-LABEL: test_unsigned_short_1024: 1635; AVX1: # %bb.0: # %entry 1636; AVX1-NEXT: movl %edx, %eax 1637; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 1638; AVX1-NEXT: xorl %ecx, %ecx 1639; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1640; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 1641; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1642; AVX1-NEXT: .p2align 4, 0x90 1643; AVX1-NEXT: .LBB11_1: # %vector.body 1644; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1645; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1646; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1647; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1648; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1649; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1650; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1651; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1652; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1653; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1654; AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 1655; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1656; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 1657; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1658; AVX1-NEXT: vpmulld %xmm6, %xmm5, %xmm5 1659; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1660; AVX1-NEXT: vpmulld %xmm7, %xmm6, %xmm6 1661; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1662; AVX1-NEXT: vpmulld %xmm0, %xmm7, %xmm13 1663; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1664; AVX1-NEXT: vpmulld %xmm12, %xmm7, %xmm7 1665; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1666; AVX1-NEXT: vpmulld %xmm10, %xmm0, %xmm10 1667; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1668; AVX1-NEXT: vpmulld %xmm11, %xmm0, %xmm11 1669; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 1670; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1671; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm1 1672; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 1673; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0 1674; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 1675; AVX1-NEXT: vpaddd %xmm8, %xmm6, %xmm1 1676; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 1677; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 1678; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm0 1679; AVX1-NEXT: vpaddd %xmm9, %xmm7, %xmm1 1680; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 1681; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 1682; AVX1-NEXT: vpaddd %xmm0, %xmm10, %xmm0 1683; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm1 1684; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 1685; AVX1-NEXT: addq $16, %rcx 1686; AVX1-NEXT: cmpq %rcx, %rax 1687; AVX1-NEXT: jne .LBB11_1 1688; AVX1-NEXT: # %bb.2: # %middle.block 1689; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm0 1690; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 1691; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm4 1692; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1693; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1694; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1695; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 1696; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1697; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0 1698; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1699; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 1700; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1701; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1702; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1703; AVX1-NEXT: vmovd %xmm0, %eax 1704; AVX1-NEXT: vzeroupper 1705; AVX1-NEXT: retq 1706; 1707; AVX2-LABEL: test_unsigned_short_1024: 1708; AVX2: # %bb.0: # %entry 1709; AVX2-NEXT: movl %edx, %eax 1710; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1711; AVX2-NEXT: xorl %ecx, %ecx 1712; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1713; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1714; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 1715; AVX2-NEXT: .p2align 4, 0x90 1716; AVX2-NEXT: .LBB11_1: # %vector.body 1717; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1718; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1719; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1720; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1721; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1722; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1723; AVX2-NEXT: vpmulld %ymm4, %ymm8, %ymm4 1724; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 1725; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1726; AVX2-NEXT: vpmulld %ymm5, %ymm4, %ymm4 1727; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 1728; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1729; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4 1730; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 1731; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1732; AVX2-NEXT: vpmulld %ymm7, %ymm4, %ymm4 1733; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 1734; AVX2-NEXT: addq $16, %rcx 1735; AVX2-NEXT: cmpq %rcx, %rax 1736; AVX2-NEXT: jne .LBB11_1 1737; AVX2-NEXT: # %bb.2: # %middle.block 1738; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 1739; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1740; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1741; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1742; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1743; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1744; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1745; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 1746; AVX2-NEXT: vmovd %xmm0, %eax 1747; AVX2-NEXT: vzeroupper 1748; AVX2-NEXT: retq 1749; 1750; AVX512-LABEL: test_unsigned_short_1024: 1751; AVX512: # %bb.0: # %entry 1752; AVX512-NEXT: movl %edx, %eax 1753; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 1754; AVX512-NEXT: xorl %ecx, %ecx 1755; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1756; AVX512-NEXT: .p2align 4, 0x90 1757; AVX512-NEXT: .LBB11_1: # %vector.body 1758; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 1759; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1760; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1761; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1762; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2 1763; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1764; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1765; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 1766; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 1767; AVX512-NEXT: addq $16, %rcx 1768; AVX512-NEXT: cmpq %rcx, %rax 1769; AVX512-NEXT: jne .LBB11_1 1770; AVX512-NEXT: # %bb.2: # %middle.block 1771; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1772; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1773; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1774; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1775; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1776; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1777; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1778; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1779; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1780; AVX512-NEXT: vmovd %xmm0, %eax 1781; AVX512-NEXT: vzeroupper 1782; AVX512-NEXT: retq 1783entry: 1784 %3 = zext i32 %2 to i64 1785 br label %vector.body 1786 1787vector.body: 1788 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1789 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1790 %4 = getelementptr inbounds i16, i16* %0, i64 %index 1791 %5 = bitcast i16* %4 to <32 x i16>* 1792 %wide.load = load <32 x i16>, <32 x i16>* %5, align 2 1793 %6 = zext <32 x i16> %wide.load to <32 x i32> 1794 %7 = getelementptr inbounds i16, i16* %1, i64 %index 1795 %8 = bitcast i16* %7 to <32 x i16>* 1796 %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2 1797 %9 = zext <32 x i16> %wide.load14 to <32 x i32> 1798 %10 = mul nsw <32 x i32> %9, %6 1799 %11 = add nsw <32 x i32> %10, %vec.phi 1800 %index.next = add i64 %index, 16 1801 %12 = icmp eq i64 %index.next, %3 1802 br i1 %12, label %middle.block, label %vector.body 1803 1804middle.block: 1805 %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1806 %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2 1807 %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1808 %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1 1809 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1810 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 1811 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1812 %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15 1813 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1814 %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17 1815 %13 = extractelement <32 x i32> %bin.rdx18, i32 0 1816 ret i32 %13 1817} 1818 1819define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) { 1820; SSE2-LABEL: pmaddwd_8: 1821; SSE2: # %bb.0: 1822; SSE2-NEXT: pmaddwd %xmm1, %xmm0 1823; SSE2-NEXT: retq 1824; 1825; AVX-LABEL: pmaddwd_8: 1826; AVX: # %bb.0: 1827; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1828; AVX-NEXT: retq 1829 %a = sext <8 x i16> %A to <8 x i32> 1830 %b = sext <8 x i16> %B to <8 x i32> 1831 %m = mul nsw <8 x i32> %a, %b 1832 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1833 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1834 %ret = add <4 x i32> %odd, %even 1835 ret <4 x i32> %ret 1836} 1837 1838define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) { 1839; SSE2-LABEL: pmaddwd_8_swapped: 1840; SSE2: # %bb.0: 1841; SSE2-NEXT: pmaddwd %xmm1, %xmm0 1842; SSE2-NEXT: retq 1843; 1844; AVX-LABEL: pmaddwd_8_swapped: 1845; AVX: # %bb.0: 1846; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1847; AVX-NEXT: retq 1848 %a = sext <8 x i16> %A to <8 x i32> 1849 %b = sext <8 x i16> %B to <8 x i32> 1850 %m = mul nsw <8 x i32> %a, %b 1851 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1852 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1853 %ret = add <4 x i32> %even, %odd 1854 ret <4 x i32> %ret 1855} 1856 1857define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) { 1858; SSE2-LABEL: larger_mul: 1859; SSE2: # %bb.0: 1860; SSE2-NEXT: movdqa %xmm0, %xmm1 1861; SSE2-NEXT: pmulhw %xmm2, %xmm1 1862; SSE2-NEXT: pmullw %xmm2, %xmm0 1863; SSE2-NEXT: movdqa %xmm0, %xmm2 1864; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1865; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1866; SSE2-NEXT: movdqa %xmm0, %xmm1 1867; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1868; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 1869; SSE2-NEXT: paddd %xmm1, %xmm0 1870; SSE2-NEXT: retq 1871; 1872; AVX1-LABEL: larger_mul: 1873; AVX1: # %bb.0: 1874; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 1875; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1876; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 1877; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 1878; AVX1-NEXT: vpmovsxwd %xmm1, %xmm2 1879; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1880; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 1881; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 1882; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1883; AVX1-NEXT: vzeroupper 1884; AVX1-NEXT: retq 1885; 1886; AVX2-LABEL: larger_mul: 1887; AVX2: # %bb.0: 1888; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1889; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 1890; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1891; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 1892; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1893; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 1894; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1895; AVX2-NEXT: vzeroupper 1896; AVX2-NEXT: retq 1897; 1898; AVX512-LABEL: larger_mul: 1899; AVX512: # %bb.0: 1900; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 1901; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 1902; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1903; AVX512-NEXT: vpextrd $2, %xmm0, %eax 1904; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1 1905; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 1906; AVX512-NEXT: vmovd %xmm2, %eax 1907; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 1908; AVX512-NEXT: vpextrd $2, %xmm2, %eax 1909; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 1910; AVX512-NEXT: vpextrd $3, %xmm0, %eax 1911; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1912; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 1913; AVX512-NEXT: vpextrd $1, %xmm2, %eax 1914; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 1915; AVX512-NEXT: vpextrd $3, %xmm2, %eax 1916; AVX512-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 1917; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1918; AVX512-NEXT: vzeroupper 1919; AVX512-NEXT: retq 1920 %a = sext <16 x i16> %A to <16 x i32> 1921 %b = sext <16 x i16> %B to <16 x i32> 1922 %m = mul nsw <16 x i32> %a, %b 1923 %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1924 %even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1925 %ret = add <4 x i32> %odd, %even 1926 ret <4 x i32> %ret 1927} 1928 1929define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) { 1930; SSE2-LABEL: pmaddwd_16: 1931; SSE2: # %bb.0: 1932; SSE2-NEXT: pmaddwd %xmm2, %xmm0 1933; SSE2-NEXT: pmaddwd %xmm3, %xmm1 1934; SSE2-NEXT: retq 1935; 1936; AVX1-LABEL: pmaddwd_16: 1937; AVX1: # %bb.0: 1938; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1939; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1940; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 1941; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1942; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1943; AVX1-NEXT: retq 1944; 1945; AVX256-LABEL: pmaddwd_16: 1946; AVX256: # %bb.0: 1947; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1948; AVX256-NEXT: retq 1949 %a = sext <16 x i16> %A to <16 x i32> 1950 %b = sext <16 x i16> %B to <16 x i32> 1951 %m = mul nsw <16 x i32> %a, %b 1952 %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 1953 %even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 1954 %ret = add <8 x i32> %odd, %even 1955 ret <8 x i32> %ret 1956} 1957 1958define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { 1959; SSE2-LABEL: pmaddwd_32: 1960; SSE2: # %bb.0: 1961; SSE2-NEXT: pmaddwd %xmm4, %xmm0 1962; SSE2-NEXT: pmaddwd %xmm5, %xmm1 1963; SSE2-NEXT: pmaddwd %xmm6, %xmm2 1964; SSE2-NEXT: pmaddwd %xmm7, %xmm3 1965; SSE2-NEXT: retq 1966; 1967; AVX1-LABEL: pmaddwd_32: 1968; AVX1: # %bb.0: 1969; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1970; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1971; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 1972; AVX1-NEXT: vpmaddwd %xmm6, %xmm4, %xmm4 1973; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 1974; AVX1-NEXT: vpmaddwd %xmm6, %xmm5, %xmm5 1975; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 1976; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 1977; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 1978; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1979; AVX1-NEXT: retq 1980; 1981; AVX2-LABEL: pmaddwd_32: 1982; AVX2: # %bb.0: 1983; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 1984; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 1985; AVX2-NEXT: retq 1986; 1987; AVX512F-LABEL: pmaddwd_32: 1988; AVX512F: # %bb.0: 1989; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 1990; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 1991; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1992; AVX512F-NEXT: retq 1993; 1994; AVX512BW-LABEL: pmaddwd_32: 1995; AVX512BW: # %bb.0: 1996; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 1997; AVX512BW-NEXT: retq 1998 %a = sext <32 x i16> %A to <32 x i32> 1999 %b = sext <32 x i16> %B to <32 x i32> 2000 %m = mul nsw <32 x i32> %a, %b 2001 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 2002 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 2003 %ret = add <16 x i32> %odd, %even 2004 ret <16 x i32> %ret 2005} 2006 2007define <4 x i32> @pmaddwd_const(<8 x i16> %A) { 2008; SSE2-LABEL: pmaddwd_const: 2009; SSE2: # %bb.0: 2010; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm0 2011; SSE2-NEXT: retq 2012; 2013; AVX-LABEL: pmaddwd_const: 2014; AVX: # %bb.0: 2015; AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 2016; AVX-NEXT: retq 2017 %a = sext <8 x i16> %A to <8 x i32> 2018 %m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32> 2019 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2020 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2021 %ret = add <4 x i32> %odd, %even 2022 ret <4 x i32> %ret 2023} 2024 2025; Do not select unsigned i16 multiplication 2026define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) { 2027; SSE2-LABEL: pmaddwd_negative1: 2028; SSE2: # %bb.0: 2029; SSE2-NEXT: movdqa %xmm0, %xmm2 2030; SSE2-NEXT: pmulhuw %xmm1, %xmm2 2031; SSE2-NEXT: pmullw %xmm1, %xmm0 2032; SSE2-NEXT: movdqa %xmm0, %xmm1 2033; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2034; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2035; SSE2-NEXT: movdqa %xmm0, %xmm2 2036; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 2037; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 2038; SSE2-NEXT: paddd %xmm2, %xmm0 2039; SSE2-NEXT: retq 2040; 2041; AVX1-LABEL: pmaddwd_negative1: 2042; AVX1: # %bb.0: 2043; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 2044; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2045; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2046; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 2047; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 2048; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 2049; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2050; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2051; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 2052; AVX1-NEXT: retq 2053; 2054; AVX256-LABEL: pmaddwd_negative1: 2055; AVX256: # %bb.0: 2056; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2057; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2058; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2059; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2060; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 2061; AVX256-NEXT: vzeroupper 2062; AVX256-NEXT: retq 2063 %a = zext <8 x i16> %A to <8 x i32> 2064 %b = zext <8 x i16> %B to <8 x i32> 2065 %m = mul nuw <8 x i32> %a, %b 2066 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2067 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2068 %ret = add <4 x i32> %odd, %even 2069 ret <4 x i32> %ret 2070} 2071 2072; Do not select if constant is too large 2073define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) { 2074; SSE2-LABEL: pmaddwd_negative2: 2075; SSE2: # %bb.0: 2076; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2077; SSE2-NEXT: psrad $16, %xmm1 2078; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 2079; SSE2-NEXT: psrad $16, %xmm0 2080; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,7,42,32] 2081; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2082; SSE2-NEXT: pmuludq %xmm2, %xmm0 2083; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] 2084; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2085; SSE2-NEXT: pmuludq %xmm3, %xmm0 2086; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2087; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 2088; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,4294934528,0,0] 2089; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 2090; SSE2-NEXT: pmuludq %xmm2, %xmm1 2091; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 2092; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2093; SSE2-NEXT: pmuludq %xmm3, %xmm1 2094; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2095; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2096; SSE2-NEXT: movdqa %xmm0, %xmm1 2097; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] 2098; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] 2099; SSE2-NEXT: paddd %xmm1, %xmm0 2100; SSE2-NEXT: retq 2101; 2102; AVX1-LABEL: pmaddwd_negative2: 2103; AVX1: # %bb.0: 2104; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2105; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 2106; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 2107; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2108; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 2109; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 2110; AVX1-NEXT: retq 2111; 2112; AVX256-LABEL: pmaddwd_negative2: 2113; AVX256: # %bb.0: 2114; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 2115; AVX256-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 2116; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2117; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 2118; AVX256-NEXT: vzeroupper 2119; AVX256-NEXT: retq 2120 %a = sext <8 x i16> %A to <8 x i32> 2121 %m = mul nsw <8 x i32> %a, <i32 32768, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32> 2122 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2123 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2124 %ret = add <4 x i32> %odd, %even 2125 ret <4 x i32> %ret 2126} 2127 2128define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) { 2129; SSE2-LABEL: jumbled_indices4: 2130; SSE2: # %bb.0: 2131; SSE2-NEXT: pmaddwd %xmm1, %xmm0 2132; SSE2-NEXT: retq 2133; 2134; AVX-LABEL: jumbled_indices4: 2135; AVX: # %bb.0: 2136; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 2137; AVX-NEXT: retq 2138 %exta = sext <8 x i16> %A to <8 x i32> 2139 %extb = sext <8 x i16> %B to <8 x i32> 2140 %m = mul <8 x i32> %exta, %extb 2141 %sa = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 3, i32 1, i32 5, i32 6> 2142 %sb = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 2, i32 0, i32 4, i32 7> 2143 %a = add <4 x i32> %sa, %sb 2144 ret <4 x i32> %a 2145} 2146 2147define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) { 2148; SSE2-LABEL: jumbled_indices8: 2149; SSE2: # %bb.0: 2150; SSE2-NEXT: pmaddwd %xmm2, %xmm0 2151; SSE2-NEXT: pmaddwd %xmm3, %xmm1 2152; SSE2-NEXT: retq 2153; 2154; AVX1-LABEL: jumbled_indices8: 2155; AVX1: # %bb.0: 2156; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2157; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2158; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 2159; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 2160; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2161; AVX1-NEXT: retq 2162; 2163; AVX256-LABEL: jumbled_indices8: 2164; AVX256: # %bb.0: 2165; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 2166; AVX256-NEXT: retq 2167 %exta = sext <16 x i16> %A to <16 x i32> 2168 %extb = sext <16 x i16> %B to <16 x i32> 2169 %m = mul <16 x i32> %exta, %extb 2170 %sa = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 7, i32 4, i32 11, i32 8, i32 15, i32 12> 2171 %sb = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 6, i32 5, i32 10, i32 9, i32 14, i32 13> 2172 %a = add <8 x i32> %sa, %sb 2173 ret <8 x i32> %a 2174} 2175 2176define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { 2177; SSE2-LABEL: jumbled_indices16: 2178; SSE2: # %bb.0: 2179; SSE2-NEXT: pmaddwd %xmm4, %xmm0 2180; SSE2-NEXT: pmaddwd %xmm5, %xmm1 2181; SSE2-NEXT: pmaddwd %xmm6, %xmm2 2182; SSE2-NEXT: pmaddwd %xmm7, %xmm3 2183; SSE2-NEXT: retq 2184; 2185; AVX1-LABEL: jumbled_indices16: 2186; AVX1: # %bb.0: 2187; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2188; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 2189; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 2190; AVX1-NEXT: vpmaddwd %xmm6, %xmm4, %xmm4 2191; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 2192; AVX1-NEXT: vpmaddwd %xmm6, %xmm5, %xmm5 2193; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 2194; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 2195; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 2196; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2197; AVX1-NEXT: retq 2198; 2199; AVX2-LABEL: jumbled_indices16: 2200; AVX2: # %bb.0: 2201; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 2202; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 2203; AVX2-NEXT: retq 2204; 2205; AVX512F-LABEL: jumbled_indices16: 2206; AVX512F: # %bb.0: 2207; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 2208; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 2209; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2210; AVX512F-NEXT: retq 2211; 2212; AVX512BW-LABEL: jumbled_indices16: 2213; AVX512BW: # %bb.0: 2214; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 2215; AVX512BW-NEXT: retq 2216 %exta = sext <32 x i16> %A to <32 x i32> 2217 %extb = sext <32 x i16> %B to <32 x i32> 2218 %m = mul <32 x i32> %exta, %extb 2219 %sa = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 2, i32 0, i32 5, i32 6, i32 11, i32 9, i32 15, i32 12, i32 17, i32 18, i32 20, i32 23, i32 27, i32 24, i32 31, i32 29> 2220 %sb = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 3, i32 1, i32 4, i32 7, i32 10, i32 8, i32 14, i32 13, i32 16, i32 19, i32 21, i32 22, i32 26, i32 25, i32 30, i32 28> 2221 %a = add <16 x i32> %sa, %sb 2222 ret <16 x i32> %a 2223} 2224 2225define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { 2226; SSE2-LABEL: jumbled_indices32: 2227; SSE2: # %bb.0: 2228; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm0 2229; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm1 2230; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm2 2231; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm3 2232; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm4 2233; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm5 2234; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm6 2235; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm7 2236; SSE2-NEXT: movdqa %xmm7, 112(%rdi) 2237; SSE2-NEXT: movdqa %xmm6, 96(%rdi) 2238; SSE2-NEXT: movdqa %xmm5, 80(%rdi) 2239; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 2240; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 2241; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 2242; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 2243; SSE2-NEXT: movdqa %xmm0, (%rdi) 2244; SSE2-NEXT: movq %rdi, %rax 2245; SSE2-NEXT: retq 2246; 2247; AVX1-LABEL: jumbled_indices32: 2248; AVX1: # %bb.0: 2249; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 2250; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 2251; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm10 2252; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 2253; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm12 2254; AVX1-NEXT: vpmaddwd %xmm12, %xmm8, %xmm8 2255; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12 2256; AVX1-NEXT: vpmaddwd %xmm12, %xmm9, %xmm9 2257; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm12 2258; AVX1-NEXT: vpmaddwd %xmm12, %xmm10, %xmm10 2259; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm12 2260; AVX1-NEXT: vpmaddwd %xmm12, %xmm11, %xmm11 2261; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 2262; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 2263; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 2264; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 2265; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2 2266; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 2267; AVX1-NEXT: vpmaddwd %xmm7, %xmm3, %xmm3 2268; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 2269; AVX1-NEXT: retq 2270; 2271; AVX2-LABEL: jumbled_indices32: 2272; AVX2: # %bb.0: 2273; AVX2-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0 2274; AVX2-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 2275; AVX2-NEXT: vpmaddwd %ymm6, %ymm2, %ymm2 2276; AVX2-NEXT: vpmaddwd %ymm7, %ymm3, %ymm3 2277; AVX2-NEXT: retq 2278; 2279; AVX512F-LABEL: jumbled_indices32: 2280; AVX512F: # %bb.0: 2281; AVX512F-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 2282; AVX512F-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0 2283; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2284; AVX512F-NEXT: vpmaddwd %ymm7, %ymm3, %ymm1 2285; AVX512F-NEXT: vpmaddwd %ymm6, %ymm2, %ymm2 2286; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 2287; AVX512F-NEXT: retq 2288; 2289; AVX512BW-LABEL: jumbled_indices32: 2290; AVX512BW: # %bb.0: 2291; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm0, %zmm0 2292; AVX512BW-NEXT: vpmaddwd %zmm3, %zmm1, %zmm1 2293; AVX512BW-NEXT: retq 2294 %exta = sext <64 x i16> %A to <64 x i32> 2295 %extb = sext <64 x i16> %B to <64 x i32> 2296 %m = mul <64 x i32> %exta, %extb 2297 %sa = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 1, i32 2, i32 6, i32 5, i32 10, i32 8, i32 14, i32 12, i32 19, i32 17, i32 22, i32 20, i32 25, i32 27, i32 30, i32 28, i32 32, i32 34, i32 37, i32 38, i32 41, i32 43, i32 45, i32 47, i32 50, i32 48, i32 52, i32 54, i32 59, i32 56, i32 61, i32 63> 2298 %sb = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 0, i32 3, i32 7, i32 4, i32 11, i32 9, i32 15, i32 13, i32 18, i32 16, i32 23, i32 21, i32 24, i32 26, i32 31, i32 29, i32 33, i32 35, i32 36, i32 39, i32 40, i32 42, i32 44, i32 46, i32 51, i32 49, i32 53, i32 55, i32 58, i32 57, i32 60, i32 62> 2299 %a = add <32 x i32> %sa, %sb 2300 ret <32 x i32> %a 2301} 2302 2303; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through. 2304; This would require the combine to recreate the concat_vectors. 2305define <4 x i32> @pmaddwd_128(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { 2306; SSE2-LABEL: pmaddwd_128: 2307; SSE2: # %bb.0: 2308; SSE2-NEXT: movdqa (%rdi), %xmm0 2309; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2310; SSE2-NEXT: retq 2311; 2312; AVX-LABEL: pmaddwd_128: 2313; AVX: # %bb.0: 2314; AVX-NEXT: vmovdqa (%rdi), %xmm0 2315; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2316; AVX-NEXT: retq 2317 %A = load <8 x i16>, <8 x i16>* %Aptr 2318 %B = load <8 x i16>, <8 x i16>* %Bptr 2319 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2320 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2321 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2322 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2323 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2324 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2325 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2326 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2327 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2328 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2329 %add = add <4 x i32> %even_mul, %odd_mul 2330 ret <4 x i32> %add 2331} 2332 2333define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) { 2334; SSE2-LABEL: pmaddwd_256: 2335; SSE2: # %bb.0: 2336; SSE2-NEXT: movdqa (%rdi), %xmm0 2337; SSE2-NEXT: movdqa 16(%rdi), %xmm1 2338; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2339; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 2340; SSE2-NEXT: retq 2341; 2342; AVX1-LABEL: pmaddwd_256: 2343; AVX1: # %bb.0: 2344; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2345; AVX1-NEXT: vmovdqa (%rsi), %ymm1 2346; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2347; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2348; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 2349; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 2350; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2351; AVX1-NEXT: retq 2352; 2353; AVX256-LABEL: pmaddwd_256: 2354; AVX256: # %bb.0: 2355; AVX256-NEXT: vmovdqa (%rdi), %ymm0 2356; AVX256-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2357; AVX256-NEXT: retq 2358 %A = load <16 x i16>, <16 x i16>* %Aptr 2359 %B = load <16 x i16>, <16 x i16>* %Bptr 2360 %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2361 %A_odd = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2362 %B_even = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2363 %B_odd = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2364 %A_even_ext = sext <8 x i16> %A_even to <8 x i32> 2365 %B_even_ext = sext <8 x i16> %B_even to <8 x i32> 2366 %A_odd_ext = sext <8 x i16> %A_odd to <8 x i32> 2367 %B_odd_ext = sext <8 x i16> %B_odd to <8 x i32> 2368 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 2369 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 2370 %add = add <8 x i32> %even_mul, %odd_mul 2371 ret <8 x i32> %add 2372} 2373 2374define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) { 2375; SSE2-LABEL: pmaddwd_512: 2376; SSE2: # %bb.0: 2377; SSE2-NEXT: movdqa (%rdi), %xmm0 2378; SSE2-NEXT: movdqa 16(%rdi), %xmm1 2379; SSE2-NEXT: movdqa 32(%rdi), %xmm2 2380; SSE2-NEXT: movdqa 48(%rdi), %xmm3 2381; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2382; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 2383; SSE2-NEXT: pmaddwd 32(%rsi), %xmm2 2384; SSE2-NEXT: pmaddwd 48(%rsi), %xmm3 2385; SSE2-NEXT: retq 2386; 2387; AVX1-LABEL: pmaddwd_512: 2388; AVX1: # %bb.0: 2389; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2390; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 2391; AVX1-NEXT: vmovdqa (%rsi), %ymm2 2392; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 2393; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 2394; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 2395; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 2396; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 2397; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 2398; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 2399; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2400; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 2401; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 2402; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2403; AVX1-NEXT: retq 2404; 2405; AVX2-LABEL: pmaddwd_512: 2406; AVX2: # %bb.0: 2407; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2408; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2409; AVX2-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2410; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2411; AVX2-NEXT: retq 2412; 2413; AVX512F-LABEL: pmaddwd_512: 2414; AVX512F: # %bb.0: 2415; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2416; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 2417; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2418; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2419; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2420; AVX512F-NEXT: retq 2421; 2422; AVX512BW-LABEL: pmaddwd_512: 2423; AVX512BW: # %bb.0: 2424; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2425; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 2426; AVX512BW-NEXT: retq 2427 %A = load <32 x i16>, <32 x i16>* %Aptr 2428 %B = load <32 x i16>, <32 x i16>* %Bptr 2429 %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 2430 %A_odd = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 2431 %B_even = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 2432 %B_odd = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 2433 %A_even_ext = sext <16 x i16> %A_even to <16 x i32> 2434 %B_even_ext = sext <16 x i16> %B_even to <16 x i32> 2435 %A_odd_ext = sext <16 x i16> %A_odd to <16 x i32> 2436 %B_odd_ext = sext <16 x i16> %B_odd to <16 x i32> 2437 %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext 2438 %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext 2439 %add = add <16 x i32> %even_mul, %odd_mul 2440 ret <16 x i32> %add 2441} 2442 2443define <32 x i32> @pmaddwd_1024(<64 x i16>* %Aptr, <64 x i16>* %Bptr) { 2444; SSE2-LABEL: pmaddwd_1024: 2445; SSE2: # %bb.0: 2446; SSE2-NEXT: movdqa 112(%rsi), %xmm0 2447; SSE2-NEXT: movdqa 96(%rsi), %xmm1 2448; SSE2-NEXT: movdqa 80(%rsi), %xmm2 2449; SSE2-NEXT: movdqa 64(%rsi), %xmm3 2450; SSE2-NEXT: movdqa (%rsi), %xmm4 2451; SSE2-NEXT: movdqa 16(%rsi), %xmm5 2452; SSE2-NEXT: movdqa 32(%rsi), %xmm6 2453; SSE2-NEXT: movdqa 48(%rsi), %xmm7 2454; SSE2-NEXT: pmaddwd (%rdx), %xmm4 2455; SSE2-NEXT: pmaddwd 16(%rdx), %xmm5 2456; SSE2-NEXT: pmaddwd 32(%rdx), %xmm6 2457; SSE2-NEXT: pmaddwd 48(%rdx), %xmm7 2458; SSE2-NEXT: pmaddwd 64(%rdx), %xmm3 2459; SSE2-NEXT: pmaddwd 80(%rdx), %xmm2 2460; SSE2-NEXT: pmaddwd 96(%rdx), %xmm1 2461; SSE2-NEXT: pmaddwd 112(%rdx), %xmm0 2462; SSE2-NEXT: movdqa %xmm0, 112(%rdi) 2463; SSE2-NEXT: movdqa %xmm1, 96(%rdi) 2464; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 2465; SSE2-NEXT: movdqa %xmm3, 64(%rdi) 2466; SSE2-NEXT: movdqa %xmm7, 48(%rdi) 2467; SSE2-NEXT: movdqa %xmm6, 32(%rdi) 2468; SSE2-NEXT: movdqa %xmm5, 16(%rdi) 2469; SSE2-NEXT: movdqa %xmm4, (%rdi) 2470; SSE2-NEXT: movq %rdi, %rax 2471; SSE2-NEXT: retq 2472; 2473; AVX1-LABEL: pmaddwd_1024: 2474; AVX1: # %bb.0: 2475; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2476; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 2477; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2 2478; AVX1-NEXT: vmovdqa 96(%rdi), %ymm8 2479; AVX1-NEXT: vmovdqa (%rsi), %ymm4 2480; AVX1-NEXT: vmovdqa 32(%rsi), %ymm5 2481; AVX1-NEXT: vmovdqa 64(%rsi), %ymm6 2482; AVX1-NEXT: vmovdqa 96(%rsi), %ymm9 2483; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 2484; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 2485; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 2486; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 2487; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 2488; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 2489; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2490; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 2491; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 2492; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 2493; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3 2494; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 2495; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 2496; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2 2497; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2498; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 2499; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4 2500; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 2501; AVX1-NEXT: vpmaddwd %xmm9, %xmm8, %xmm4 2502; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 2503; AVX1-NEXT: retq 2504; 2505; AVX2-LABEL: pmaddwd_1024: 2506; AVX2: # %bb.0: 2507; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2508; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2509; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 2510; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 2511; AVX2-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2512; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2513; AVX2-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2 2514; AVX2-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm3 2515; AVX2-NEXT: retq 2516; 2517; AVX512F-LABEL: pmaddwd_1024: 2518; AVX512F: # %bb.0: 2519; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2520; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 2521; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm2 2522; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm3 2523; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2524; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2525; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2526; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm1 2527; AVX512F-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2 2528; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 2529; AVX512F-NEXT: retq 2530; 2531; AVX512BW-LABEL: pmaddwd_1024: 2532; AVX512BW: # %bb.0: 2533; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2534; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 2535; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 2536; AVX512BW-NEXT: vpmaddwd 64(%rsi), %zmm1, %zmm1 2537; AVX512BW-NEXT: retq 2538 %A = load <64 x i16>, <64 x i16>* %Aptr 2539 %B = load <64 x i16>, <64 x i16>* %Bptr 2540 %A_even = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 2541 %A_odd = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63> 2542 %B_even = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 2543 %B_odd = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63> 2544 %A_even_ext = sext <32 x i16> %A_even to <32 x i32> 2545 %B_even_ext = sext <32 x i16> %B_even to <32 x i32> 2546 %A_odd_ext = sext <32 x i16> %A_odd to <32 x i32> 2547 %B_odd_ext = sext <32 x i16> %B_odd to <32 x i32> 2548 %even_mul = mul <32 x i32> %A_even_ext, %B_even_ext 2549 %odd_mul = mul <32 x i32> %A_odd_ext, %B_odd_ext 2550 %add = add <32 x i32> %even_mul, %odd_mul 2551 ret <32 x i32> %add 2552} 2553 2554define <4 x i32> @pmaddwd_commuted_mul(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { 2555; SSE2-LABEL: pmaddwd_commuted_mul: 2556; SSE2: # %bb.0: 2557; SSE2-NEXT: movdqa (%rdi), %xmm0 2558; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2559; SSE2-NEXT: retq 2560; 2561; AVX-LABEL: pmaddwd_commuted_mul: 2562; AVX: # %bb.0: 2563; AVX-NEXT: vmovdqa (%rdi), %xmm0 2564; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2565; AVX-NEXT: retq 2566 %A = load <8 x i16>, <8 x i16>* %Aptr 2567 %B = load <8 x i16>, <8 x i16>* %Bptr 2568 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2569 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2570 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2571 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2572 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2573 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2574 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2575 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2576 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2577 %odd_mul = mul <4 x i32> %B_odd_ext, %A_odd_ext ; Different order than previous mul 2578 %add = add <4 x i32> %even_mul, %odd_mul 2579 ret <4 x i32> %add 2580} 2581 2582define <4 x i32> @pmaddwd_swapped_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { 2583; SSE2-LABEL: pmaddwd_swapped_indices: 2584; SSE2: # %bb.0: 2585; SSE2-NEXT: movdqa (%rdi), %xmm0 2586; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2587; SSE2-NEXT: retq 2588; 2589; AVX-LABEL: pmaddwd_swapped_indices: 2590; AVX: # %bb.0: 2591; AVX-NEXT: vmovdqa (%rdi), %xmm0 2592; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2593; AVX-NEXT: retq 2594 %A = load <8 x i16>, <8 x i16>* %Aptr 2595 %B = load <8 x i16>, <8 x i16>* %Bptr 2596 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; indices aren't all even 2597 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; indices aren't all odd 2598 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; same indices as A 2599 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; same indices as A 2600 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2601 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2602 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2603 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2604 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2605 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2606 %add = add <4 x i32> %even_mul, %odd_mul 2607 ret <4 x i32> %add 2608} 2609 2610; Negative test were indices aren't paired properly 2611define <4 x i32> @pmaddwd_bad_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { 2612; SSE2-LABEL: pmaddwd_bad_indices: 2613; SSE2: # %bb.0: 2614; SSE2-NEXT: movdqa (%rdi), %xmm0 2615; SSE2-NEXT: movdqa (%rsi), %xmm1 2616; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] 2617; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 2618; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2619; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,2,3,4,5,6,7] 2620; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] 2621; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2622; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 2623; SSE2-NEXT: movdqa %xmm3, %xmm4 2624; SSE2-NEXT: pmulhw %xmm2, %xmm4 2625; SSE2-NEXT: pmullw %xmm2, %xmm3 2626; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 2627; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 2628; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 2629; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2630; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 2631; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2632; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2633; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 2634; SSE2-NEXT: movdqa %xmm0, %xmm2 2635; SSE2-NEXT: pmulhw %xmm1, %xmm2 2636; SSE2-NEXT: pmullw %xmm1, %xmm0 2637; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2638; SSE2-NEXT: paddd %xmm3, %xmm0 2639; SSE2-NEXT: retq 2640; 2641; AVX-LABEL: pmaddwd_bad_indices: 2642; AVX: # %bb.0: 2643; AVX-NEXT: vmovdqa (%rdi), %xmm0 2644; AVX-NEXT: vmovdqa (%rsi), %xmm1 2645; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,4,5,10,11,12,13,12,13,10,11,12,13,14,15] 2646; AVX-NEXT: vpmovsxwd %xmm2, %xmm2 2647; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2648; AVX-NEXT: vpmovsxwd %xmm3, %xmm3 2649; AVX-NEXT: vpmulld %xmm3, %xmm2, %xmm2 2650; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,8,9,14,15,12,13,14,15] 2651; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 2652; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 2653; AVX-NEXT: vpmovsxwd %xmm1, %xmm1 2654; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2655; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0 2656; AVX-NEXT: retq 2657 %A = load <8 x i16>, <8 x i16>* %Aptr 2658 %B = load <8 x i16>, <8 x i16>* %Bptr 2659 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 2660 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> 2661 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ; different indices than A 2662 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> ; different indices than A 2663 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2664 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2665 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2666 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2667 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2668 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2669 %add = add <4 x i32> %even_mul, %odd_mul 2670 ret <4 x i32> %add 2671} 2672