1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW
7
8define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
9; SSE2-LABEL: _Z10test_shortPsS_i_128:
10; SSE2:       # %bb.0: # %entry
11; SSE2-NEXT:    movl %edx, %eax
12; SSE2-NEXT:    pxor %xmm0, %xmm0
13; SSE2-NEXT:    xorl %ecx, %ecx
14; SSE2-NEXT:    .p2align 4, 0x90
15; SSE2-NEXT:  .LBB0_1: # %vector.body
16; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
17; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
18; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
19; SSE2-NEXT:    movdqa %xmm2, %xmm3
20; SSE2-NEXT:    pmulhw %xmm1, %xmm3
21; SSE2-NEXT:    pmullw %xmm1, %xmm2
22; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
23; SSE2-NEXT:    paddd %xmm2, %xmm0
24; SSE2-NEXT:    addq $8, %rcx
25; SSE2-NEXT:    cmpq %rcx, %rax
26; SSE2-NEXT:    jne .LBB0_1
27; SSE2-NEXT:  # %bb.2: # %middle.block
28; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
29; SSE2-NEXT:    paddd %xmm0, %xmm1
30; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
31; SSE2-NEXT:    paddd %xmm1, %xmm0
32; SSE2-NEXT:    movd %xmm0, %eax
33; SSE2-NEXT:    retq
34;
35; AVX-LABEL: _Z10test_shortPsS_i_128:
36; AVX:       # %bb.0: # %entry
37; AVX-NEXT:    movl %edx, %eax
38; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
39; AVX-NEXT:    xorl %ecx, %ecx
40; AVX-NEXT:    .p2align 4, 0x90
41; AVX-NEXT:  .LBB0_1: # %vector.body
42; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
43; AVX-NEXT:    vpmovsxwd (%rdi,%rcx,2), %xmm1
44; AVX-NEXT:    vpmovsxwd (%rsi,%rcx,2), %xmm2
45; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
46; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
47; AVX-NEXT:    addq $8, %rcx
48; AVX-NEXT:    cmpq %rcx, %rax
49; AVX-NEXT:    jne .LBB0_1
50; AVX-NEXT:  # %bb.2: # %middle.block
51; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
52; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
53; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
54; AVX-NEXT:    vmovd %xmm0, %eax
55; AVX-NEXT:    retq
56entry:
57  %3 = zext i32 %2 to i64
58  br label %vector.body
59
60vector.body:
61  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
62  %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
63  %4 = getelementptr inbounds i16, i16* %0, i64 %index
64  %5 = bitcast i16* %4 to <4 x i16>*
65  %wide.load = load <4 x i16>, <4 x i16>* %5, align 2
66  %6 = sext <4 x i16> %wide.load to <4 x i32>
67  %7 = getelementptr inbounds i16, i16* %1, i64 %index
68  %8 = bitcast i16* %7 to <4 x i16>*
69  %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2
70  %9 = sext <4 x i16> %wide.load14 to <4 x i32>
71  %10 = mul nsw <4 x i32> %9, %6
72  %11 = add nsw <4 x i32> %10, %vec.phi
73  %index.next = add i64 %index, 8
74  %12 = icmp eq i64 %index.next, %3
75  br i1 %12, label %middle.block, label %vector.body
76
77middle.block:
78  %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
79  %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15
80  %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
81  %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17
82  %13 = extractelement <4 x i32> %bin.rdx18, i32 0
83  ret i32 %13
84}
85
86define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
87; SSE2-LABEL: _Z10test_shortPsS_i_256:
88; SSE2:       # %bb.0: # %entry
89; SSE2-NEXT:    movl %edx, %eax
90; SSE2-NEXT:    pxor %xmm0, %xmm0
91; SSE2-NEXT:    xorl %ecx, %ecx
92; SSE2-NEXT:    pxor %xmm1, %xmm1
93; SSE2-NEXT:    .p2align 4, 0x90
94; SSE2-NEXT:  .LBB1_1: # %vector.body
95; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
96; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm2
97; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm3
98; SSE2-NEXT:    pmaddwd %xmm2, %xmm3
99; SSE2-NEXT:    paddd %xmm3, %xmm1
100; SSE2-NEXT:    addq $8, %rcx
101; SSE2-NEXT:    cmpq %rcx, %rax
102; SSE2-NEXT:    jne .LBB1_1
103; SSE2-NEXT:  # %bb.2: # %middle.block
104; SSE2-NEXT:    paddd %xmm0, %xmm1
105; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
106; SSE2-NEXT:    paddd %xmm1, %xmm0
107; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
108; SSE2-NEXT:    paddd %xmm0, %xmm1
109; SSE2-NEXT:    movd %xmm1, %eax
110; SSE2-NEXT:    retq
111;
112; AVX1-LABEL: _Z10test_shortPsS_i_256:
113; AVX1:       # %bb.0: # %entry
114; AVX1-NEXT:    movl %edx, %eax
115; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
116; AVX1-NEXT:    xorl %ecx, %ecx
117; AVX1-NEXT:    .p2align 4, 0x90
118; AVX1-NEXT:  .LBB1_1: # %vector.body
119; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
120; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm1
121; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
122; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
123; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
124; AVX1-NEXT:    addq $8, %rcx
125; AVX1-NEXT:    cmpq %rcx, %rax
126; AVX1-NEXT:    jne .LBB1_1
127; AVX1-NEXT:  # %bb.2: # %middle.block
128; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
129; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
130; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
131; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
132; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
133; AVX1-NEXT:    vmovd %xmm0, %eax
134; AVX1-NEXT:    vzeroupper
135; AVX1-NEXT:    retq
136;
137; AVX256-LABEL: _Z10test_shortPsS_i_256:
138; AVX256:       # %bb.0: # %entry
139; AVX256-NEXT:    movl %edx, %eax
140; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
141; AVX256-NEXT:    xorl %ecx, %ecx
142; AVX256-NEXT:    .p2align 4, 0x90
143; AVX256-NEXT:  .LBB1_1: # %vector.body
144; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
145; AVX256-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm1
146; AVX256-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
147; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
148; AVX256-NEXT:    addq $8, %rcx
149; AVX256-NEXT:    cmpq %rcx, %rax
150; AVX256-NEXT:    jne .LBB1_1
151; AVX256-NEXT:  # %bb.2: # %middle.block
152; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
153; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
154; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
155; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
156; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
157; AVX256-NEXT:    vmovd %xmm0, %eax
158; AVX256-NEXT:    vzeroupper
159; AVX256-NEXT:    retq
160entry:
161  %3 = zext i32 %2 to i64
162  br label %vector.body
163
164vector.body:
165  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
166  %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
167  %4 = getelementptr inbounds i16, i16* %0, i64 %index
168  %5 = bitcast i16* %4 to <8 x i16>*
169  %wide.load = load <8 x i16>, <8 x i16>* %5, align 2
170  %6 = sext <8 x i16> %wide.load to <8 x i32>
171  %7 = getelementptr inbounds i16, i16* %1, i64 %index
172  %8 = bitcast i16* %7 to <8 x i16>*
173  %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2
174  %9 = sext <8 x i16> %wide.load14 to <8 x i32>
175  %10 = mul nsw <8 x i32> %9, %6
176  %11 = add nsw <8 x i32> %10, %vec.phi
177  %index.next = add i64 %index, 8
178  %12 = icmp eq i64 %index.next, %3
179  br i1 %12, label %middle.block, label %vector.body
180
181middle.block:
182  %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
183  %bin.rdx = add <8 x i32> %11, %rdx.shuf
184  %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
185  %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15
186  %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
187  %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
188  %13 = extractelement <8 x i32> %bin.rdx18, i32 0
189  ret i32 %13
190}
191
192define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
193; SSE2-LABEL: _Z10test_shortPsS_i_512:
194; SSE2:       # %bb.0: # %entry
195; SSE2-NEXT:    movl %edx, %eax
196; SSE2-NEXT:    pxor %xmm0, %xmm0
197; SSE2-NEXT:    xorl %ecx, %ecx
198; SSE2-NEXT:    pxor %xmm2, %xmm2
199; SSE2-NEXT:    pxor %xmm1, %xmm1
200; SSE2-NEXT:    .p2align 4, 0x90
201; SSE2-NEXT:  .LBB2_1: # %vector.body
202; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
203; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm3
204; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm4
205; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm5
206; SSE2-NEXT:    pmaddwd %xmm3, %xmm5
207; SSE2-NEXT:    paddd %xmm5, %xmm2
208; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm3
209; SSE2-NEXT:    pmaddwd %xmm4, %xmm3
210; SSE2-NEXT:    paddd %xmm3, %xmm1
211; SSE2-NEXT:    addq $16, %rcx
212; SSE2-NEXT:    cmpq %rcx, %rax
213; SSE2-NEXT:    jne .LBB2_1
214; SSE2-NEXT:  # %bb.2: # %middle.block
215; SSE2-NEXT:    paddd %xmm0, %xmm2
216; SSE2-NEXT:    paddd %xmm0, %xmm1
217; SSE2-NEXT:    paddd %xmm2, %xmm1
218; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
219; SSE2-NEXT:    paddd %xmm1, %xmm0
220; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
221; SSE2-NEXT:    paddd %xmm0, %xmm1
222; SSE2-NEXT:    movd %xmm1, %eax
223; SSE2-NEXT:    retq
224;
225; AVX1-LABEL: _Z10test_shortPsS_i_512:
226; AVX1:       # %bb.0: # %entry
227; AVX1-NEXT:    movl %edx, %eax
228; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
229; AVX1-NEXT:    xorl %ecx, %ecx
230; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
231; AVX1-NEXT:    .p2align 4, 0x90
232; AVX1-NEXT:  .LBB2_1: # %vector.body
233; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
234; AVX1-NEXT:    vmovdqu (%rdi,%rcx,2), %ymm2
235; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
236; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
237; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
238; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
239; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
240; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
241; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
242; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
243; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
244; AVX1-NEXT:    addq $16, %rcx
245; AVX1-NEXT:    cmpq %rcx, %rax
246; AVX1-NEXT:    jne .LBB2_1
247; AVX1-NEXT:  # %bb.2: # %middle.block
248; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
249; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
250; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
251; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
252; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
253; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
254; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
255; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
256; AVX1-NEXT:    vmovd %xmm0, %eax
257; AVX1-NEXT:    vzeroupper
258; AVX1-NEXT:    retq
259;
260; AVX2-LABEL: _Z10test_shortPsS_i_512:
261; AVX2:       # %bb.0: # %entry
262; AVX2-NEXT:    movl %edx, %eax
263; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
264; AVX2-NEXT:    xorl %ecx, %ecx
265; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
266; AVX2-NEXT:    .p2align 4, 0x90
267; AVX2-NEXT:  .LBB2_1: # %vector.body
268; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
269; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm2
270; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2
271; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
272; AVX2-NEXT:    addq $16, %rcx
273; AVX2-NEXT:    cmpq %rcx, %rax
274; AVX2-NEXT:    jne .LBB2_1
275; AVX2-NEXT:  # %bb.2: # %middle.block
276; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
277; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
278; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
279; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
280; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
281; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
282; AVX2-NEXT:    vmovd %xmm0, %eax
283; AVX2-NEXT:    vzeroupper
284; AVX2-NEXT:    retq
285;
286; AVX512-LABEL: _Z10test_shortPsS_i_512:
287; AVX512:       # %bb.0: # %entry
288; AVX512-NEXT:    movl %edx, %eax
289; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
290; AVX512-NEXT:    xorl %ecx, %ecx
291; AVX512-NEXT:    .p2align 4, 0x90
292; AVX512-NEXT:  .LBB2_1: # %vector.body
293; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
294; AVX512-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm1
295; AVX512-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1
296; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
297; AVX512-NEXT:    addq $16, %rcx
298; AVX512-NEXT:    cmpq %rcx, %rax
299; AVX512-NEXT:    jne .LBB2_1
300; AVX512-NEXT:  # %bb.2: # %middle.block
301; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
302; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
303; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
304; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
305; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
306; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
307; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
308; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
309; AVX512-NEXT:    vmovd %xmm0, %eax
310; AVX512-NEXT:    vzeroupper
311; AVX512-NEXT:    retq
312entry:
313  %3 = zext i32 %2 to i64
314  br label %vector.body
315
316vector.body:
317  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
318  %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
319  %4 = getelementptr inbounds i16, i16* %0, i64 %index
320  %5 = bitcast i16* %4 to <16 x i16>*
321  %wide.load = load <16 x i16>, <16 x i16>* %5, align 2
322  %6 = sext <16 x i16> %wide.load to <16 x i32>
323  %7 = getelementptr inbounds i16, i16* %1, i64 %index
324  %8 = bitcast i16* %7 to <16 x i16>*
325  %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2
326  %9 = sext <16 x i16> %wide.load14 to <16 x i32>
327  %10 = mul nsw <16 x i32> %9, %6
328  %11 = add nsw <16 x i32> %10, %vec.phi
329  %index.next = add i64 %index, 16
330  %12 = icmp eq i64 %index.next, %3
331  br i1 %12, label %middle.block, label %vector.body
332
333middle.block:
334  %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
335  %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1
336  %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
337  %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf
338  %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
339  %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
340  %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
341  %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
342  %13 = extractelement <16 x i32> %bin.rdx18, i32 0
343  ret i32 %13
344}
345
346define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
347; SSE2-LABEL: _Z10test_shortPsS_i_1024:
348; SSE2:       # %bb.0: # %entry
349; SSE2-NEXT:    movl %edx, %eax
350; SSE2-NEXT:    pxor %xmm8, %xmm8
351; SSE2-NEXT:    xorl %ecx, %ecx
352; SSE2-NEXT:    pxor %xmm2, %xmm2
353; SSE2-NEXT:    pxor %xmm4, %xmm4
354; SSE2-NEXT:    pxor %xmm1, %xmm1
355; SSE2-NEXT:    pxor %xmm3, %xmm3
356; SSE2-NEXT:    .p2align 4, 0x90
357; SSE2-NEXT:  .LBB3_1: # %vector.body
358; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
359; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm5
360; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm6
361; SSE2-NEXT:    movdqu 32(%rdi,%rcx,2), %xmm7
362; SSE2-NEXT:    movdqu 48(%rdi,%rcx,2), %xmm9
363; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm0
364; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
365; SSE2-NEXT:    paddd %xmm0, %xmm2
366; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm0
367; SSE2-NEXT:    pmaddwd %xmm6, %xmm0
368; SSE2-NEXT:    paddd %xmm0, %xmm4
369; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm0
370; SSE2-NEXT:    pmaddwd %xmm7, %xmm0
371; SSE2-NEXT:    paddd %xmm0, %xmm1
372; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm0
373; SSE2-NEXT:    pmaddwd %xmm9, %xmm0
374; SSE2-NEXT:    paddd %xmm0, %xmm3
375; SSE2-NEXT:    addq $16, %rcx
376; SSE2-NEXT:    cmpq %rcx, %rax
377; SSE2-NEXT:    jne .LBB3_1
378; SSE2-NEXT:  # %bb.2: # %middle.block
379; SSE2-NEXT:    paddd %xmm8, %xmm4
380; SSE2-NEXT:    paddd %xmm8, %xmm3
381; SSE2-NEXT:    paddd %xmm4, %xmm3
382; SSE2-NEXT:    paddd %xmm8, %xmm2
383; SSE2-NEXT:    paddd %xmm8, %xmm1
384; SSE2-NEXT:    paddd %xmm3, %xmm1
385; SSE2-NEXT:    paddd %xmm2, %xmm1
386; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
387; SSE2-NEXT:    paddd %xmm1, %xmm0
388; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
389; SSE2-NEXT:    paddd %xmm0, %xmm1
390; SSE2-NEXT:    movd %xmm1, %eax
391; SSE2-NEXT:    retq
392;
393; AVX1-LABEL: _Z10test_shortPsS_i_1024:
394; AVX1:       # %bb.0: # %entry
395; AVX1-NEXT:    movl %edx, %eax
396; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
397; AVX1-NEXT:    xorl %ecx, %ecx
398; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
399; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
400; AVX1-NEXT:    .p2align 4, 0x90
401; AVX1-NEXT:  .LBB3_1: # %vector.body
402; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
403; AVX1-NEXT:    vmovdqu (%rdi,%rcx,2), %ymm3
404; AVX1-NEXT:    vmovdqu 32(%rdi,%rcx,2), %ymm4
405; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm5
406; AVX1-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm6
407; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm7
408; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
409; AVX1-NEXT:    vpmaddwd %xmm7, %xmm0, %xmm0
410; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
411; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm0
412; AVX1-NEXT:    vpmaddwd %xmm4, %xmm6, %xmm4
413; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
414; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
415; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
416; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
417; AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
418; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
419; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
420; AVX1-NEXT:    vpmaddwd %xmm3, %xmm5, %xmm3
421; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
422; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
423; AVX1-NEXT:    addq $16, %rcx
424; AVX1-NEXT:    cmpq %rcx, %rax
425; AVX1-NEXT:    jne .LBB3_1
426; AVX1-NEXT:  # %bb.2: # %middle.block
427; AVX1-NEXT:    vpaddd %xmm8, %xmm2, %xmm0
428; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
429; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm4
430; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
431; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm5
432; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
433; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
434; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
435; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm0
436; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
437; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
438; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
439; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
440; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
441; AVX1-NEXT:    vmovd %xmm0, %eax
442; AVX1-NEXT:    vzeroupper
443; AVX1-NEXT:    retq
444;
445; AVX2-LABEL: _Z10test_shortPsS_i_1024:
446; AVX2:       # %bb.0: # %entry
447; AVX2-NEXT:    movl %edx, %eax
448; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
449; AVX2-NEXT:    xorl %ecx, %ecx
450; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
451; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
452; AVX2-NEXT:    .p2align 4, 0x90
453; AVX2-NEXT:  .LBB3_1: # %vector.body
454; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
455; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
456; AVX2-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm4
457; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4
458; AVX2-NEXT:    vpaddd %ymm2, %ymm4, %ymm2
459; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
460; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
461; AVX2-NEXT:    addq $16, %rcx
462; AVX2-NEXT:    cmpq %rcx, %rax
463; AVX2-NEXT:    jne .LBB3_1
464; AVX2-NEXT:  # %bb.2: # %middle.block
465; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
466; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
467; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
468; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
469; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
470; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
471; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
472; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
473; AVX2-NEXT:    vmovd %xmm0, %eax
474; AVX2-NEXT:    vzeroupper
475; AVX2-NEXT:    retq
476;
477; AVX512F-LABEL: _Z10test_shortPsS_i_1024:
478; AVX512F:       # %bb.0: # %entry
479; AVX512F-NEXT:    movl %edx, %eax
480; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
481; AVX512F-NEXT:    xorl %ecx, %ecx
482; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
483; AVX512F-NEXT:    .p2align 4, 0x90
484; AVX512F-NEXT:  .LBB3_1: # %vector.body
485; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
486; AVX512F-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm2
487; AVX512F-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm3
488; AVX512F-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3
489; AVX512F-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2
490; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
491; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
492; AVX512F-NEXT:    addq $16, %rcx
493; AVX512F-NEXT:    cmpq %rcx, %rax
494; AVX512F-NEXT:    jne .LBB3_1
495; AVX512F-NEXT:  # %bb.2: # %middle.block
496; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
497; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
498; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
499; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
500; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
501; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
502; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
503; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
504; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
505; AVX512F-NEXT:    vmovd %xmm0, %eax
506; AVX512F-NEXT:    vzeroupper
507; AVX512F-NEXT:    retq
508;
509; AVX512BW-LABEL: _Z10test_shortPsS_i_1024:
510; AVX512BW:       # %bb.0: # %entry
511; AVX512BW-NEXT:    movl %edx, %eax
512; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
513; AVX512BW-NEXT:    xorl %ecx, %ecx
514; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
515; AVX512BW-NEXT:    .p2align 4, 0x90
516; AVX512BW-NEXT:  .LBB3_1: # %vector.body
517; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
518; AVX512BW-NEXT:    vmovdqu64 (%rsi,%rcx,2), %zmm2
519; AVX512BW-NEXT:    vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2
520; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
521; AVX512BW-NEXT:    addq $16, %rcx
522; AVX512BW-NEXT:    cmpq %rcx, %rax
523; AVX512BW-NEXT:    jne .LBB3_1
524; AVX512BW-NEXT:  # %bb.2: # %middle.block
525; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
526; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
527; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
528; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
529; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
530; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
531; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
532; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
533; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
534; AVX512BW-NEXT:    vmovd %xmm0, %eax
535; AVX512BW-NEXT:    vzeroupper
536; AVX512BW-NEXT:    retq
537entry:
538  %3 = zext i32 %2 to i64
539  br label %vector.body
540
541vector.body:
542  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
543  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
544  %4 = getelementptr inbounds i16, i16* %0, i64 %index
545  %5 = bitcast i16* %4 to <32 x i16>*
546  %wide.load = load <32 x i16>, <32 x i16>* %5, align 2
547  %6 = sext <32 x i16> %wide.load to <32 x i32>
548  %7 = getelementptr inbounds i16, i16* %1, i64 %index
549  %8 = bitcast i16* %7 to <32 x i16>*
550  %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2
551  %9 = sext <32 x i16> %wide.load14 to <32 x i32>
552  %10 = mul nsw <32 x i32> %9, %6
553  %11 = add nsw <32 x i32> %10, %vec.phi
554  %index.next = add i64 %index, 16
555  %12 = icmp eq i64 %index.next, %3
556  br i1 %12, label %middle.block, label %vector.body
557
558middle.block:
559  %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
560  %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2
561  %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
562  %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1
563  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
564  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
565  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
566  %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15
567  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
568  %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17
569  %13 = extractelement <32 x i32> %bin.rdx18, i32 0
570  ret i32 %13
571}
572
573define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
574; SSE2-LABEL: _Z9test_charPcS_i_128:
575; SSE2:       # %bb.0: # %entry
576; SSE2-NEXT:    movl %edx, %eax
577; SSE2-NEXT:    pxor %xmm0, %xmm0
578; SSE2-NEXT:    xorl %ecx, %ecx
579; SSE2-NEXT:    .p2align 4, 0x90
580; SSE2-NEXT:  .LBB4_1: # %vector.body
581; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
582; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
583; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
584; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
585; SSE2-NEXT:    psrad $24, %xmm1
586; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
587; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
588; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
589; SSE2-NEXT:    psrad $24, %xmm2
590; SSE2-NEXT:    pmullw %xmm1, %xmm2
591; SSE2-NEXT:    pslld $16, %xmm2
592; SSE2-NEXT:    psrad $16, %xmm2
593; SSE2-NEXT:    paddd %xmm2, %xmm0
594; SSE2-NEXT:    addq $16, %rcx
595; SSE2-NEXT:    cmpq %rcx, %rax
596; SSE2-NEXT:    jne .LBB4_1
597; SSE2-NEXT:  # %bb.2: # %middle.block
598; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
599; SSE2-NEXT:    paddd %xmm0, %xmm1
600; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
601; SSE2-NEXT:    paddd %xmm1, %xmm0
602; SSE2-NEXT:    movd %xmm0, %eax
603; SSE2-NEXT:    retq
604;
605; AVX-LABEL: _Z9test_charPcS_i_128:
606; AVX:       # %bb.0: # %entry
607; AVX-NEXT:    movl %edx, %eax
608; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
609; AVX-NEXT:    xorl %ecx, %ecx
610; AVX-NEXT:    .p2align 4, 0x90
611; AVX-NEXT:  .LBB4_1: # %vector.body
612; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
613; AVX-NEXT:    vpmovsxbd (%rdi,%rcx), %xmm1
614; AVX-NEXT:    vpmovsxbd (%rsi,%rcx), %xmm2
615; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
616; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
617; AVX-NEXT:    addq $16, %rcx
618; AVX-NEXT:    cmpq %rcx, %rax
619; AVX-NEXT:    jne .LBB4_1
620; AVX-NEXT:  # %bb.2: # %middle.block
621; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
622; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
623; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
624; AVX-NEXT:    vmovd %xmm0, %eax
625; AVX-NEXT:    retq
626entry:
627  %3 = zext i32 %2 to i64
628  br label %vector.body
629
630vector.body:
631  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
632  %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
633  %4 = getelementptr inbounds i8, i8* %0, i64 %index
634  %5 = bitcast i8* %4 to <4 x i8>*
635  %wide.load = load <4 x i8>, <4 x i8>* %5, align 1
636  %6 = sext <4 x i8> %wide.load to <4 x i32>
637  %7 = getelementptr inbounds i8, i8* %1, i64 %index
638  %8 = bitcast i8* %7 to <4 x i8>*
639  %wide.load14 = load <4 x i8>, <4 x i8>* %8, align 1
640  %9 = sext <4 x i8> %wide.load14 to <4 x i32>
641  %10 = mul nsw <4 x i32> %9, %6
642  %11 = add nsw <4 x i32> %10, %vec.phi
643  %index.next = add i64 %index, 16
644  %12 = icmp eq i64 %index.next, %3
645  br i1 %12, label %middle.block, label %vector.body
646
647middle.block:
648  %rdx.shuf17 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
649  %bin.rdx18 = add <4 x i32> %11, %rdx.shuf17
650  %rdx.shuf19 = shufflevector <4 x i32> %bin.rdx18, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
651  %bin.rdx20 = add <4 x i32> %bin.rdx18, %rdx.shuf19
652  %13 = extractelement <4 x i32> %bin.rdx20, i32 0
653  ret i32 %13
654}
655
656define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
657; SSE2-LABEL: _Z9test_charPcS_i_256:
658; SSE2:       # %bb.0: # %entry
659; SSE2-NEXT:    movl %edx, %eax
660; SSE2-NEXT:    pxor %xmm0, %xmm0
661; SSE2-NEXT:    xorl %ecx, %ecx
662; SSE2-NEXT:    pxor %xmm1, %xmm1
663; SSE2-NEXT:    .p2align 4, 0x90
664; SSE2-NEXT:  .LBB5_1: # %vector.body
665; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
666; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
667; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
668; SSE2-NEXT:    psraw $8, %xmm2
669; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
670; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
671; SSE2-NEXT:    psraw $8, %xmm3
672; SSE2-NEXT:    pmaddwd %xmm2, %xmm3
673; SSE2-NEXT:    paddd %xmm3, %xmm1
674; SSE2-NEXT:    addq $16, %rcx
675; SSE2-NEXT:    cmpq %rcx, %rax
676; SSE2-NEXT:    jne .LBB5_1
677; SSE2-NEXT:  # %bb.2: # %middle.block
678; SSE2-NEXT:    paddd %xmm0, %xmm1
679; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
680; SSE2-NEXT:    paddd %xmm1, %xmm0
681; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
682; SSE2-NEXT:    paddd %xmm0, %xmm1
683; SSE2-NEXT:    movd %xmm1, %eax
684; SSE2-NEXT:    retq
685;
686; AVX1-LABEL: _Z9test_charPcS_i_256:
687; AVX1:       # %bb.0: # %entry
688; AVX1-NEXT:    movl %edx, %eax
689; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
690; AVX1-NEXT:    xorl %ecx, %ecx
691; AVX1-NEXT:    .p2align 4, 0x90
692; AVX1-NEXT:  .LBB5_1: # %vector.body
693; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
694; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm1
695; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm2
696; AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
697; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
698; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
699; AVX1-NEXT:    addq $16, %rcx
700; AVX1-NEXT:    cmpq %rcx, %rax
701; AVX1-NEXT:    jne .LBB5_1
702; AVX1-NEXT:  # %bb.2: # %middle.block
703; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
704; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
705; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
706; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
707; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
708; AVX1-NEXT:    vmovd %xmm0, %eax
709; AVX1-NEXT:    vzeroupper
710; AVX1-NEXT:    retq
711;
712; AVX256-LABEL: _Z9test_charPcS_i_256:
713; AVX256:       # %bb.0: # %entry
714; AVX256-NEXT:    movl %edx, %eax
715; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
716; AVX256-NEXT:    xorl %ecx, %ecx
717; AVX256-NEXT:    .p2align 4, 0x90
718; AVX256-NEXT:  .LBB5_1: # %vector.body
719; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
720; AVX256-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm1
721; AVX256-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm2
722; AVX256-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
723; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
724; AVX256-NEXT:    addq $16, %rcx
725; AVX256-NEXT:    cmpq %rcx, %rax
726; AVX256-NEXT:    jne .LBB5_1
727; AVX256-NEXT:  # %bb.2: # %middle.block
728; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
729; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
730; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
731; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
732; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
733; AVX256-NEXT:    vmovd %xmm0, %eax
734; AVX256-NEXT:    vzeroupper
735; AVX256-NEXT:    retq
736entry:
737  %3 = zext i32 %2 to i64
738  br label %vector.body
739
740vector.body:
741  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
742  %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
743  %4 = getelementptr inbounds i8, i8* %0, i64 %index
744  %5 = bitcast i8* %4 to <8 x i8>*
745  %wide.load = load <8 x i8>, <8 x i8>* %5, align 1
746  %6 = sext <8 x i8> %wide.load to <8 x i32>
747  %7 = getelementptr inbounds i8, i8* %1, i64 %index
748  %8 = bitcast i8* %7 to <8 x i8>*
749  %wide.load14 = load <8 x i8>, <8 x i8>* %8, align 1
750  %9 = sext <8 x i8> %wide.load14 to <8 x i32>
751  %10 = mul nsw <8 x i32> %9, %6
752  %11 = add nsw <8 x i32> %10, %vec.phi
753  %index.next = add i64 %index, 16
754  %12 = icmp eq i64 %index.next, %3
755  br i1 %12, label %middle.block, label %vector.body
756
757middle.block:
758  %rdx.shuf15 = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
759  %bin.rdx16 = add <8 x i32> %11, %rdx.shuf15
760  %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
761  %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
762  %rdx.shuf19 = shufflevector <8 x i32> %bin.rdx18, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
763  %bin.rdx20 = add <8 x i32> %bin.rdx18, %rdx.shuf19
764  %13 = extractelement <8 x i32> %bin.rdx20, i32 0
765  ret i32 %13
766}
767
768define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
769; SSE2-LABEL: _Z9test_charPcS_i_512:
770; SSE2:       # %bb.0: # %entry
771; SSE2-NEXT:    movl %edx, %eax
772; SSE2-NEXT:    pxor %xmm0, %xmm0
773; SSE2-NEXT:    xorl %ecx, %ecx
774; SSE2-NEXT:    pxor %xmm2, %xmm2
775; SSE2-NEXT:    pxor %xmm1, %xmm1
776; SSE2-NEXT:    .p2align 4, 0x90
777; SSE2-NEXT:  .LBB6_1: # %vector.body
778; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
779; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
780; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
781; SSE2-NEXT:    psraw $8, %xmm3
782; SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
783; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
784; SSE2-NEXT:    psraw $8, %xmm4
785; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
786; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
787; SSE2-NEXT:    psraw $8, %xmm5
788; SSE2-NEXT:    pmaddwd %xmm3, %xmm5
789; SSE2-NEXT:    paddd %xmm5, %xmm2
790; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
791; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
792; SSE2-NEXT:    psraw $8, %xmm3
793; SSE2-NEXT:    pmaddwd %xmm4, %xmm3
794; SSE2-NEXT:    paddd %xmm3, %xmm1
795; SSE2-NEXT:    addq $16, %rcx
796; SSE2-NEXT:    cmpq %rcx, %rax
797; SSE2-NEXT:    jne .LBB6_1
798; SSE2-NEXT:  # %bb.2: # %middle.block
799; SSE2-NEXT:    paddd %xmm0, %xmm2
800; SSE2-NEXT:    paddd %xmm0, %xmm1
801; SSE2-NEXT:    paddd %xmm2, %xmm1
802; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
803; SSE2-NEXT:    paddd %xmm1, %xmm0
804; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
805; SSE2-NEXT:    paddd %xmm0, %xmm1
806; SSE2-NEXT:    movd %xmm1, %eax
807; SSE2-NEXT:    retq
808;
809; AVX1-LABEL: _Z9test_charPcS_i_512:
810; AVX1:       # %bb.0: # %entry
811; AVX1-NEXT:    movl %edx, %eax
812; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
813; AVX1-NEXT:    xorl %ecx, %ecx
814; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
815; AVX1-NEXT:    .p2align 4, 0x90
816; AVX1-NEXT:  .LBB6_1: # %vector.body
817; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
818; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm2
819; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm3
820; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
821; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
822; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm4
823; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
824; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
825; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
826; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
827; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
828; AVX1-NEXT:    addq $16, %rcx
829; AVX1-NEXT:    cmpq %rcx, %rax
830; AVX1-NEXT:    jne .LBB6_1
831; AVX1-NEXT:  # %bb.2: # %middle.block
832; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
833; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
834; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
835; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
836; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
837; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
838; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
839; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
840; AVX1-NEXT:    vmovd %xmm0, %eax
841; AVX1-NEXT:    vzeroupper
842; AVX1-NEXT:    retq
843;
844; AVX2-LABEL: _Z9test_charPcS_i_512:
845; AVX2:       # %bb.0: # %entry
846; AVX2-NEXT:    movl %edx, %eax
847; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
848; AVX2-NEXT:    xorl %ecx, %ecx
849; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
850; AVX2-NEXT:    .p2align 4, 0x90
851; AVX2-NEXT:  .LBB6_1: # %vector.body
852; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
853; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm2
854; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
855; AVX2-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
856; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
857; AVX2-NEXT:    addq $16, %rcx
858; AVX2-NEXT:    cmpq %rcx, %rax
859; AVX2-NEXT:    jne .LBB6_1
860; AVX2-NEXT:  # %bb.2: # %middle.block
861; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
862; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
863; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
864; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
865; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
866; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
867; AVX2-NEXT:    vmovd %xmm0, %eax
868; AVX2-NEXT:    vzeroupper
869; AVX2-NEXT:    retq
870;
871; AVX512-LABEL: _Z9test_charPcS_i_512:
872; AVX512:       # %bb.0: # %entry
873; AVX512-NEXT:    movl %edx, %eax
874; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
875; AVX512-NEXT:    xorl %ecx, %ecx
876; AVX512-NEXT:    .p2align 4, 0x90
877; AVX512-NEXT:  .LBB6_1: # %vector.body
878; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
879; AVX512-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm1
880; AVX512-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm2
881; AVX512-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
882; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
883; AVX512-NEXT:    addq $16, %rcx
884; AVX512-NEXT:    cmpq %rcx, %rax
885; AVX512-NEXT:    jne .LBB6_1
886; AVX512-NEXT:  # %bb.2: # %middle.block
887; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
888; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
889; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
890; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
891; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
892; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
893; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
894; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
895; AVX512-NEXT:    vmovd %xmm0, %eax
896; AVX512-NEXT:    vzeroupper
897; AVX512-NEXT:    retq
898entry:
899  %3 = zext i32 %2 to i64
900  br label %vector.body
901
902vector.body:
903  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
904  %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
905  %4 = getelementptr inbounds i8, i8* %0, i64 %index
906  %5 = bitcast i8* %4 to <16 x i8>*
907  %wide.load = load <16 x i8>, <16 x i8>* %5, align 1
908  %6 = sext <16 x i8> %wide.load to <16 x i32>
909  %7 = getelementptr inbounds i8, i8* %1, i64 %index
910  %8 = bitcast i8* %7 to <16 x i8>*
911  %wide.load14 = load <16 x i8>, <16 x i8>* %8, align 1
912  %9 = sext <16 x i8> %wide.load14 to <16 x i32>
913  %10 = mul nsw <16 x i32> %9, %6
914  %11 = add nsw <16 x i32> %10, %vec.phi
915  %index.next = add i64 %index, 16
916  %12 = icmp eq i64 %index.next, %3
917  br i1 %12, label %middle.block, label %vector.body
918
919middle.block:
920  %rdx.shuf = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
921  %bin.rdx = add <16 x i32> %11, %rdx.shuf
922  %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
923  %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
924  %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
925  %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
926  %rdx.shuf19 = shufflevector <16 x i32> %bin.rdx18, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
927  %bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19
928  %13 = extractelement <16 x i32> %bin.rdx20, i32 0
929  ret i32 %13
930}
931
932define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
933; SSE2-LABEL: _Z9test_charPcS_i_1024:
934; SSE2:       # %bb.0: # %entry
935; SSE2-NEXT:    movl %edx, %eax
936; SSE2-NEXT:    pxor %xmm8, %xmm8
937; SSE2-NEXT:    xorl %ecx, %ecx
938; SSE2-NEXT:    pxor %xmm9, %xmm9
939; SSE2-NEXT:    pxor %xmm4, %xmm4
940; SSE2-NEXT:    pxor %xmm1, %xmm1
941; SSE2-NEXT:    pxor %xmm3, %xmm3
942; SSE2-NEXT:    .p2align 4, 0x90
943; SSE2-NEXT:  .LBB7_1: # %vector.body
944; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
945; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
946; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
947; SSE2-NEXT:    psraw $8, %xmm5
948; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
949; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
950; SSE2-NEXT:    psraw $8, %xmm6
951; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
952; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
953; SSE2-NEXT:    psraw $8, %xmm7
954; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
955; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
956; SSE2-NEXT:    psraw $8, %xmm0
957; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
958; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
959; SSE2-NEXT:    psraw $8, %xmm2
960; SSE2-NEXT:    pmaddwd %xmm5, %xmm2
961; SSE2-NEXT:    paddd %xmm2, %xmm9
962; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
963; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
964; SSE2-NEXT:    psraw $8, %xmm2
965; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
966; SSE2-NEXT:    paddd %xmm2, %xmm4
967; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
968; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
969; SSE2-NEXT:    psraw $8, %xmm2
970; SSE2-NEXT:    pmaddwd %xmm7, %xmm2
971; SSE2-NEXT:    paddd %xmm2, %xmm1
972; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
973; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
974; SSE2-NEXT:    psraw $8, %xmm2
975; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
976; SSE2-NEXT:    paddd %xmm2, %xmm3
977; SSE2-NEXT:    addq $32, %rcx
978; SSE2-NEXT:    cmpq %rcx, %rax
979; SSE2-NEXT:    jne .LBB7_1
980; SSE2-NEXT:  # %bb.2: # %middle.block
981; SSE2-NEXT:    paddd %xmm8, %xmm4
982; SSE2-NEXT:    paddd %xmm8, %xmm3
983; SSE2-NEXT:    paddd %xmm4, %xmm3
984; SSE2-NEXT:    paddd %xmm8, %xmm9
985; SSE2-NEXT:    paddd %xmm8, %xmm1
986; SSE2-NEXT:    paddd %xmm3, %xmm1
987; SSE2-NEXT:    paddd %xmm9, %xmm1
988; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
989; SSE2-NEXT:    paddd %xmm1, %xmm0
990; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
991; SSE2-NEXT:    paddd %xmm0, %xmm1
992; SSE2-NEXT:    movd %xmm1, %eax
993; SSE2-NEXT:    retq
994;
995; AVX1-LABEL: _Z9test_charPcS_i_1024:
996; AVX1:       # %bb.0: # %entry
997; AVX1-NEXT:    movl %edx, %eax
998; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
999; AVX1-NEXT:    xorl %ecx, %ecx
1000; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1001; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1002; AVX1-NEXT:    .p2align 4, 0x90
1003; AVX1-NEXT:  .LBB7_1: # %vector.body
1004; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1005; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
1006; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm4
1007; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm5
1008; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm6
1009; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
1010; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
1011; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
1012; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
1013; AVX1-NEXT:    vpmovsxbw 16(%rsi,%rcx), %xmm7
1014; AVX1-NEXT:    vpmaddwd %xmm5, %xmm7, %xmm5
1015; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
1016; AVX1-NEXT:    vpmaddwd %xmm6, %xmm7, %xmm6
1017; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
1018; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
1019; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
1020; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
1021; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
1022; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
1023; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
1024; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
1025; AVX1-NEXT:    addq $32, %rcx
1026; AVX1-NEXT:    cmpq %rcx, %rax
1027; AVX1-NEXT:    jne .LBB7_1
1028; AVX1-NEXT:  # %bb.2: # %middle.block
1029; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm3
1030; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1031; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
1032; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1033; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
1034; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
1035; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
1036; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
1037; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
1038; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
1039; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
1040; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1041; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1042; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1043; AVX1-NEXT:    vmovd %xmm0, %eax
1044; AVX1-NEXT:    vzeroupper
1045; AVX1-NEXT:    retq
1046;
1047; AVX2-LABEL: _Z9test_charPcS_i_1024:
1048; AVX2:       # %bb.0: # %entry
1049; AVX2-NEXT:    movl %edx, %eax
1050; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1051; AVX2-NEXT:    xorl %ecx, %ecx
1052; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1053; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1054; AVX2-NEXT:    .p2align 4, 0x90
1055; AVX2-NEXT:  .LBB7_1: # %vector.body
1056; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
1057; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
1058; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
1059; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
1060; AVX2-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
1061; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
1062; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
1063; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
1064; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
1065; AVX2-NEXT:    addq $32, %rcx
1066; AVX2-NEXT:    cmpq %rcx, %rax
1067; AVX2-NEXT:    jne .LBB7_1
1068; AVX2-NEXT:  # %bb.2: # %middle.block
1069; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
1070; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
1071; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
1072; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1073; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1074; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1075; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1076; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
1077; AVX2-NEXT:    vmovd %xmm0, %eax
1078; AVX2-NEXT:    vzeroupper
1079; AVX2-NEXT:    retq
1080;
1081; AVX512F-LABEL: _Z9test_charPcS_i_1024:
1082; AVX512F:       # %bb.0: # %entry
1083; AVX512F-NEXT:    movl %edx, %eax
1084; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1085; AVX512F-NEXT:    xorl %ecx, %ecx
1086; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1087; AVX512F-NEXT:    .p2align 4, 0x90
1088; AVX512F-NEXT:  .LBB7_1: # %vector.body
1089; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
1090; AVX512F-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm2
1091; AVX512F-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
1092; AVX512F-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm4
1093; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm4, %ymm2
1094; AVX512F-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm4
1095; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm4, %ymm3
1096; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1097; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
1098; AVX512F-NEXT:    addq $32, %rcx
1099; AVX512F-NEXT:    cmpq %rcx, %rax
1100; AVX512F-NEXT:    jne .LBB7_1
1101; AVX512F-NEXT:  # %bb.2: # %middle.block
1102; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
1103; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1104; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1105; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
1106; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1107; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1108; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1109; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1110; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1111; AVX512F-NEXT:    vmovd %xmm0, %eax
1112; AVX512F-NEXT:    vzeroupper
1113; AVX512F-NEXT:    retq
1114;
1115; AVX512BW-LABEL: _Z9test_charPcS_i_1024:
1116; AVX512BW:       # %bb.0: # %entry
1117; AVX512BW-NEXT:    movl %edx, %eax
1118; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1119; AVX512BW-NEXT:    xorl %ecx, %ecx
1120; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1121; AVX512BW-NEXT:    .p2align 4, 0x90
1122; AVX512BW-NEXT:  .LBB7_1: # %vector.body
1123; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
1124; AVX512BW-NEXT:    vpmovsxbw (%rdi,%rcx), %zmm2
1125; AVX512BW-NEXT:    vpmovsxbw (%rsi,%rcx), %zmm3
1126; AVX512BW-NEXT:    vpmaddwd %zmm2, %zmm3, %zmm2
1127; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
1128; AVX512BW-NEXT:    addq $32, %rcx
1129; AVX512BW-NEXT:    cmpq %rcx, %rax
1130; AVX512BW-NEXT:    jne .LBB7_1
1131; AVX512BW-NEXT:  # %bb.2: # %middle.block
1132; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
1133; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1134; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1135; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1136; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1137; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1138; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1139; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1140; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1141; AVX512BW-NEXT:    vmovd %xmm0, %eax
1142; AVX512BW-NEXT:    vzeroupper
1143; AVX512BW-NEXT:    retq
1144entry:
1145  %3 = zext i32 %2 to i64
1146  br label %vector.body
1147
1148vector.body:
1149  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1150  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1151  %4 = getelementptr inbounds i8, i8* %0, i64 %index
1152  %5 = bitcast i8* %4 to <32 x i8>*
1153  %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
1154  %6 = sext <32 x i8> %wide.load to <32 x i32>
1155  %7 = getelementptr inbounds i8, i8* %1, i64 %index
1156  %8 = bitcast i8* %7 to <32 x i8>*
1157  %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
1158  %9 = sext <32 x i8> %wide.load14 to <32 x i32>
1159  %10 = mul nsw <32 x i32> %9, %6
1160  %11 = add nsw <32 x i32> %10, %vec.phi
1161  %index.next = add i64 %index, 32
1162  %12 = icmp eq i64 %index.next, %3
1163  br i1 %12, label %middle.block, label %vector.body
1164
1165middle.block:
1166  %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1167  %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
1168  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1169  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
1170  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1171  %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
1172  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1173  %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
1174  %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1175  %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
1176  %13 = extractelement <32 x i32> %bin.rdx20, i32 0
1177  ret i32 %13
1178}
1179
1180define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
1181; SSE2-LABEL: test_unsigned_short_128:
1182; SSE2:       # %bb.0: # %entry
1183; SSE2-NEXT:    movl %edx, %eax
1184; SSE2-NEXT:    pxor %xmm0, %xmm0
1185; SSE2-NEXT:    xorl %ecx, %ecx
1186; SSE2-NEXT:    .p2align 4, 0x90
1187; SSE2-NEXT:  .LBB8_1: # %vector.body
1188; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1189; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
1190; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
1191; SSE2-NEXT:    movdqa %xmm2, %xmm3
1192; SSE2-NEXT:    pmulhuw %xmm1, %xmm3
1193; SSE2-NEXT:    pmullw %xmm1, %xmm2
1194; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1195; SSE2-NEXT:    paddd %xmm2, %xmm0
1196; SSE2-NEXT:    addq $16, %rcx
1197; SSE2-NEXT:    cmpq %rcx, %rax
1198; SSE2-NEXT:    jne .LBB8_1
1199; SSE2-NEXT:  # %bb.2: # %middle.block
1200; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1201; SSE2-NEXT:    paddd %xmm0, %xmm1
1202; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1203; SSE2-NEXT:    paddd %xmm1, %xmm0
1204; SSE2-NEXT:    movd %xmm0, %eax
1205; SSE2-NEXT:    retq
1206;
1207; AVX-LABEL: test_unsigned_short_128:
1208; AVX:       # %bb.0: # %entry
1209; AVX-NEXT:    movl %edx, %eax
1210; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1211; AVX-NEXT:    xorl %ecx, %ecx
1212; AVX-NEXT:    .p2align 4, 0x90
1213; AVX-NEXT:  .LBB8_1: # %vector.body
1214; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
1215; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1216; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1217; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
1218; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
1219; AVX-NEXT:    addq $16, %rcx
1220; AVX-NEXT:    cmpq %rcx, %rax
1221; AVX-NEXT:    jne .LBB8_1
1222; AVX-NEXT:  # %bb.2: # %middle.block
1223; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1224; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1225; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1226; AVX-NEXT:    vmovd %xmm0, %eax
1227; AVX-NEXT:    retq
1228entry:
1229  %3 = zext i32 %2 to i64
1230  br label %vector.body
1231
1232vector.body:
1233  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1234  %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1235  %4 = getelementptr inbounds i16, i16* %0, i64 %index
1236  %5 = bitcast i16* %4 to <4 x i16>*
1237  %wide.load = load <4 x i16>, <4 x i16>* %5, align 2
1238  %6 = zext <4 x i16> %wide.load to <4 x i32>
1239  %7 = getelementptr inbounds i16, i16* %1, i64 %index
1240  %8 = bitcast i16* %7 to <4 x i16>*
1241  %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2
1242  %9 = zext <4 x i16> %wide.load14 to <4 x i32>
1243  %10 = mul nsw <4 x i32> %9, %6
1244  %11 = add nsw <4 x i32> %10, %vec.phi
1245  %index.next = add i64 %index, 16
1246  %12 = icmp eq i64 %index.next, %3
1247  br i1 %12, label %middle.block, label %vector.body
1248
1249middle.block:
1250  %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1251  %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15
1252  %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1253  %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17
1254  %13 = extractelement <4 x i32> %bin.rdx18, i32 0
1255  ret i32 %13
1256}
1257
1258define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
1259; SSE2-LABEL: test_unsigned_short_256:
1260; SSE2:       # %bb.0: # %entry
1261; SSE2-NEXT:    movl %edx, %eax
1262; SSE2-NEXT:    pxor %xmm0, %xmm0
1263; SSE2-NEXT:    xorl %ecx, %ecx
1264; SSE2-NEXT:    pxor %xmm1, %xmm1
1265; SSE2-NEXT:    .p2align 4, 0x90
1266; SSE2-NEXT:  .LBB9_1: # %vector.body
1267; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1268; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm2
1269; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm3
1270; SSE2-NEXT:    movdqa %xmm3, %xmm4
1271; SSE2-NEXT:    pmulhuw %xmm2, %xmm4
1272; SSE2-NEXT:    pmullw %xmm2, %xmm3
1273; SSE2-NEXT:    movdqa %xmm3, %xmm2
1274; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1275; SSE2-NEXT:    paddd %xmm2, %xmm0
1276; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1277; SSE2-NEXT:    paddd %xmm3, %xmm1
1278; SSE2-NEXT:    addq $16, %rcx
1279; SSE2-NEXT:    cmpq %rcx, %rax
1280; SSE2-NEXT:    jne .LBB9_1
1281; SSE2-NEXT:  # %bb.2: # %middle.block
1282; SSE2-NEXT:    paddd %xmm1, %xmm0
1283; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1284; SSE2-NEXT:    paddd %xmm0, %xmm1
1285; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1286; SSE2-NEXT:    paddd %xmm1, %xmm0
1287; SSE2-NEXT:    movd %xmm0, %eax
1288; SSE2-NEXT:    retq
1289;
1290; AVX1-LABEL: test_unsigned_short_256:
1291; AVX1:       # %bb.0: # %entry
1292; AVX1-NEXT:    movl %edx, %eax
1293; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1294; AVX1-NEXT:    xorl %ecx, %ecx
1295; AVX1-NEXT:    .p2align 4, 0x90
1296; AVX1-NEXT:  .LBB9_1: # %vector.body
1297; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1298; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1299; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1300; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1301; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
1302; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1303; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
1304; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1305; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
1306; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
1307; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1308; AVX1-NEXT:    addq $16, %rcx
1309; AVX1-NEXT:    cmpq %rcx, %rax
1310; AVX1-NEXT:    jne .LBB9_1
1311; AVX1-NEXT:  # %bb.2: # %middle.block
1312; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1313; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1314; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1315; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1316; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1317; AVX1-NEXT:    vmovd %xmm0, %eax
1318; AVX1-NEXT:    vzeroupper
1319; AVX1-NEXT:    retq
1320;
1321; AVX256-LABEL: test_unsigned_short_256:
1322; AVX256:       # %bb.0: # %entry
1323; AVX256-NEXT:    movl %edx, %eax
1324; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1325; AVX256-NEXT:    xorl %ecx, %ecx
1326; AVX256-NEXT:    .p2align 4, 0x90
1327; AVX256-NEXT:  .LBB9_1: # %vector.body
1328; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
1329; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1330; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1331; AVX256-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
1332; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
1333; AVX256-NEXT:    addq $16, %rcx
1334; AVX256-NEXT:    cmpq %rcx, %rax
1335; AVX256-NEXT:    jne .LBB9_1
1336; AVX256-NEXT:  # %bb.2: # %middle.block
1337; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
1338; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1339; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1340; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1341; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
1342; AVX256-NEXT:    vmovd %xmm0, %eax
1343; AVX256-NEXT:    vzeroupper
1344; AVX256-NEXT:    retq
1345entry:
1346  %3 = zext i32 %2 to i64
1347  br label %vector.body
1348
1349vector.body:
1350  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1351  %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1352  %4 = getelementptr inbounds i16, i16* %0, i64 %index
1353  %5 = bitcast i16* %4 to <8 x i16>*
1354  %wide.load = load <8 x i16>, <8 x i16>* %5, align 2
1355  %6 = zext <8 x i16> %wide.load to <8 x i32>
1356  %7 = getelementptr inbounds i16, i16* %1, i64 %index
1357  %8 = bitcast i16* %7 to <8 x i16>*
1358  %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2
1359  %9 = zext <8 x i16> %wide.load14 to <8 x i32>
1360  %10 = mul nsw <8 x i32> %9, %6
1361  %11 = add nsw <8 x i32> %10, %vec.phi
1362  %index.next = add i64 %index, 16
1363  %12 = icmp eq i64 %index.next, %3
1364  br i1 %12, label %middle.block, label %vector.body
1365
1366middle.block:
1367  %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1368  %bin.rdx = add <8 x i32> %11, %rdx.shuf
1369  %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1370  %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15
1371  %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1372  %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
1373  %13 = extractelement <8 x i32> %bin.rdx18, i32 0
1374  ret i32 %13
1375}
1376
1377define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
1378; SSE2-LABEL: test_unsigned_short_512:
1379; SSE2:       # %bb.0: # %entry
1380; SSE2-NEXT:    movl %edx, %eax
1381; SSE2-NEXT:    pxor %xmm0, %xmm0
1382; SSE2-NEXT:    xorl %ecx, %ecx
1383; SSE2-NEXT:    pxor %xmm1, %xmm1
1384; SSE2-NEXT:    pxor %xmm3, %xmm3
1385; SSE2-NEXT:    pxor %xmm2, %xmm2
1386; SSE2-NEXT:    .p2align 4, 0x90
1387; SSE2-NEXT:  .LBB10_1: # %vector.body
1388; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1389; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm4
1390; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm8
1391; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm6
1392; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm7
1393; SSE2-NEXT:    movdqa %xmm6, %xmm5
1394; SSE2-NEXT:    pmulhuw %xmm4, %xmm5
1395; SSE2-NEXT:    pmullw %xmm4, %xmm6
1396; SSE2-NEXT:    movdqa %xmm6, %xmm4
1397; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1398; SSE2-NEXT:    paddd %xmm4, %xmm0
1399; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1400; SSE2-NEXT:    paddd %xmm6, %xmm1
1401; SSE2-NEXT:    movdqa %xmm7, %xmm4
1402; SSE2-NEXT:    pmulhuw %xmm8, %xmm4
1403; SSE2-NEXT:    pmullw %xmm8, %xmm7
1404; SSE2-NEXT:    movdqa %xmm7, %xmm5
1405; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1406; SSE2-NEXT:    paddd %xmm5, %xmm3
1407; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
1408; SSE2-NEXT:    paddd %xmm7, %xmm2
1409; SSE2-NEXT:    addq $16, %rcx
1410; SSE2-NEXT:    cmpq %rcx, %rax
1411; SSE2-NEXT:    jne .LBB10_1
1412; SSE2-NEXT:  # %bb.2: # %middle.block
1413; SSE2-NEXT:    paddd %xmm3, %xmm0
1414; SSE2-NEXT:    paddd %xmm2, %xmm1
1415; SSE2-NEXT:    paddd %xmm0, %xmm1
1416; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1417; SSE2-NEXT:    paddd %xmm1, %xmm0
1418; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1419; SSE2-NEXT:    paddd %xmm0, %xmm1
1420; SSE2-NEXT:    movd %xmm1, %eax
1421; SSE2-NEXT:    retq
1422;
1423; AVX1-LABEL: test_unsigned_short_512:
1424; AVX1:       # %bb.0: # %entry
1425; AVX1-NEXT:    movl %edx, %eax
1426; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1427; AVX1-NEXT:    xorl %ecx, %ecx
1428; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1429; AVX1-NEXT:    .p2align 4, 0x90
1430; AVX1-NEXT:  .LBB10_1: # %vector.body
1431; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1432; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1433; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1434; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1435; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1436; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1437; AVX1-NEXT:    vpmulld %xmm2, %xmm6, %xmm2
1438; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1439; AVX1-NEXT:    vpmulld %xmm3, %xmm6, %xmm3
1440; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1441; AVX1-NEXT:    vpmulld %xmm4, %xmm6, %xmm4
1442; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1443; AVX1-NEXT:    vpmulld %xmm5, %xmm6, %xmm5
1444; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
1445; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
1446; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
1447; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1448; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1449; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
1450; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
1451; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1452; AVX1-NEXT:    addq $16, %rcx
1453; AVX1-NEXT:    cmpq %rcx, %rax
1454; AVX1-NEXT:    jne .LBB10_1
1455; AVX1-NEXT:  # %bb.2: # %middle.block
1456; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1457; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1458; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1459; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
1460; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1461; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1462; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1463; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1464; AVX1-NEXT:    vmovd %xmm0, %eax
1465; AVX1-NEXT:    vzeroupper
1466; AVX1-NEXT:    retq
1467;
1468; AVX2-LABEL: test_unsigned_short_512:
1469; AVX2:       # %bb.0: # %entry
1470; AVX2-NEXT:    movl %edx, %eax
1471; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1472; AVX2-NEXT:    xorl %ecx, %ecx
1473; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1474; AVX2-NEXT:    .p2align 4, 0x90
1475; AVX2-NEXT:  .LBB10_1: # %vector.body
1476; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
1477; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1478; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1479; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1480; AVX2-NEXT:    vpmulld %ymm2, %ymm4, %ymm2
1481; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
1482; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1483; AVX2-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
1484; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
1485; AVX2-NEXT:    addq $16, %rcx
1486; AVX2-NEXT:    cmpq %rcx, %rax
1487; AVX2-NEXT:    jne .LBB10_1
1488; AVX2-NEXT:  # %bb.2: # %middle.block
1489; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1490; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1491; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1492; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1493; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1494; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
1495; AVX2-NEXT:    vmovd %xmm0, %eax
1496; AVX2-NEXT:    vzeroupper
1497; AVX2-NEXT:    retq
1498;
1499; AVX512-LABEL: test_unsigned_short_512:
1500; AVX512:       # %bb.0: # %entry
1501; AVX512-NEXT:    movl %edx, %eax
1502; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1503; AVX512-NEXT:    xorl %ecx, %ecx
1504; AVX512-NEXT:    .p2align 4, 0x90
1505; AVX512-NEXT:  .LBB10_1: # %vector.body
1506; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
1507; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1508; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1509; AVX512-NEXT:    vpmulld %zmm1, %zmm2, %zmm1
1510; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
1511; AVX512-NEXT:    addq $16, %rcx
1512; AVX512-NEXT:    cmpq %rcx, %rax
1513; AVX512-NEXT:    jne .LBB10_1
1514; AVX512-NEXT:  # %bb.2: # %middle.block
1515; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1516; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1517; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1518; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1519; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1520; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1521; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1522; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1523; AVX512-NEXT:    vmovd %xmm0, %eax
1524; AVX512-NEXT:    vzeroupper
1525; AVX512-NEXT:    retq
1526entry:
1527  %3 = zext i32 %2 to i64
1528  br label %vector.body
1529
1530vector.body:
1531  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1532  %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1533  %4 = getelementptr inbounds i16, i16* %0, i64 %index
1534  %5 = bitcast i16* %4 to <16 x i16>*
1535  %wide.load = load <16 x i16>, <16 x i16>* %5, align 2
1536  %6 = zext <16 x i16> %wide.load to <16 x i32>
1537  %7 = getelementptr inbounds i16, i16* %1, i64 %index
1538  %8 = bitcast i16* %7 to <16 x i16>*
1539  %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2
1540  %9 = zext <16 x i16> %wide.load14 to <16 x i32>
1541  %10 = mul nsw <16 x i32> %9, %6
1542  %11 = add nsw <16 x i32> %10, %vec.phi
1543  %index.next = add i64 %index, 16
1544  %12 = icmp eq i64 %index.next, %3
1545  br i1 %12, label %middle.block, label %vector.body
1546
1547middle.block:
1548  %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1549  %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1
1550  %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1551  %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf
1552  %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1553  %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
1554  %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1555  %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
1556  %13 = extractelement <16 x i32> %bin.rdx18, i32 0
1557  ret i32 %13
1558}
1559
1560define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
1561; SSE2-LABEL: test_unsigned_short_1024:
1562; SSE2:       # %bb.0: # %entry
1563; SSE2-NEXT:    movl %edx, %eax
1564; SSE2-NEXT:    pxor %xmm8, %xmm8
1565; SSE2-NEXT:    xorl %ecx, %ecx
1566; SSE2-NEXT:    pxor %xmm3, %xmm3
1567; SSE2-NEXT:    pxor %xmm9, %xmm9
1568; SSE2-NEXT:    pxor %xmm10, %xmm10
1569; SSE2-NEXT:    pxor %xmm4, %xmm4
1570; SSE2-NEXT:    pxor %xmm6, %xmm6
1571; SSE2-NEXT:    pxor %xmm5, %xmm5
1572; SSE2-NEXT:    pxor %xmm7, %xmm7
1573; SSE2-NEXT:    .p2align 4, 0x90
1574; SSE2-NEXT:  .LBB11_1: # %vector.body
1575; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1576; SSE2-NEXT:    movdqu 48(%rdi,%rcx,2), %xmm0
1577; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm1
1578; SSE2-NEXT:    movdqa %xmm1, %xmm2
1579; SSE2-NEXT:    pmulhuw %xmm0, %xmm2
1580; SSE2-NEXT:    pmullw %xmm0, %xmm1
1581; SSE2-NEXT:    movdqa %xmm1, %xmm0
1582; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1583; SSE2-NEXT:    paddd %xmm0, %xmm7
1584; SSE2-NEXT:    movdqu 32(%rdi,%rcx,2), %xmm0
1585; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1586; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm2
1587; SSE2-NEXT:    paddd %xmm1, %xmm5
1588; SSE2-NEXT:    movdqa %xmm2, %xmm1
1589; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
1590; SSE2-NEXT:    pmullw %xmm0, %xmm2
1591; SSE2-NEXT:    movdqa %xmm2, %xmm0
1592; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1593; SSE2-NEXT:    paddd %xmm0, %xmm6
1594; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm0
1595; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1596; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm1
1597; SSE2-NEXT:    paddd %xmm2, %xmm4
1598; SSE2-NEXT:    movdqa %xmm1, %xmm2
1599; SSE2-NEXT:    pmulhuw %xmm0, %xmm2
1600; SSE2-NEXT:    pmullw %xmm0, %xmm1
1601; SSE2-NEXT:    movdqa %xmm1, %xmm0
1602; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1603; SSE2-NEXT:    paddd %xmm0, %xmm8
1604; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm0
1605; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1606; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm2
1607; SSE2-NEXT:    paddd %xmm1, %xmm3
1608; SSE2-NEXT:    movdqa %xmm2, %xmm1
1609; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
1610; SSE2-NEXT:    pmullw %xmm0, %xmm2
1611; SSE2-NEXT:    movdqa %xmm2, %xmm0
1612; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1613; SSE2-NEXT:    paddd %xmm0, %xmm9
1614; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1615; SSE2-NEXT:    paddd %xmm2, %xmm10
1616; SSE2-NEXT:    addq $16, %rcx
1617; SSE2-NEXT:    cmpq %rcx, %rax
1618; SSE2-NEXT:    jne .LBB11_1
1619; SSE2-NEXT:  # %bb.2: # %middle.block
1620; SSE2-NEXT:    paddd %xmm6, %xmm3
1621; SSE2-NEXT:    paddd %xmm7, %xmm10
1622; SSE2-NEXT:    paddd %xmm3, %xmm10
1623; SSE2-NEXT:    paddd %xmm4, %xmm8
1624; SSE2-NEXT:    paddd %xmm5, %xmm9
1625; SSE2-NEXT:    paddd %xmm10, %xmm9
1626; SSE2-NEXT:    paddd %xmm8, %xmm9
1627; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1]
1628; SSE2-NEXT:    paddd %xmm9, %xmm0
1629; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1630; SSE2-NEXT:    paddd %xmm0, %xmm1
1631; SSE2-NEXT:    movd %xmm1, %eax
1632; SSE2-NEXT:    retq
1633;
1634; AVX1-LABEL: test_unsigned_short_1024:
1635; AVX1:       # %bb.0: # %entry
1636; AVX1-NEXT:    movl %edx, %eax
1637; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
1638; AVX1-NEXT:    xorl %ecx, %ecx
1639; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1640; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
1641; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1642; AVX1-NEXT:    .p2align 4, 0x90
1643; AVX1-NEXT:  .LBB11_1: # %vector.body
1644; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1645; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1646; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1647; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1648; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1649; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1650; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1651; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm10 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1652; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1653; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1654; AVX1-NEXT:    vpmulld %xmm4, %xmm1, %xmm1
1655; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1656; AVX1-NEXT:    vpmulld %xmm5, %xmm4, %xmm4
1657; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1658; AVX1-NEXT:    vpmulld %xmm6, %xmm5, %xmm5
1659; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1660; AVX1-NEXT:    vpmulld %xmm7, %xmm6, %xmm6
1661; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1662; AVX1-NEXT:    vpmulld %xmm0, %xmm7, %xmm13
1663; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1664; AVX1-NEXT:    vpmulld %xmm12, %xmm7, %xmm7
1665; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1666; AVX1-NEXT:    vpmulld %xmm10, %xmm0, %xmm10
1667; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1668; AVX1-NEXT:    vpmulld %xmm11, %xmm0, %xmm11
1669; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
1670; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
1671; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm1
1672; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
1673; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm0
1674; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
1675; AVX1-NEXT:    vpaddd %xmm8, %xmm6, %xmm1
1676; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm8
1677; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm0
1678; AVX1-NEXT:    vpaddd %xmm0, %xmm13, %xmm0
1679; AVX1-NEXT:    vpaddd %xmm9, %xmm7, %xmm1
1680; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
1681; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
1682; AVX1-NEXT:    vpaddd %xmm0, %xmm10, %xmm0
1683; AVX1-NEXT:    vpaddd %xmm3, %xmm11, %xmm1
1684; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm3
1685; AVX1-NEXT:    addq $16, %rcx
1686; AVX1-NEXT:    cmpq %rcx, %rax
1687; AVX1-NEXT:    jne .LBB11_1
1688; AVX1-NEXT:  # %bb.2: # %middle.block
1689; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm0
1690; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
1691; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm4
1692; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1693; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1694; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1695; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
1696; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
1697; AVX1-NEXT:    vpaddd %xmm0, %xmm9, %xmm0
1698; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1699; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm0
1700; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1701; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1702; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1703; AVX1-NEXT:    vmovd %xmm0, %eax
1704; AVX1-NEXT:    vzeroupper
1705; AVX1-NEXT:    retq
1706;
1707; AVX2-LABEL: test_unsigned_short_1024:
1708; AVX2:       # %bb.0: # %entry
1709; AVX2-NEXT:    movl %edx, %eax
1710; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1711; AVX2-NEXT:    xorl %ecx, %ecx
1712; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1713; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1714; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1715; AVX2-NEXT:    .p2align 4, 0x90
1716; AVX2-NEXT:  .LBB11_1: # %vector.body
1717; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
1718; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1719; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1720; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1721; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1722; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1723; AVX2-NEXT:    vpmulld %ymm4, %ymm8, %ymm4
1724; AVX2-NEXT:    vpaddd %ymm2, %ymm4, %ymm2
1725; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1726; AVX2-NEXT:    vpmulld %ymm5, %ymm4, %ymm4
1727; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
1728; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1729; AVX2-NEXT:    vpmulld %ymm6, %ymm4, %ymm4
1730; AVX2-NEXT:    vpaddd %ymm0, %ymm4, %ymm0
1731; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1732; AVX2-NEXT:    vpmulld %ymm7, %ymm4, %ymm4
1733; AVX2-NEXT:    vpaddd %ymm3, %ymm4, %ymm3
1734; AVX2-NEXT:    addq $16, %rcx
1735; AVX2-NEXT:    cmpq %rcx, %rax
1736; AVX2-NEXT:    jne .LBB11_1
1737; AVX2-NEXT:  # %bb.2: # %middle.block
1738; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
1739; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
1740; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1741; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1742; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1743; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1744; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1745; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
1746; AVX2-NEXT:    vmovd %xmm0, %eax
1747; AVX2-NEXT:    vzeroupper
1748; AVX2-NEXT:    retq
1749;
1750; AVX512-LABEL: test_unsigned_short_1024:
1751; AVX512:       # %bb.0: # %entry
1752; AVX512-NEXT:    movl %edx, %eax
1753; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1754; AVX512-NEXT:    xorl %ecx, %ecx
1755; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1756; AVX512-NEXT:    .p2align 4, 0x90
1757; AVX512-NEXT:  .LBB11_1: # %vector.body
1758; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
1759; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1760; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1761; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1762; AVX512-NEXT:    vpmulld %zmm2, %zmm4, %zmm2
1763; AVX512-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
1764; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1765; AVX512-NEXT:    vpmulld %zmm3, %zmm2, %zmm2
1766; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
1767; AVX512-NEXT:    addq $16, %rcx
1768; AVX512-NEXT:    cmpq %rcx, %rax
1769; AVX512-NEXT:    jne .LBB11_1
1770; AVX512-NEXT:  # %bb.2: # %middle.block
1771; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1772; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1773; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1774; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1775; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1776; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1777; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1778; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1779; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1780; AVX512-NEXT:    vmovd %xmm0, %eax
1781; AVX512-NEXT:    vzeroupper
1782; AVX512-NEXT:    retq
1783entry:
1784  %3 = zext i32 %2 to i64
1785  br label %vector.body
1786
1787vector.body:
1788  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1789  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1790  %4 = getelementptr inbounds i16, i16* %0, i64 %index
1791  %5 = bitcast i16* %4 to <32 x i16>*
1792  %wide.load = load <32 x i16>, <32 x i16>* %5, align 2
1793  %6 = zext <32 x i16> %wide.load to <32 x i32>
1794  %7 = getelementptr inbounds i16, i16* %1, i64 %index
1795  %8 = bitcast i16* %7 to <32 x i16>*
1796  %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2
1797  %9 = zext <32 x i16> %wide.load14 to <32 x i32>
1798  %10 = mul nsw <32 x i32> %9, %6
1799  %11 = add nsw <32 x i32> %10, %vec.phi
1800  %index.next = add i64 %index, 16
1801  %12 = icmp eq i64 %index.next, %3
1802  br i1 %12, label %middle.block, label %vector.body
1803
1804middle.block:
1805  %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1806  %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2
1807  %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1808  %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1
1809  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1810  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
1811  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1812  %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15
1813  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1814  %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17
1815  %13 = extractelement <32 x i32> %bin.rdx18, i32 0
1816  ret i32 %13
1817}
1818
1819define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {
1820; SSE2-LABEL: pmaddwd_8:
1821; SSE2:       # %bb.0:
1822; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
1823; SSE2-NEXT:    retq
1824;
1825; AVX-LABEL: pmaddwd_8:
1826; AVX:       # %bb.0:
1827; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1828; AVX-NEXT:    retq
1829   %a = sext <8 x i16> %A to <8 x i32>
1830   %b = sext <8 x i16> %B to <8 x i32>
1831   %m = mul nsw <8 x i32> %a, %b
1832   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1833   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1834   %ret = add <4 x i32> %odd, %even
1835   ret <4 x i32> %ret
1836}
1837
1838define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) {
1839; SSE2-LABEL: pmaddwd_8_swapped:
1840; SSE2:       # %bb.0:
1841; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
1842; SSE2-NEXT:    retq
1843;
1844; AVX-LABEL: pmaddwd_8_swapped:
1845; AVX:       # %bb.0:
1846; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1847; AVX-NEXT:    retq
1848   %a = sext <8 x i16> %A to <8 x i32>
1849   %b = sext <8 x i16> %B to <8 x i32>
1850   %m = mul nsw <8 x i32> %a, %b
1851   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1852   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1853   %ret = add <4 x i32> %even, %odd
1854   ret <4 x i32> %ret
1855}
1856
1857define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) {
1858; SSE2-LABEL: larger_mul:
1859; SSE2:       # %bb.0:
1860; SSE2-NEXT:    movdqa %xmm0, %xmm1
1861; SSE2-NEXT:    pmulhw %xmm2, %xmm1
1862; SSE2-NEXT:    pmullw %xmm2, %xmm0
1863; SSE2-NEXT:    movdqa %xmm0, %xmm2
1864; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1865; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1866; SSE2-NEXT:    movdqa %xmm0, %xmm1
1867; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1868; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
1869; SSE2-NEXT:    paddd %xmm1, %xmm0
1870; SSE2-NEXT:    retq
1871;
1872; AVX1-LABEL: larger_mul:
1873; AVX1:       # %bb.0:
1874; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
1875; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1876; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
1877; AVX1-NEXT:    vpackssdw %xmm0, %xmm2, %xmm0
1878; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm2
1879; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1880; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
1881; AVX1-NEXT:    vpackssdw %xmm1, %xmm2, %xmm1
1882; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1883; AVX1-NEXT:    vzeroupper
1884; AVX1-NEXT:    retq
1885;
1886; AVX2-LABEL: larger_mul:
1887; AVX2:       # %bb.0:
1888; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1889; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
1890; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1891; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
1892; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1893; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
1894; AVX2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1895; AVX2-NEXT:    vzeroupper
1896; AVX2-NEXT:    retq
1897;
1898; AVX512-LABEL: larger_mul:
1899; AVX512:       # %bb.0:
1900; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
1901; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
1902; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1903; AVX512-NEXT:    vpextrd $2, %xmm0, %eax
1904; AVX512-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm1
1905; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
1906; AVX512-NEXT:    vmovd %xmm2, %eax
1907; AVX512-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
1908; AVX512-NEXT:    vpextrd $2, %xmm2, %eax
1909; AVX512-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
1910; AVX512-NEXT:    vpextrd $3, %xmm0, %eax
1911; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1912; AVX512-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1913; AVX512-NEXT:    vpextrd $1, %xmm2, %eax
1914; AVX512-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1915; AVX512-NEXT:    vpextrd $3, %xmm2, %eax
1916; AVX512-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1917; AVX512-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
1918; AVX512-NEXT:    vzeroupper
1919; AVX512-NEXT:    retq
1920   %a = sext <16 x i16> %A to <16 x i32>
1921   %b = sext <16 x i16> %B to <16 x i32>
1922   %m = mul nsw <16 x i32> %a, %b
1923   %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1924   %even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1925   %ret = add <4 x i32> %odd, %even
1926   ret <4 x i32> %ret
1927}
1928
1929define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {
1930; SSE2-LABEL: pmaddwd_16:
1931; SSE2:       # %bb.0:
1932; SSE2-NEXT:    pmaddwd %xmm2, %xmm0
1933; SSE2-NEXT:    pmaddwd %xmm3, %xmm1
1934; SSE2-NEXT:    retq
1935;
1936; AVX1-LABEL: pmaddwd_16:
1937; AVX1:       # %bb.0:
1938; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1939; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1940; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
1941; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1942; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1943; AVX1-NEXT:    retq
1944;
1945; AVX256-LABEL: pmaddwd_16:
1946; AVX256:       # %bb.0:
1947; AVX256-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1948; AVX256-NEXT:    retq
1949   %a = sext <16 x i16> %A to <16 x i32>
1950   %b = sext <16 x i16> %B to <16 x i32>
1951   %m = mul nsw <16 x i32> %a, %b
1952   %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
1953   %even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
1954   %ret = add <8 x i32> %odd, %even
1955   ret <8 x i32> %ret
1956}
1957
1958define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
1959; SSE2-LABEL: pmaddwd_32:
1960; SSE2:       # %bb.0:
1961; SSE2-NEXT:    pmaddwd %xmm4, %xmm0
1962; SSE2-NEXT:    pmaddwd %xmm5, %xmm1
1963; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
1964; SSE2-NEXT:    pmaddwd %xmm7, %xmm3
1965; SSE2-NEXT:    retq
1966;
1967; AVX1-LABEL: pmaddwd_32:
1968; AVX1:       # %bb.0:
1969; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1970; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
1971; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
1972; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
1973; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
1974; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
1975; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
1976; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
1977; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
1978; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
1979; AVX1-NEXT:    retq
1980;
1981; AVX2-LABEL: pmaddwd_32:
1982; AVX2:       # %bb.0:
1983; AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
1984; AVX2-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
1985; AVX2-NEXT:    retq
1986;
1987; AVX512F-LABEL: pmaddwd_32:
1988; AVX512F:       # %bb.0:
1989; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
1990; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
1991; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1992; AVX512F-NEXT:    retq
1993;
1994; AVX512BW-LABEL: pmaddwd_32:
1995; AVX512BW:       # %bb.0:
1996; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
1997; AVX512BW-NEXT:    retq
1998   %a = sext <32 x i16> %A to <32 x i32>
1999   %b = sext <32 x i16> %B to <32 x i32>
2000   %m = mul nsw <32 x i32> %a, %b
2001   %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
2002   %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
2003   %ret = add <16 x i32> %odd, %even
2004   ret <16 x i32> %ret
2005}
2006
2007define <4 x i32> @pmaddwd_const(<8 x i16> %A) {
2008; SSE2-LABEL: pmaddwd_const:
2009; SSE2:       # %bb.0:
2010; SSE2-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
2011; SSE2-NEXT:    retq
2012;
2013; AVX-LABEL: pmaddwd_const:
2014; AVX:       # %bb.0:
2015; AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
2016; AVX-NEXT:    retq
2017   %a = sext <8 x i16> %A to <8 x i32>
2018   %m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
2019   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2020   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2021   %ret = add <4 x i32> %odd, %even
2022   ret <4 x i32> %ret
2023}
2024
2025; Do not select unsigned i16 multiplication
2026define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {
2027; SSE2-LABEL: pmaddwd_negative1:
2028; SSE2:       # %bb.0:
2029; SSE2-NEXT:    movdqa %xmm0, %xmm2
2030; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
2031; SSE2-NEXT:    pmullw %xmm1, %xmm0
2032; SSE2-NEXT:    movdqa %xmm0, %xmm1
2033; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2034; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2035; SSE2-NEXT:    movdqa %xmm0, %xmm2
2036; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2037; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
2038; SSE2-NEXT:    paddd %xmm2, %xmm0
2039; SSE2-NEXT:    retq
2040;
2041; AVX1-LABEL: pmaddwd_negative1:
2042; AVX1:       # %bb.0:
2043; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
2044; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2045; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2046; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
2047; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2048; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
2049; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2050; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
2051; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
2052; AVX1-NEXT:    retq
2053;
2054; AVX256-LABEL: pmaddwd_negative1:
2055; AVX256:       # %bb.0:
2056; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2057; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2058; AVX256-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
2059; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
2060; AVX256-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
2061; AVX256-NEXT:    vzeroupper
2062; AVX256-NEXT:    retq
2063   %a = zext <8 x i16> %A to <8 x i32>
2064   %b = zext <8 x i16> %B to <8 x i32>
2065   %m = mul nuw <8 x i32> %a, %b
2066   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2067   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2068   %ret = add <4 x i32> %odd, %even
2069   ret <4 x i32> %ret
2070}
2071
2072; Do not select if constant is too large
2073define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
2074; SSE2-LABEL: pmaddwd_negative2:
2075; SSE2:       # %bb.0:
2076; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2077; SSE2-NEXT:    psrad $16, %xmm1
2078; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2079; SSE2-NEXT:    psrad $16, %xmm0
2080; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,7,42,32]
2081; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2082; SSE2-NEXT:    pmuludq %xmm2, %xmm0
2083; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
2084; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2085; SSE2-NEXT:    pmuludq %xmm3, %xmm0
2086; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2087; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
2088; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,4294934528,0,0]
2089; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
2090; SSE2-NEXT:    pmuludq %xmm2, %xmm1
2091; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
2092; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
2093; SSE2-NEXT:    pmuludq %xmm3, %xmm1
2094; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2095; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2096; SSE2-NEXT:    movdqa %xmm0, %xmm1
2097; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
2098; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
2099; SSE2-NEXT:    paddd %xmm1, %xmm0
2100; SSE2-NEXT:    retq
2101;
2102; AVX1-LABEL: pmaddwd_negative2:
2103; AVX1:       # %bb.0:
2104; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2105; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
2106; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
2107; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2108; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
2109; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
2110; AVX1-NEXT:    retq
2111;
2112; AVX256-LABEL: pmaddwd_negative2:
2113; AVX256:       # %bb.0:
2114; AVX256-NEXT:    vpmovsxwd %xmm0, %ymm0
2115; AVX256-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
2116; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
2117; AVX256-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
2118; AVX256-NEXT:    vzeroupper
2119; AVX256-NEXT:    retq
2120   %a = sext <8 x i16> %A to <8 x i32>
2121   %m = mul nsw <8 x i32> %a, <i32 32768, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
2122   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2123   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2124   %ret = add <4 x i32> %odd, %even
2125   ret <4 x i32> %ret
2126}
2127
2128define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) {
2129; SSE2-LABEL: jumbled_indices4:
2130; SSE2:       # %bb.0:
2131; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
2132; SSE2-NEXT:    retq
2133;
2134; AVX-LABEL: jumbled_indices4:
2135; AVX:       # %bb.0:
2136; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
2137; AVX-NEXT:    retq
2138  %exta = sext <8 x i16> %A to <8 x i32>
2139  %extb = sext <8 x i16> %B to <8 x i32>
2140  %m = mul <8 x i32> %exta, %extb
2141  %sa = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 3, i32 1, i32 5, i32 6>
2142  %sb = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 2, i32 0, i32 4, i32 7>
2143  %a = add <4 x i32> %sa, %sb
2144  ret <4 x i32> %a
2145}
2146
2147define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) {
2148; SSE2-LABEL: jumbled_indices8:
2149; SSE2:       # %bb.0:
2150; SSE2-NEXT:    pmaddwd %xmm2, %xmm0
2151; SSE2-NEXT:    pmaddwd %xmm3, %xmm1
2152; SSE2-NEXT:    retq
2153;
2154; AVX1-LABEL: jumbled_indices8:
2155; AVX1:       # %bb.0:
2156; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2157; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2158; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
2159; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
2160; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2161; AVX1-NEXT:    retq
2162;
2163; AVX256-LABEL: jumbled_indices8:
2164; AVX256:       # %bb.0:
2165; AVX256-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
2166; AVX256-NEXT:    retq
2167  %exta = sext <16 x i16> %A to <16 x i32>
2168  %extb = sext <16 x i16> %B to <16 x i32>
2169  %m = mul <16 x i32> %exta, %extb
2170  %sa = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 7, i32 4, i32 11, i32 8, i32 15, i32 12>
2171  %sb = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 6, i32 5, i32 10, i32 9, i32 14, i32 13>
2172  %a = add <8 x i32> %sa, %sb
2173  ret <8 x i32> %a
2174}
2175
2176define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
2177; SSE2-LABEL: jumbled_indices16:
2178; SSE2:       # %bb.0:
2179; SSE2-NEXT:    pmaddwd %xmm4, %xmm0
2180; SSE2-NEXT:    pmaddwd %xmm5, %xmm1
2181; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
2182; SSE2-NEXT:    pmaddwd %xmm7, %xmm3
2183; SSE2-NEXT:    retq
2184;
2185; AVX1-LABEL: jumbled_indices16:
2186; AVX1:       # %bb.0:
2187; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2188; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
2189; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
2190; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
2191; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
2192; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
2193; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
2194; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
2195; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
2196; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
2197; AVX1-NEXT:    retq
2198;
2199; AVX2-LABEL: jumbled_indices16:
2200; AVX2:       # %bb.0:
2201; AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
2202; AVX2-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
2203; AVX2-NEXT:    retq
2204;
2205; AVX512F-LABEL: jumbled_indices16:
2206; AVX512F:       # %bb.0:
2207; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
2208; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
2209; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2210; AVX512F-NEXT:    retq
2211;
2212; AVX512BW-LABEL: jumbled_indices16:
2213; AVX512BW:       # %bb.0:
2214; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
2215; AVX512BW-NEXT:    retq
2216  %exta = sext <32 x i16> %A to <32 x i32>
2217  %extb = sext <32 x i16> %B to <32 x i32>
2218  %m = mul <32 x i32> %exta, %extb
2219  %sa = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 2, i32 0, i32 5, i32 6, i32 11, i32 9, i32 15, i32 12, i32 17, i32 18, i32 20, i32 23, i32 27, i32 24, i32 31, i32 29>
2220  %sb = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 3, i32 1, i32 4, i32 7, i32 10, i32 8, i32 14, i32 13, i32 16, i32 19, i32 21, i32 22, i32 26, i32 25, i32 30, i32 28>
2221  %a = add <16 x i32> %sa, %sb
2222  ret <16 x i32> %a
2223}
2224
2225define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
2226; SSE2-LABEL: jumbled_indices32:
2227; SSE2:       # %bb.0:
2228; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm0
2229; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm1
2230; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm2
2231; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm3
2232; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm4
2233; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm5
2234; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm6
2235; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm7
2236; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
2237; SSE2-NEXT:    movdqa %xmm6, 96(%rdi)
2238; SSE2-NEXT:    movdqa %xmm5, 80(%rdi)
2239; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
2240; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
2241; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
2242; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
2243; SSE2-NEXT:    movdqa %xmm0, (%rdi)
2244; SSE2-NEXT:    movq %rdi, %rax
2245; SSE2-NEXT:    retq
2246;
2247; AVX1-LABEL: jumbled_indices32:
2248; AVX1:       # %bb.0:
2249; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
2250; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
2251; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm10
2252; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm11
2253; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm12
2254; AVX1-NEXT:    vpmaddwd %xmm12, %xmm8, %xmm8
2255; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm12
2256; AVX1-NEXT:    vpmaddwd %xmm12, %xmm9, %xmm9
2257; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm12
2258; AVX1-NEXT:    vpmaddwd %xmm12, %xmm10, %xmm10
2259; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm12
2260; AVX1-NEXT:    vpmaddwd %xmm12, %xmm11, %xmm11
2261; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
2262; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
2263; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
2264; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
2265; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
2266; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm2, %ymm2
2267; AVX1-NEXT:    vpmaddwd %xmm7, %xmm3, %xmm3
2268; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm3, %ymm3
2269; AVX1-NEXT:    retq
2270;
2271; AVX2-LABEL: jumbled_indices32:
2272; AVX2:       # %bb.0:
2273; AVX2-NEXT:    vpmaddwd %ymm4, %ymm0, %ymm0
2274; AVX2-NEXT:    vpmaddwd %ymm5, %ymm1, %ymm1
2275; AVX2-NEXT:    vpmaddwd %ymm6, %ymm2, %ymm2
2276; AVX2-NEXT:    vpmaddwd %ymm7, %ymm3, %ymm3
2277; AVX2-NEXT:    retq
2278;
2279; AVX512F-LABEL: jumbled_indices32:
2280; AVX512F:       # %bb.0:
2281; AVX512F-NEXT:    vpmaddwd %ymm5, %ymm1, %ymm1
2282; AVX512F-NEXT:    vpmaddwd %ymm4, %ymm0, %ymm0
2283; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2284; AVX512F-NEXT:    vpmaddwd %ymm7, %ymm3, %ymm1
2285; AVX512F-NEXT:    vpmaddwd %ymm6, %ymm2, %ymm2
2286; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
2287; AVX512F-NEXT:    retq
2288;
2289; AVX512BW-LABEL: jumbled_indices32:
2290; AVX512BW:       # %bb.0:
2291; AVX512BW-NEXT:    vpmaddwd %zmm2, %zmm0, %zmm0
2292; AVX512BW-NEXT:    vpmaddwd %zmm3, %zmm1, %zmm1
2293; AVX512BW-NEXT:    retq
2294  %exta = sext <64 x i16> %A to <64 x i32>
2295  %extb = sext <64 x i16> %B to <64 x i32>
2296  %m = mul <64 x i32> %exta, %extb
2297  %sa = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 1, i32 2, i32 6, i32 5, i32 10, i32 8, i32 14, i32 12, i32 19, i32 17, i32 22, i32 20, i32 25, i32 27, i32 30, i32 28, i32 32, i32 34, i32 37, i32 38, i32 41, i32 43, i32 45, i32 47, i32 50, i32 48, i32 52, i32 54, i32 59, i32 56, i32 61, i32 63>
2298  %sb = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 0, i32 3, i32 7, i32 4, i32 11, i32 9, i32 15, i32 13, i32 18, i32 16, i32 23, i32 21, i32 24, i32 26, i32 31, i32 29, i32 33, i32 35, i32 36, i32 39, i32 40, i32 42, i32 44, i32 46, i32 51, i32 49, i32 53, i32 55, i32 58, i32 57, i32 60, i32 62>
2299  %a = add <32 x i32> %sa, %sb
2300  ret <32 x i32> %a
2301}
2302
2303; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through.
2304; This would require the combine to recreate the concat_vectors.
2305define <4 x i32> @pmaddwd_128(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
2306; SSE2-LABEL: pmaddwd_128:
2307; SSE2:       # %bb.0:
2308; SSE2-NEXT:    movdqa (%rdi), %xmm0
2309; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2310; SSE2-NEXT:    retq
2311;
2312; AVX-LABEL: pmaddwd_128:
2313; AVX:       # %bb.0:
2314; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2315; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2316; AVX-NEXT:    retq
2317  %A = load <8 x i16>, <8 x i16>* %Aptr
2318  %B = load <8 x i16>, <8 x i16>* %Bptr
2319  %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2320  %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2321  %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2322  %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2323  %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
2324  %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
2325  %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
2326  %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
2327  %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
2328  %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
2329  %add = add <4 x i32> %even_mul, %odd_mul
2330  ret <4 x i32> %add
2331}
2332
2333define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) {
2334; SSE2-LABEL: pmaddwd_256:
2335; SSE2:       # %bb.0:
2336; SSE2-NEXT:    movdqa (%rdi), %xmm0
2337; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
2338; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2339; SSE2-NEXT:    pmaddwd 16(%rsi), %xmm1
2340; SSE2-NEXT:    retq
2341;
2342; AVX1-LABEL: pmaddwd_256:
2343; AVX1:       # %bb.0:
2344; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
2345; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
2346; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2347; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2348; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
2349; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
2350; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2351; AVX1-NEXT:    retq
2352;
2353; AVX256-LABEL: pmaddwd_256:
2354; AVX256:       # %bb.0:
2355; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
2356; AVX256-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2357; AVX256-NEXT:    retq
2358  %A = load <16 x i16>, <16 x i16>* %Aptr
2359  %B = load <16 x i16>, <16 x i16>* %Bptr
2360  %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2361  %A_odd = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2362  %B_even = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2363  %B_odd = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2364  %A_even_ext = sext <8 x i16> %A_even to <8 x i32>
2365  %B_even_ext = sext <8 x i16> %B_even to <8 x i32>
2366  %A_odd_ext = sext <8 x i16> %A_odd to <8 x i32>
2367  %B_odd_ext = sext <8 x i16> %B_odd to <8 x i32>
2368  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
2369  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
2370  %add = add <8 x i32> %even_mul, %odd_mul
2371  ret <8 x i32> %add
2372}
2373
2374define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) {
2375; SSE2-LABEL: pmaddwd_512:
2376; SSE2:       # %bb.0:
2377; SSE2-NEXT:    movdqa (%rdi), %xmm0
2378; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
2379; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
2380; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
2381; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2382; SSE2-NEXT:    pmaddwd 16(%rsi), %xmm1
2383; SSE2-NEXT:    pmaddwd 32(%rsi), %xmm2
2384; SSE2-NEXT:    pmaddwd 48(%rsi), %xmm3
2385; SSE2-NEXT:    retq
2386;
2387; AVX1-LABEL: pmaddwd_512:
2388; AVX1:       # %bb.0:
2389; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
2390; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
2391; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
2392; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
2393; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
2394; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
2395; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
2396; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
2397; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
2398; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
2399; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2400; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
2401; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
2402; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2403; AVX1-NEXT:    retq
2404;
2405; AVX2-LABEL: pmaddwd_512:
2406; AVX2:       # %bb.0:
2407; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2408; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2409; AVX2-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2410; AVX2-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
2411; AVX2-NEXT:    retq
2412;
2413; AVX512F-LABEL: pmaddwd_512:
2414; AVX512F:       # %bb.0:
2415; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2416; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
2417; AVX512F-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
2418; AVX512F-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2419; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2420; AVX512F-NEXT:    retq
2421;
2422; AVX512BW-LABEL: pmaddwd_512:
2423; AVX512BW:       # %bb.0:
2424; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2425; AVX512BW-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
2426; AVX512BW-NEXT:    retq
2427  %A = load <32 x i16>, <32 x i16>* %Aptr
2428  %B = load <32 x i16>, <32 x i16>* %Bptr
2429  %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
2430  %A_odd = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
2431  %B_even = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
2432  %B_odd = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
2433  %A_even_ext = sext <16 x i16> %A_even to <16 x i32>
2434  %B_even_ext = sext <16 x i16> %B_even to <16 x i32>
2435  %A_odd_ext = sext <16 x i16> %A_odd to <16 x i32>
2436  %B_odd_ext = sext <16 x i16> %B_odd to <16 x i32>
2437  %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext
2438  %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext
2439  %add = add <16 x i32> %even_mul, %odd_mul
2440  ret <16 x i32> %add
2441}
2442
2443define <32 x i32> @pmaddwd_1024(<64 x i16>* %Aptr, <64 x i16>* %Bptr) {
2444; SSE2-LABEL: pmaddwd_1024:
2445; SSE2:       # %bb.0:
2446; SSE2-NEXT:    movdqa 112(%rsi), %xmm0
2447; SSE2-NEXT:    movdqa 96(%rsi), %xmm1
2448; SSE2-NEXT:    movdqa 80(%rsi), %xmm2
2449; SSE2-NEXT:    movdqa 64(%rsi), %xmm3
2450; SSE2-NEXT:    movdqa (%rsi), %xmm4
2451; SSE2-NEXT:    movdqa 16(%rsi), %xmm5
2452; SSE2-NEXT:    movdqa 32(%rsi), %xmm6
2453; SSE2-NEXT:    movdqa 48(%rsi), %xmm7
2454; SSE2-NEXT:    pmaddwd (%rdx), %xmm4
2455; SSE2-NEXT:    pmaddwd 16(%rdx), %xmm5
2456; SSE2-NEXT:    pmaddwd 32(%rdx), %xmm6
2457; SSE2-NEXT:    pmaddwd 48(%rdx), %xmm7
2458; SSE2-NEXT:    pmaddwd 64(%rdx), %xmm3
2459; SSE2-NEXT:    pmaddwd 80(%rdx), %xmm2
2460; SSE2-NEXT:    pmaddwd 96(%rdx), %xmm1
2461; SSE2-NEXT:    pmaddwd 112(%rdx), %xmm0
2462; SSE2-NEXT:    movdqa %xmm0, 112(%rdi)
2463; SSE2-NEXT:    movdqa %xmm1, 96(%rdi)
2464; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
2465; SSE2-NEXT:    movdqa %xmm3, 64(%rdi)
2466; SSE2-NEXT:    movdqa %xmm7, 48(%rdi)
2467; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
2468; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
2469; SSE2-NEXT:    movdqa %xmm4, (%rdi)
2470; SSE2-NEXT:    movq %rdi, %rax
2471; SSE2-NEXT:    retq
2472;
2473; AVX1-LABEL: pmaddwd_1024:
2474; AVX1:       # %bb.0:
2475; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
2476; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
2477; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
2478; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm8
2479; AVX1-NEXT:    vmovdqa (%rsi), %ymm4
2480; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm5
2481; AVX1-NEXT:    vmovdqa 64(%rsi), %ymm6
2482; AVX1-NEXT:    vmovdqa 96(%rsi), %ymm9
2483; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
2484; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
2485; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
2486; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
2487; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
2488; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
2489; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2490; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
2491; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
2492; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
2493; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm3
2494; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
2495; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
2496; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
2497; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
2498; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm3
2499; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm4
2500; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
2501; AVX1-NEXT:    vpmaddwd %xmm9, %xmm8, %xmm4
2502; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
2503; AVX1-NEXT:    retq
2504;
2505; AVX2-LABEL: pmaddwd_1024:
2506; AVX2:       # %bb.0:
2507; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2508; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2509; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm2
2510; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
2511; AVX2-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2512; AVX2-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
2513; AVX2-NEXT:    vpmaddwd 64(%rsi), %ymm2, %ymm2
2514; AVX2-NEXT:    vpmaddwd 96(%rsi), %ymm3, %ymm3
2515; AVX2-NEXT:    retq
2516;
2517; AVX512F-LABEL: pmaddwd_1024:
2518; AVX512F:       # %bb.0:
2519; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2520; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
2521; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm2
2522; AVX512F-NEXT:    vmovdqa 96(%rdi), %ymm3
2523; AVX512F-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
2524; AVX512F-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2525; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2526; AVX512F-NEXT:    vpmaddwd 96(%rsi), %ymm3, %ymm1
2527; AVX512F-NEXT:    vpmaddwd 64(%rsi), %ymm2, %ymm2
2528; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
2529; AVX512F-NEXT:    retq
2530;
2531; AVX512BW-LABEL: pmaddwd_1024:
2532; AVX512BW:       # %bb.0:
2533; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2534; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
2535; AVX512BW-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
2536; AVX512BW-NEXT:    vpmaddwd 64(%rsi), %zmm1, %zmm1
2537; AVX512BW-NEXT:    retq
2538  %A = load <64 x i16>, <64 x i16>* %Aptr
2539  %B = load <64 x i16>, <64 x i16>* %Bptr
2540  %A_even = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
2541  %A_odd = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
2542  %B_even = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
2543  %B_odd = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
2544  %A_even_ext = sext <32 x i16> %A_even to <32 x i32>
2545  %B_even_ext = sext <32 x i16> %B_even to <32 x i32>
2546  %A_odd_ext = sext <32 x i16> %A_odd to <32 x i32>
2547  %B_odd_ext = sext <32 x i16> %B_odd to <32 x i32>
2548  %even_mul = mul <32 x i32> %A_even_ext, %B_even_ext
2549  %odd_mul = mul <32 x i32> %A_odd_ext, %B_odd_ext
2550  %add = add <32 x i32> %even_mul, %odd_mul
2551  ret <32 x i32> %add
2552}
2553
2554define <4 x i32> @pmaddwd_commuted_mul(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
2555; SSE2-LABEL: pmaddwd_commuted_mul:
2556; SSE2:       # %bb.0:
2557; SSE2-NEXT:    movdqa (%rdi), %xmm0
2558; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2559; SSE2-NEXT:    retq
2560;
2561; AVX-LABEL: pmaddwd_commuted_mul:
2562; AVX:       # %bb.0:
2563; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2564; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2565; AVX-NEXT:    retq
2566  %A = load <8 x i16>, <8 x i16>* %Aptr
2567  %B = load <8 x i16>, <8 x i16>* %Bptr
2568  %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2569  %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2570  %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2571  %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2572  %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
2573  %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
2574  %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
2575  %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
2576  %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
2577  %odd_mul = mul <4 x i32> %B_odd_ext, %A_odd_ext ; Different order than previous mul
2578  %add = add <4 x i32> %even_mul, %odd_mul
2579  ret <4 x i32> %add
2580}
2581
2582define <4 x i32> @pmaddwd_swapped_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
2583; SSE2-LABEL: pmaddwd_swapped_indices:
2584; SSE2:       # %bb.0:
2585; SSE2-NEXT:    movdqa (%rdi), %xmm0
2586; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2587; SSE2-NEXT:    retq
2588;
2589; AVX-LABEL: pmaddwd_swapped_indices:
2590; AVX:       # %bb.0:
2591; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2592; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2593; AVX-NEXT:    retq
2594  %A = load <8 x i16>, <8 x i16>* %Aptr
2595  %B = load <8 x i16>, <8 x i16>* %Bptr
2596  %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; indices aren't all even
2597  %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; indices aren't all odd
2598  %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; same indices as A
2599  %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; same indices as A
2600  %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
2601  %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
2602  %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
2603  %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
2604  %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
2605  %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
2606  %add = add <4 x i32> %even_mul, %odd_mul
2607  ret <4 x i32> %add
2608}
2609
2610; Negative test were indices aren't paired properly
2611define <4 x i32> @pmaddwd_bad_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
2612; SSE2-LABEL: pmaddwd_bad_indices:
2613; SSE2:       # %bb.0:
2614; SSE2-NEXT:    movdqa (%rdi), %xmm0
2615; SSE2-NEXT:    movdqa (%rsi), %xmm1
2616; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
2617; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
2618; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2619; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[2,1,2,3,4,5,6,7]
2620; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
2621; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2622; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
2623; SSE2-NEXT:    movdqa %xmm3, %xmm4
2624; SSE2-NEXT:    pmulhw %xmm2, %xmm4
2625; SSE2-NEXT:    pmullw %xmm2, %xmm3
2626; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2627; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2628; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2629; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2630; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
2631; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2632; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2633; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
2634; SSE2-NEXT:    movdqa %xmm0, %xmm2
2635; SSE2-NEXT:    pmulhw %xmm1, %xmm2
2636; SSE2-NEXT:    pmullw %xmm1, %xmm0
2637; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2638; SSE2-NEXT:    paddd %xmm3, %xmm0
2639; SSE2-NEXT:    retq
2640;
2641; AVX-LABEL: pmaddwd_bad_indices:
2642; AVX:       # %bb.0:
2643; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2644; AVX-NEXT:    vmovdqa (%rsi), %xmm1
2645; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3,4,5,10,11,12,13,12,13,10,11,12,13,14,15]
2646; AVX-NEXT:    vpmovsxwd %xmm2, %xmm2
2647; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2648; AVX-NEXT:    vpmovsxwd %xmm3, %xmm3
2649; AVX-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
2650; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,8,9,14,15,12,13,14,15]
2651; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
2652; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
2653; AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
2654; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
2655; AVX-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
2656; AVX-NEXT:    retq
2657  %A = load <8 x i16>, <8 x i16>* %Aptr
2658  %B = load <8 x i16>, <8 x i16>* %Bptr
2659  %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
2660  %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
2661  %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ; different indices than A
2662  %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> ; different indices than A
2663  %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
2664  %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
2665  %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
2666  %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
2667  %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
2668  %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
2669  %add = add <4 x i32> %even_mul, %odd_mul
2670  ret <4 x i32> %add
2671}
2672