1; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
4target triple = "aarch64--linux-gnu"
5
6; CHECK-LABEL: @reduction_i8
7;
8; char reduction_i8(char *a, char *b, int n) {
9;   char sum = 0;
10;   for (int i = 0; i < n; ++i)
11;     sum += (a[i] + b[i]);
12;   return sum;
13; }
14;
15; CHECK: vector.body:
16; CHECK:   phi <16 x i8>
17; CHECK:   load <16 x i8>
18; CHECK:   load <16 x i8>
19; CHECK:   add <16 x i8>
20; CHECK:   add <16 x i8>
21;
22; CHECK: middle.block:
23; CHECK:   shufflevector <16 x i8>
24; CHECK:   add <16 x i8>
25; CHECK:   shufflevector <16 x i8>
26; CHECK:   add <16 x i8>
27; CHECK:   shufflevector <16 x i8>
28; CHECK:   add <16 x i8>
29; CHECK:   shufflevector <16 x i8>
30; CHECK:   add <16 x i8>
31; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <16 x i8>
32; CHECK:   zext i8 [[Rdx]] to i32
33;
34define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
35entry:
36  %cmp.12 = icmp sgt i32 %n, 0
37  br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup
38
39for.body.preheader:
40  br label %for.body
41
42for.cond.for.cond.cleanup_crit_edge:
43  %add5.lcssa = phi i32 [ %add5, %for.body ]
44  %conv6 = trunc i32 %add5.lcssa to i8
45  br label %for.cond.cleanup
46
47for.cond.cleanup:
48  %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
49  ret i8 %sum.0.lcssa
50
51for.body:
52  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
53  %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
54  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
55  %0 = load i8, i8* %arrayidx, align 1
56  %conv = zext i8 %0 to i32
57  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
58  %1 = load i8, i8* %arrayidx2, align 1
59  %conv3 = zext i8 %1 to i32
60  %conv4 = and i32 %sum.013, 255
61  %add = add nuw nsw i32 %conv, %conv4
62  %add5 = add nuw nsw i32 %add, %conv3
63  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
64  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
65  %exitcond = icmp eq i32 %lftr.wideiv, %n
66  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
67}
68
69; CHECK-LABEL: @reduction_i16_1
70;
71; short reduction_i16_1(short *a, short *b, int n) {
72;   short sum = 0;
73;   for (int i = 0; i < n; ++i)
74;     sum += (a[i] + b[i]);
75;   return sum;
76; }
77;
78; CHECK: vector.body:
79; CHECK:   phi <8 x i16>
80; CHECK:   load <8 x i16>
81; CHECK:   load <8 x i16>
82; CHECK:   add <8 x i16>
83; CHECK:   add <8 x i16>
84;
85; CHECK: middle.block:
86; CHECK:   shufflevector <8 x i16>
87; CHECK:   add <8 x i16>
88; CHECK:   shufflevector <8 x i16>
89; CHECK:   add <8 x i16>
90; CHECK:   shufflevector <8 x i16>
91; CHECK:   add <8 x i16>
92; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16>
93; CHECK:   zext i16 [[Rdx]] to i32
94;
95define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) {
96entry:
97  %cmp.16 = icmp sgt i32 %n, 0
98  br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup
99
100for.body.preheader:
101  br label %for.body
102
103for.cond.for.cond.cleanup_crit_edge:
104  %add5.lcssa = phi i32 [ %add5, %for.body ]
105  %conv6 = trunc i32 %add5.lcssa to i16
106  br label %for.cond.cleanup
107
108for.cond.cleanup:
109  %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
110  ret i16 %sum.0.lcssa
111
112for.body:
113  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
114  %sum.017 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
115  %arrayidx = getelementptr inbounds i16, i16* %a, i64 %indvars.iv
116  %0 = load i16, i16* %arrayidx, align 2
117  %conv.14 = zext i16 %0 to i32
118  %arrayidx2 = getelementptr inbounds i16, i16* %b, i64 %indvars.iv
119  %1 = load i16, i16* %arrayidx2, align 2
120  %conv3.15 = zext i16 %1 to i32
121  %conv4.13 = and i32 %sum.017, 65535
122  %add = add nuw nsw i32 %conv.14, %conv4.13
123  %add5 = add nuw nsw i32 %add, %conv3.15
124  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
125  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
126  %exitcond = icmp eq i32 %lftr.wideiv, %n
127  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
128}
129
130; CHECK-LABEL: @reduction_i16_2
131;
132; short reduction_i16_2(char *a, char *b, int n) {
133;   short sum = 0;
134;   for (int i = 0; i < n; ++i)
135;     sum += (a[i] + b[i]);
136;   return sum;
137; }
138;
139; CHECK: vector.body:
140; CHECK:   phi <8 x i16>
141; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
142; CHECK:   zext <8 x i8> [[Ld1]] to <8 x i16>
143; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
144; CHECK:   zext <8 x i8> [[Ld2]] to <8 x i16>
145; CHECK:   add <8 x i16>
146; CHECK:   add <8 x i16>
147;
148; CHECK: middle.block:
149; CHECK:   shufflevector <8 x i16>
150; CHECK:   add <8 x i16>
151; CHECK:   shufflevector <8 x i16>
152; CHECK:   add <8 x i16>
153; CHECK:   shufflevector <8 x i16>
154; CHECK:   add <8 x i16>
155; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16>
156; CHECK:   zext i16 [[Rdx]] to i32
157;
158define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
159entry:
160  %cmp.14 = icmp sgt i32 %n, 0
161  br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup
162
163for.body.preheader:
164  br label %for.body
165
166for.cond.for.cond.cleanup_crit_edge:
167  %add5.lcssa = phi i32 [ %add5, %for.body ]
168  %conv6 = trunc i32 %add5.lcssa to i16
169  br label %for.cond.cleanup
170
171for.cond.cleanup:
172  %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
173  ret i16 %sum.0.lcssa
174
175for.body:
176  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
177  %sum.015 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
178  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
179  %0 = load i8, i8* %arrayidx, align 1
180  %conv = zext i8 %0 to i32
181  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
182  %1 = load i8, i8* %arrayidx2, align 1
183  %conv3 = zext i8 %1 to i32
184  %conv4.13 = and i32 %sum.015, 65535
185  %add = add nuw nsw i32 %conv, %conv4.13
186  %add5 = add nuw nsw i32 %add, %conv3
187  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
188  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
189  %exitcond = icmp eq i32 %lftr.wideiv, %n
190  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
191}
192