1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
3
4;unsigned load_le32(unsigned char *data) {
5;    unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
6;    return le32;
7;}
8
9define i32 @_Z9load_le32Ph(i8* nocapture readonly %data) {
10; CHECK-LABEL: @_Z9load_le32Ph(
11; CHECK-NEXT:  entry:
12; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[DATA:%.*]], align 1
13; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
14; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 1
15; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
16; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
17; CHECK-NEXT:    [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 8
18; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL3]], [[CONV]]
19; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 2
20; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
21; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[TMP2]] to i32
22; CHECK-NEXT:    [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 16
23; CHECK-NEXT:    [[OR7:%.*]] = or i32 [[OR]], [[SHL6]]
24; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 3
25; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1
26; CHECK-NEXT:    [[CONV9:%.*]] = zext i8 [[TMP3]] to i32
27; CHECK-NEXT:    [[SHL10:%.*]] = shl nuw i32 [[CONV9]], 24
28; CHECK-NEXT:    [[OR11:%.*]] = or i32 [[OR7]], [[SHL10]]
29; CHECK-NEXT:    ret i32 [[OR11]]
30;
31entry:
32  %0 = load i8, i8* %data, align 1
33  %conv = zext i8 %0 to i32
34  %arrayidx1 = getelementptr inbounds i8, i8* %data, i64 1
35  %1 = load i8, i8* %arrayidx1, align 1
36  %conv2 = zext i8 %1 to i32
37  %shl3 = shl nuw nsw i32 %conv2, 8
38  %or = or i32 %shl3, %conv
39  %arrayidx4 = getelementptr inbounds i8, i8* %data, i64 2
40  %2 = load i8, i8* %arrayidx4, align 1
41  %conv5 = zext i8 %2 to i32
42  %shl6 = shl nuw nsw i32 %conv5, 16
43  %or7 = or i32 %or, %shl6
44  %arrayidx8 = getelementptr inbounds i8, i8* %data, i64 3
45  %3 = load i8, i8* %arrayidx8, align 1
46  %conv9 = zext i8 %3 to i32
47  %shl10 = shl nuw i32 %conv9, 24
48  %or11 = or i32 %or7, %shl10
49  ret i32 %or11
50}
51
52define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceable(16) %x) {
53; CHECK-LABEL: @PR16739_byref(
54; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
55; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
56; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
57; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
58; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
59; CHECK-NEXT:    [[X2:%.*]] = load float, float* [[GEP2]], align 4
60; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
61; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
62; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
63; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1
64; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2
65; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
66; CHECK-NEXT:    ret <4 x float> [[I3]]
67;
68  %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
69  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1
70  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 2
71  %x0 = load float, float* %gep0
72  %x1 = load float, float* %gep1
73  %x2 = load float, float* %gep2
74  %i0 = insertelement <4 x float> undef, float %x0, i32 0
75  %i1 = insertelement <4 x float> %i0, float %x1, i32 1
76  %i2 = insertelement <4 x float> %i1, float %x2, i32 2
77  %i3 = insertelement <4 x float> %i2, float %x2, i32 3
78  ret <4 x float> %i3
79}
80
81define <4 x float> @PR16739_byref_alt(<4 x float>* nocapture readonly dereferenceable(16) %x) {
82; CHECK-LABEL: @PR16739_byref_alt(
83; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
84; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
85; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
86; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
87; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
88; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
89; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
90; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP3]], i32 1
91; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2
92; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP4]], i32 2
93; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP4]], i32 3
94; CHECK-NEXT:    ret <4 x float> [[I3]]
95;
96  %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
97  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1
98  %x0 = load float, float* %gep0
99  %x1 = load float, float* %gep1
100  %i0 = insertelement <4 x float> undef, float %x0, i32 0
101  %i1 = insertelement <4 x float> %i0, float %x0, i32 1
102  %i2 = insertelement <4 x float> %i1, float %x1, i32 2
103  %i3 = insertelement <4 x float> %i2, float %x1, i32 3
104  ret <4 x float> %i3
105}
106
107define <4 x float> @PR16739_byval(<4 x float>* nocapture readonly dereferenceable(16) %x) {
108; CHECK-LABEL: @PR16739_byval(
109; CHECK-NEXT:    [[T0:%.*]] = bitcast <4 x float>* [[X:%.*]] to i64*
110; CHECK-NEXT:    [[T1:%.*]] = load i64, i64* [[T0]], align 16
111; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
112; CHECK-NEXT:    [[T3:%.*]] = bitcast float* [[T2]] to i64*
113; CHECK-NEXT:    [[T4:%.*]] = load i64, i64* [[T3]], align 8
114; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
115; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
116; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
117; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
118; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
119; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
120; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
121; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
122; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
123; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
124; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
125; CHECK-NEXT:    ret <4 x float> [[T15]]
126;
127  %t0 = bitcast <4 x float>* %x to i64*
128  %t1 = load i64, i64* %t0, align 16
129  %t2 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 2
130  %t3 = bitcast float* %t2 to i64*
131  %t4 = load i64, i64* %t3, align 8
132  %t5 = trunc i64 %t1 to i32
133  %t6 = bitcast i32 %t5 to float
134  %t7 = insertelement <4 x float> undef, float %t6, i32 0
135  %t8 = lshr i64 %t1, 32
136  %t9 = trunc i64 %t8 to i32
137  %t10 = bitcast i32 %t9 to float
138  %t11 = insertelement <4 x float> %t7, float %t10, i32 1
139  %t12 = trunc i64 %t4 to i32
140  %t13 = bitcast i32 %t12 to float
141  %t14 = insertelement <4 x float> %t11, float %t13, i32 2
142  %t15 = insertelement <4 x float> %t14, float %t13, i32 3
143  ret <4 x float> %t15
144}
145
146define void @PR43578_prefer128(i32* %r, i64* %p, i64* %q) #0 {
147; CHECK-LABEL: @PR43578_prefer128(
148; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 0
149; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1
150; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2
151; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3
152; CHECK-NEXT:    [[Q0:%.*]] = getelementptr inbounds i64, i64* [[Q:%.*]], i64 0
153; CHECK-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 1
154; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 2
155; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
156; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
157; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
158; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
159; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
160; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
161; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
162; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
163; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
164; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
165; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
166; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
167; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
168; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
169; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
170; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
171; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]
172; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
173; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP14]]
174; CHECK-NEXT:    ret void
175;
176  %p0 = getelementptr inbounds i64, i64* %p, i64 0
177  %p1 = getelementptr inbounds i64, i64* %p, i64 1
178  %p2 = getelementptr inbounds i64, i64* %p, i64 2
179  %p3 = getelementptr inbounds i64, i64* %p, i64 3
180
181  %q0 = getelementptr inbounds i64, i64* %q, i64 0
182  %q1 = getelementptr inbounds i64, i64* %q, i64 1
183  %q2 = getelementptr inbounds i64, i64* %q, i64 2
184  %q3 = getelementptr inbounds i64, i64* %q, i64 3
185
186  %x0 = load i64, i64* %p0, align 2
187  %x1 = load i64, i64* %p1, align 2
188  %x2 = load i64, i64* %p2, align 2
189  %x3 = load i64, i64* %p3, align 2
190
191  %y0 = load i64, i64* %q0, align 2
192  %y1 = load i64, i64* %q1, align 2
193  %y2 = load i64, i64* %q2, align 2
194  %y3 = load i64, i64* %q3, align 2
195
196  %sub0 = sub nsw i64 %x0, %y0
197  %sub1 = sub nsw i64 %x1, %y1
198  %sub2 = sub nsw i64 %x2, %y2
199  %sub3 = sub nsw i64 %x3, %y3
200
201  %g0 = getelementptr inbounds i32, i32* %r, i64 %sub0
202  %g1 = getelementptr inbounds i32, i32* %r, i64 %sub1
203  %g2 = getelementptr inbounds i32, i32* %r, i64 %sub2
204  %g3 = getelementptr inbounds i32, i32* %r, i64 %sub3
205  ret void
206}
207
208attributes #0 = { "prefer-vector-width"="128" }
209