1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE2
3; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
4; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
5; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
6; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
7; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
8
9;
10; vXi8
11;
12
13define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
14; SSE2-LABEL: @loadext_2i8_to_2i64(
15; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
16; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
17; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
18; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
19; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
20; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
21; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
22; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
23; SSE2-NEXT:    ret <2 x i64> [[V1]]
24;
25; SLM-LABEL: @loadext_2i8_to_2i64(
26; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
27; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
28; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
29; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
30; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
31; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
32; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
33; SLM-NEXT:    ret <2 x i64> [[V1]]
34;
35; AVX-LABEL: @loadext_2i8_to_2i64(
36; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
37; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
38; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
39; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
40; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
41; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
42; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
43; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
44; AVX-NEXT:    ret <2 x i64> [[V1]]
45;
46  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
47  %i0 = load i8, i8* %p0, align 1
48  %i1 = load i8, i8* %p1, align 1
49  %x0 = zext i8 %i0 to i64
50  %x1 = zext i8 %i1 to i64
51  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
52  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
53  ret <2 x i64> %v1
54}
55
56define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
57; SSE2-LABEL: @loadext_4i8_to_4i32(
58; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
59; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
60; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
61; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
62; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
63; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
64; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
65; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
66; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
67; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
68; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
69; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
70; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
71; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
72; SSE2-NEXT:    ret <4 x i32> [[V3]]
73;
74; SLM-LABEL: @loadext_4i8_to_4i32(
75; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
76; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
77; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
78; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
79; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
80; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
81; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
82; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
83; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
84; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
85; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
86; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
87; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
88; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
89; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
90; SLM-NEXT:    ret <4 x i32> [[V3]]
91;
92; AVX-LABEL: @loadext_4i8_to_4i32(
93; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
94; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
95; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
96; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
97; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
98; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
99; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
100; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
101; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
102; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
103; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
104; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
105; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
106; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
107; AVX-NEXT:    ret <4 x i32> [[V3]]
108;
109  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
110  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
111  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
112  %i0 = load i8, i8* %p0, align 1
113  %i1 = load i8, i8* %p1, align 1
114  %i2 = load i8, i8* %p2, align 1
115  %i3 = load i8, i8* %p3, align 1
116  %x0 = zext i8 %i0 to i32
117  %x1 = zext i8 %i1 to i32
118  %x2 = zext i8 %i2 to i32
119  %x3 = zext i8 %i3 to i32
120  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
121  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
122  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
123  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
124  ret <4 x i32> %v3
125}
126
127define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
128; SSE2-LABEL: @loadext_4i8_to_4i64(
129; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
130; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
131; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
132; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
133; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
134; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
135; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
136; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
137; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
138; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
139; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
140; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
141; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
142; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
143; SSE2-NEXT:    ret <4 x i64> [[V3]]
144;
145; SLM-LABEL: @loadext_4i8_to_4i64(
146; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
147; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
148; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
149; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
150; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
151; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
152; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
153; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
154; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
155; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
156; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
157; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
158; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
159; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
160; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
161; SLM-NEXT:    ret <4 x i64> [[V3]]
162;
163; AVX-LABEL: @loadext_4i8_to_4i64(
164; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
165; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
166; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
167; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
168; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
169; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
170; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
171; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
172; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
173; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
174; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
175; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
176; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
177; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
178; AVX-NEXT:    ret <4 x i64> [[V3]]
179;
180  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
181  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
182  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
183  %i0 = load i8, i8* %p0, align 1
184  %i1 = load i8, i8* %p1, align 1
185  %i2 = load i8, i8* %p2, align 1
186  %i3 = load i8, i8* %p3, align 1
187  %x0 = zext i8 %i0 to i64
188  %x1 = zext i8 %i1 to i64
189  %x2 = zext i8 %i2 to i64
190  %x3 = zext i8 %i3 to i64
191  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
192  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
193  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
194  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
195  ret <4 x i64> %v3
196}
197
198define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
199; SSE2-LABEL: @loadext_8i8_to_8i16(
200; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
201; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
202; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
203; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
204; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
205; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
206; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
207; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
208; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
209; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
210; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
211; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
212; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
213; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
214; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
215; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
216; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
217; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
218; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
219; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
220; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
221; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
222; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
223; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
224; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
225; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
226; SSE2-NEXT:    ret <8 x i16> [[V7]]
227;
228; SLM-LABEL: @loadext_8i8_to_8i16(
229; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
230; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
231; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
232; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
233; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
234; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
235; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
236; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
237; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
238; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
239; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
240; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
241; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
242; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
243; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
244; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
245; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
246; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
247; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
248; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
249; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
250; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
251; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
252; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
253; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
254; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
255; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
256; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
257; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
258; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
259; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
260; SLM-NEXT:    ret <8 x i16> [[V7]]
261;
262; AVX-LABEL: @loadext_8i8_to_8i16(
263; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
264; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
265; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
266; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
267; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
268; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
269; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
270; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
271; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
272; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
273; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
274; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
275; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
276; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
277; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
278; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
279; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
280; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
281; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
282; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
283; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
284; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
285; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
286; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
287; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
288; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
289; AVX-NEXT:    ret <8 x i16> [[V7]]
290;
291  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
292  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
293  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
294  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
295  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
296  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
297  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
298  %i0 = load i8, i8* %p0, align 1
299  %i1 = load i8, i8* %p1, align 1
300  %i2 = load i8, i8* %p2, align 1
301  %i3 = load i8, i8* %p3, align 1
302  %i4 = load i8, i8* %p4, align 1
303  %i5 = load i8, i8* %p5, align 1
304  %i6 = load i8, i8* %p6, align 1
305  %i7 = load i8, i8* %p7, align 1
306  %x0 = zext i8 %i0 to i16
307  %x1 = zext i8 %i1 to i16
308  %x2 = zext i8 %i2 to i16
309  %x3 = zext i8 %i3 to i16
310  %x4 = zext i8 %i4 to i16
311  %x5 = zext i8 %i5 to i16
312  %x6 = zext i8 %i6 to i16
313  %x7 = zext i8 %i7 to i16
314  %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
315  %v1 = insertelement <8 x i16>   %v0, i16 %x1, i32 1
316  %v2 = insertelement <8 x i16>   %v1, i16 %x2, i32 2
317  %v3 = insertelement <8 x i16>   %v2, i16 %x3, i32 3
318  %v4 = insertelement <8 x i16>   %v3, i16 %x4, i32 4
319  %v5 = insertelement <8 x i16>   %v4, i16 %x5, i32 5
320  %v6 = insertelement <8 x i16>   %v5, i16 %x6, i32 6
321  %v7 = insertelement <8 x i16>   %v6, i16 %x7, i32 7
322  ret <8 x i16> %v7
323}
324
325define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
326; SSE2-LABEL: @loadext_8i8_to_8i32(
327; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
328; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
329; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
330; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
331; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
332; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
333; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
334; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
335; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
336; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
337; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
338; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
339; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
340; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
341; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
342; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
343; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
344; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
345; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
346; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
347; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
348; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
349; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
350; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
351; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
352; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
353; SSE2-NEXT:    ret <8 x i32> [[V7]]
354;
355; SLM-LABEL: @loadext_8i8_to_8i32(
356; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
357; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
358; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
359; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
360; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
361; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
362; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
363; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
364; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
365; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
366; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
367; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
368; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
369; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
370; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
371; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
372; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
373; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
374; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
375; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i32
376; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i32
377; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i32
378; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i32
379; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
380; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
381; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
382; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
383; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
384; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
385; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
386; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
387; SLM-NEXT:    ret <8 x i32> [[V7]]
388;
389; AVX-LABEL: @loadext_8i8_to_8i32(
390; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
391; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
392; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
393; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
394; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
395; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
396; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
397; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
398; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
399; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
400; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
401; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
402; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
403; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
404; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
405; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
406; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
407; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
408; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
409; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
410; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
411; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
412; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
413; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
414; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
415; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
416; AVX-NEXT:    ret <8 x i32> [[V7]]
417;
418  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
419  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
420  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
421  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
422  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
423  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
424  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
425  %i0 = load i8, i8* %p0, align 1
426  %i1 = load i8, i8* %p1, align 1
427  %i2 = load i8, i8* %p2, align 1
428  %i3 = load i8, i8* %p3, align 1
429  %i4 = load i8, i8* %p4, align 1
430  %i5 = load i8, i8* %p5, align 1
431  %i6 = load i8, i8* %p6, align 1
432  %i7 = load i8, i8* %p7, align 1
433  %x0 = zext i8 %i0 to i32
434  %x1 = zext i8 %i1 to i32
435  %x2 = zext i8 %i2 to i32
436  %x3 = zext i8 %i3 to i32
437  %x4 = zext i8 %i4 to i32
438  %x5 = zext i8 %i5 to i32
439  %x6 = zext i8 %i6 to i32
440  %x7 = zext i8 %i7 to i32
441  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
442  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
443  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
444  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
445  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
446  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
447  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
448  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
449  ret <8 x i32> %v7
450}
451
452define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
453; SSE2-LABEL: @loadext_16i8_to_16i16(
454; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
455; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
456; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
457; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
458; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
459; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
460; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
461; SSE2-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
462; SSE2-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
463; SSE2-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
464; SSE2-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
465; SSE2-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
466; SSE2-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
467; SSE2-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
468; SSE2-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
469; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
470; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
471; SSE2-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
472; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
473; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
474; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
475; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
476; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
477; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
478; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
479; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
480; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
481; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
482; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
483; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
484; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
485; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
486; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
487; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
488; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
489; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
490; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
491; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
492; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
493; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
494; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
495; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
496; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
497; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
498; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
499; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
500; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
501; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
502; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
503; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
504; SSE2-NEXT:    ret <16 x i16> [[V15]]
505;
506; SLM-LABEL: @loadext_16i8_to_16i16(
507; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
508; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
509; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
510; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
511; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
512; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
513; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
514; SLM-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
515; SLM-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
516; SLM-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
517; SLM-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
518; SLM-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
519; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
520; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
521; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
522; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
523; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
524; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
525; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
526; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
527; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
528; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
529; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
530; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
531; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
532; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
533; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
534; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
535; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
536; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
537; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
538; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
539; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
540; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
541; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
542; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
543; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
544; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
545; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
546; SLM-NEXT:    [[X8:%.*]] = zext i8 [[I8]] to i16
547; SLM-NEXT:    [[X9:%.*]] = zext i8 [[I9]] to i16
548; SLM-NEXT:    [[X10:%.*]] = zext i8 [[I10]] to i16
549; SLM-NEXT:    [[X11:%.*]] = zext i8 [[I11]] to i16
550; SLM-NEXT:    [[X12:%.*]] = zext i8 [[I12]] to i16
551; SLM-NEXT:    [[X13:%.*]] = zext i8 [[I13]] to i16
552; SLM-NEXT:    [[X14:%.*]] = zext i8 [[I14]] to i16
553; SLM-NEXT:    [[X15:%.*]] = zext i8 [[I15]] to i16
554; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
555; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
556; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
557; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
558; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
559; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
560; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
561; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
562; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
563; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
564; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
565; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
566; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
567; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
568; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
569; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
570; SLM-NEXT:    ret <16 x i16> [[V15]]
571;
572; AVX-LABEL: @loadext_16i8_to_16i16(
573; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
574; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
575; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
576; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
577; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
578; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
579; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
580; AVX-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
581; AVX-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
582; AVX-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
583; AVX-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
584; AVX-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
585; AVX-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
586; AVX-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
587; AVX-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
588; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
589; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
590; AVX-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
591; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
592; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
593; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
594; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
595; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
596; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
597; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
598; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
599; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
600; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
601; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
602; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
603; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
604; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
605; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
606; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
607; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
608; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
609; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
610; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
611; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
612; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
613; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
614; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
615; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
616; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
617; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
618; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
619; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
620; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
621; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
622; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
623; AVX-NEXT:    ret <16 x i16> [[V15]]
624;
625  %p1  = getelementptr inbounds i8, i8* %p0, i64 1
626  %p2  = getelementptr inbounds i8, i8* %p0, i64 2
627  %p3  = getelementptr inbounds i8, i8* %p0, i64 3
628  %p4  = getelementptr inbounds i8, i8* %p0, i64 4
629  %p5  = getelementptr inbounds i8, i8* %p0, i64 5
630  %p6  = getelementptr inbounds i8, i8* %p0, i64 6
631  %p7  = getelementptr inbounds i8, i8* %p0, i64 7
632  %p8  = getelementptr inbounds i8, i8* %p0, i64 8
633  %p9  = getelementptr inbounds i8, i8* %p0, i64 9
634  %p10 = getelementptr inbounds i8, i8* %p0, i64 10
635  %p11 = getelementptr inbounds i8, i8* %p0, i64 11
636  %p12 = getelementptr inbounds i8, i8* %p0, i64 12
637  %p13 = getelementptr inbounds i8, i8* %p0, i64 13
638  %p14 = getelementptr inbounds i8, i8* %p0, i64 14
639  %p15 = getelementptr inbounds i8, i8* %p0, i64 15
640  %i0  = load i8, i8* %p0,  align 1
641  %i1  = load i8, i8* %p1,  align 1
642  %i2  = load i8, i8* %p2,  align 1
643  %i3  = load i8, i8* %p3,  align 1
644  %i4  = load i8, i8* %p4,  align 1
645  %i5  = load i8, i8* %p5,  align 1
646  %i6  = load i8, i8* %p6,  align 1
647  %i7  = load i8, i8* %p7,  align 1
648  %i8  = load i8, i8* %p8,  align 1
649  %i9  = load i8, i8* %p9,  align 1
650  %i10 = load i8, i8* %p10, align 1
651  %i11 = load i8, i8* %p11, align 1
652  %i12 = load i8, i8* %p12, align 1
653  %i13 = load i8, i8* %p13, align 1
654  %i14 = load i8, i8* %p14, align 1
655  %i15 = load i8, i8* %p15, align 1
656  %x0  = zext i8 %i0  to i16
657  %x1  = zext i8 %i1  to i16
658  %x2  = zext i8 %i2  to i16
659  %x3  = zext i8 %i3  to i16
660  %x4  = zext i8 %i4  to i16
661  %x5  = zext i8 %i5  to i16
662  %x6  = zext i8 %i6  to i16
663  %x7  = zext i8 %i7  to i16
664  %x8  = zext i8 %i8  to i16
665  %x9  = zext i8 %i9  to i16
666  %x10 = zext i8 %i10 to i16
667  %x11 = zext i8 %i11 to i16
668  %x12 = zext i8 %i12 to i16
669  %x13 = zext i8 %i13 to i16
670  %x14 = zext i8 %i14 to i16
671  %x15 = zext i8 %i15 to i16
672  %v0  = insertelement <16 x i16> undef, i16 %x0,  i32 0
673  %v1  = insertelement <16 x i16>  %v0,  i16 %x1,  i32 1
674  %v2  = insertelement <16 x i16>  %v1,  i16 %x2,  i32 2
675  %v3  = insertelement <16 x i16>  %v2,  i16 %x3,  i32 3
676  %v4  = insertelement <16 x i16>  %v3,  i16 %x4,  i32 4
677  %v5  = insertelement <16 x i16>  %v4,  i16 %x5,  i32 5
678  %v6  = insertelement <16 x i16>  %v5,  i16 %x6,  i32 6
679  %v7  = insertelement <16 x i16>  %v6,  i16 %x7,  i32 7
680  %v8  = insertelement <16 x i16>  %v7,  i16 %x8,  i32 8
681  %v9  = insertelement <16 x i16>  %v8,  i16 %x9,  i32 9
682  %v10 = insertelement <16 x i16>  %v9,  i16 %x10, i32 10
683  %v11 = insertelement <16 x i16>  %v10, i16 %x11, i32 11
684  %v12 = insertelement <16 x i16>  %v11, i16 %x12, i32 12
685  %v13 = insertelement <16 x i16>  %v12, i16 %x13, i32 13
686  %v14 = insertelement <16 x i16>  %v13, i16 %x14, i32 14
687  %v15 = insertelement <16 x i16>  %v14, i16 %x15, i32 15
688  ret <16 x i16> %v15
689}
690
691;
692; vXi16
693;
694
695define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
696; SSE2-LABEL: @loadext_2i16_to_2i64(
697; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
698; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
699; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
700; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
701; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
702; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
703; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
704; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
705; SSE2-NEXT:    ret <2 x i64> [[V1]]
706;
707; SLM-LABEL: @loadext_2i16_to_2i64(
708; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
709; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
710; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
711; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
712; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
713; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
714; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
715; SLM-NEXT:    ret <2 x i64> [[V1]]
716;
717; AVX-LABEL: @loadext_2i16_to_2i64(
718; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
719; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
720; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
721; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
722; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
723; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
724; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
725; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
726; AVX-NEXT:    ret <2 x i64> [[V1]]
727;
728  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
729  %i0 = load i16, i16* %p0, align 1
730  %i1 = load i16, i16* %p1, align 1
731  %x0 = zext i16 %i0 to i64
732  %x1 = zext i16 %i1 to i64
733  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
734  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
735  ret <2 x i64> %v1
736}
737
738define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
739; SSE2-LABEL: @loadext_4i16_to_4i32(
740; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
741; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
742; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
743; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
744; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
745; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
746; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
747; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
748; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
749; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
750; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
751; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
752; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
753; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
754; SSE2-NEXT:    ret <4 x i32> [[V3]]
755;
756; SLM-LABEL: @loadext_4i16_to_4i32(
757; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
758; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
759; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
760; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
761; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
762; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
763; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
764; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
765; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
766; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
767; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
768; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
769; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
770; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
771; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
772; SLM-NEXT:    ret <4 x i32> [[V3]]
773;
774; AVX-LABEL: @loadext_4i16_to_4i32(
775; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
776; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
777; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
778; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
779; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
780; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
781; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
782; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
783; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
784; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
785; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
786; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
787; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
788; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
789; AVX-NEXT:    ret <4 x i32> [[V3]]
790;
791  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
792  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
793  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
794  %i0 = load i16, i16* %p0, align 1
795  %i1 = load i16, i16* %p1, align 1
796  %i2 = load i16, i16* %p2, align 1
797  %i3 = load i16, i16* %p3, align 1
798  %x0 = zext i16 %i0 to i32
799  %x1 = zext i16 %i1 to i32
800  %x2 = zext i16 %i2 to i32
801  %x3 = zext i16 %i3 to i32
802  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
803  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
804  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
805  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
806  ret <4 x i32> %v3
807}
808
809define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
810; SSE2-LABEL: @loadext_4i16_to_4i64(
811; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
812; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
813; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
814; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
815; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
816; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
817; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
818; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
819; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
820; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
821; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
822; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
823; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
824; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
825; SSE2-NEXT:    ret <4 x i64> [[V3]]
826;
827; SLM-LABEL: @loadext_4i16_to_4i64(
828; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
829; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
830; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
831; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
832; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
833; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
834; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
835; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
836; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
837; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
838; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
839; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
840; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
841; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
842; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
843; SLM-NEXT:    ret <4 x i64> [[V3]]
844;
845; AVX-LABEL: @loadext_4i16_to_4i64(
846; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
847; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
848; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
849; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
850; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
851; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
852; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
853; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
854; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
855; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
856; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
857; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
858; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
859; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
860; AVX-NEXT:    ret <4 x i64> [[V3]]
861;
862  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
863  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
864  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
865  %i0 = load i16, i16* %p0, align 1
866  %i1 = load i16, i16* %p1, align 1
867  %i2 = load i16, i16* %p2, align 1
868  %i3 = load i16, i16* %p3, align 1
869  %x0 = zext i16 %i0 to i64
870  %x1 = zext i16 %i1 to i64
871  %x2 = zext i16 %i2 to i64
872  %x3 = zext i16 %i3 to i64
873  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
874  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
875  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
876  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
877  ret <4 x i64> %v3
878}
879
880define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
881; SSE2-LABEL: @loadext_8i16_to_8i32(
882; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
883; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
884; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
885; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
886; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
887; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
888; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
889; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
890; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
891; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
892; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
893; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
894; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
895; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
896; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
897; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
898; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
899; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
900; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
901; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
902; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
903; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
904; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
905; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
906; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
907; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
908; SSE2-NEXT:    ret <8 x i32> [[V7]]
909;
910; SLM-LABEL: @loadext_8i16_to_8i32(
911; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
912; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
913; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
914; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
915; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
916; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
917; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
918; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
919; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
920; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
921; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
922; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
923; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
924; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
925; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
926; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
927; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
928; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
929; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
930; SLM-NEXT:    [[X4:%.*]] = zext i16 [[I4]] to i32
931; SLM-NEXT:    [[X5:%.*]] = zext i16 [[I5]] to i32
932; SLM-NEXT:    [[X6:%.*]] = zext i16 [[I6]] to i32
933; SLM-NEXT:    [[X7:%.*]] = zext i16 [[I7]] to i32
934; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
935; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
936; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
937; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
938; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
939; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
940; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
941; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
942; SLM-NEXT:    ret <8 x i32> [[V7]]
943;
944; AVX-LABEL: @loadext_8i16_to_8i32(
945; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
946; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
947; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
948; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
949; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
950; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
951; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
952; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
953; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
954; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
955; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
956; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
957; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
958; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
959; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
960; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
961; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
962; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
963; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
964; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
965; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
966; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
967; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
968; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
969; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
970; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
971; AVX-NEXT:    ret <8 x i32> [[V7]]
972;
973  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
974  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
975  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
976  %p4 = getelementptr inbounds i16, i16* %p0, i64 4
977  %p5 = getelementptr inbounds i16, i16* %p0, i64 5
978  %p6 = getelementptr inbounds i16, i16* %p0, i64 6
979  %p7 = getelementptr inbounds i16, i16* %p0, i64 7
980  %i0 = load i16, i16* %p0, align 1
981  %i1 = load i16, i16* %p1, align 1
982  %i2 = load i16, i16* %p2, align 1
983  %i3 = load i16, i16* %p3, align 1
984  %i4 = load i16, i16* %p4, align 1
985  %i5 = load i16, i16* %p5, align 1
986  %i6 = load i16, i16* %p6, align 1
987  %i7 = load i16, i16* %p7, align 1
988  %x0 = zext i16 %i0 to i32
989  %x1 = zext i16 %i1 to i32
990  %x2 = zext i16 %i2 to i32
991  %x3 = zext i16 %i3 to i32
992  %x4 = zext i16 %i4 to i32
993  %x5 = zext i16 %i5 to i32
994  %x6 = zext i16 %i6 to i32
995  %x7 = zext i16 %i7 to i32
996  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
997  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
998  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
999  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
1000  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
1001  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
1002  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
1003  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
1004  ret <8 x i32> %v7
1005}
1006
1007;
1008; vXi32
1009;
1010
1011define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
1012; SSE2-LABEL: @loadext_2i32_to_2i64(
1013; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
1014; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
1015; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
1016; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
1017; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
1018; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
1019; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
1020; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
1021; SSE2-NEXT:    ret <2 x i64> [[V1]]
1022;
1023; SLM-LABEL: @loadext_2i32_to_2i64(
1024; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
1025; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
1026; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
1027; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
1028; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
1029; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
1030; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
1031; SLM-NEXT:    ret <2 x i64> [[V1]]
1032;
1033; AVX-LABEL: @loadext_2i32_to_2i64(
1034; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
1035; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
1036; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
1037; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
1038; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
1039; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
1040; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
1041; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
1042; AVX-NEXT:    ret <2 x i64> [[V1]]
1043;
1044  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
1045  %i0 = load i32, i32* %p0, align 1
1046  %i1 = load i32, i32* %p1, align 1
1047  %x0 = zext i32 %i0 to i64
1048  %x1 = zext i32 %i1 to i64
1049  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
1050  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
1051  ret <2 x i64> %v1
1052}
1053
1054define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
1055; SSE2-LABEL: @loadext_4i32_to_4i64(
1056; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
1057; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
1058; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
1059; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
1060; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
1061; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
1062; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
1063; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
1064; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
1065; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
1066; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
1067; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
1068; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
1069; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
1070; SSE2-NEXT:    ret <4 x i64> [[V3]]
1071;
1072; SLM-LABEL: @loadext_4i32_to_4i64(
1073; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
1074; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
1075; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
1076; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
1077; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
1078; SLM-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
1079; SLM-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
1080; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
1081; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
1082; SLM-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
1083; SLM-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
1084; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
1085; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
1086; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
1087; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
1088; SLM-NEXT:    ret <4 x i64> [[V3]]
1089;
1090; AVX-LABEL: @loadext_4i32_to_4i64(
1091; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
1092; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
1093; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
1094; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
1095; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
1096; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
1097; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
1098; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
1099; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
1100; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
1101; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
1102; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
1103; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
1104; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
1105; AVX-NEXT:    ret <4 x i64> [[V3]]
1106;
1107  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
1108  %p2 = getelementptr inbounds i32, i32* %p0, i64 2
1109  %p3 = getelementptr inbounds i32, i32* %p0, i64 3
1110  %i0 = load i32, i32* %p0, align 1
1111  %i1 = load i32, i32* %p1, align 1
1112  %i2 = load i32, i32* %p2, align 1
1113  %i3 = load i32, i32* %p3, align 1
1114  %x0 = zext i32 %i0 to i64
1115  %x1 = zext i32 %i1 to i64
1116  %x2 = zext i32 %i2 to i64
1117  %x3 = zext i32 %i3 to i64
1118  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
1119  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
1120  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
1121  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
1122  ret <4 x i64> %v3
1123}
1124