1; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
2; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
3; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
4; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
7; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
17
18target triple = "aarch64-unknown-linux-gnu"
19
20; Don't use SVE when its registers are no bigger than NEON.
21; NO_SVE-NOT: z{0-9}
22
23;
24; sext i1 -> i32
25;
26
27; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
28; type's element type is not byte based and thus cannot be lowered directly to
29; an SVE instruction.
30define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) #0 {
31; CHECK-LABEL: sext_v8i1_v8i32:
32; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
33; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
34; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
35; CHECK-NEXT: lsl [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31
36; CHECK-NEXT: asr [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31
37; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
38; CHECK-NEXT: ret
39  %b = sext <8 x i1> %a to <8 x i32>
40  store <8 x i32> %b, <8 x i32>* %out
41  ret void
42}
43
44;
45; sext i3 -> i64
46;
47
48; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
49; type's element type is not power-of-2 based and thus cannot be lowered
50; directly to an SVE instruction.
51define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) #0 {
52; CHECK-LABEL: sext_v4i3_v4i64:
53; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
54; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
55; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
56; CHECK-NEXT: lsl [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61
57; CHECK-NEXT: asr [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61
58; CHECK-NEXT: st1d { [[A_WORDS]].d }, [[PG]], [x0]
59; CHECK-NEXT: ret
60  %b = sext <4 x i3> %a to <4 x i64>
61  store <4 x i64> %b, <4 x i64>* %out
62  ret void
63}
64
65;
66; sext i8 -> i16
67;
68
69define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
70; CHECK-LABEL: sext_v16i8_v16i16:
71; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
72; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
73; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0]
74; CHECK-NEXT: ret
75  %b = sext <16 x i8> %a to <16 x i16>
76  store <16 x i16>%b, <16 x i16>* %out
77  ret void
78}
79
80; NOTE: Extra 'add' is to prevent the extend being combined with the load.
81define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
82; CHECK-LABEL: sext_v32i8_v32i16:
83; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
84; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
85; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32
86; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
87; VBITS_GE_512-NEXT: ret
88  %a = load <32 x i8>, <32 x i8>* %in
89  %b = add <32 x i8> %a, %a
90  %c = sext <32 x i8> %b to <32 x i16>
91  store <32 x i16> %c, <32 x i16>* %out
92  ret void
93}
94
95define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
96; CHECK-LABEL: sext_v64i8_v64i16:
97; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
98; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
99; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64
100; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
101; VBITS_GE_1024-NEXT: ret
102  %a = load <64 x i8>, <64 x i8>* %in
103  %b = add <64 x i8> %a, %a
104  %c = sext <64 x i8> %b to <64 x i16>
105  store <64 x i16> %c, <64 x i16>* %out
106  ret void
107}
108
109define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
110; CHECK-LABEL: sext_v128i8_v128i16:
111; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
112; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
113; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128
114; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
115; VBITS_GE_2048-NEXT: ret
116  %a = load <128 x i8>, <128 x i8>* %in
117  %b = add <128 x i8> %a, %a
118  %c = sext <128 x i8> %b to <128 x i16>
119  store <128 x i16> %c, <128 x i16>* %out
120  ret void
121}
122
123;
124; sext i8 -> i32
125;
126
127define void @sext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 {
128; CHECK-LABEL: sext_v8i8_v8i32:
129; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
130; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
131; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
132; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0]
133; CHECK-NEXT: ret
134  %b = sext <8 x i8> %a to <8 x i32>
135  store <8 x i32>%b, <8 x i32>* %out
136  ret void
137}
138
139define void @sext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 {
140; CHECK-LABEL: sext_v16i8_v16i32:
141; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
142; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
143; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
144; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
145; VBITS_GE_512-NEXT: ret
146
147; Ensure sensible type legalisation.
148; VBITS_EQ_256: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8
149; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b
150; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b
151; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h
152; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_HI:z[0-9]+]].s, [[A_HALFS_HI]].h
153; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
154; VBITS_EQ_256-DAG: add x[[OUT_HI:[0-9]+]], x0, #32
155; VBITS_EQ_256-DAG: st1w { [[A_WORDS_LO]].s }, [[PG]], [x0]
156; VBITS_EQ_256-DAG: st1w { [[A_WORDS_HI]].s }, [[PG]], [x[[OUT_HI]]]
157; VBITS_EQ_256-NEXT: ret
158  %b = sext <16 x i8> %a to <16 x i32>
159  store <16 x i32> %b, <16 x i32>* %out
160  ret void
161}
162
163define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
164; CHECK-LABEL: sext_v32i8_v32i32:
165; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
166; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
167; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
168; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32
169; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
170; VBITS_GE_1024-NEXT: ret
171  %a = load <32 x i8>, <32 x i8>* %in
172  %b = add <32 x i8> %a, %a
173  %c = sext <32 x i8> %b to <32 x i32>
174  store <32 x i32> %c, <32 x i32>* %out
175  ret void
176}
177
178define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
179; CHECK-LABEL: sext_v64i8_v64i32:
180; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
181; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
182; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
183; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64
184; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
185; VBITS_GE_2048-NEXT: ret
186  %a = load <64 x i8>, <64 x i8>* %in
187  %b = add <64 x i8> %a, %a
188  %c = sext <64 x i8> %b to <64 x i32>
189  store <64 x i32> %c, <64 x i32>* %out
190  ret void
191}
192
193;
194; sext i8 -> i64
195;
196
197; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The sign
198; extend is a two step process where the container is any_extend'd with the
199; result feeding an inreg sign extend.
200define void @sext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 {
201; CHECK-LABEL: sext_v4i8_v4i64:
202; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
203; CHECK-NEXT: uunpklo [[ANYEXT_W:z[0-9]+]].s, z0.h
204; CHECK-NEXT: uunpklo [[ANYEXT_D:z[0-9]+]].d, [[ANYEXT_W]].s
205; CHECK-NEXT: sxtb [[A_DWORDS:z[0-9]+]].d, [[PG]]/m, [[ANYEXT_D]].d
206; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
207; CHECK-NEXT: ret
208  %b = sext <4 x i8> %a to <4 x i64>
209  store <4 x i64>%b, <4 x i64>* %out
210  ret void
211}
212
213define void @sext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 {
214; CHECK-LABEL: sext_v8i8_v8i64:
215; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
216; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
217; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
218; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
219; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
220; VBITS_GE_512-NEXT: ret
221  %b = sext <8 x i8> %a to <8 x i64>
222  store <8 x i64>%b, <8 x i64>* %out
223  ret void
224}
225
226define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 {
227; CHECK-LABEL: sext_v16i8_v16i64:
228; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
229; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
230; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
231; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
232; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
233; VBITS_GE_1024-NEXT: ret
234  %b = sext <16 x i8> %a to <16 x i64>
235  store <16 x i64> %b, <16 x i64>* %out
236  ret void
237}
238
239define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
240; CHECK-LABEL: sext_v32i8_v32i64:
241; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
242; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
243; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
244; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
245; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
246; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
247; VBITS_GE_2048-NEXT: ret
248  %a = load <32 x i8>, <32 x i8>* %in
249  %b = add <32 x i8> %a, %a
250  %c = sext <32 x i8> %b to <32 x i64>
251  store <32 x i64> %c, <32 x i64>* %out
252  ret void
253}
254
255;
256; sext i16 -> i32
257;
258
259define void @sext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 {
260; CHECK-LABEL: sext_v8i16_v8i32:
261; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
262; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
263; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
264; CHECK-NEXT: ret
265  %b = sext <8 x i16> %a to <8 x i32>
266  store <8 x i32>%b, <8 x i32>* %out
267  ret void
268}
269
270define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
271; CHECK-LABEL: sext_v16i16_v16i32:
272; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
273; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
274; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16
275; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
276; VBITS_GE_512-NEXT: ret
277  %a = load <16 x i16>, <16 x i16>* %in
278  %b = add <16 x i16> %a, %a
279  %c = sext <16 x i16> %b to <16 x i32>
280  store <16 x i32> %c, <16 x i32>* %out
281  ret void
282}
283
284define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
285; CHECK-LABEL: sext_v32i16_v32i32:
286; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
287; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
288; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32
289; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
290; VBITS_GE_1024-NEXT: ret
291  %a = load <32 x i16>, <32 x i16>* %in
292  %b = add <32 x i16> %a, %a
293  %c = sext <32 x i16> %b to <32 x i32>
294  store <32 x i32> %c, <32 x i32>* %out
295  ret void
296}
297
298define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
299; CHECK-LABEL: sext_v64i16_v64i32:
300; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
301; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
302; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64
303; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
304; VBITS_GE_2048-NEXT: ret
305  %a = load <64 x i16>, <64 x i16>* %in
306  %b = add <64 x i16> %a, %a
307  %c = sext <64 x i16> %b to <64 x i32>
308  store <64 x i32> %c, <64 x i32>* %out
309  ret void
310}
311
312;
313; sext i16 -> i64
314;
315
316define void @sext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 {
317; CHECK-LABEL: sext_v4i16_v4i64:
318; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
319; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
320; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
321; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
322; CHECK-NEXT: ret
323  %b = sext <4 x i16> %a to <4 x i64>
324  store <4 x i64>%b, <4 x i64>* %out
325  ret void
326}
327
328define void @sext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 {
329; CHECK-LABEL: sext_v8i16_v8i64:
330; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
331; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
332; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
333; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
334; VBITS_GE_512-NEXT: ret
335  %b = sext <8 x i16> %a to <8 x i64>
336  store <8 x i64>%b, <8 x i64>* %out
337  ret void
338}
339
340define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
341; CHECK-LABEL: sext_v16i16_v16i64:
342; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
343; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
344; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
345; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16
346; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
347; VBITS_GE_1024-NEXT: ret
348  %a = load <16 x i16>, <16 x i16>* %in
349  %b = add <16 x i16> %a, %a
350  %c = sext <16 x i16> %b to <16 x i64>
351  store <16 x i64> %c, <16 x i64>* %out
352  ret void
353}
354
355define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
356; CHECK-LABEL: sext_v32i16_v32i64:
357; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
358; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
359; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
360; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
361; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
362; VBITS_GE_2048-NEXT: ret
363  %a = load <32 x i16>, <32 x i16>* %in
364  %b = add <32 x i16> %a, %a
365  %c = sext <32 x i16> %b to <32 x i64>
366  store <32 x i64> %c, <32 x i64>* %out
367  ret void
368}
369
370;
371; sext i32 -> i64
372;
373
374define void @sext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 {
375; CHECK-LABEL: sext_v4i32_v4i64:
376; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
377; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, z0.s
378; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
379; CHECK-NEXT: ret
380  %b = sext <4 x i32> %a to <4 x i64>
381  store <4 x i64>%b, <4 x i64>* %out
382  ret void
383}
384
385define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
386; CHECK-LABEL: sext_v8i32_v8i64:
387; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
388; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
389; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
390; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
391; VBITS_GE_512-NEXT: ret
392  %a = load <8 x i32>, <8 x i32>* %in
393  %b = add <8 x i32> %a, %a
394  %c = sext <8 x i32> %b to <8 x i64>
395  store <8 x i64> %c, <8 x i64>* %out
396  ret void
397}
398
399define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
400; CHECK-LABEL: sext_v16i32_v16i64:
401; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
402; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
403; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16
404; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
405; VBITS_GE_1024-NEXT: ret
406  %a = load <16 x i32>, <16 x i32>* %in
407  %b = add <16 x i32> %a, %a
408  %c = sext <16 x i32> %b to <16 x i64>
409  store <16 x i64> %c, <16 x i64>* %out
410  ret void
411}
412
413define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
414; CHECK-LABEL: sext_v32i32_v32i64:
415; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
416; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
417; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
418; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
419; VBITS_GE_2048-NEXT: ret
420  %a = load <32 x i32>, <32 x i32>* %in
421  %b = add <32 x i32> %a, %a
422  %c = sext <32 x i32> %b to <32 x i64>
423  store <32 x i64> %c, <32 x i64>* %out
424  ret void
425}
426
427;
428; zext i8 -> i16
429;
430
431define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
432; CHECK-LABEL: zext_v16i8_v16i16:
433; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
434; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
435; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0]
436; CHECK-NEXT: ret
437  %b = zext <16 x i8> %a to <16 x i16>
438  store <16 x i16>%b, <16 x i16>* %out
439  ret void
440}
441
442; NOTE: Extra 'add' is to prevent the extend being combined with the load.
443define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
444; CHECK-LABEL: zext_v32i8_v32i16:
445; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
446; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
447; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32
448; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
449; VBITS_GE_512-NEXT: ret
450  %a = load <32 x i8>, <32 x i8>* %in
451  %b = add <32 x i8> %a, %a
452  %c = zext <32 x i8> %b to <32 x i16>
453  store <32 x i16> %c, <32 x i16>* %out
454  ret void
455}
456
457define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
458; CHECK-LABEL: zext_v64i8_v64i16:
459; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
460; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
461; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64
462; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
463; VBITS_GE_1024-NEXT: ret
464  %a = load <64 x i8>, <64 x i8>* %in
465  %b = add <64 x i8> %a, %a
466  %c = zext <64 x i8> %b to <64 x i16>
467  store <64 x i16> %c, <64 x i16>* %out
468  ret void
469}
470
471define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
472; CHECK-LABEL: zext_v128i8_v128i16:
473; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
474; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
475; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128
476; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
477; VBITS_GE_2048-NEXT: ret
478  %a = load <128 x i8>, <128 x i8>* %in
479  %b = add <128 x i8> %a, %a
480  %c = zext <128 x i8> %b to <128 x i16>
481  store <128 x i16> %c, <128 x i16>* %out
482  ret void
483}
484
485;
486; zext i8 -> i32
487;
488
489define void @zext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 {
490; CHECK-LABEL: zext_v8i8_v8i32:
491; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
492; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b
493; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
494; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0]
495; CHECK-NEXT: ret
496  %b = zext <8 x i8> %a to <8 x i32>
497  store <8 x i32>%b, <8 x i32>* %out
498  ret void
499}
500
501define void @zext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 {
502; CHECK-LABEL: zext_v16i8_v16i32:
503; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
504; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
505; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
506; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
507; VBITS_GE_512-NEXT: ret
508
509; Ensure sensible type legalisation.
510; VBITS_EQ_256: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8
511; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b
512; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b
513; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h
514; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_HI:z[0-9]+]].s, [[A_HALFS_HI]].h
515; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
516; VBITS_EQ_256-DAG: add x[[OUT_HI:[0-9]+]], x0, #32
517; VBITS_EQ_256-DAG: st1w { [[A_WORDS_LO]].s }, [[PG]], [x0]
518; VBITS_EQ_256-DAG: st1w { [[A_WORDS_HI]].s }, [[PG]], [x[[OUT_HI]]]
519; VBITS_EQ_256-NEXT: ret
520  %b = zext <16 x i8> %a to <16 x i32>
521  store <16 x i32> %b, <16 x i32>* %out
522  ret void
523}
524
525define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
526; CHECK-LABEL: zext_v32i8_v32i32:
527; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
528; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
529; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
530; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32
531; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
532; VBITS_GE_1024-NEXT: ret
533  %a = load <32 x i8>, <32 x i8>* %in
534  %b = add <32 x i8> %a, %a
535  %c = zext <32 x i8> %b to <32 x i32>
536  store <32 x i32> %c, <32 x i32>* %out
537  ret void
538}
539
540define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
541; CHECK-LABEL: zext_v64i8_v64i32:
542; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
543; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
544; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
545; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64
546; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
547; VBITS_GE_2048-NEXT: ret
548  %a = load <64 x i8>, <64 x i8>* %in
549  %b = add <64 x i8> %a, %a
550  %c = zext <64 x i8> %b to <64 x i32>
551  store <64 x i32> %c, <64 x i32>* %out
552  ret void
553}
554
555;
556; zext i8 -> i64
557;
558
559; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The zero
560; extend is a two step process where the container is zero_extend_inreg'd with
561; the result feeding a normal zero extend from halfs to doublewords.
562define void @zext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 {
563; CHECK-LABEL: zext_v4i8_v4i64:
564; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
565; CHECK-NEXT: bic v0.4h, #255, lsl #8
566; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
567; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
568; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
569; CHECK-NEXT: ret
570  %b = zext <4 x i8> %a to <4 x i64>
571  store <4 x i64>%b, <4 x i64>* %out
572  ret void
573}
574
575define void @zext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 {
576; CHECK-LABEL: zext_v8i8_v8i64:
577; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
578; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
579; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
580; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
581; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
582; VBITS_GE_512-NEXT: ret
583  %b = zext <8 x i8> %a to <8 x i64>
584  store <8 x i64>%b, <8 x i64>* %out
585  ret void
586}
587
588define void @zext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 {
589; CHECK-LABEL: zext_v16i8_v16i64:
590; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
591; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
592; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
593; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
594; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
595; VBITS_GE_1024-NEXT: ret
596  %b = zext <16 x i8> %a to <16 x i64>
597  store <16 x i64> %b, <16 x i64>* %out
598  ret void
599}
600
601define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
602; CHECK-LABEL: zext_v32i8_v32i64:
603; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
604; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
605; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
606; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
607; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
608; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
609; VBITS_GE_2048-NEXT: ret
610  %a = load <32 x i8>, <32 x i8>* %in
611  %b = add <32 x i8> %a, %a
612  %c = zext <32 x i8> %b to <32 x i64>
613  store <32 x i64> %c, <32 x i64>* %out
614  ret void
615}
616
617;
618; zext i16 -> i32
619;
620
621define void @zext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 {
622; CHECK-LABEL: zext_v8i16_v8i32:
623; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
624; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
625; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
626; CHECK-NEXT: ret
627  %b = zext <8 x i16> %a to <8 x i32>
628  store <8 x i32>%b, <8 x i32>* %out
629  ret void
630}
631
632define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
633; CHECK-LABEL: zext_v16i16_v16i32:
634; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
635; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
636; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16
637; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
638; VBITS_GE_512-NEXT: ret
639  %a = load <16 x i16>, <16 x i16>* %in
640  %b = add <16 x i16> %a, %a
641  %c = zext <16 x i16> %b to <16 x i32>
642  store <16 x i32> %c, <16 x i32>* %out
643  ret void
644}
645
646define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
647; CHECK-LABEL: zext_v32i16_v32i32:
648; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
649; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
650; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32
651; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
652; VBITS_GE_1024-NEXT: ret
653  %a = load <32 x i16>, <32 x i16>* %in
654  %b = add <32 x i16> %a, %a
655  %c = zext <32 x i16> %b to <32 x i32>
656  store <32 x i32> %c, <32 x i32>* %out
657  ret void
658}
659
660define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
661; CHECK-LABEL: zext_v64i16_v64i32:
662; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
663; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
664; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64
665; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
666; VBITS_GE_2048-NEXT: ret
667  %a = load <64 x i16>, <64 x i16>* %in
668  %b = add <64 x i16> %a, %a
669  %c = zext <64 x i16> %b to <64 x i32>
670  store <64 x i32> %c, <64 x i32>* %out
671  ret void
672}
673
674;
675; zext i16 -> i64
676;
677
678define void @zext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 {
679; CHECK-LABEL: zext_v4i16_v4i64:
680; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
681; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
682; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
683; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
684; CHECK-NEXT: ret
685  %b = zext <4 x i16> %a to <4 x i64>
686  store <4 x i64>%b, <4 x i64>* %out
687  ret void
688}
689
690define void @zext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 {
691; CHECK-LABEL: zext_v8i16_v8i64:
692; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
693; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h
694; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
695; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
696; VBITS_GE_512-NEXT: ret
697  %b = zext <8 x i16> %a to <8 x i64>
698  store <8 x i64>%b, <8 x i64>* %out
699  ret void
700}
701
702define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
703; CHECK-LABEL: zext_v16i16_v16i64:
704; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
705; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
706; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
707; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16
708; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
709; VBITS_GE_1024-NEXT: ret
710  %a = load <16 x i16>, <16 x i16>* %in
711  %b = add <16 x i16> %a, %a
712  %c = zext <16 x i16> %b to <16 x i64>
713  store <16 x i64> %c, <16 x i64>* %out
714  ret void
715}
716
717define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
718; CHECK-LABEL: zext_v32i16_v32i64:
719; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
720; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
721; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
722; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
723; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
724; VBITS_GE_2048-NEXT: ret
725  %a = load <32 x i16>, <32 x i16>* %in
726  %b = add <32 x i16> %a, %a
727  %c = zext <32 x i16> %b to <32 x i64>
728  store <32 x i64> %c, <32 x i64>* %out
729  ret void
730}
731
732;
733; zext i32 -> i64
734;
735
736define void @zext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 {
737; CHECK-LABEL: zext_v4i32_v4i64:
738; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
739; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, z0.s
740; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
741; CHECK-NEXT: ret
742  %b = zext <4 x i32> %a to <4 x i64>
743  store <4 x i64>%b, <4 x i64>* %out
744  ret void
745}
746
747define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
748; CHECK-LABEL: zext_v8i32_v8i64:
749; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
750; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
751; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
752; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
753; VBITS_GE_512-NEXT: ret
754  %a = load <8 x i32>, <8 x i32>* %in
755  %b = add <8 x i32> %a, %a
756  %c = zext <8 x i32> %b to <8 x i64>
757  store <8 x i64> %c, <8 x i64>* %out
758  ret void
759}
760
761define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
762; CHECK-LABEL: zext_v16i32_v16i64:
763; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
764; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
765; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16
766; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
767; VBITS_GE_1024-NEXT: ret
768  %a = load <16 x i32>, <16 x i32>* %in
769  %b = add <16 x i32> %a, %a
770  %c = zext <16 x i32> %b to <16 x i64>
771  store <16 x i64> %c, <16 x i64>* %out
772  ret void
773}
774
775define void @zext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
776; CHECK-LABEL: zext_v32i32_v32i64:
777; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
778; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
779; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
780; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
781; VBITS_GE_2048-NEXT: ret
782  %a = load <32 x i32>, <32 x i32>* %in
783  %b = add <32 x i32> %a, %a
784  %c = zext <32 x i32> %b to <32 x i64>
785  store <32 x i64> %c, <32 x i64>* %out
786  ret void
787}
788
789attributes #0 = { nounwind "target-features"="+sve" }
790