1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
3; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
4
5;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
6; unscaled unpacked 32-bit offsets
7;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
8
9define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
10; CHECK-LABEL: masked_gather_nxv2i8:
11; CHECK:       // %bb.0:
12; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
13; CHECK-NEXT:    ret
14  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
15  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
16  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
17  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
18  ret <vscale x 2 x i64> %vals.zext
19}
20
21define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
22; CHECK-LABEL: masked_gather_nxv2i16:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
25; CHECK-NEXT:    ret
26  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
27  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
28  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
29  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
30  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
31  ret <vscale x 2 x i64> %vals.zext
32}
33
34define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
35; CHECK-LABEL: masked_gather_nxv2i32:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
38; CHECK-NEXT:    ret
39  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
40  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
41  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
42  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
43  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
44  ret <vscale x 2 x i64> %vals.zext
45}
46
47define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
48; CHECK-LABEL: masked_gather_nxv2i64:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
51; CHECK-NEXT:    ret
52  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
53  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
54  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
55  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
56  ret <vscale x 2 x i64> %vals
57}
58
59define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
60; CHECK-LABEL: masked_gather_nxv2f16:
61; CHECK:       // %bb.0:
62; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
63; CHECK-NEXT:    ret
64  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
65  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
66  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
67  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
68  ret <vscale x 2 x half> %vals
69}
70
71define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
72; CHECK-LABEL: masked_gather_nxv2f32:
73; CHECK:       // %bb.0:
74; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
75; CHECK-NEXT:    ret
76  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
77  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
78  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
79  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
80  ret <vscale x 2 x float> %vals
81}
82
83define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
84; CHECK-LABEL: masked_gather_nxv2f64:
85; CHECK:       // %bb.0:
86; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
87; CHECK-NEXT:    ret
88  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
89  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
90  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
91  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
92  ret <vscale x 2 x double> %vals
93}
94
95define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
96; CHECK-LABEL: masked_sgather_nxv2i8:
97; CHECK:       // %bb.0:
98; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
99; CHECK-NEXT:    ret
100  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
101  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
102  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
103  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
104  ret <vscale x 2 x i64> %vals.sext
105}
106
107define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
108; CHECK-LABEL: masked_sgather_nxv2i16:
109; CHECK:       // %bb.0:
110; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
111; CHECK-NEXT:    ret
112  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
113  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
114  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
115  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
116  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
117  ret <vscale x 2 x i64> %vals.sext
118}
119
120define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
121; CHECK-LABEL: masked_sgather_nxv2i32:
122; CHECK:       // %bb.0:
123; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
124; CHECK-NEXT:    ret
125  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
126  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
127  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
128  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
129  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
130  ret <vscale x 2 x i64> %vals.sext
131}
132
133;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
134; unscaled packed 32-bit offsets
135;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
136
137define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
138; CHECK-LABEL: masked_gather_nxv4i8:
139; CHECK:       // %bb.0:
140; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
141; CHECK-NEXT:    ret
142  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
143  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
144  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
145  %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
146  ret <vscale x 4 x i32> %vals.zext
147}
148
149define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
150; CHECK-LABEL: masked_gather_nxv4i16:
151; CHECK:       // %bb.0:
152; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
153; CHECK-NEXT:    ret
154  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
155  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
156  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
157  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
158  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
159  ret <vscale x 4 x i32> %vals.zext
160}
161
162define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
163; CHECK-LABEL: masked_gather_nxv4i32:
164; CHECK:       // %bb.0:
165; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
166; CHECK-NEXT:    ret
167  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
168  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
169  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
170  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
171  ret <vscale x 4 x i32> %vals
172}
173
174define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
175; CHECK-LABEL: masked_gather_nxv4f16:
176; CHECK:       // %bb.0:
177; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
178; CHECK-NEXT:    ret
179  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
180  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
181  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
182  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
183  ret <vscale x 4 x half> %vals
184}
185
186define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
187; CHECK-LABEL: masked_gather_nxv4f32:
188; CHECK:       // %bb.0:
189; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
190; CHECK-NEXT:    ret
191  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
192  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
193  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
194  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
195  ret <vscale x 4 x float> %vals
196}
197
198define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
199; CHECK-LABEL: masked_sgather_nxv4i8:
200; CHECK:       // %bb.0:
201; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
202; CHECK-NEXT:    ret
203  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
204  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
205  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
206  %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
207  ret <vscale x 4 x i32> %vals.sext
208}
209
210define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
211; CHECK-LABEL: masked_sgather_nxv4i16:
212; CHECK:       // %bb.0:
213; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
214; CHECK-NEXT:    ret
215  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
216  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
217  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
218  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
219  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
220  ret <vscale x 4 x i32> %vals.sext
221}
222
223declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
224declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
225declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
226declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
227declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
228declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
229declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
230
231declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
232declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
233declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
234declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
235declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
236