1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
2 // RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
3 // RUN:  | FileCheck %s
4 
5 // Test new aarch64 intrinsics with poly64
6 
7 #include <arm_neon.h>
8 
9 // CHECK-LABEL: define <1 x i64> @test_vceq_p64(<1 x i64> %a, <1 x i64> %b) #0 {
10 // CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
11 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
12 // CHECK:   ret <1 x i64> [[SEXT_I]]
test_vceq_p64(poly64x1_t a,poly64x1_t b)13 uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) {
14   return vceq_p64(a, b);
15 }
16 
17 // CHECK-LABEL: define <2 x i64> @test_vceqq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
18 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b
19 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
20 // CHECK:   ret <2 x i64> [[SEXT_I]]
test_vceqq_p64(poly64x2_t a,poly64x2_t b)21 uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) {
22   return vceqq_p64(a, b);
23 }
24 
25 // CHECK-LABEL: define <1 x i64> @test_vtst_p64(<1 x i64> %a, <1 x i64> %b) #0 {
26 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
27 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
28 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
29 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
30 // CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
31 // CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
32 // CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
33 // CHECK:   ret <1 x i64> [[VTST_I]]
test_vtst_p64(poly64x1_t a,poly64x1_t b)34 uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) {
35   return vtst_p64(a, b);
36 }
37 
38 // CHECK-LABEL: define <2 x i64> @test_vtstq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
39 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
40 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
41 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
42 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
43 // CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
44 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
45 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
46 // CHECK:   ret <2 x i64> [[VTST_I]]
test_vtstq_p64(poly64x2_t a,poly64x2_t b)47 uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) {
48   return vtstq_p64(a, b);
49 }
50 
51 // CHECK-LABEL: define <1 x i64> @test_vbsl_p64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
52 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
53 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
54 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
55 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
56 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
57 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
58 // CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
59 // CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
60 // CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
61 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
62 // CHECK:   ret <1 x i64> [[VBSL5_I]]
test_vbsl_p64(poly64x1_t a,poly64x1_t b,poly64x1_t c)63 poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
64   return vbsl_p64(a, b, c);
65 }
66 
67 // CHECK-LABEL: define <2 x i64> @test_vbslq_p64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
68 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
69 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
70 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
71 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
72 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
73 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
74 // CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
75 // CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
76 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
77 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
78 // CHECK:   ret <2 x i64> [[VBSL5_I]]
test_vbslq_p64(poly64x2_t a,poly64x2_t b,poly64x2_t c)79 poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) {
80   return vbslq_p64(a, b, c);
81 }
82 
83 // CHECK-LABEL: define i64 @test_vget_lane_p64(<1 x i64> %v) #0 {
84 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
85 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
86 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
87 // CHECK:   ret i64 [[VGET_LANE]]
test_vget_lane_p64(poly64x1_t v)88 poly64_t test_vget_lane_p64(poly64x1_t v) {
89   return vget_lane_p64(v, 0);
90 }
91 
92 // CHECK-LABEL: define i64 @test_vgetq_lane_p64(<2 x i64> %v) #0 {
93 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
94 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
95 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
96 // CHECK:   ret i64 [[VGETQ_LANE]]
test_vgetq_lane_p64(poly64x2_t v)97 poly64_t test_vgetq_lane_p64(poly64x2_t v) {
98   return vgetq_lane_p64(v, 1);
99 }
100 
101 // CHECK-LABEL: define <1 x i64> @test_vset_lane_p64(i64 %a, <1 x i64> %v) #0 {
102 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
103 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
104 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
105 // CHECK:   ret <1 x i64> [[VSET_LANE]]
test_vset_lane_p64(poly64_t a,poly64x1_t v)106 poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) {
107   return vset_lane_p64(a, v, 0);
108 }
109 
110 // CHECK-LABEL: define <2 x i64> @test_vsetq_lane_p64(i64 %a, <2 x i64> %v) #0 {
111 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
112 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
113 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
114 // CHECK:   ret <2 x i64> [[VSET_LANE]]
test_vsetq_lane_p64(poly64_t a,poly64x2_t v)115 poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) {
116   return vsetq_lane_p64(a, v, 1);
117 }
118 
119 // CHECK-LABEL: define <1 x i64> @test_vcopy_lane_p64(<1 x i64> %a, <1 x i64> %b) #0 {
120 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
121 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
122 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
123 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %a to <8 x i8>
124 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
125 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 0
126 // CHECK:   ret <1 x i64> [[VSET_LANE]]
test_vcopy_lane_p64(poly64x1_t a,poly64x1_t b)127 poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) {
128   return vcopy_lane_p64(a, 0, b, 0);
129 
130 }
131 
132 // CHECK-LABEL: define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 {
133 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
134 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
135 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
136 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
137 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
138 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 1
139 // CHECK:   ret <2 x i64> [[VSET_LANE]]
test_vcopyq_lane_p64(poly64x2_t a,poly64x1_t b)140 poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) {
141   return vcopyq_lane_p64(a, 1, b, 0);
142 }
143 
144 // CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
145 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
146 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
147 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
148 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
149 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
150 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 1
151 // CHECK:   ret <2 x i64> [[VSET_LANE]]
test_vcopyq_laneq_p64(poly64x2_t a,poly64x2_t b)152 poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) {
153   return vcopyq_laneq_p64(a, 1, b, 1);
154 }
155 
156 // CHECK-LABEL: define <1 x i64> @test_vcreate_p64(i64 %a) #0 {
157 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
158 // CHECK:   ret <1 x i64> [[TMP0]]
test_vcreate_p64(uint64_t a)159 poly64x1_t test_vcreate_p64(uint64_t a) {
160   return vcreate_p64(a);
161 }
162 
163 // CHECK-LABEL: define <1 x i64> @test_vdup_n_p64(i64 %a) #0 {
164 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
165 // CHECK:   ret <1 x i64> [[VECINIT_I]]
test_vdup_n_p64(poly64_t a)166 poly64x1_t test_vdup_n_p64(poly64_t a) {
167   return vdup_n_p64(a);
168 }
169 // CHECK-LABEL: define <2 x i64> @test_vdupq_n_p64(i64 %a) #0 {
170 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
171 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
172 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
test_vdupq_n_p64(poly64_t a)173 poly64x2_t test_vdupq_n_p64(poly64_t a) {
174   return vdupq_n_p64(a);
175 }
176 
177 // CHECK-LABEL: define <1 x i64> @test_vmov_n_p64(i64 %a) #0 {
178 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
179 // CHECK:   ret <1 x i64> [[VECINIT_I]]
test_vmov_n_p64(poly64_t a)180 poly64x1_t test_vmov_n_p64(poly64_t a) {
181   return vmov_n_p64(a);
182 }
183 
184 // CHECK-LABEL: define <2 x i64> @test_vmovq_n_p64(i64 %a) #0 {
185 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
186 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
187 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
test_vmovq_n_p64(poly64_t a)188 poly64x2_t test_vmovq_n_p64(poly64_t a) {
189   return vmovq_n_p64(a);
190 }
191 
192 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 {
193 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer
194 // CHECK:   ret <1 x i64> [[SHUFFLE]]
test_vdup_lane_p64(poly64x1_t vec)195 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
196   return vdup_lane_p64(vec, 0);
197 }
198 
199 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #0 {
200 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer
201 // CHECK:   ret <2 x i64> [[SHUFFLE]]
test_vdupq_lane_p64(poly64x1_t vec)202 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
203   return vdupq_lane_p64(vec, 0);
204 }
205 
206 // CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #0 {
207 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> <i32 1, i32 1>
208 // CHECK:   ret <2 x i64> [[SHUFFLE]]
test_vdupq_laneq_p64(poly64x2_t vec)209 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
210   return vdupq_laneq_p64(vec, 1);
211 }
212 
213 // CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
214 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
215 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vcombine_p64(poly64x1_t low,poly64x1_t high)216 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
217   return vcombine_p64(low, high);
218 }
219 
220 // CHECK-LABEL: define <1 x i64> @test_vld1_p64(i64* %ptr) #0 {
221 // CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
222 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
223 // CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
224 // CHECK:   ret <1 x i64> [[TMP2]]
test_vld1_p64(poly64_t const * ptr)225 poly64x1_t test_vld1_p64(poly64_t const * ptr) {
226   return vld1_p64(ptr);
227 }
228 
229 // CHECK-LABEL: define <2 x i64> @test_vld1q_p64(i64* %ptr) #0 {
230 // CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
231 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
232 // CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
233 // CHECK:   ret <2 x i64> [[TMP2]]
test_vld1q_p64(poly64_t const * ptr)234 poly64x2_t test_vld1q_p64(poly64_t const * ptr) {
235   return vld1q_p64(ptr);
236 }
237 
238 // CHECK-LABEL: define void @test_vst1_p64(i64* %ptr, <1 x i64> %val) #0 {
239 // CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
240 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8>
241 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
242 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
243 // CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
244 // CHECK:   ret void
test_vst1_p64(poly64_t * ptr,poly64x1_t val)245 void test_vst1_p64(poly64_t * ptr, poly64x1_t val) {
246   return vst1_p64(ptr, val);
247 }
248 
249 // CHECK-LABEL: define void @test_vst1q_p64(i64* %ptr, <2 x i64> %val) #0 {
250 // CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
251 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8>
252 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
253 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
254 // CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
255 // CHECK:   ret void
test_vst1q_p64(poly64_t * ptr,poly64x2_t val)256 void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
257   return vst1q_p64(ptr, val);
258 }
259 
260 // CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_p64(i64* %ptr) #0 {
261 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
262 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
263 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
264 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
265 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
266 // CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
267 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
268 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
269 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
270 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
271 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
272 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
273 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
test_vld2_p64(poly64_t const * ptr)274 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
275   return vld2_p64(ptr);
276 }
277 
278 // CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_p64(i64* %ptr) #0 {
279 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
280 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
281 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
282 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
283 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
284 // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
285 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
286 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
287 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
288 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
289 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
290 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
291 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
test_vld2q_p64(poly64_t const * ptr)292 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
293   return vld2q_p64(ptr);
294 }
295 
296 // CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_p64(i64* %ptr) #0 {
297 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
298 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
299 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
300 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
301 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
302 // CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
303 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
304 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
305 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
306 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
307 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
308 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
309 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
test_vld3_p64(poly64_t const * ptr)310 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
311   return vld3_p64(ptr);
312 }
313 
314 // CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_p64(i64* %ptr) #0 {
315 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
316 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
317 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
318 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
319 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
320 // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
321 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
322 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
323 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
324 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
325 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
326 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
327 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
test_vld3q_p64(poly64_t const * ptr)328 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
329   return vld3q_p64(ptr);
330 }
331 
332 // CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_p64(i64* %ptr) #0 {
333 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
334 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
335 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
336 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
337 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
338 // CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
339 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
340 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
341 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
342 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
343 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
344 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
345 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
test_vld4_p64(poly64_t const * ptr)346 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
347   return vld4_p64(ptr);
348 }
349 
350 // CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_p64(i64* %ptr) #0 {
351 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
352 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
353 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
354 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
355 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
356 // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
357 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
358 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
359 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
360 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
361 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
362 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
363 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
test_vld4q_p64(poly64_t const * ptr)364 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
365   return vld4q_p64(ptr);
366 }
367 
368 // CHECK-LABEL: define void @test_vst2_p64(i64* %ptr, [2 x <1 x i64>] %val.coerce) #0 {
369 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
370 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
371 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[VAL]], i32 0, i32 0
372 // CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
373 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
374 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
375 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
376 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
377 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
378 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 0
379 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
380 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
381 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
382 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL2]], i64 0, i64 1
383 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
384 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
385 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
386 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
387 // CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
388 // CHECK:   ret void
test_vst2_p64(poly64_t * ptr,poly64x1x2_t val)389 void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
390   return vst2_p64(ptr, val);
391 }
392 
393 // CHECK-LABEL: define void @test_vst2q_p64(i64* %ptr, [2 x <2 x i64>] %val.coerce) #0 {
394 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
395 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
396 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[VAL]], i32 0, i32 0
397 // CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
398 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
399 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
400 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
401 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
402 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
403 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 0
404 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
405 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
406 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
407 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL2]], i64 0, i64 1
408 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
409 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
410 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
411 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
412 // CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
413 // CHECK:   ret void
test_vst2q_p64(poly64_t * ptr,poly64x2x2_t val)414 void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
415   return vst2q_p64(ptr, val);
416 }
417 
418 // CHECK-LABEL: define void @test_vst3_p64(i64* %ptr, [3 x <1 x i64>] %val.coerce) #0 {
419 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
420 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
421 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[VAL]], i32 0, i32 0
422 // CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
423 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
424 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
425 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
426 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
427 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
428 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 0
429 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
430 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
431 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
432 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL2]], i64 0, i64 1
433 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
434 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
435 // CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
436 // CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL4]], i64 0, i64 2
437 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX5]], align 8
438 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
439 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
440 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
441 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
442 // CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
443 // CHECK:   ret void
test_vst3_p64(poly64_t * ptr,poly64x1x3_t val)444 void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
445   return vst3_p64(ptr, val);
446 }
447 
448 // CHECK-LABEL: define void @test_vst3q_p64(i64* %ptr, [3 x <2 x i64>] %val.coerce) #0 {
449 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
450 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
451 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[VAL]], i32 0, i32 0
452 // CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
453 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
454 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
455 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
456 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
457 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
458 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 0
459 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
460 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
461 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
462 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL2]], i64 0, i64 1
463 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
464 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
465 // CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
466 // CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL4]], i64 0, i64 2
467 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX5]], align 16
468 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
469 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
470 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
471 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
472 // CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
473 // CHECK:   ret void
test_vst3q_p64(poly64_t * ptr,poly64x2x3_t val)474 void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
475   return vst3q_p64(ptr, val);
476 }
477 
478 // CHECK-LABEL: define void @test_vst4_p64(i64* %ptr, [4 x <1 x i64>] %val.coerce) #0 {
479 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
480 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
481 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[VAL]], i32 0, i32 0
482 // CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
483 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
484 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
485 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
486 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
487 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
488 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 0
489 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
490 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
491 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
492 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL2]], i64 0, i64 1
493 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
494 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
495 // CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
496 // CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL4]], i64 0, i64 2
497 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX5]], align 8
498 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
499 // CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
500 // CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL6]], i64 0, i64 3
501 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX7]], align 8
502 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
503 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
504 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
505 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
506 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
507 // CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
508 // CHECK:   ret void
test_vst4_p64(poly64_t * ptr,poly64x1x4_t val)509 void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
510   return vst4_p64(ptr, val);
511 }
512 
513 // CHECK-LABEL: define void @test_vst4q_p64(i64* %ptr, [4 x <2 x i64>] %val.coerce) #0 {
514 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
515 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
516 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[VAL]], i32 0, i32 0
517 // CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
518 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
519 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
520 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
521 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
522 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
523 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 0
524 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
525 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
526 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
527 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL2]], i64 0, i64 1
528 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
529 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
530 // CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
531 // CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL4]], i64 0, i64 2
532 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX5]], align 16
533 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
534 // CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
535 // CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL6]], i64 0, i64 3
536 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX7]], align 16
537 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
538 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
539 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
540 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
541 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
542 // CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
543 // CHECK:   ret void
test_vst4q_p64(poly64_t * ptr,poly64x2x4_t val)544 void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) {
545   return vst4q_p64(ptr, val);
546 }
547 
548 // CHECK-LABEL: define <1 x i64> @test_vext_p64(<1 x i64> %a, <1 x i64> %b) #0 {
549 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
550 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
551 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
552 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
553 // CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
554 // CHECK:   ret <1 x i64> [[VEXT]]
test_vext_p64(poly64x1_t a,poly64x1_t b)555 poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) {
556   return vext_u64(a, b, 0);
557 
558 }
559 
560 // CHECK-LABEL: define <2 x i64> @test_vextq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
561 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
562 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
563 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
564 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
565 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
566 // CHECK:   ret <2 x i64> [[VEXT]]
test_vextq_p64(poly64x2_t a,poly64x2_t b)567 poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) {
568   return vextq_p64(a, b, 1);
569 }
570 
571 // CHECK-LABEL: define <2 x i64> @test_vzip1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
572 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
573 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vzip1q_p64(poly64x2_t a,poly64x2_t b)574 poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) {
575   return vzip1q_p64(a, b);
576 }
577 
578 // CHECK-LABEL: define <2 x i64> @test_vzip2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
579 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
580 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vzip2q_p64(poly64x2_t a,poly64x2_t b)581 poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) {
582   return vzip2q_u64(a, b);
583 }
584 
585 // CHECK-LABEL: define <2 x i64> @test_vuzp1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
586 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
587 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vuzp1q_p64(poly64x2_t a,poly64x2_t b)588 poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) {
589   return vuzp1q_p64(a, b);
590 }
591 
592 // CHECK-LABEL: define <2 x i64> @test_vuzp2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
593 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
594 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vuzp2q_p64(poly64x2_t a,poly64x2_t b)595 poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) {
596   return vuzp2q_u64(a, b);
597 }
598 
599 // CHECK-LABEL: define <2 x i64> @test_vtrn1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
600 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
601 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vtrn1q_p64(poly64x2_t a,poly64x2_t b)602 poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
603   return vtrn1q_p64(a, b);
604 }
605 
606 // CHECK-LABEL: define <2 x i64> @test_vtrn2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
607 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
608 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vtrn2q_p64(poly64x2_t a,poly64x2_t b)609 poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
610   return vtrn2q_u64(a, b);
611 }
612 
613 // CHECK-LABEL: define <1 x i64> @test_vsri_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
614 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
615 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
616 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
617 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
618 // CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 33)
619 // CHECK:   ret <1 x i64> [[VSRI_N2]]
test_vsri_n_p64(poly64x1_t a,poly64x1_t b)620 poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) {
621   return vsri_n_p64(a, b, 33);
622 }
623 
624 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
625 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
626 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
627 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
628 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
629 // CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 64)
630 // CHECK:   ret <2 x i64> [[VSRI_N2]]
test_vsriq_n_p64(poly64x2_t a,poly64x2_t b)631 poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) {
632   return vsriq_n_p64(a, b, 64);
633 }
634 
635