1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
2 // RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
3 
4 // Test new aarch64 intrinsics and types
5 
6 #include <arm_neon.h>
7 
8 
9 // CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
10 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
11 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
12 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
13 // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
14 // CHECK:   ret float [[MUL]]
test_vmuls_lane_f32(float32_t a,float32x2_t b)15 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
16   return vmuls_lane_f32(a, b, 1);
17 }
18 
19 // CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
20 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
21 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
22 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
23 // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
24 // CHECK:   ret double [[MUL]]
test_vmuld_lane_f64(float64_t a,float64x1_t b)25 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
26   return vmuld_lane_f64(a, b, 0);
27 }
28 
29 // CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #0 {
30 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
31 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
32 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
33 // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
34 // CHECK:   ret float [[MUL]]
test_vmuls_laneq_f32(float32_t a,float32x4_t b)35 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
36   return vmuls_laneq_f32(a, b, 3);
37 }
38 
39 // CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #0 {
40 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
41 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
42 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
43 // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
44 // CHECK:   ret double [[MUL]]
test_vmuld_laneq_f64(float64_t a,float64x2_t b)45 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
46   return vmuld_laneq_f64(a, b, 1);
47 }
48 
49 // CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
50 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
51 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
52 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP1]] to double
53 // CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
54 // CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
55 // CHECK:   ret <1 x double> [[TMP4]]
test_vmul_n_f64(float64x1_t a,float64_t b)56 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
57   return vmul_n_f64(a, b);
58 }
59 
60 // CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
61 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
62 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
63 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
64 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) #2
65 // CHECK:   ret float [[VMULXS_F32_I]]
test_vmulxs_lane_f32(float32_t a,float32x2_t b)66 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
67   return vmulxs_lane_f32(a, b, 1);
68 }
69 
70 // CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #0 {
71 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
72 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
73 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
74 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) #2
75 // CHECK:   ret float [[VMULXS_F32_I]]
test_vmulxs_laneq_f32(float32_t a,float32x4_t b)76 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
77   return vmulxs_laneq_f32(a, b, 3);
78 }
79 
80 // CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
81 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
82 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
83 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
84 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) #2
85 // CHECK:   ret double [[VMULXD_F64_I]]
test_vmulxd_lane_f64(float64_t a,float64x1_t b)86 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
87   return vmulxd_lane_f64(a, b, 0);
88 }
89 
90 // CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #0 {
91 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
92 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
93 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
94 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) #2
95 // CHECK:   ret double [[VMULXD_F64_I]]
test_vmulxd_laneq_f64(float64_t a,float64x2_t b)96 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
97   return vmulxd_laneq_f64(a, b, 1);
98 }
99 
100 // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
101 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
102 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
103 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
104 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
105 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
106 // CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
107 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]]) #2
108 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
109 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
110 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
111 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_lane_f64(float64x1_t a,float64x1_t b)112 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
113   return vmulx_lane_f64(a, b, 0);
114 }
115 
116 
117 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #0 {
118 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
119 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
120 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
121 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
122 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
123 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
124 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
125 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
126 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
127 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
128 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_0(float64x1_t a,float64x2_t b)129 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
130   return vmulx_laneq_f64(a, b, 0);
131 }
132 
133 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #0 {
134 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
135 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
136 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
137 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
138 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
139 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
140 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
141 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
142 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
143 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
144 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_1(float64x1_t a,float64x2_t b)145 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
146   return vmulx_laneq_f64(a, b, 1);
147 }
148 
149 
150 // CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
151 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
152 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
153 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
154 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
155 // CHECK:   ret float [[TMP2]]
test_vfmas_lane_f32(float32_t a,float32_t b,float32x2_t c)156 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
157   return vfmas_lane_f32(a, b, c, 1);
158 }
159 
160 // CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
161 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
162 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
163 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
164 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
165 // CHECK:   ret double [[TMP2]]
test_vfmad_lane_f64(float64_t a,float64_t b,float64x1_t c)166 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
167   return vfmad_lane_f64(a, b, c, 0);
168 }
169 
170 // CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
171 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
172 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
173 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
174 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
175 // CHECK:   ret double [[TMP2]]
test_vfmad_laneq_f64(float64_t a,float64_t b,float64x2_t c)176 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
177   return vfmad_laneq_f64(a, b, c, 1);
178 }
179 
180 // CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
181 // CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
182 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
183 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
184 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
185 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
186 // CHECK:   ret float [[TMP2]]
test_vfmss_lane_f32(float32_t a,float32_t b,float32x2_t c)187 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
188   return vfmss_lane_f32(a, b, c, 1);
189 }
190 
191 // CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
192 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
193 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
194 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
195 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
196 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
197 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
198 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
199 // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
200 // CHECK:   ret <1 x double> [[FMLA2]]
test_vfma_lane_f64(float64x1_t a,float64x1_t b,float64x1_t v)201 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
202   return vfma_lane_f64(a, b, v, 0);
203 }
204 
205 // CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
206 // CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
207 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
208 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
209 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
210 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
211 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
212 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
213 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
214 // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
215 // CHECK:   ret <1 x double> [[FMLA2]]
test_vfms_lane_f64(float64x1_t a,float64x1_t b,float64x1_t v)216 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
217   return vfms_lane_f64(a, b, v, 0);
218 }
219 
220 // CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
221 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
222 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
223 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
224 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
225 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
226 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
227 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
228 // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
229 // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
230 // CHECK:   ret <1 x double> [[TMP7]]
test_vfma_laneq_f64(float64x1_t a,float64x1_t b,float64x2_t v)231 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
232   return vfma_laneq_f64(a, b, v, 0);
233 }
234 
235 // CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
236 // CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
237 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
238 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
239 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
240 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
241 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
242 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
243 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
244 // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
245 // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
246 // CHECK:   ret <1 x double> [[TMP7]]
test_vfms_laneq_f64(float64x1_t a,float64x1_t b,float64x2_t v)247 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
248   return vfms_laneq_f64(a, b, v, 0);
249 }
250 
251 // CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
252 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
253 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
254 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
255 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
256 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
257 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
258 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
259 // CHECK:   ret i32 [[TMP4]]
test_vqdmullh_lane_s16(int16_t a,int16x4_t b)260 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
261   return vqdmullh_lane_s16(a, b, 3);
262 }
263 
264 // CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
265 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
266 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
267 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
268 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) #2
269 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
test_vqdmulls_lane_s32(int32_t a,int32x2_t b)270 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
271   return vqdmulls_lane_s32(a, b, 1);
272 }
273 
274 // CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
275 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
276 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
277 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
278 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
279 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
280 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
281 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
282 // CHECK:   ret i32 [[TMP4]]
test_vqdmullh_laneq_s16(int16_t a,int16x8_t b)283 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
284   return vqdmullh_laneq_s16(a, b, 7);
285 }
286 
287 // CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #0 {
288 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
289 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
290 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
291 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) #2
292 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
test_vqdmulls_laneq_s32(int32_t a,int32x4_t b)293 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
294   return vqdmulls_laneq_s32(a, b, 3);
295 }
296 
297 // CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
298 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
299 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
300 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
301 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
302 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
303 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
304 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
305 // CHECK:   ret i16 [[TMP4]]
test_vqdmulhh_lane_s16(int16_t a,int16x4_t b)306 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
307   return vqdmulhh_lane_s16(a, b, 3);
308 }
309 
310 // CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
311 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
312 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
313 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
314 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
315 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
test_vqdmulhs_lane_s32(int32_t a,int32x2_t b)316 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
317   return vqdmulhs_lane_s32(a, b, 1);
318 }
319 
320 
321 // CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
322 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
323 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
324 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
325 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
326 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
327 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
328 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
329 // CHECK:   ret i16 [[TMP4]]
test_vqdmulhh_laneq_s16(int16_t a,int16x8_t b)330 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
331   return vqdmulhh_laneq_s16(a, b, 7);
332 }
333 
334 
335 // CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
336 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
337 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
338 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
339 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
340 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
test_vqdmulhs_laneq_s32(int32_t a,int32x4_t b)341 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
342   return vqdmulhs_laneq_s32(a, b, 3);
343 }
344 
345 // CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
346 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
347 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
348 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
349 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
350 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
351 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
352 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
353 // CHECK:   ret i16 [[TMP4]]
test_vqrdmulhh_lane_s16(int16_t a,int16x4_t b)354 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
355   return vqrdmulhh_lane_s16(a, b, 3);
356 }
357 
358 // CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
359 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
360 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
361 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
362 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
363 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
test_vqrdmulhs_lane_s32(int32_t a,int32x2_t b)364 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
365   return vqrdmulhs_lane_s32(a, b, 1);
366 }
367 
368 
369 // CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
370 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
371 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
372 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
373 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
374 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
375 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
376 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
377 // CHECK:   ret i16 [[TMP4]]
test_vqrdmulhh_laneq_s16(int16_t a,int16x8_t b)378 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
379   return vqrdmulhh_laneq_s16(a, b, 7);
380 }
381 
382 
383 // CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
384 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
385 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
386 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
387 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
388 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
test_vqrdmulhs_laneq_s32(int32_t a,int32x4_t b)389 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
390   return vqrdmulhs_laneq_s32(a, b, 3);
391 }
392 
393 // CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
394 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
395 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
396 // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
397 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
398 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
399 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
400 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
401 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
402 // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlalh_lane_s16(int32_t a,int16_t b,int16x4_t c)403 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
404   return vqdmlalh_lane_s16(a, b, c, 3);
405 }
406 
407 // CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
408 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
409 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
410 // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
411 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
412 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
413 // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlals_lane_s32(int64_t a,int32_t b,int32x2_t c)414 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
415   return vqdmlals_lane_s32(a, b, c, 1);
416 }
417 
418 // CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
419 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
420 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
421 // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
422 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
423 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
424 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
425 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
426 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
427 // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlalh_laneq_s16(int32_t a,int16_t b,int16x8_t c)428 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
429   return vqdmlalh_laneq_s16(a, b, c, 7);
430 }
431 
432 // CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
433 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
434 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
435 // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
436 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
437 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
438 // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlals_laneq_s32(int64_t a,int32_t b,int32x4_t c)439 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
440   return vqdmlals_laneq_s32(a, b, c, 3);
441 }
442 
443 // CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
444 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
445 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
446 // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
447 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
448 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
449 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
450 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
451 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
452 // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlslh_lane_s16(int32_t a,int16_t b,int16x4_t c)453 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
454   return vqdmlslh_lane_s16(a, b, c, 3);
455 }
456 
457 // CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
458 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
459 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
460 // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
461 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
462 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
463 // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlsls_lane_s32(int64_t a,int32_t b,int32x2_t c)464 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
465   return vqdmlsls_lane_s32(a, b, c, 1);
466 }
467 
468 // CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
469 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
470 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
471 // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
472 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
473 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
474 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
475 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
476 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
477 // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlslh_laneq_s16(int32_t a,int16_t b,int16x8_t c)478 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
479   return vqdmlslh_laneq_s16(a, b, c, 7);
480 }
481 
482 // CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
483 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
484 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
485 // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
486 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
487 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
488 // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlsls_laneq_s32(int64_t a,int32_t b,int32x4_t c)489 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
490   return vqdmlsls_laneq_s32(a, b, c, 3);
491 }
492 
493 // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
494 // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
495 // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
496 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
497 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
498 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
499 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
500 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
501 // CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
502 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]]) #2
503 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
504 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
505 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
506 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_lane_f64_0()507 float64x1_t test_vmulx_lane_f64_0() {
508       float64x1_t arg1;
509       float64x1_t arg2;
510       float64x1_t result;
511       float64_t sarg1, sarg2, sres;
512       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
513       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
514       result = vmulx_lane_f64(arg1, arg2, 0);
515       return result;
516 }
517 
518 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #0 {
519 // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
520 // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
521 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
522 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
523 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
524 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
525 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
526 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
527 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
528 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
529 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
530 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
531 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
532 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_2()533 float64x1_t test_vmulx_laneq_f64_2() {
534       float64x1_t arg1;
535       float64x1_t arg2;
536       float64x2_t arg3;
537       float64x1_t result;
538       float64_t sarg1, sarg2, sres;
539       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
540       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
541       arg3 = vcombine_f64(arg1, arg2);
542       result = vmulx_laneq_f64(arg1, arg3, 1);
543       return result;
544 }
545