1; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
2; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
3; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
4
5declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
6declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
7declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
8declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
9declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
10declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
11
12declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
13declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
14declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
15declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
16declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
17declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
18
19declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
20declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
21declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
22declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
23declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
24declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
25
26;-----------------------------------------------------------------------------
27; RDMA Vector
28; test for SIMDThreeSameVectorSQRDMLxHTiedHS
29
30define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
31; CHECK-LABEL: test_sqrdmlah_v4i16:
32   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
33   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
34; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
35; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
36; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
37   ret <4 x i16> %retval
38}
39
40define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
41; CHECK-LABEL: test_sqrdmlah_v8i16:
42   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
43   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
44; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
45; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
46; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
47   ret <8 x i16> %retval
48}
49
50define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
51; CHECK-LABEL: test_sqrdmlah_v2i32:
52   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
53   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
54; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
55; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
56; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
57   ret <2 x i32> %retval
58}
59
60define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
61; CHECK-LABEL: test_sqrdmlah_v4i32:
62   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
63   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
64; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
65; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
66; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
67   ret <4 x i32> %retval
68}
69
70define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
71; CHECK-LABEL: test_sqrdmlsh_v4i16:
72   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
73   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
74; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
75; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
76; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
77   ret <4 x i16> %retval
78}
79
80define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
81; CHECK-LABEL: test_sqrdmlsh_v8i16:
82   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
83   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
84; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
85; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
86; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
87   ret <8 x i16> %retval
88}
89
90define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
91; CHECK-LABEL: test_sqrdmlsh_v2i32:
92   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
93   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
94; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
95; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
96; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
97   ret <2 x i32> %retval
98}
99
100define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
101; CHECK-LABEL: test_sqrdmlsh_v4i32:
102   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
103   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
104; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
105; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
106; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
107   ret <4 x i32> %retval
108}
109
110;-----------------------------------------------------------------------------
111; RDMA Vector, by element
112; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
113
114define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
115; CHECK-LABEL: test_sqrdmlah_lane_s16:
116entry:
117  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
118  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
119  %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
120; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
121; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
122; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
123  ret <4 x i16> %retval
124}
125
126define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
127; CHECK-LABEL: test_sqrdmlahq_lane_s16:
128entry:
129  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
130  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
131  %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
132; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
133; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
134; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
135  ret <8 x i16> %retval
136}
137
138define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
139; CHECK-LABEL: test_sqrdmlah_lane_s32:
140entry:
141  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
142  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
143  %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
144; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
145; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
146; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
147  ret <2 x i32> %retval
148}
149
150define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
151; CHECK-LABEL: test_sqrdmlahq_lane_s32:
152entry:
153  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
154  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
155  %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
156; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
157; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
158; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
159  ret <4 x i32> %retval
160}
161
162define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
163; CHECK-LABEL: test_sqrdmlsh_lane_s16:
164entry:
165  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
166  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
167  %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
168; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
169; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
170; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
171  ret <4 x i16> %retval
172}
173
174define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
175; CHECK-LABEL: test_sqrdmlshq_lane_s16:
176entry:
177  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
178  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
179  %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
180; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
181; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
182; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
183  ret <8 x i16> %retval
184}
185
186define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
187; CHECK-LABEL: test_sqrdmlsh_lane_s32:
188entry:
189  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
190  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
191  %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
192; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
193; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
194; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
195  ret <2 x i32> %retval
196}
197
198define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
199; CHECK-LABEL: test_sqrdmlshq_lane_s32:
200entry:
201  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
202  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
203  %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
204; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
205; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
206; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
207  ret <4 x i32> %retval
208}
209
210;-----------------------------------------------------------------------------
211; RDMA Vector, by element, extracted
212; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
213; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
214
215define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
216; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
217entry:
218  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
219  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
220  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
221  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
222  %retval = extractelement <4 x i16> %retval_vec, i64 0
223; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
224; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
225; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
226  ret i16 %retval
227}
228
229define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
230; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
231entry:
232  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
233  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
234  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
235  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
236  %retval = extractelement <8 x i16> %retval_vec, i64 0
237; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
238; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
239; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
240  ret i16 %retval
241}
242
243define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
244; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
245entry:
246  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
247  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
248  %extract = extractelement <2 x i32> %prod, i64 0
249  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
250; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
251; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
252; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
253  ret i32 %retval
254}
255
256define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
257; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
258entry:
259  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
260  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
261  %extract = extractelement <4 x i32> %prod, i64 0
262  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
263; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
264; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
265; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
266  ret i32 %retval
267}
268
269define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
270; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
271entry:
272  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
273  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
274  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
275  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
276  %retval = extractelement <4 x i16> %retval_vec, i64 0
277; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
278; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
279; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
280  ret i16 %retval
281}
282
283define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
284; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
285entry:
286  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
287  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
288  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
289  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
290  %retval = extractelement <8 x i16> %retval_vec, i64 0
291; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
292; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
293; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
294  ret i16 %retval
295}
296
297define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
298; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
299entry:
300  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
301  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
302  %extract = extractelement <2 x i32> %prod, i64 0
303  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
304; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
305; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
306; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
307  ret i32 %retval
308}
309
310define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
311; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
312entry:
313  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
314  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
315  %extract = extractelement <4 x i32> %prod, i64 0
316  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
317; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
318; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
319; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
320  ret i32 %retval
321}
322
323;-----------------------------------------------------------------------------
324; RDMA Scalar
325; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
326
327define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
328; CHECK-LABEL: test_sqrdmlah_v1i16:
329  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
330  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
331  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
332  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
333  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
334  %retval = extractelement <4 x i16> %retval_vec, i64 0
335; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
336; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
337; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
338  ret i16 %retval
339}
340
341define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
342; CHECK-LABEL: test_sqrdmlah_v1i32:
343  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
344  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
345  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
346  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
347  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
348  %retval = extractelement <4 x i32> %retval_vec, i64 0
349; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
350; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
351; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
352  ret i32 %retval
353}
354
355
356define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
357; CHECK-LABEL: test_sqrdmlsh_v1i16:
358  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
359  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
360  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
361  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
362  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
363  %retval = extractelement <4 x i16> %retval_vec, i64 0
364; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
365; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
366; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
367  ret i16 %retval
368}
369
370define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
371; CHECK-LABEL: test_sqrdmlsh_v1i32:
372  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
373  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
374  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
375  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
376  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
377  %retval = extractelement <4 x i32> %retval_vec, i64 0
378; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
379; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
380; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
381  ret i32 %retval
382}
383define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
384; CHECK-LABEL: test_sqrdmlah_i32:
385  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
386  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
387; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
388; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
389; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
390  ret i32 %retval
391}
392
393define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
394; CHECK-LABEL: test_sqrdmlsh_i32:
395  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
396  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
397; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
398; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
399; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
400  ret i32 %retval
401}
402
403;-----------------------------------------------------------------------------
404; RDMA Scalar, by element
405; i16 tests are performed via tests in above chapter, with IR in ACLE style
406; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
407
408define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
409; CHECK-LABEL: test_sqrdmlah_extract_i16:
410  %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
411  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
412  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
413  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
414  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
415  %retval = extractelement <4 x i16> %retval_vec, i32 0
416; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
417; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
418; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
419  ret i16 %retval
420}
421
422define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
423; CHECK-LABEL: test_sqrdmlah_extract_i32:
424  %extract = extractelement <4 x i32> %rhs, i32 3
425  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
426  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
427; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
428; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
429; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
430  ret i32 %retval
431}
432
433define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
434; CHECK-LABEL: test_sqrdmlshq_extract_i16:
435  %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
436  %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
437  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
438  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
439  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
440  %retval = extractelement <8 x i16> %retval_vec, i32 0
441; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
442; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
443; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
444  ret i16 %retval
445}
446
447define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
448; CHECK-LABEL: test_sqrdmlsh_extract_i32:
449  %extract = extractelement <4 x i32> %rhs, i32 3
450  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
451  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
452; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
453; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
454; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
455  ret i32 %retval
456}
457