1; Test the MSA intrinsics that are encoded with the 3R instruction format and
2; use the result as a third operand and results in wider elements than the
3; operands had.
4
5; RUN: llc -march=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
6; RUN: llc -march=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
7
8@llvm_mips_dpadd_s_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
9@llvm_mips_dpadd_s_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
10@llvm_mips_dpadd_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
11
12define void @llvm_mips_dpadd_s_h_test() nounwind {
13entry:
14  %0 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_s_h_ARG2
15  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_s_h_ARG3
16  %2 = tail call <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16> <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>, <16 x i8> %0, <16 x i8> %1)
17  store <8 x i16> %2, <8 x i16>* @llvm_mips_dpadd_s_h_RES
18  ret void
19}
20
21declare <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
22
23; CHECK: llvm_mips_dpadd_s_h_test:
24; CHECK: ld.b
25; CHECK: ld.b
26; CHECK: ldi.h [[R1:\$w[0-9]+]],
27; CHECK: dpadd_s.h [[R1]],
28; CHECK: st.h
29; CHECK: .size llvm_mips_dpadd_s_h_test
30;
31@llvm_mips_dpadd_s_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
32@llvm_mips_dpadd_s_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
33@llvm_mips_dpadd_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
34
35define void @llvm_mips_dpadd_s_w_test() nounwind {
36entry:
37  %0 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_s_w_ARG2
38  %1 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_s_w_ARG3
39  %2 = tail call <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32> <i32 4, i32 4, i32 4, i32 4>, <8 x i16> %0, <8 x i16> %1)
40  store <4 x i32> %2, <4 x i32>* @llvm_mips_dpadd_s_w_RES
41  ret void
42}
43
44declare <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
45
46; CHECK: llvm_mips_dpadd_s_w_test:
47; CHECK: ld.h
48; CHECK: ld.h
49; CHECK: ldi.w [[R1:\$w[0-9]+]],
50; CHECK: dpadd_s.w [[R1]],
51; CHECK: st.w
52; CHECK: .size llvm_mips_dpadd_s_w_test
53;
54@llvm_mips_dpadd_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
55@llvm_mips_dpadd_s_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
56@llvm_mips_dpadd_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
57
58define void @llvm_mips_dpadd_s_d_test() nounwind {
59entry:
60  %0 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_s_d_ARG2
61  %1 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_s_d_ARG3
62  %2 = tail call <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64> <i64 4, i64 4>, <4 x i32> %0, <4 x i32> %1)
63  store <2 x i64> %2, <2 x i64>* @llvm_mips_dpadd_s_d_RES
64  ret void
65}
66
67declare <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
68
69; CHECK: llvm_mips_dpadd_s_d_test:
70; CHECK: ldi.d [[R1:\$w[0-9]+]],
71; CHECK: ld.w
72; CHECK: ld.w
73; CHECK: dpadd_s.d [[R1]],
74; CHECK: st.d
75; CHECK: .size llvm_mips_dpadd_s_d_test
76;
77@llvm_mips_dpadd_u_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
78@llvm_mips_dpadd_u_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
79@llvm_mips_dpadd_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
80
81define void @llvm_mips_dpadd_u_h_test() nounwind {
82entry:
83  %0 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_u_h_ARG2
84  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_u_h_ARG3
85  %2 = tail call <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16> <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>, <16 x i8> %0, <16 x i8> %1)
86  store <8 x i16> %2, <8 x i16>* @llvm_mips_dpadd_u_h_RES
87  ret void
88}
89
90declare <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
91
92; CHECK: llvm_mips_dpadd_u_h_test:
93; CHECK: ld.b
94; CHECK: ld.b
95; CHECK: ldi.h [[R1:\$w[0-9]+]],
96; CHECK: dpadd_u.h [[R1]],
97; CHECK: st.h
98; CHECK: .size llvm_mips_dpadd_u_h_test
99;
100@llvm_mips_dpadd_u_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
101@llvm_mips_dpadd_u_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
102@llvm_mips_dpadd_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
103
104define void @llvm_mips_dpadd_u_w_test() nounwind {
105entry:
106  %0 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_u_w_ARG2
107  %1 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_u_w_ARG3
108  %2 = tail call <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32> <i32 4, i32 4, i32 4, i32 4>, <8 x i16> %0, <8 x i16> %1)
109  store <4 x i32> %2, <4 x i32>* @llvm_mips_dpadd_u_w_RES
110  ret void
111}
112
113declare <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
114
115; CHECK: llvm_mips_dpadd_u_w_test:
116; CHECK: ld.h
117; CHECK: ld.h
118; CHECK: ldi.w [[R1:\$w[0-9]+]],
119; CHECK: dpadd_u.w [[R1]],
120; CHECK: st.w
121; CHECK: .size llvm_mips_dpadd_u_w_test
122;
123@llvm_mips_dpadd_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
124@llvm_mips_dpadd_u_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
125@llvm_mips_dpadd_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
126
127define void @llvm_mips_dpadd_u_d_test() nounwind {
128entry:
129  %0 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_u_d_ARG2
130  %1 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_u_d_ARG3
131  %2 = tail call <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64> <i64 4, i64 4>, <4 x i32> %0, <4 x i32> %1)
132  store <2 x i64> %2, <2 x i64>* @llvm_mips_dpadd_u_d_RES
133  ret void
134}
135
136declare <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
137
138; CHECK: llvm_mips_dpadd_u_d_test:
139; CHECK: ldi.d [[R1:\$w[0-9]+]],
140; CHECK: ld.w
141; CHECK: ld.w
142; CHECK: dpadd_u.d [[R1]],
143; CHECK: st.d
144; CHECK: .size llvm_mips_dpadd_u_d_test
145;
146@llvm_mips_dpsub_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
147@llvm_mips_dpsub_s_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
148@llvm_mips_dpsub_s_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
149@llvm_mips_dpsub_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
150
151define void @llvm_mips_dpsub_s_h_test() nounwind {
152entry:
153  %0 = load <8 x i16>, <8 x i16>* @llvm_mips_dpsub_s_h_ARG1
154  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_dpsub_s_h_ARG2
155  %2 = load <16 x i8>, <16 x i8>* @llvm_mips_dpsub_s_h_ARG3
156  %3 = tail call <8 x i16> @llvm.mips.dpsub.s.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
157  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpsub_s_h_RES
158  ret void
159}
160
161declare <8 x i16> @llvm.mips.dpsub.s.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
162
163; CHECK: llvm_mips_dpsub_s_h_test:
164; CHECK: ld.b
165; CHECK: ld.b
166; CHECK: ld.h
167; CHECK: dpsub_s.h
168; CHECK: st.h
169; CHECK: .size llvm_mips_dpsub_s_h_test
170;
171@llvm_mips_dpsub_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
172@llvm_mips_dpsub_s_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
173@llvm_mips_dpsub_s_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
174@llvm_mips_dpsub_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
175
176define void @llvm_mips_dpsub_s_w_test() nounwind {
177entry:
178  %0 = load <4 x i32>, <4 x i32>* @llvm_mips_dpsub_s_w_ARG1
179  %1 = load <8 x i16>, <8 x i16>* @llvm_mips_dpsub_s_w_ARG2
180  %2 = load <8 x i16>, <8 x i16>* @llvm_mips_dpsub_s_w_ARG3
181  %3 = tail call <4 x i32> @llvm.mips.dpsub.s.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
182  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpsub_s_w_RES
183  ret void
184}
185
186declare <4 x i32> @llvm.mips.dpsub.s.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
187
188; CHECK: llvm_mips_dpsub_s_w_test:
189; CHECK: ld.h
190; CHECK: ld.h
191; CHECK: ld.w
192; CHECK: dpsub_s.w
193; CHECK: st.w
194; CHECK: .size llvm_mips_dpsub_s_w_test
195;
196@llvm_mips_dpsub_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
197@llvm_mips_dpsub_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
198@llvm_mips_dpsub_s_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
199@llvm_mips_dpsub_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
200
201define void @llvm_mips_dpsub_s_d_test() nounwind {
202entry:
203  %0 = load <2 x i64>, <2 x i64>* @llvm_mips_dpsub_s_d_ARG1
204  %1 = load <4 x i32>, <4 x i32>* @llvm_mips_dpsub_s_d_ARG2
205  %2 = load <4 x i32>, <4 x i32>* @llvm_mips_dpsub_s_d_ARG3
206  %3 = tail call <2 x i64> @llvm.mips.dpsub.s.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
207  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpsub_s_d_RES
208  ret void
209}
210
211declare <2 x i64> @llvm.mips.dpsub.s.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
212
213; CHECK: llvm_mips_dpsub_s_d_test:
214; CHECK: ld.w
215; CHECK: ld.w
216; CHECK: ld.d
217; CHECK: dpsub_s.d
218; CHECK: st.d
219; CHECK: .size llvm_mips_dpsub_s_d_test
220;
221@llvm_mips_dpsub_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
222@llvm_mips_dpsub_u_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
223@llvm_mips_dpsub_u_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
224@llvm_mips_dpsub_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
225
226define void @llvm_mips_dpsub_u_h_test() nounwind {
227entry:
228  %0 = load <8 x i16>, <8 x i16>* @llvm_mips_dpsub_u_h_ARG1
229  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_dpsub_u_h_ARG2
230  %2 = load <16 x i8>, <16 x i8>* @llvm_mips_dpsub_u_h_ARG3
231  %3 = tail call <8 x i16> @llvm.mips.dpsub.u.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
232  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpsub_u_h_RES
233  ret void
234}
235
236declare <8 x i16> @llvm.mips.dpsub.u.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
237
238; CHECK: llvm_mips_dpsub_u_h_test:
239; CHECK: ld.b
240; CHECK: ld.b
241; CHECK: ld.h
242; CHECK: dpsub_u.h
243; CHECK: st.h
244; CHECK: .size llvm_mips_dpsub_u_h_test
245;
246@llvm_mips_dpsub_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
247@llvm_mips_dpsub_u_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
248@llvm_mips_dpsub_u_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
249@llvm_mips_dpsub_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
250
251define void @llvm_mips_dpsub_u_w_test() nounwind {
252entry:
253  %0 = load <4 x i32>, <4 x i32>* @llvm_mips_dpsub_u_w_ARG1
254  %1 = load <8 x i16>, <8 x i16>* @llvm_mips_dpsub_u_w_ARG2
255  %2 = load <8 x i16>, <8 x i16>* @llvm_mips_dpsub_u_w_ARG3
256  %3 = tail call <4 x i32> @llvm.mips.dpsub.u.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
257  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpsub_u_w_RES
258  ret void
259}
260
261declare <4 x i32> @llvm.mips.dpsub.u.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
262
263; CHECK: llvm_mips_dpsub_u_w_test:
264; CHECK: ld.h
265; CHECK: ld.h
266; CHECK: ld.w
267; CHECK: dpsub_u.w
268; CHECK: st.w
269; CHECK: .size llvm_mips_dpsub_u_w_test
270;
271@llvm_mips_dpsub_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
272@llvm_mips_dpsub_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
273@llvm_mips_dpsub_u_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
274@llvm_mips_dpsub_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
275
276define void @llvm_mips_dpsub_u_d_test() nounwind {
277entry:
278  %0 = load <2 x i64>, <2 x i64>* @llvm_mips_dpsub_u_d_ARG1
279  %1 = load <4 x i32>, <4 x i32>* @llvm_mips_dpsub_u_d_ARG2
280  %2 = load <4 x i32>, <4 x i32>* @llvm_mips_dpsub_u_d_ARG3
281  %3 = tail call <2 x i64> @llvm.mips.dpsub.u.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
282  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpsub_u_d_RES
283  ret void
284}
285
286declare <2 x i64> @llvm.mips.dpsub.u.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
287
288; CHECK: llvm_mips_dpsub_u_d_test:
289; CHECK: ld.w
290; CHECK: ld.w
291; CHECK: ld.d
292; CHECK: dpsub_u.d
293; CHECK: st.d
294; CHECK: .size llvm_mips_dpsub_u_d_test
295;
296