1; RUN: llc -march=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
2; RUN: llc -march=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
3
4define void @and_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
5  ; CHECK: and_v16i8:
6
7  %1 = load <16 x i8>, <16 x i8>* %a
8  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
9  %2 = load <16 x i8>, <16 x i8>* %b
10  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
11  %3 = and <16 x i8> %1, %2
12  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
13  store <16 x i8> %3, <16 x i8>* %c
14  ; CHECK-DAG: st.b [[R3]], 0($4)
15
16  ret void
17  ; CHECK: .size and_v16i8
18}
19
20define void @and_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
21  ; CHECK: and_v8i16:
22
23  %1 = load <8 x i16>, <8 x i16>* %a
24  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
25  %2 = load <8 x i16>, <8 x i16>* %b
26  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
27  %3 = and <8 x i16> %1, %2
28  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
29  store <8 x i16> %3, <8 x i16>* %c
30  ; CHECK-DAG: st.h [[R3]], 0($4)
31
32  ret void
33  ; CHECK: .size and_v8i16
34}
35
36define void @and_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
37  ; CHECK: and_v4i32:
38
39  %1 = load <4 x i32>, <4 x i32>* %a
40  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
41  %2 = load <4 x i32>, <4 x i32>* %b
42  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
43  %3 = and <4 x i32> %1, %2
44  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
45  store <4 x i32> %3, <4 x i32>* %c
46  ; CHECK-DAG: st.w [[R3]], 0($4)
47
48  ret void
49  ; CHECK: .size and_v4i32
50}
51
52define void @and_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
53  ; CHECK: and_v2i64:
54
55  %1 = load <2 x i64>, <2 x i64>* %a
56  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
57  %2 = load <2 x i64>, <2 x i64>* %b
58  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
59  %3 = and <2 x i64> %1, %2
60  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
61  store <2 x i64> %3, <2 x i64>* %c
62  ; CHECK-DAG: st.d [[R3]], 0($4)
63
64  ret void
65  ; CHECK: .size and_v2i64
66}
67
68define void @and_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
69  ; CHECK: and_v16i8_i:
70
71  %1 = load <16 x i8>, <16 x i8>* %a
72  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
73  %2 = and <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
74  ; CHECK-DAG: andi.b [[R4:\$w[0-9]+]], [[R1]], 1
75  store <16 x i8> %2, <16 x i8>* %c
76  ; CHECK-DAG: st.b [[R4]], 0($4)
77
78  ret void
79  ; CHECK: .size and_v16i8_i
80}
81
82define void @and_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
83  ; CHECK: and_v8i16_i:
84
85  %1 = load <8 x i16>, <8 x i16>* %a
86  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
87  %2 = and <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
88  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
89  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
90  store <8 x i16> %2, <8 x i16>* %c
91  ; CHECK-DAG: st.h [[R4]], 0($4)
92
93  ret void
94  ; CHECK: .size and_v8i16_i
95}
96
97define void @and_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
98  ; CHECK: and_v4i32_i:
99
100  %1 = load <4 x i32>, <4 x i32>* %a
101  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
102  %2 = and <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
103  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
104  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
105  store <4 x i32> %2, <4 x i32>* %c
106  ; CHECK-DAG: st.w [[R4]], 0($4)
107
108  ret void
109  ; CHECK: .size and_v4i32_i
110}
111
112define void @and_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
113  ; CHECK: and_v2i64_i:
114
115  %1 = load <2 x i64>, <2 x i64>* %a
116  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
117  %2 = and <2 x i64> %1, <i64 1, i64 1>
118  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
119  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
120  store <2 x i64> %2, <2 x i64>* %c
121  ; CHECK-DAG: st.d [[R4]], 0($4)
122
123  ret void
124  ; CHECK: .size and_v2i64_i
125}
126
127define void @or_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
128  ; CHECK: or_v16i8:
129
130  %1 = load <16 x i8>, <16 x i8>* %a
131  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
132  %2 = load <16 x i8>, <16 x i8>* %b
133  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
134  %3 = or <16 x i8> %1, %2
135  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
136  store <16 x i8> %3, <16 x i8>* %c
137  ; CHECK-DAG: st.b [[R3]], 0($4)
138
139  ret void
140  ; CHECK: .size or_v16i8
141}
142
143define void @or_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
144  ; CHECK: or_v8i16:
145
146  %1 = load <8 x i16>, <8 x i16>* %a
147  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
148  %2 = load <8 x i16>, <8 x i16>* %b
149  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
150  %3 = or <8 x i16> %1, %2
151  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
152  store <8 x i16> %3, <8 x i16>* %c
153  ; CHECK-DAG: st.h [[R3]], 0($4)
154
155  ret void
156  ; CHECK: .size or_v8i16
157}
158
159define void @or_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
160  ; CHECK: or_v4i32:
161
162  %1 = load <4 x i32>, <4 x i32>* %a
163  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
164  %2 = load <4 x i32>, <4 x i32>* %b
165  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
166  %3 = or <4 x i32> %1, %2
167  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
168  store <4 x i32> %3, <4 x i32>* %c
169  ; CHECK-DAG: st.w [[R3]], 0($4)
170
171  ret void
172  ; CHECK: .size or_v4i32
173}
174
175define void @or_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
176  ; CHECK: or_v2i64:
177
178  %1 = load <2 x i64>, <2 x i64>* %a
179  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
180  %2 = load <2 x i64>, <2 x i64>* %b
181  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
182  %3 = or <2 x i64> %1, %2
183  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
184  store <2 x i64> %3, <2 x i64>* %c
185  ; CHECK-DAG: st.d [[R3]], 0($4)
186
187  ret void
188  ; CHECK: .size or_v2i64
189}
190
191define void @or_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
192  ; CHECK: or_v16i8_i:
193
194  %1 = load <16 x i8>, <16 x i8>* %a
195  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
196  %2 = or <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
197  ; CHECK-DAG: ori.b [[R4:\$w[0-9]+]], [[R1]], 3
198  store <16 x i8> %2, <16 x i8>* %c
199  ; CHECK-DAG: st.b [[R4]], 0($4)
200
201  ret void
202  ; CHECK: .size or_v16i8_i
203}
204
205define void @or_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
206  ; CHECK: or_v8i16_i:
207
208  %1 = load <8 x i16>, <8 x i16>* %a
209  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
210  %2 = or <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
211  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 3
212  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
213  store <8 x i16> %2, <8 x i16>* %c
214  ; CHECK-DAG: st.h [[R4]], 0($4)
215
216  ret void
217  ; CHECK: .size or_v8i16_i
218}
219
220define void @or_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
221  ; CHECK: or_v4i32_i:
222
223  %1 = load <4 x i32>, <4 x i32>* %a
224  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
225  %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
226  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 3
227  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
228  store <4 x i32> %2, <4 x i32>* %c
229  ; CHECK-DAG: st.w [[R4]], 0($4)
230
231  ret void
232  ; CHECK: .size or_v4i32_i
233}
234
235define void @or_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
236  ; CHECK: or_v2i64_i:
237
238  %1 = load <2 x i64>, <2 x i64>* %a
239  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
240  %2 = or <2 x i64> %1, <i64 3, i64 3>
241  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 3
242  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
243  store <2 x i64> %2, <2 x i64>* %c
244  ; CHECK-DAG: st.d [[R4]], 0($4)
245
246  ret void
247  ; CHECK: .size or_v2i64_i
248}
249
250define void @nor_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
251  ; CHECK: nor_v16i8:
252
253  %1 = load <16 x i8>, <16 x i8>* %a
254  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
255  %2 = load <16 x i8>, <16 x i8>* %b
256  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
257  %3 = or <16 x i8> %1, %2
258  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
259  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
260  store <16 x i8> %4, <16 x i8>* %c
261  ; CHECK-DAG: st.b [[R3]], 0($4)
262
263  ret void
264  ; CHECK: .size nor_v16i8
265}
266
267define void @nor_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
268  ; CHECK: nor_v8i16:
269
270  %1 = load <8 x i16>, <8 x i16>* %a
271  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
272  %2 = load <8 x i16>, <8 x i16>* %b
273  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
274  %3 = or <8 x i16> %1, %2
275  %4 = xor <8 x i16> %3, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
276  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
277  store <8 x i16> %4, <8 x i16>* %c
278  ; CHECK-DAG: st.h [[R3]], 0($4)
279
280  ret void
281  ; CHECK: .size nor_v8i16
282}
283
284define void @nor_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
285  ; CHECK: nor_v4i32:
286
287  %1 = load <4 x i32>, <4 x i32>* %a
288  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
289  %2 = load <4 x i32>, <4 x i32>* %b
290  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
291  %3 = or <4 x i32> %1, %2
292  %4 = xor <4 x i32> %3, <i32 -1, i32 -1, i32 -1, i32 -1>
293  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
294  store <4 x i32> %4, <4 x i32>* %c
295  ; CHECK-DAG: st.w [[R3]], 0($4)
296
297  ret void
298  ; CHECK: .size nor_v4i32
299}
300
301define void @nor_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
302  ; CHECK: nor_v2i64:
303
304  %1 = load <2 x i64>, <2 x i64>* %a
305  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
306  %2 = load <2 x i64>, <2 x i64>* %b
307  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
308  %3 = or <2 x i64> %1, %2
309  %4 = xor <2 x i64> %3, <i64 -1, i64 -1>
310  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
311  store <2 x i64> %4, <2 x i64>* %c
312  ; CHECK-DAG: st.d [[R3]], 0($4)
313
314  ret void
315  ; CHECK: .size nor_v2i64
316}
317
318define void @nor_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
319  ; CHECK: nor_v16i8_i:
320
321  %1 = load <16 x i8>, <16 x i8>* %a
322  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
323  %2 = or <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
324  %3 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
325  ; CHECK-DAG: ori.b [[R4:\$w[0-9]+]], [[R1]], 1
326  store <16 x i8> %3, <16 x i8>* %c
327  ; CHECK-DAG: st.b [[R4]], 0($4)
328
329  ret void
330  ; CHECK: .size nor_v16i8_i
331}
332
333define void @nor_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
334  ; CHECK: nor_v8i16_i:
335
336  %1 = load <8 x i16>, <8 x i16>* %a
337  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
338  %2 = or <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
339  %3 = xor <8 x i16> %2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
340  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
341  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
342  store <8 x i16> %3, <8 x i16>* %c
343  ; CHECK-DAG: st.h [[R4]], 0($4)
344
345  ret void
346  ; CHECK: .size nor_v8i16_i
347}
348
349define void @nor_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
350  ; CHECK: nor_v4i32_i:
351
352  %1 = load <4 x i32>, <4 x i32>* %a
353  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
354  %2 = or <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
355  %3 = xor <4 x i32> %2, <i32 -1, i32 -1, i32 -1, i32 -1>
356  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
357  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
358  store <4 x i32> %3, <4 x i32>* %c
359  ; CHECK-DAG: st.w [[R4]], 0($4)
360
361  ret void
362  ; CHECK: .size nor_v4i32_i
363}
364
365define void @nor_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
366  ; CHECK: nor_v2i64_i:
367
368  %1 = load <2 x i64>, <2 x i64>* %a
369  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
370  %2 = or <2 x i64> %1, <i64 1, i64 1>
371  %3 = xor <2 x i64> %2, <i64 -1, i64 -1>
372  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
373  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
374  store <2 x i64> %3, <2 x i64>* %c
375  ; CHECK-DAG: st.d [[R4]], 0($4)
376
377  ret void
378  ; CHECK: .size nor_v2i64_i
379}
380
381define void @xor_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
382  ; CHECK: xor_v16i8:
383
384  %1 = load <16 x i8>, <16 x i8>* %a
385  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
386  %2 = load <16 x i8>, <16 x i8>* %b
387  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
388  %3 = xor <16 x i8> %1, %2
389  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
390  store <16 x i8> %3, <16 x i8>* %c
391  ; CHECK-DAG: st.b [[R3]], 0($4)
392
393  ret void
394  ; CHECK: .size xor_v16i8
395}
396
397define void @xor_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
398  ; CHECK: xor_v8i16:
399
400  %1 = load <8 x i16>, <8 x i16>* %a
401  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
402  %2 = load <8 x i16>, <8 x i16>* %b
403  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
404  %3 = xor <8 x i16> %1, %2
405  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
406  store <8 x i16> %3, <8 x i16>* %c
407  ; CHECK-DAG: st.h [[R3]], 0($4)
408
409  ret void
410  ; CHECK: .size xor_v8i16
411}
412
413define void @xor_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
414  ; CHECK: xor_v4i32:
415
416  %1 = load <4 x i32>, <4 x i32>* %a
417  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
418  %2 = load <4 x i32>, <4 x i32>* %b
419  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
420  %3 = xor <4 x i32> %1, %2
421  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
422  store <4 x i32> %3, <4 x i32>* %c
423  ; CHECK-DAG: st.w [[R3]], 0($4)
424
425  ret void
426  ; CHECK: .size xor_v4i32
427}
428
429define void @xor_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
430  ; CHECK: xor_v2i64:
431
432  %1 = load <2 x i64>, <2 x i64>* %a
433  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
434  %2 = load <2 x i64>, <2 x i64>* %b
435  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
436  %3 = xor <2 x i64> %1, %2
437  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
438  store <2 x i64> %3, <2 x i64>* %c
439  ; CHECK-DAG: st.d [[R3]], 0($4)
440
441  ret void
442  ; CHECK: .size xor_v2i64
443}
444
445define void @xor_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
446  ; CHECK: xor_v16i8_i:
447
448  %1 = load <16 x i8>, <16 x i8>* %a
449  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
450  %2 = xor <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
451  ; CHECK-DAG: xori.b [[R4:\$w[0-9]+]], [[R1]], 3
452  store <16 x i8> %2, <16 x i8>* %c
453  ; CHECK-DAG: st.b [[R4]], 0($4)
454
455  ret void
456  ; CHECK: .size xor_v16i8_i
457}
458
459define void @xor_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
460  ; CHECK: xor_v8i16_i:
461
462  %1 = load <8 x i16>, <8 x i16>* %a
463  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
464  %2 = xor <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
465  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 3
466  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
467  store <8 x i16> %2, <8 x i16>* %c
468  ; CHECK-DAG: st.h [[R4]], 0($4)
469
470  ret void
471  ; CHECK: .size xor_v8i16_i
472}
473
474define void @xor_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
475  ; CHECK: xor_v4i32_i:
476
477  %1 = load <4 x i32>, <4 x i32>* %a
478  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
479  %2 = xor <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
480  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 3
481  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
482  store <4 x i32> %2, <4 x i32>* %c
483  ; CHECK-DAG: st.w [[R4]], 0($4)
484
485  ret void
486  ; CHECK: .size xor_v4i32_i
487}
488
489define void @xor_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
490  ; CHECK: xor_v2i64_i:
491
492  %1 = load <2 x i64>, <2 x i64>* %a
493  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
494  %2 = xor <2 x i64> %1, <i64 3, i64 3>
495  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 3
496  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
497  store <2 x i64> %2, <2 x i64>* %c
498  ; CHECK-DAG: st.d [[R4]], 0($4)
499
500  ret void
501  ; CHECK: .size xor_v2i64_i
502}
503
504define void @sll_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
505  ; CHECK: sll_v16i8:
506
507  %1 = load <16 x i8>, <16 x i8>* %a
508  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
509  %2 = load <16 x i8>, <16 x i8>* %b
510  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
511  %3 = shl <16 x i8> %1, %2
512  ; CHECK-DAG: sll.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
513  store <16 x i8> %3, <16 x i8>* %c
514  ; CHECK-DAG: st.b [[R3]], 0($4)
515
516  ret void
517  ; CHECK: .size sll_v16i8
518}
519
520define void @sll_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
521  ; CHECK: sll_v8i16:
522
523  %1 = load <8 x i16>, <8 x i16>* %a
524  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
525  %2 = load <8 x i16>, <8 x i16>* %b
526  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
527  %3 = shl <8 x i16> %1, %2
528  ; CHECK-DAG: sll.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
529  store <8 x i16> %3, <8 x i16>* %c
530  ; CHECK-DAG: st.h [[R3]], 0($4)
531
532  ret void
533  ; CHECK: .size sll_v8i16
534}
535
536define void @sll_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
537  ; CHECK: sll_v4i32:
538
539  %1 = load <4 x i32>, <4 x i32>* %a
540  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
541  %2 = load <4 x i32>, <4 x i32>* %b
542  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
543  %3 = shl <4 x i32> %1, %2
544  ; CHECK-DAG: sll.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
545  store <4 x i32> %3, <4 x i32>* %c
546  ; CHECK-DAG: st.w [[R3]], 0($4)
547
548  ret void
549  ; CHECK: .size sll_v4i32
550}
551
552define void @sll_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
553  ; CHECK: sll_v2i64:
554
555  %1 = load <2 x i64>, <2 x i64>* %a
556  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
557  %2 = load <2 x i64>, <2 x i64>* %b
558  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
559  %3 = shl <2 x i64> %1, %2
560  ; CHECK-DAG: sll.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
561  store <2 x i64> %3, <2 x i64>* %c
562  ; CHECK-DAG: st.d [[R3]], 0($4)
563
564  ret void
565  ; CHECK: .size sll_v2i64
566}
567
568define void @sll_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
569  ; CHECK: sll_v16i8_i:
570
571  %1 = load <16 x i8>, <16 x i8>* %a
572  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
573  %2 = shl <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
574  ; CHECK-DAG: slli.b [[R4:\$w[0-9]+]], [[R1]], 1
575  store <16 x i8> %2, <16 x i8>* %c
576  ; CHECK-DAG: st.b [[R4]], 0($4)
577
578  ret void
579  ; CHECK: .size sll_v16i8_i
580}
581
582define void @sll_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
583  ; CHECK: sll_v8i16_i:
584
585  %1 = load <8 x i16>, <8 x i16>* %a
586  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
587  %2 = shl <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
588  ; CHECK-DAG: slli.h [[R4:\$w[0-9]+]], [[R1]], 1
589  store <8 x i16> %2, <8 x i16>* %c
590  ; CHECK-DAG: st.h [[R4]], 0($4)
591
592  ret void
593  ; CHECK: .size sll_v8i16_i
594}
595
596define void @sll_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
597  ; CHECK: sll_v4i32_i:
598
599  %1 = load <4 x i32>, <4 x i32>* %a
600  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
601  %2 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
602  ; CHECK-DAG: slli.w [[R4:\$w[0-9]+]], [[R1]], 1
603  store <4 x i32> %2, <4 x i32>* %c
604  ; CHECK-DAG: st.w [[R4]], 0($4)
605
606  ret void
607  ; CHECK: .size sll_v4i32_i
608}
609
610define void @sll_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
611  ; CHECK: sll_v2i64_i:
612
613  %1 = load <2 x i64>, <2 x i64>* %a
614  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
615  %2 = shl <2 x i64> %1, <i64 1, i64 1>
616  ; CHECK-DAG: slli.d [[R4:\$w[0-9]+]], [[R1]], 1
617  store <2 x i64> %2, <2 x i64>* %c
618  ; CHECK-DAG: st.d [[R4]], 0($4)
619
620  ret void
621  ; CHECK: .size sll_v2i64_i
622}
623
624define void @sra_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
625  ; CHECK: sra_v16i8:
626
627  %1 = load <16 x i8>, <16 x i8>* %a
628  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
629  %2 = load <16 x i8>, <16 x i8>* %b
630  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
631  %3 = ashr <16 x i8> %1, %2
632  ; CHECK-DAG: sra.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
633  store <16 x i8> %3, <16 x i8>* %c
634  ; CHECK-DAG: st.b [[R3]], 0($4)
635
636  ret void
637  ; CHECK: .size sra_v16i8
638}
639
640define void @sra_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
641  ; CHECK: sra_v8i16:
642
643  %1 = load <8 x i16>, <8 x i16>* %a
644  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
645  %2 = load <8 x i16>, <8 x i16>* %b
646  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
647  %3 = ashr <8 x i16> %1, %2
648  ; CHECK-DAG: sra.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
649  store <8 x i16> %3, <8 x i16>* %c
650  ; CHECK-DAG: st.h [[R3]], 0($4)
651
652  ret void
653  ; CHECK: .size sra_v8i16
654}
655
656define void @sra_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
657  ; CHECK: sra_v4i32:
658
659  %1 = load <4 x i32>, <4 x i32>* %a
660  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
661  %2 = load <4 x i32>, <4 x i32>* %b
662  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
663  %3 = ashr <4 x i32> %1, %2
664  ; CHECK-DAG: sra.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
665  store <4 x i32> %3, <4 x i32>* %c
666  ; CHECK-DAG: st.w [[R3]], 0($4)
667
668  ret void
669  ; CHECK: .size sra_v4i32
670}
671
672define void @sra_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
673  ; CHECK: sra_v2i64:
674
675  %1 = load <2 x i64>, <2 x i64>* %a
676  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
677  %2 = load <2 x i64>, <2 x i64>* %b
678  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
679  %3 = ashr <2 x i64> %1, %2
680  ; CHECK-DAG: sra.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
681  store <2 x i64> %3, <2 x i64>* %c
682  ; CHECK-DAG: st.d [[R3]], 0($4)
683
684  ret void
685  ; CHECK: .size sra_v2i64
686}
687
688define void @sra_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
689  ; CHECK: sra_v16i8_i:
690
691  %1 = load <16 x i8>, <16 x i8>* %a
692  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
693  %2 = ashr <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
694  ; CHECK-DAG: srai.b [[R4:\$w[0-9]+]], [[R1]], 1
695  store <16 x i8> %2, <16 x i8>* %c
696  ; CHECK-DAG: st.b [[R4]], 0($4)
697
698  ret void
699  ; CHECK: .size sra_v16i8_i
700}
701
702define void @sra_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
703  ; CHECK: sra_v8i16_i:
704
705  %1 = load <8 x i16>, <8 x i16>* %a
706  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
707  %2 = ashr <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
708  ; CHECK-DAG: srai.h [[R4:\$w[0-9]+]], [[R1]], 1
709  store <8 x i16> %2, <8 x i16>* %c
710  ; CHECK-DAG: st.h [[R4]], 0($4)
711
712  ret void
713  ; CHECK: .size sra_v8i16_i
714}
715
716define void @sra_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
717  ; CHECK: sra_v4i32_i:
718
719  %1 = load <4 x i32>, <4 x i32>* %a
720  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
721  %2 = ashr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
722  ; CHECK-DAG: srai.w [[R4:\$w[0-9]+]], [[R1]], 1
723  store <4 x i32> %2, <4 x i32>* %c
724  ; CHECK-DAG: st.w [[R4]], 0($4)
725
726  ret void
727  ; CHECK: .size sra_v4i32_i
728}
729
730define void @sra_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
731  ; CHECK: sra_v2i64_i:
732
733  %1 = load <2 x i64>, <2 x i64>* %a
734  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
735  %2 = ashr <2 x i64> %1, <i64 1, i64 1>
736  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R1]], 1
737  store <2 x i64> %2, <2 x i64>* %c
738  ; CHECK-DAG: st.d [[R4]], 0($4)
739
740  ret void
741  ; CHECK: .size sra_v2i64_i
742}
743
744define void @srl_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
745  ; CHECK: srl_v16i8:
746
747  %1 = load <16 x i8>, <16 x i8>* %a
748  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
749  %2 = load <16 x i8>, <16 x i8>* %b
750  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
751  %3 = lshr <16 x i8> %1, %2
752  ; CHECK-DAG: srl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
753  store <16 x i8> %3, <16 x i8>* %c
754  ; CHECK-DAG: st.b [[R3]], 0($4)
755
756  ret void
757  ; CHECK: .size srl_v16i8
758}
759
760define void @srl_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
761  ; CHECK: srl_v8i16:
762
763  %1 = load <8 x i16>, <8 x i16>* %a
764  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
765  %2 = load <8 x i16>, <8 x i16>* %b
766  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
767  %3 = lshr <8 x i16> %1, %2
768  ; CHECK-DAG: srl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
769  store <8 x i16> %3, <8 x i16>* %c
770  ; CHECK-DAG: st.h [[R3]], 0($4)
771
772  ret void
773  ; CHECK: .size srl_v8i16
774}
775
776define void @srl_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
777  ; CHECK: srl_v4i32:
778
779  %1 = load <4 x i32>, <4 x i32>* %a
780  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
781  %2 = load <4 x i32>, <4 x i32>* %b
782  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
783  %3 = lshr <4 x i32> %1, %2
784  ; CHECK-DAG: srl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
785  store <4 x i32> %3, <4 x i32>* %c
786  ; CHECK-DAG: st.w [[R3]], 0($4)
787
788  ret void
789  ; CHECK: .size srl_v4i32
790}
791
792define void @srl_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
793  ; CHECK: srl_v2i64:
794
795  %1 = load <2 x i64>, <2 x i64>* %a
796  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
797  %2 = load <2 x i64>, <2 x i64>* %b
798  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
799  %3 = lshr <2 x i64> %1, %2
800  ; CHECK-DAG: srl.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
801  store <2 x i64> %3, <2 x i64>* %c
802  ; CHECK-DAG: st.d [[R3]], 0($4)
803
804  ret void
805  ; CHECK: .size srl_v2i64
806}
807
808define void @srl_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
809  ; CHECK: srl_v16i8_i:
810
811  %1 = load <16 x i8>, <16 x i8>* %a
812  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
813  %2 = lshr <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
814  ; CHECK-DAG: srli.b [[R4:\$w[0-9]+]], [[R1]], 1
815  store <16 x i8> %2, <16 x i8>* %c
816  ; CHECK-DAG: st.b [[R4]], 0($4)
817
818  ret void
819  ; CHECK: .size srl_v16i8_i
820}
821
822define void @srl_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
823  ; CHECK: srl_v8i16_i:
824
825  %1 = load <8 x i16>, <8 x i16>* %a
826  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
827  %2 = lshr <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
828  ; CHECK-DAG: srli.h [[R4:\$w[0-9]+]], [[R1]], 1
829  store <8 x i16> %2, <8 x i16>* %c
830  ; CHECK-DAG: st.h [[R4]], 0($4)
831
832  ret void
833  ; CHECK: .size srl_v8i16_i
834}
835
836define void @srl_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
837  ; CHECK: srl_v4i32_i:
838
839  %1 = load <4 x i32>, <4 x i32>* %a
840  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
841  %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
842  ; CHECK-DAG: srli.w [[R4:\$w[0-9]+]], [[R1]], 1
843  store <4 x i32> %2, <4 x i32>* %c
844  ; CHECK-DAG: st.w [[R4]], 0($4)
845
846  ret void
847  ; CHECK: .size srl_v4i32_i
848}
849
850define void @srl_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
851  ; CHECK: srl_v2i64_i:
852
853  %1 = load <2 x i64>, <2 x i64>* %a
854  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
855  %2 = lshr <2 x i64> %1, <i64 1, i64 1>
856  ; CHECK-DAG: srli.d [[R4:\$w[0-9]+]], [[R1]], 1
857  store <2 x i64> %2, <2 x i64>* %c
858  ; CHECK-DAG: st.d [[R4]], 0($4)
859
860  ret void
861  ; CHECK: .size srl_v2i64_i
862}
863
864define void @ctpop_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
865  ; CHECK: ctpop_v16i8:
866
867  %1 = load <16 x i8>, <16 x i8>* %a
868  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
869  %2 = tail call <16 x i8> @llvm.ctpop.v16i8 (<16 x i8> %1)
870  ; CHECK-DAG: pcnt.b [[R3:\$w[0-9]+]], [[R1]]
871  store <16 x i8> %2, <16 x i8>* %c
872  ; CHECK-DAG: st.b [[R3]], 0($4)
873
874  ret void
875  ; CHECK: .size ctpop_v16i8
876}
877
878define void @ctpop_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
879  ; CHECK: ctpop_v8i16:
880
881  %1 = load <8 x i16>, <8 x i16>* %a
882  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
883  %2 = tail call <8 x i16> @llvm.ctpop.v8i16 (<8 x i16> %1)
884  ; CHECK-DAG: pcnt.h [[R3:\$w[0-9]+]], [[R1]]
885  store <8 x i16> %2, <8 x i16>* %c
886  ; CHECK-DAG: st.h [[R3]], 0($4)
887
888  ret void
889  ; CHECK: .size ctpop_v8i16
890}
891
892define void @ctpop_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
893  ; CHECK: ctpop_v4i32:
894
895  %1 = load <4 x i32>, <4 x i32>* %a
896  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
897  %2 = tail call <4 x i32> @llvm.ctpop.v4i32 (<4 x i32> %1)
898  ; CHECK-DAG: pcnt.w [[R3:\$w[0-9]+]], [[R1]]
899  store <4 x i32> %2, <4 x i32>* %c
900  ; CHECK-DAG: st.w [[R3]], 0($4)
901
902  ret void
903  ; CHECK: .size ctpop_v4i32
904}
905
906define void @ctpop_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
907  ; CHECK: ctpop_v2i64:
908
909  %1 = load <2 x i64>, <2 x i64>* %a
910  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
911  %2 = tail call <2 x i64> @llvm.ctpop.v2i64 (<2 x i64> %1)
912  ; CHECK-DAG: pcnt.d [[R3:\$w[0-9]+]], [[R1]]
913  store <2 x i64> %2, <2 x i64>* %c
914  ; CHECK-DAG: st.d [[R3]], 0($4)
915
916  ret void
917  ; CHECK: .size ctpop_v2i64
918}
919
920define void @ctlz_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
921  ; CHECK: ctlz_v16i8:
922
923  %1 = load <16 x i8>, <16 x i8>* %a
924  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
925  %2 = tail call <16 x i8> @llvm.ctlz.v16i8 (<16 x i8> %1)
926  ; CHECK-DAG: nlzc.b [[R3:\$w[0-9]+]], [[R1]]
927  store <16 x i8> %2, <16 x i8>* %c
928  ; CHECK-DAG: st.b [[R3]], 0($4)
929
930  ret void
931  ; CHECK: .size ctlz_v16i8
932}
933
934define void @ctlz_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
935  ; CHECK: ctlz_v8i16:
936
937  %1 = load <8 x i16>, <8 x i16>* %a
938  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
939  %2 = tail call <8 x i16> @llvm.ctlz.v8i16 (<8 x i16> %1)
940  ; CHECK-DAG: nlzc.h [[R3:\$w[0-9]+]], [[R1]]
941  store <8 x i16> %2, <8 x i16>* %c
942  ; CHECK-DAG: st.h [[R3]], 0($4)
943
944  ret void
945  ; CHECK: .size ctlz_v8i16
946}
947
948define void @ctlz_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
949  ; CHECK: ctlz_v4i32:
950
951  %1 = load <4 x i32>, <4 x i32>* %a
952  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
953  %2 = tail call <4 x i32> @llvm.ctlz.v4i32 (<4 x i32> %1)
954  ; CHECK-DAG: nlzc.w [[R3:\$w[0-9]+]], [[R1]]
955  store <4 x i32> %2, <4 x i32>* %c
956  ; CHECK-DAG: st.w [[R3]], 0($4)
957
958  ret void
959  ; CHECK: .size ctlz_v4i32
960}
961
962define void @ctlz_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
963  ; CHECK: ctlz_v2i64:
964
965  %1 = load <2 x i64>, <2 x i64>* %a
966  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
967  %2 = tail call <2 x i64> @llvm.ctlz.v2i64 (<2 x i64> %1)
968  ; CHECK-DAG: nlzc.d [[R3:\$w[0-9]+]], [[R1]]
969  store <2 x i64> %2, <2 x i64>* %c
970  ; CHECK-DAG: st.d [[R3]], 0($4)
971
972  ret void
973  ; CHECK: .size ctlz_v2i64
974}
975
976define void @bsel_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b, <16 x i8>* %m) nounwind {
977  ; CHECK: bsel_v16i8:
978
979  %1 = load <16 x i8>, <16 x i8>* %a
980  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
981  %2 = load <16 x i8>, <16 x i8>* %b
982  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
983  %3 = load <16 x i8>, <16 x i8>* %m
984  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
985  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1,
986                          i8 -1, i8 -1, i8 -1, i8 -1,
987                          i8 -1, i8 -1, i8 -1, i8 -1,
988                          i8 -1, i8 -1, i8 -1, i8 -1>
989  %5 = and <16 x i8> %1, %3
990  %6 = and <16 x i8> %2, %4
991  %7 = or <16 x i8> %5, %6
992  ; bmnz is the same operation
993  ; (vselect Mask, IfSet, IfClr) -> (BMNZ IfClr, IfSet, Mask)
994  ; CHECK-DAG: bmnz.v [[R2]], [[R1]], [[R3]]
995  store <16 x i8> %7, <16 x i8>* %c
996  ; CHECK-DAG: st.b [[R2]], 0($4)
997
998  ret void
999  ; CHECK: .size bsel_v16i8
1000}
1001
1002define void @bsel_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %m) nounwind {
1003  ; CHECK: bsel_v16i8_i:
1004
1005  %1 = load <16 x i8>, <16 x i8>* %a
1006  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1007  %2 = load <16 x i8>, <16 x i8>* %m
1008  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($6)
1009  %3 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1,
1010                          i8 -1, i8 -1, i8 -1, i8 -1,
1011                          i8 -1, i8 -1, i8 -1, i8 -1,
1012                          i8 -1, i8 -1, i8 -1, i8 -1>
1013  %4 = and <16 x i8> %1, %3
1014  %5 = and <16 x i8> <i8 6, i8 6, i8 6, i8 6,
1015                      i8 6, i8 6, i8 6, i8 6,
1016                      i8 6, i8 6, i8 6, i8 6,
1017                      i8 6, i8 6, i8 6, i8 6>, %2
1018  %6 = or <16 x i8> %4, %5
1019  ; CHECK-DAG: bseli.b [[R3]], [[R1]], 6
1020  store <16 x i8> %6, <16 x i8>* %c
1021  ; CHECK-DAG: st.b [[R3]], 0($4)
1022
1023  ret void
1024  ; CHECK: .size bsel_v16i8_i
1025}
1026
1027define void @bsel_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
1028  ; CHECK: bsel_v8i16:
1029
1030  %1 = load <8 x i16>, <8 x i16>* %a
1031  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1032  %2 = load <8 x i16>, <8 x i16>* %b
1033  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
1034  %3 = and <8 x i16> %1, <i16 6, i16 6, i16 6, i16 6,
1035                          i16 6, i16 6, i16 6, i16 6>
1036  %4 = and <8 x i16> %2, <i16 65529, i16 65529, i16 65529, i16 65529,
1037                          i16 65529, i16 65529, i16 65529, i16 65529>
1038  %5 = or <8 x i16> %3, %4
1039  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 6
1040  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
1041  store <8 x i16> %5, <8 x i16>* %c
1042  ; CHECK-DAG: st.h [[R3]], 0($4)
1043
1044  ret void
1045  ; CHECK: .size bsel_v8i16
1046}
1047
1048define void @bsel_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
1049  ; CHECK: bsel_v4i32:
1050
1051  %1 = load <4 x i32>, <4 x i32>* %a
1052  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1053  %2 = load <4 x i32>, <4 x i32>* %b
1054  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
1055  %3 = and <4 x i32> %1, <i32 6, i32 6, i32 6, i32 6>
1056  %4 = and <4 x i32> %2, <i32 4294967289, i32 4294967289, i32 4294967289, i32 4294967289>
1057  %5 = or <4 x i32> %3, %4
1058  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 6
1059  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
1060  store <4 x i32> %5, <4 x i32>* %c
1061  ; CHECK-DAG: st.w [[R3]], 0($4)
1062
1063  ret void
1064  ; CHECK: .size bsel_v4i32
1065}
1066
1067define void @bsel_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
1068  ; CHECK: bsel_v2i64:
1069
1070  %1 = load <2 x i64>, <2 x i64>* %a
1071  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1072  %2 = load <2 x i64>, <2 x i64>* %b
1073  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
1074  %3 = and <2 x i64> %1, <i64 6, i64 6>
1075  %4 = and <2 x i64> %2, <i64 18446744073709551609, i64 18446744073709551609>
1076  %5 = or <2 x i64> %3, %4
1077  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 6
1078  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
1079  store <2 x i64> %5, <2 x i64>* %c
1080  ; CHECK-DAG: st.d [[R3]], 0($4)
1081
1082  ret void
1083  ; CHECK: .size bsel_v2i64
1084}
1085
1086define void @binsl_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
1087  ; CHECK: binsl_v16i8_i:
1088
1089  %1 = load <16 x i8>, <16 x i8>* %a
1090  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1091  %2 = load <16 x i8>, <16 x i8>* %b
1092  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
1093  %3 = and <16 x i8> %1, <i8 192, i8 192, i8 192, i8 192,
1094                          i8 192, i8 192, i8 192, i8 192,
1095                          i8 192, i8 192, i8 192, i8 192,
1096                          i8 192, i8 192, i8 192, i8 192>
1097  %4 = and <16 x i8> %2, <i8 63, i8 63, i8 63, i8 63,
1098                          i8 63, i8 63, i8 63, i8 63,
1099                          i8 63, i8 63, i8 63, i8 63,
1100                          i8 63, i8 63, i8 63, i8 63>
1101  %5 = or <16 x i8> %3, %4
1102  ; CHECK-DAG: binsli.b [[R2]], [[R1]], 1
1103  store <16 x i8> %5, <16 x i8>* %c
1104  ; CHECK-DAG: st.b [[R2]], 0($4)
1105
1106  ret void
1107  ; CHECK: .size binsl_v16i8_i
1108}
1109
1110define void @binsl_v8i16_i(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
1111  ; CHECK: binsl_v8i16_i:
1112
1113  %1 = load <8 x i16>, <8 x i16>* %a
1114  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1115  %2 = load <8 x i16>, <8 x i16>* %b
1116  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
1117  %3 = and <8 x i16> %1, <i16 49152, i16 49152, i16 49152, i16 49152,
1118                          i16 49152, i16 49152, i16 49152, i16 49152>
1119  %4 = and <8 x i16> %2, <i16 16383, i16 16383, i16 16383, i16 16383,
1120                          i16 16383, i16 16383, i16 16383, i16 16383>
1121  %5 = or <8 x i16> %3, %4
1122  ; CHECK-DAG: binsli.h [[R2]], [[R1]], 1
1123  store <8 x i16> %5, <8 x i16>* %c
1124  ; CHECK-DAG: st.h [[R2]], 0($4)
1125
1126  ret void
1127  ; CHECK: .size binsl_v8i16_i
1128}
1129
1130define void @binsl_v4i32_i(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
1131  ; CHECK: binsl_v4i32_i:
1132
1133  %1 = load <4 x i32>, <4 x i32>* %a
1134  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1135  %2 = load <4 x i32>, <4 x i32>* %b
1136  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
1137  %3 = and <4 x i32> %1, <i32 3221225472, i32 3221225472, i32 3221225472, i32 3221225472>
1138  %4 = and <4 x i32> %2, <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
1139  %5 = or <4 x i32> %3, %4
1140  ; CHECK-DAG: binsli.w [[R2]], [[R1]], 1
1141  store <4 x i32> %5, <4 x i32>* %c
1142  ; CHECK-DAG: st.w [[R2]], 0($4)
1143
1144  ret void
1145  ; CHECK: .size binsl_v4i32_i
1146}
1147
1148define void @binsl_v2i64_i(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
1149  ; CHECK: binsl_v2i64_i:
1150
1151  %1 = load <2 x i64>, <2 x i64>* %a
1152  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1153  %2 = load <2 x i64>, <2 x i64>* %b
1154  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
1155  %3 = and <2 x i64> %1, <i64 18446744073709551608, i64 18446744073709551608>
1156  %4 = and <2 x i64> %2, <i64 7, i64 7>
1157  %5 = or <2 x i64> %3, %4
1158  ; TODO: We use a particularly wide mask here to work around a legalization
1159  ;       issue. If the mask doesn't fit within a 10-bit immediate, it gets
1160  ;       legalized into a constant pool. We should add a test to cover the
1161  ;       other cases once they correctly select binsli.d.
1162  ; CHECK-DAG: binsli.d [[R2]], [[R1]], 60
1163  store <2 x i64> %5, <2 x i64>* %c
1164  ; CHECK-DAG: st.d [[R2]], 0($4)
1165
1166  ret void
1167  ; CHECK: .size binsl_v2i64_i
1168}
1169
1170define void @binsr_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
1171  ; CHECK: binsr_v16i8_i:
1172
1173  %1 = load <16 x i8>, <16 x i8>* %a
1174  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1175  %2 = load <16 x i8>, <16 x i8>* %b
1176  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
1177  %3 = and <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3,
1178                          i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1179  %4 = and <16 x i8> %2, <i8 252, i8 252, i8 252, i8 252,
1180                          i8 252, i8 252, i8 252, i8 252,
1181                          i8 252, i8 252, i8 252, i8 252,
1182                          i8 252, i8 252, i8 252, i8 252>
1183  %5 = or <16 x i8> %3, %4
1184  ; CHECK-DAG: binsri.b [[R2]], [[R1]], 1
1185  store <16 x i8> %5, <16 x i8>* %c
1186  ; CHECK-DAG: st.b [[R2]], 0($4)
1187
1188  ret void
1189  ; CHECK: .size binsr_v16i8_i
1190}
1191
1192define void @binsr_v8i16_i(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
1193  ; CHECK: binsr_v8i16_i:
1194
1195  %1 = load <8 x i16>, <8 x i16>* %a
1196  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1197  %2 = load <8 x i16>, <8 x i16>* %b
1198  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
1199  %3 = and <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3,
1200                          i16 3, i16 3, i16 3, i16 3>
1201  %4 = and <8 x i16> %2, <i16 65532, i16 65532, i16 65532, i16 65532,
1202                          i16 65532, i16 65532, i16 65532, i16 65532>
1203  %5 = or <8 x i16> %3, %4
1204  ; CHECK-DAG: binsri.h [[R2]], [[R1]], 1
1205  store <8 x i16> %5, <8 x i16>* %c
1206  ; CHECK-DAG: st.h [[R2]], 0($4)
1207
1208  ret void
1209  ; CHECK: .size binsr_v8i16_i
1210}
1211
1212define void @binsr_v4i32_i(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
1213  ; CHECK: binsr_v4i32_i:
1214
1215  %1 = load <4 x i32>, <4 x i32>* %a
1216  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1217  %2 = load <4 x i32>, <4 x i32>* %b
1218  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
1219  %3 = and <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
1220  %4 = and <4 x i32> %2, <i32 4294967292, i32 4294967292, i32 4294967292, i32 4294967292>
1221  %5 = or <4 x i32> %3, %4
1222  ; CHECK-DAG: binsri.w [[R2]], [[R1]], 1
1223  store <4 x i32> %5, <4 x i32>* %c
1224  ; CHECK-DAG: st.w [[R2]], 0($4)
1225
1226  ret void
1227  ; CHECK: .size binsr_v4i32_i
1228}
1229
1230define void @binsr_v2i64_i(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
1231  ; CHECK: binsr_v2i64_i:
1232
1233  %1 = load <2 x i64>, <2 x i64>* %a
1234  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1235  %2 = load <2 x i64>, <2 x i64>* %b
1236  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
1237  %3 = and <2 x i64> %1, <i64 3, i64 3>
1238  %4 = and <2 x i64> %2, <i64 18446744073709551612, i64 18446744073709551612>
1239  %5 = or <2 x i64> %3, %4
1240  ; CHECK-DAG: binsri.d [[R2]], [[R1]], 1
1241  store <2 x i64> %5, <2 x i64>* %c
1242  ; CHECK-DAG: st.d [[R2]], 0($4)
1243
1244  ret void
1245  ; CHECK: .size binsr_v2i64_i
1246}
1247
1248define void @bclr_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
1249  ; CHECK: bclr_v16i8:
1250
1251  %1 = load <16 x i8>, <16 x i8>* %a
1252  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1253  %2 = load <16 x i8>, <16 x i8>* %b
1254  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
1255  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
1256  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1257  %5 = and <16 x i8> %1, %4
1258  ; CHECK-DAG: bclr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1259  store <16 x i8> %5, <16 x i8>* %c
1260  ; CHECK-DAG: st.b [[R3]], 0($4)
1261
1262  ret void
1263  ; CHECK: .size bclr_v16i8
1264}
1265
1266define void @bclr_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
1267  ; CHECK: bclr_v8i16:
1268
1269  %1 = load <8 x i16>, <8 x i16>* %a
1270  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1271  %2 = load <8 x i16>, <8 x i16>* %b
1272  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
1273  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
1274  %4 = xor <8 x i16> %3, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1275  %5 = and <8 x i16> %1, %4
1276  ; CHECK-DAG: bclr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1277  store <8 x i16> %5, <8 x i16>* %c
1278  ; CHECK-DAG: st.h [[R3]], 0($4)
1279
1280  ret void
1281  ; CHECK: .size bclr_v8i16
1282}
1283
1284define void @bclr_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
1285  ; CHECK: bclr_v4i32:
1286
1287  %1 = load <4 x i32>, <4 x i32>* %a
1288  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1289  %2 = load <4 x i32>, <4 x i32>* %b
1290  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
1291  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
1292  %4 = xor <4 x i32> %3, <i32 -1, i32 -1, i32 -1, i32 -1>
1293  %5 = and <4 x i32> %1, %4
1294  ; CHECK-DAG: bclr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1295  store <4 x i32> %5, <4 x i32>* %c
1296  ; CHECK-DAG: st.w [[R3]], 0($4)
1297
1298  ret void
1299  ; CHECK: .size bclr_v4i32
1300}
1301
1302define void @bclr_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
1303  ; CHECK: bclr_v2i64:
1304
1305  %1 = load <2 x i64>, <2 x i64>* %a
1306  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1307  %2 = load <2 x i64>, <2 x i64>* %b
1308  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
1309  %3 = shl <2 x i64> <i64 1, i64 1>, %2
1310  %4 = xor <2 x i64> %3, <i64 -1, i64 -1>
1311  %5 = and <2 x i64> %1, %4
1312  ; CHECK-DAG: bclr.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1313  store <2 x i64> %5, <2 x i64>* %c
1314  ; CHECK-DAG: st.d [[R3]], 0($4)
1315
1316  ret void
1317  ; CHECK: .size bclr_v2i64
1318}
1319
1320define void @bset_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
1321  ; CHECK: bset_v16i8:
1322
1323  %1 = load <16 x i8>, <16 x i8>* %a
1324  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1325  %2 = load <16 x i8>, <16 x i8>* %b
1326  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
1327  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
1328  %4 = or <16 x i8> %1, %3
1329  ; CHECK-DAG: bset.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1330  store <16 x i8> %4, <16 x i8>* %c
1331  ; CHECK-DAG: st.b [[R3]], 0($4)
1332
1333  ret void
1334  ; CHECK: .size bset_v16i8
1335}
1336
1337define void @bset_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
1338  ; CHECK: bset_v8i16:
1339
1340  %1 = load <8 x i16>, <8 x i16>* %a
1341  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1342  %2 = load <8 x i16>, <8 x i16>* %b
1343  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
1344  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
1345  %4 = or <8 x i16> %1, %3
1346  ; CHECK-DAG: bset.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1347  store <8 x i16> %4, <8 x i16>* %c
1348  ; CHECK-DAG: st.h [[R3]], 0($4)
1349
1350  ret void
1351  ; CHECK: .size bset_v8i16
1352}
1353
1354define void @bset_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
1355  ; CHECK: bset_v4i32:
1356
1357  %1 = load <4 x i32>, <4 x i32>* %a
1358  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1359  %2 = load <4 x i32>, <4 x i32>* %b
1360  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
1361  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
1362  %4 = or <4 x i32> %1, %3
1363  ; CHECK-DAG: bset.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1364  store <4 x i32> %4, <4 x i32>* %c
1365  ; CHECK-DAG: st.w [[R3]], 0($4)
1366
1367  ret void
1368  ; CHECK: .size bset_v4i32
1369}
1370
1371define void @bset_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
1372  ; CHECK: bset_v2i64:
1373
1374  %1 = load <2 x i64>, <2 x i64>* %a
1375  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1376  %2 = load <2 x i64>, <2 x i64>* %b
1377  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
1378  %3 = shl <2 x i64> <i64 1, i64 1>, %2
1379  %4 = or <2 x i64> %1, %3
1380  ; CHECK-DAG: bset.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1381  store <2 x i64> %4, <2 x i64>* %c
1382  ; CHECK-DAG: st.d [[R3]], 0($4)
1383
1384  ret void
1385  ; CHECK: .size bset_v2i64
1386}
1387
1388define void @bneg_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
1389  ; CHECK: bneg_v16i8:
1390
1391  %1 = load <16 x i8>, <16 x i8>* %a
1392  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1393  %2 = load <16 x i8>, <16 x i8>* %b
1394  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
1395  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
1396  %4 = xor <16 x i8> %1, %3
1397  ; CHECK-DAG: bneg.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1398  store <16 x i8> %4, <16 x i8>* %c
1399  ; CHECK-DAG: st.b [[R3]], 0($4)
1400
1401  ret void
1402  ; CHECK: .size bneg_v16i8
1403}
1404
1405define void @bneg_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
1406  ; CHECK: bneg_v8i16:
1407
1408  %1 = load <8 x i16>, <8 x i16>* %a
1409  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1410  %2 = load <8 x i16>, <8 x i16>* %b
1411  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
1412  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
1413  %4 = xor <8 x i16> %1, %3
1414  ; CHECK-DAG: bneg.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1415  store <8 x i16> %4, <8 x i16>* %c
1416  ; CHECK-DAG: st.h [[R3]], 0($4)
1417
1418  ret void
1419  ; CHECK: .size bneg_v8i16
1420}
1421
1422define void @bneg_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
1423  ; CHECK: bneg_v4i32:
1424
1425  %1 = load <4 x i32>, <4 x i32>* %a
1426  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1427  %2 = load <4 x i32>, <4 x i32>* %b
1428  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
1429  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
1430  %4 = xor <4 x i32> %1, %3
1431  ; CHECK-DAG: bneg.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1432  store <4 x i32> %4, <4 x i32>* %c
1433  ; CHECK-DAG: st.w [[R3]], 0($4)
1434
1435  ret void
1436  ; CHECK: .size bneg_v4i32
1437}
1438
1439define void @bneg_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
1440  ; CHECK: bneg_v2i64:
1441
1442  %1 = load <2 x i64>, <2 x i64>* %a
1443  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1444  %2 = load <2 x i64>, <2 x i64>* %b
1445  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
1446  %3 = shl <2 x i64> <i64 1, i64 1>, %2
1447  %4 = xor <2 x i64> %1, %3
1448  ; CHECK-DAG: bneg.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
1449  store <2 x i64> %4, <2 x i64>* %c
1450  ; CHECK-DAG: st.d [[R3]], 0($4)
1451
1452  ret void
1453  ; CHECK: .size bneg_v2i64
1454}
1455
1456define void @bclri_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
1457  ; CHECK: bclri_v16i8:
1458
1459  %1 = load <16 x i8>, <16 x i8>* %a
1460  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1461  %2 = xor <16 x i8> <i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8>,
1462                     <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1463  %3 = and <16 x i8> %1, %2
1464  ; bclri.b and andi.b are exactly equivalent.
1465  ; CHECK-DAG: andi.b [[R3:\$w[0-9]+]], [[R1]], 247
1466  store <16 x i8> %3, <16 x i8>* %c
1467  ; CHECK-DAG: st.b [[R3]], 0($4)
1468
1469  ret void
1470  ; CHECK: .size bclri_v16i8
1471}
1472
1473define void @bclri_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
1474  ; CHECK: bclri_v8i16:
1475
1476  %1 = load <8 x i16>, <8 x i16>* %a
1477  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1478  %2 = xor <8 x i16> <i16  8, i16  8, i16  8, i16  8, i16  8, i16  8, i16  8, i16  8>,
1479                     <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1480  %3 = and <8 x i16> %1, %2
1481  ; CHECK-DAG: bclri.h [[R3:\$w[0-9]+]], [[R1]], 3
1482  store <8 x i16> %3, <8 x i16>* %c
1483  ; CHECK-DAG: st.h [[R3]], 0($4)
1484
1485  ret void
1486  ; CHECK: .size bclri_v8i16
1487}
1488
1489define void @bclri_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
1490  ; CHECK: bclri_v4i32:
1491
1492  %1 = load <4 x i32>, <4 x i32>* %a
1493  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1494  %2 = xor <4 x i32> <i32  8, i32  8, i32  8, i32  8>,
1495                     <i32 -1, i32 -1, i32 -1, i32 -1>
1496  %3 = and <4 x i32> %1, %2
1497  ; CHECK-DAG: bclri.w [[R3:\$w[0-9]+]], [[R1]], 3
1498  store <4 x i32> %3, <4 x i32>* %c
1499  ; CHECK-DAG: st.w [[R3]], 0($4)
1500
1501  ret void
1502  ; CHECK: .size bclri_v4i32
1503}
1504
1505define void @bclri_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
1506  ; CHECK: bclri_v2i64:
1507
1508  %1 = load <2 x i64>, <2 x i64>* %a
1509  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1510  %2 = xor <2 x i64> <i64  8, i64  8>,
1511                     <i64 -1, i64 -1>
1512  %3 = and <2 x i64> %1, %2
1513  ; CHECK-DAG: bclri.d [[R3:\$w[0-9]+]], [[R1]], 3
1514  store <2 x i64> %3, <2 x i64>* %c
1515  ; CHECK-DAG: st.d [[R3]], 0($4)
1516
1517  ret void
1518  ; CHECK: .size bclri_v2i64
1519}
1520
1521define void @bseti_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
1522  ; CHECK: bseti_v16i8:
1523
1524  %1 = load <16 x i8>, <16 x i8>* %a
1525  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1526  %2 = or <16 x i8> %1, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
1527  ; CHECK-DAG: bseti.b [[R3:\$w[0-9]+]], [[R1]], 3
1528  store <16 x i8> %2, <16 x i8>* %c
1529  ; CHECK-DAG: st.b [[R3]], 0($4)
1530
1531  ret void
1532  ; CHECK: .size bseti_v16i8
1533}
1534
1535define void @bseti_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
1536  ; CHECK: bseti_v8i16:
1537
1538  %1 = load <8 x i16>, <8 x i16>* %a
1539  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1540  %2 = or <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1541  ; CHECK-DAG: bseti.h [[R3:\$w[0-9]+]], [[R1]], 3
1542  store <8 x i16> %2, <8 x i16>* %c
1543  ; CHECK-DAG: st.h [[R3]], 0($4)
1544
1545  ret void
1546  ; CHECK: .size bseti_v8i16
1547}
1548
1549define void @bseti_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
1550  ; CHECK: bseti_v4i32:
1551
1552  %1 = load <4 x i32>, <4 x i32>* %a
1553  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1554  %2 = or <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
1555  ; CHECK-DAG: bseti.w [[R3:\$w[0-9]+]], [[R1]], 3
1556  store <4 x i32> %2, <4 x i32>* %c
1557  ; CHECK-DAG: st.w [[R3]], 0($4)
1558
1559  ret void
1560  ; CHECK: .size bseti_v4i32
1561}
1562
1563define void @bseti_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
1564  ; CHECK: bseti_v2i64:
1565
1566  %1 = load <2 x i64>, <2 x i64>* %a
1567  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1568  %2 = or <2 x i64> %1, <i64 8, i64 8>
1569  ; CHECK-DAG: bseti.d [[R3:\$w[0-9]+]], [[R1]], 3
1570  store <2 x i64> %2, <2 x i64>* %c
1571  ; CHECK-DAG: st.d [[R3]], 0($4)
1572
1573  ret void
1574  ; CHECK: .size bseti_v2i64
1575}
1576
1577define void @bnegi_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
1578  ; CHECK: bnegi_v16i8:
1579
1580  %1 = load <16 x i8>, <16 x i8>* %a
1581  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
1582  %2 = xor <16 x i8> %1, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
1583  ; CHECK-DAG: bnegi.b [[R3:\$w[0-9]+]], [[R1]], 3
1584  store <16 x i8> %2, <16 x i8>* %c
1585  ; CHECK-DAG: st.b [[R3]], 0($4)
1586
1587  ret void
1588  ; CHECK: .size bnegi_v16i8
1589}
1590
1591define void @bnegi_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
1592  ; CHECK: bnegi_v8i16:
1593
1594  %1 = load <8 x i16>, <8 x i16>* %a
1595  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
1596  %2 = xor <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1597  ; CHECK-DAG: bnegi.h [[R3:\$w[0-9]+]], [[R1]], 3
1598  store <8 x i16> %2, <8 x i16>* %c
1599  ; CHECK-DAG: st.h [[R3]], 0($4)
1600
1601  ret void
1602  ; CHECK: .size bnegi_v8i16
1603}
1604
1605define void @bnegi_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
1606  ; CHECK: bnegi_v4i32:
1607
1608  %1 = load <4 x i32>, <4 x i32>* %a
1609  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
1610  %2 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
1611  ; CHECK-DAG: bnegi.w [[R3:\$w[0-9]+]], [[R1]], 3
1612  store <4 x i32> %2, <4 x i32>* %c
1613  ; CHECK-DAG: st.w [[R3]], 0($4)
1614
1615  ret void
1616  ; CHECK: .size bnegi_v4i32
1617}
1618
1619define void @bnegi_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
1620  ; CHECK: bnegi_v2i64:
1621
1622  %1 = load <2 x i64>, <2 x i64>* %a
1623  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
1624  %2 = xor <2 x i64> %1, <i64 8, i64 8>
1625  ; CHECK-DAG: bnegi.d [[R3:\$w[0-9]+]], [[R1]], 3
1626  store <2 x i64> %2, <2 x i64>* %c
1627  ; CHECK-DAG: st.d [[R3]], 0($4)
1628
1629  ret void
1630  ; CHECK: .size bnegi_v2i64
1631}
1632
1633declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %val)
1634declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val)
1635declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val)
1636declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val)
1637declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %val)
1638declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %val)
1639declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val)
1640declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %val)
1641