1; Test vector intrinsics added with z14.
2;
3; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
4
5declare <2 x i64> @llvm.s390.vbperm(<16 x i8>, <16 x i8>)
6declare <16 x i8> @llvm.s390.vmslg(<2 x i64>, <2 x i64>, <16 x i8>, i32)
7declare <16 x i8> @llvm.s390.vlrl(i32, i8 *)
8declare void @llvm.s390.vstrl(<16 x i8>, i32, i8 *)
9
10declare {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float>, <4 x float>)
11declare {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float>, <4 x float>)
12declare {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float>, <4 x float>)
13declare {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float>, i32)
14declare <4 x float> @llvm.s390.vfisb(<4 x float>, i32, i32)
15
16declare <2 x double> @llvm.s390.vfmaxdb(<2 x double>, <2 x double>, i32)
17declare <2 x double> @llvm.s390.vfmindb(<2 x double>, <2 x double>, i32)
18declare <4 x float> @llvm.s390.vfmaxsb(<4 x float>, <4 x float>, i32)
19declare <4 x float> @llvm.s390.vfminsb(<4 x float>, <4 x float>, i32)
20
21; VBPERM.
22define <2 x i64> @test_vbperm(<16 x i8> %a, <16 x i8> %b) {
23; CHECK-LABEL: test_vbperm:
24; CHECK: vbperm %v24, %v24, %v26
25; CHECK: br %r14
26  %res = call <2 x i64> @llvm.s390.vbperm(<16 x i8> %a, <16 x i8> %b)
27  ret <2 x i64> %res
28}
29
30; VMSLG with no shifts.
31define <16 x i8> @test_vmslg1(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
32; CHECK-LABEL: test_vmslg1:
33; CHECK: vmslg %v24, %v24, %v26, %v28, 0
34; CHECK: br %r14
35  %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 0)
36  ret <16 x i8> %res
37}
38
39; VMSLG with both shifts.
40define <16 x i8> @test_vmslg2(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
41; CHECK-LABEL: test_vmslg2:
42; CHECK: vmslg %v24, %v24, %v26, %v28, 12
43; CHECK: br %r14
44  %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 12)
45  ret <16 x i8> %res
46}
47
48; VLRLR with the lowest in-range displacement.
49define <16 x i8> @test_vlrlr1(i8 *%ptr, i32 %length) {
50; CHECK-LABEL: test_vlrlr1:
51; CHECK: vlrlr %v24, %r3, 0(%r2)
52; CHECK: br %r14
53  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
54  ret <16 x i8> %res
55}
56
57; VLRLR with the highest in-range displacement.
58define <16 x i8> @test_vlrlr2(i8 *%base, i32 %length) {
59; CHECK-LABEL: test_vlrlr2:
60; CHECK: vlrlr %v24, %r3, 4095(%r2)
61; CHECK: br %r14
62  %ptr = getelementptr i8, i8 *%base, i64 4095
63  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
64  ret <16 x i8> %res
65}
66
67; VLRLR with an out-of-range displacement.
68define <16 x i8> @test_vlrlr3(i8 *%base, i32 %length) {
69; CHECK-LABEL: test_vlrlr3:
70; CHECK: vlrlr %v24, %r3, 0({{%r[1-5]}})
71; CHECK: br %r14
72  %ptr = getelementptr i8, i8 *%base, i64 4096
73  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
74  ret <16 x i8> %res
75}
76
77; Check that VLRLR doesn't allow an index.
78define <16 x i8> @test_vlrlr4(i8 *%base, i64 %index, i32 %length) {
79; CHECK-LABEL: test_vlrlr4:
80; CHECK: vlrlr %v24, %r4, 0({{%r[1-5]}})
81; CHECK: br %r14
82  %ptr = getelementptr i8, i8 *%base, i64 %index
83  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
84  ret <16 x i8> %res
85}
86
87; VLRL with the lowest in-range displacement.
88define <16 x i8> @test_vlrl1(i8 *%ptr) {
89; CHECK-LABEL: test_vlrl1:
90; CHECK: vlrl %v24, 0(%r2), 0
91; CHECK: br %r14
92  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
93  ret <16 x i8> %res
94}
95
96; VLRL with the highest in-range displacement.
97define <16 x i8> @test_vlrl2(i8 *%base) {
98; CHECK-LABEL: test_vlrl2:
99; CHECK: vlrl %v24, 4095(%r2), 0
100; CHECK: br %r14
101  %ptr = getelementptr i8, i8 *%base, i64 4095
102  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
103  ret <16 x i8> %res
104}
105
106; VLRL with an out-of-range displacement.
107define <16 x i8> @test_vlrl3(i8 *%base) {
108; CHECK-LABEL: test_vlrl3:
109; CHECK: vlrl %v24, 0({{%r[1-5]}}), 0
110; CHECK: br %r14
111  %ptr = getelementptr i8, i8 *%base, i64 4096
112  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
113  ret <16 x i8> %res
114}
115
116; Check that VLRL doesn't allow an index.
117define <16 x i8> @test_vlrl4(i8 *%base, i64 %index) {
118; CHECK-LABEL: test_vlrl4:
119; CHECK: vlrl %v24, 0({{%r[1-5]}}), 0
120; CHECK: br %r14
121  %ptr = getelementptr i8, i8 *%base, i64 %index
122  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
123  ret <16 x i8> %res
124}
125
126; VLRL with length >= 15 should become VL.
127define <16 x i8> @test_vlrl5(i8 *%ptr) {
128; CHECK-LABEL: test_vlrl5:
129; CHECK: vl %v24, 0({{%r[1-5]}})
130; CHECK: br %r14
131  %res = call <16 x i8> @llvm.s390.vlrl(i32 15, i8 *%ptr)
132  ret <16 x i8> %res
133}
134
135; VSTRLR with the lowest in-range displacement.
136define void @test_vstrlr1(<16 x i8> %vec, i8 *%ptr, i32 %length) {
137; CHECK-LABEL: test_vstrlr1:
138; CHECK: vstrlr %v24, %r3, 0(%r2)
139; CHECK: br %r14
140  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
141  ret void
142}
143
144; VSTRLR with the highest in-range displacement.
145define void @test_vstrlr2(<16 x i8> %vec, i8 *%base, i32 %length) {
146; CHECK-LABEL: test_vstrlr2:
147; CHECK: vstrlr %v24, %r3, 4095(%r2)
148; CHECK: br %r14
149  %ptr = getelementptr i8, i8 *%base, i64 4095
150  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
151  ret void
152}
153
154; VSTRLR with an out-of-range displacement.
155define void @test_vstrlr3(<16 x i8> %vec, i8 *%base, i32 %length) {
156; CHECK-LABEL: test_vstrlr3:
157; CHECK: vstrlr %v24, %r3, 0({{%r[1-5]}})
158; CHECK: br %r14
159  %ptr = getelementptr i8, i8 *%base, i64 4096
160  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
161  ret void
162}
163
164; Check that VSTRLR doesn't allow an index.
165define void @test_vstrlr4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) {
166; CHECK-LABEL: test_vstrlr4:
167; CHECK: vstrlr %v24, %r4, 0({{%r[1-5]}})
168; CHECK: br %r14
169  %ptr = getelementptr i8, i8 *%base, i64 %index
170  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
171  ret void
172}
173
174; VSTRL with the lowest in-range displacement.
175define void @test_vstrl1(<16 x i8> %vec, i8 *%ptr) {
176; CHECK-LABEL: test_vstrl1:
177; CHECK: vstrl %v24, 0(%r2), 8
178; CHECK: br %r14
179  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
180  ret void
181}
182
183; VSTRL with the highest in-range displacement.
184define void @test_vstrl2(<16 x i8> %vec, i8 *%base) {
185; CHECK-LABEL: test_vstrl2:
186; CHECK: vstrl %v24, 4095(%r2), 8
187; CHECK: br %r14
188  %ptr = getelementptr i8, i8 *%base, i64 4095
189  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
190  ret void
191}
192
193; VSTRL with an out-of-range displacement.
194define void @test_vstrl3(<16 x i8> %vec, i8 *%base) {
195; CHECK-LABEL: test_vstrl3:
196; CHECK: vstrl %v24, 0({{%r[1-5]}}), 8
197; CHECK: br %r14
198  %ptr = getelementptr i8, i8 *%base, i64 4096
199  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
200  ret void
201}
202
203; Check that VSTRL doesn't allow an index.
204define void @test_vstrl4(<16 x i8> %vec, i8 *%base, i64 %index) {
205; CHECK-LABEL: test_vstrl4:
206; CHECK: vstrl %v24, 0({{%r[1-5]}}), 8
207; CHECK: br %r14
208  %ptr = getelementptr i8, i8 *%base, i64 %index
209  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
210  ret void
211}
212
213; VSTRL with length >= 15 should become VST.
214define void @test_vstrl5(<16 x i8> %vec, i8 *%ptr) {
215; CHECK-LABEL: test_vstrl5:
216; CHECK: vst %v24, 0({{%r[1-5]}})
217; CHECK: br %r14
218  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 15, i8 *%ptr)
219  ret void
220}
221
222; VFCESBS with no processing of the result.
223define i32 @test_vfcesbs(<4 x float> %a, <4 x float> %b) {
224; CHECK-LABEL: test_vfcesbs:
225; CHECK: vfcesbs {{%v[0-9]+}}, %v24, %v26
226; CHECK: ipm %r2
227; CHECK: srl %r2, 28
228; CHECK: br %r14
229  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
230                                                   <4 x float> %b)
231  %res = extractvalue {<4 x i32>, i32} %call, 1
232  ret i32 %res
233}
234
235; VFCESBS, returning 1 if any elements are equal (CC != 3).
236define i32 @test_vfcesbs_any_bool(<4 x float> %a, <4 x float> %b) {
237; CHECK-LABEL: test_vfcesbs_any_bool:
238; CHECK: vfcesbs {{%v[0-9]+}}, %v24, %v26
239; CHECK: lhi %r2, 0
240; CHECK: lochile %r2, 1
241; CHECK: br %r14
242  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
243                                                   <4 x float> %b)
244  %res = extractvalue {<4 x i32>, i32} %call, 1
245  %cmp = icmp ne i32 %res, 3
246  %ext = zext i1 %cmp to i32
247  ret i32 %ext
248}
249
250; VFCESBS, storing to %ptr if any elements are equal.
251define <4 x i32> @test_vfcesbs_any_store(<4 x float> %a, <4 x float> %b,
252                                         i32 *%ptr) {
253; CHECK-LABEL: test_vfcesbs_any_store:
254; CHECK-NOT: %r
255; CHECK: vfcesbs %v24, %v24, %v26
256; CHECK-NEXT: {{bor|bnler}} %r14
257; CHECK: mvhi 0(%r2), 0
258; CHECK: br %r14
259  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
260                                                   <4 x float> %b)
261  %res = extractvalue {<4 x i32>, i32} %call, 0
262  %cc = extractvalue {<4 x i32>, i32} %call, 1
263  %cmp = icmp ule i32 %cc, 2
264  br i1 %cmp, label %store, label %exit
265
266store:
267  store i32 0, i32 *%ptr
268  br label %exit
269
270exit:
271  ret <4 x i32> %res
272}
273
274; VFCHSBS with no processing of the result.
275define i32 @test_vfchsbs(<4 x float> %a, <4 x float> %b) {
276; CHECK-LABEL: test_vfchsbs:
277; CHECK: vfchsbs {{%v[0-9]+}}, %v24, %v26
278; CHECK: ipm %r2
279; CHECK: srl %r2, 28
280; CHECK: br %r14
281  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
282                                                   <4 x float> %b)
283  %res = extractvalue {<4 x i32>, i32} %call, 1
284  ret i32 %res
285}
286
287; VFCHSBS, returning 1 if not all elements are higher.
288define i32 @test_vfchsbs_notall_bool(<4 x float> %a, <4 x float> %b) {
289; CHECK-LABEL: test_vfchsbs_notall_bool:
290; CHECK: vfchsbs {{%v[0-9]+}}, %v24, %v26
291; CHECK: lhi %r2, 0
292; CHECK: lochinhe %r2, 1
293; CHECK: br %r14
294  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
295                                                   <4 x float> %b)
296  %res = extractvalue {<4 x i32>, i32} %call, 1
297  %cmp = icmp sge i32 %res, 1
298  %ext = zext i1 %cmp to i32
299  ret i32 %ext
300}
301
302; VFCHSBS, storing to %ptr if not all elements are higher.
303define <4 x i32> @test_vfchsbs_notall_store(<4 x float> %a, <4 x float> %b,
304                                            i32 *%ptr) {
305; CHECK-LABEL: test_vfchsbs_notall_store:
306; CHECK-NOT: %r
307; CHECK: vfchsbs %v24, %v24, %v26
308; CHECK-NEXT: {{bher|ber}} %r14
309; CHECK: mvhi 0(%r2), 0
310; CHECK: br %r14
311  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
312                                                   <4 x float> %b)
313  %res = extractvalue {<4 x i32>, i32} %call, 0
314  %cc = extractvalue {<4 x i32>, i32} %call, 1
315  %cmp = icmp ugt i32 %cc, 0
316  br i1 %cmp, label %store, label %exit
317
318store:
319  store i32 0, i32 *%ptr
320  br label %exit
321
322exit:
323  ret <4 x i32> %res
324}
325
326; VFCHESBS with no processing of the result.
327define i32 @test_vfchesbs(<4 x float> %a, <4 x float> %b) {
328; CHECK-LABEL: test_vfchesbs:
329; CHECK: vfchesbs {{%v[0-9]+}}, %v24, %v26
330; CHECK: ipm %r2
331; CHECK: srl %r2, 28
332; CHECK: br %r14
333  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
334						    <4 x float> %b)
335  %res = extractvalue {<4 x i32>, i32} %call, 1
336  ret i32 %res
337}
338
339; VFCHESBS, returning 1 if neither element is higher or equal.
340define i32 @test_vfchesbs_none_bool(<4 x float> %a, <4 x float> %b) {
341; CHECK-LABEL: test_vfchesbs_none_bool:
342; CHECK: vfchesbs {{%v[0-9]+}}, %v24, %v26
343; CHECK: lhi %r2, 0
344; CHECK: lochio %r2, 1
345; CHECK: br %r14
346  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
347						    <4 x float> %b)
348  %res = extractvalue {<4 x i32>, i32} %call, 1
349  %cmp = icmp eq i32 %res, 3
350  %ext = zext i1 %cmp to i32
351  ret i32 %ext
352}
353
354; VFCHESBS, storing to %ptr if neither element is higher or equal.
355define <4 x i32> @test_vfchesbs_none_store(<4 x float> %a, <4 x float> %b,
356                                           i32 *%ptr) {
357; CHECK-LABEL: test_vfchesbs_none_store:
358; CHECK-NOT: %r
359; CHECK: vfchesbs %v24, %v24, %v26
360; CHECK-NEXT: {{bnor|bler}} %r14
361; CHECK: mvhi 0(%r2), 0
362; CHECK: br %r14
363  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
364						    <4 x float> %b)
365  %res = extractvalue {<4 x i32>, i32} %call, 0
366  %cc = extractvalue {<4 x i32>, i32} %call, 1
367  %cmp = icmp uge i32 %cc, 3
368  br i1 %cmp, label %store, label %exit
369
370store:
371  store i32 0, i32 *%ptr
372  br label %exit
373
374exit:
375  ret <4 x i32> %res
376}
377
378; VFTCISB with the lowest useful class selector and no processing of the result.
379define i32 @test_vftcisb(<4 x float> %a) {
380; CHECK-LABEL: test_vftcisb:
381; CHECK: vftcisb {{%v[0-9]+}}, %v24, 1
382; CHECK: ipm %r2
383; CHECK: srl %r2, 28
384; CHECK: br %r14
385  %call = call {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float> %a, i32 1)
386  %res = extractvalue {<4 x i32>, i32} %call, 1
387  ret i32 %res
388}
389
390; VFTCISB with the highest useful class selector, returning 1 if all elements
391; have the right class (CC == 0).
392define i32 @test_vftcisb_all_bool(<4 x float> %a) {
393; CHECK-LABEL: test_vftcisb_all_bool:
394; CHECK: vftcisb {{%v[0-9]+}}, %v24, 4094
395; CHECK: lhi %r2, 0
396; CHECK: lochie %r2, 1
397; CHECK: br %r14
398  %call = call {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float> %a, i32 4094)
399  %res = extractvalue {<4 x i32>, i32} %call, 1
400  %cmp = icmp eq i32 %res, 0
401  %ext = zext i1 %cmp to i32
402  ret i32 %ext
403}
404
405; VFISB with a rounding mode not usable via standard intrinsics.
406define <4 x float> @test_vfisb_0_4(<4 x float> %a) {
407; CHECK-LABEL: test_vfisb_0_4:
408; CHECK: vfisb %v24, %v24, 0, 4
409; CHECK: br %r14
410  %res = call <4 x float> @llvm.s390.vfisb(<4 x float> %a, i32 0, i32 4)
411  ret <4 x float> %res
412}
413
414; VFISB with IEEE-inexact exception suppressed.
415define <4 x float> @test_vfisb_4_0(<4 x float> %a) {
416; CHECK-LABEL: test_vfisb_4_0:
417; CHECK: vfisb %v24, %v24, 4, 0
418; CHECK: br %r14
419  %res = call <4 x float> @llvm.s390.vfisb(<4 x float> %a, i32 4, i32 0)
420  ret <4 x float> %res
421}
422
423; VFMAXDB.
424define <2 x double> @test_vfmaxdb(<2 x double> %a, <2 x double> %b) {
425; CHECK-LABEL: test_vfmaxdb:
426; CHECK: vfmaxdb %v24, %v24, %v26, 4
427; CHECK: br %r14
428  %res = call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %a, <2 x double> %b, i32 4)
429  ret <2 x double> %res
430}
431
432; VFMINDB.
433define <2 x double> @test_vfmindb(<2 x double> %a, <2 x double> %b) {
434; CHECK-LABEL: test_vfmindb:
435; CHECK: vfmindb %v24, %v24, %v26, 4
436; CHECK: br %r14
437  %res = call <2 x double> @llvm.s390.vfmindb(<2 x double> %a, <2 x double> %b, i32 4)
438  ret <2 x double> %res
439}
440
441; VFMAXSB.
442define <4 x float> @test_vfmaxsb(<4 x float> %a, <4 x float> %b) {
443; CHECK-LABEL: test_vfmaxsb:
444; CHECK: vfmaxsb %v24, %v24, %v26, 4
445; CHECK: br %r14
446  %res = call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %a, <4 x float> %b, i32 4)
447  ret <4 x float> %res
448}
449
450; VFMINSB.
451define <4 x float> @test_vfminsb(<4 x float> %a, <4 x float> %b) {
452; CHECK-LABEL: test_vfminsb:
453; CHECK: vfminsb %v24, %v24, %v26, 4
454; CHECK: br %r14
455  %res = call <4 x float> @llvm.s390.vfminsb(<4 x float> %a, <4 x float> %b, i32 4)
456  ret <4 x float> %res
457}
458
459