1; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
2; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+fullfp16 -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
3; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
4; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
5; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
6; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -hardware-loops -disable-arm-loloops=true %s -S -o - | FileCheck %s --check-prefix=DISABLED
7
8; DISABLED-NOT: call i32 @llvm.loop.decrement
9
10; CHECK-LABEL: skip_call
11; CHECK-NOT: call i32 @llvm.start.loop.iterations
12; CHECK-NOT: call i32 @llvm.loop.decrement
13
14define i32 @skip_call(i32 %n) {
15entry:
16  %cmp6 = icmp eq i32 %n, 0
17  br i1 %cmp6, label %while.end, label %while.body.preheader
18
19while.body.preheader:
20  br label %while.body
21
22while.body:
23  %i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
24  %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
25  %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
26  %add = add nsw i32 %call, %res.07
27  %inc1 = add nuw i32 %i.08, 1
28  %exitcond = icmp eq i32 %inc1, %n
29  br i1 %exitcond, label %while.end.loopexit, label %while.body
30
31while.end.loopexit:
32  br label %while.end
33
34while.end:
35  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
36  ret i32 %res.0.lcssa
37}
38
39; CHECK-LABEL: test_target_specific
40; CHECK: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 50)
41; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
42; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
43; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
44; CHECK: br i1 [[CMP]], label %loop, label %exit
45
46define i32 @test_target_specific(i32* %a, i32* %b) {
47entry:
48  br label %loop
49loop:
50  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
51  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
52  %addr.a = getelementptr i32, i32* %a, i32 %count
53  %addr.b = getelementptr i32, i32* %b, i32 %count
54  %load.a = load i32, i32* %addr.a
55  %load.b = load i32, i32* %addr.b
56  %res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
57  %count.next = add nuw i32 %count, 2
58  %cmp = icmp ne i32 %count.next, 100
59  br i1 %cmp, label %loop, label %exit
60exit:
61  ret i32 %res
62}
63
64; CHECK-LABEL: test_fabs_f16
65; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
66; CHECK-MVE-NOT:  call i32 @llvm.start.loop.iterations
67; CHECK-FP:       call i32 @llvm.start.loop.iterations.i32(i32 100)
68; CHECK-MVEFP:    call i32 @llvm.start.loop.iterations.i32(i32 100)
69define void @test_fabs_f16(half* %a, half* %b) {
70entry:
71  br label %loop
72loop:
73  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
74  %addr.a = getelementptr half, half* %a, i32 %count
75  %load.a = load half, half* %addr.a
76  %abs = call half @llvm.fabs.f16(half %load.a)
77  %addr.b = getelementptr half, half* %b, i32 %count
78  store half %abs, half *%addr.b
79  %count.next = add nuw i32 %count, 1
80  %cmp = icmp ne i32 %count.next, 100
81  br i1 %cmp, label %loop, label %exit
82exit:
83  ret void
84}
85
86; CHECK-LABEL: test_fabs
87; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
88; CHECK-MVE-NOT:  call i32 @llvm.start.loop.iterations
89; CHECK-FP:       call i32 @llvm.start.loop.iterations.i32(i32 100)
90; CHECK-MVEFP:    call i32 @llvm.start.loop.iterations.i32(i32 100)
91
92define float @test_fabs(float* %a) {
93entry:
94  br label %loop
95loop:
96  %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
97  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
98  %addr.a = getelementptr float, float* %a, i32 %count
99  %load.a = load float, float* %addr.a
100  %abs = call float @llvm.fabs.f32(float %load.a)
101  %res = fadd float %abs, %acc
102  %count.next = add nuw i32 %count, 1
103  %cmp = icmp ne i32 %count.next, 100
104  br i1 %cmp, label %loop, label %exit
105exit:
106  ret float %res
107}
108
109; CHECK-LABEL: test_fabs_64
110; CHECK-MAIN-NOT:   call i32 @llvm.start.loop.iterations
111; CHECK-MVE-NOT:    call i32 @llvm.start.loop.iterations
112; CHECK-FP-NOT:     call i32 @llvm.start.loop.iterations.i32(i32 100)
113; CHECK-FP64:       call i32 @llvm.start.loop.iterations.i32(i32 100)
114; CHECK-MVEFP-NOT:  call i32 @llvm.start.loop.iterations.i32(i32 100)
115define void @test_fabs_64(double* %a, double* %b) {
116entry:
117  br label %loop
118loop:
119  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
120  %addr.a = getelementptr double, double* %a, i32 %count
121  %load.a = load double, double* %addr.a
122  %abs = call double @llvm.fabs.f64(double %load.a)
123  %addr.b = getelementptr double, double* %b, i32 %count
124  store double %abs, double *%addr.b
125  %count.next = add nuw i32 %count, 1
126  %cmp = icmp ne i32 %count.next, 100
127  br i1 %cmp, label %loop, label %exit
128exit:
129  ret void
130}
131
132; CHECK-LABEL: test_fabs_vec
133; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
134; CHECK-MVEFP: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
135; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
136; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
137; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
138; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
139define <4 x float> @test_fabs_vec(<4 x float>* %a) {
140entry:
141  br label %loop
142loop:
143  %acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
144  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
145  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
146  %load.a = load <4 x float>, <4 x float>* %addr.a
147  %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
148  %res = fadd <4 x float> %abs, %acc
149  %count.next = add nuw i32 %count, 1
150  %cmp = icmp ne i32 %count.next, 100
151  br i1 %cmp, label %loop, label %exit
152exit:
153  ret <4 x float> %res
154}
155
156; CHECK-LABEL: test_log
157; CHECK-NOT: call i32 @llvm.start.loop.iterations
158; CHECK-NOT: llvm.loop.decrement
159define float @test_log(float* %a) {
160entry:
161  br label %loop
162loop:
163  %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
164  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
165  %addr.a = getelementptr float, float* %a, i32 %count
166  %load.a = load float, float* %addr.a
167  %abs = call float @llvm.log.f32(float %load.a)
168  %res = fadd float %abs, %acc
169  %count.next = add nuw i32 %count, 1
170  %cmp = icmp ne i32 %count.next, 100
171  br i1 %cmp, label %loop, label %exit
172exit:
173  ret float %res
174}
175
176; CHECK-LABEL: test_sqrt_16
177; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
178; CHECK-MVE-NOT:  call i32 @llvm.start.loop.iterations
179; CHECK-FP:       call i32 @llvm.start.loop.iterations.i32(i32 100)
180; CHECK-MVEFP:    call i32 @llvm.start.loop.iterations.i32(i32 100)
181; CHECK-FP64:     call i32 @llvm.start.loop.iterations.i32(i32 100)
182define void @test_sqrt_16(half* %a, half* %b) {
183entry:
184  br label %loop
185loop:
186  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
187  %addr.a = getelementptr half, half* %a, i32 %count
188  %load.a = load half, half* %addr.a
189  %sqrt = call half @llvm.sqrt.f16(half %load.a)
190  %addr.b = getelementptr half, half* %b, i32 %count
191  store half %sqrt, half *%addr.b
192  %count.next = add nuw i32 %count, 1
193  %cmp = icmp ne i32 %count.next, 100
194  br i1 %cmp, label %loop, label %exit
195exit:
196  ret void
197}
198; CHECK-LABEL: test_sqrt
199; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
200; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
201; CHECK-FP: call i32 @llvm.start.loop.iterations
202; CHECK-MVEFP: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
203; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
204; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
205; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
206; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
207define void @test_sqrt(float* %a, float* %b) {
208entry:
209  br label %loop
210loop:
211  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
212  %addr.a = getelementptr float, float* %a, i32 %count
213  %load.a = load float, float* %addr.a
214  %sqrt = call float @llvm.sqrt.f32(float %load.a)
215  %addr.b = getelementptr float, float* %b, i32 %count
216  store float %sqrt, float* %addr.b
217  %count.next = add nuw i32 %count, 1
218  %cmp = icmp ne i32 %count.next, 100
219  br i1 %cmp, label %loop, label %exit
220exit:
221  ret void
222}
223
224; CHECK-LABEL: test_sqrt_64
225; CHECK-MAIN-NOT:   call i32 @llvm.start.loop.iterations
226; CHECK-MVE-NOT:    call i32 @llvm.start.loop.iterations
227; CHECK-FP-NOT:     call i32 @llvm.start.loop.iterations.i32(i32 100)
228; CHECK-MVEFP-NOT:  call i32 @llvm.start.loop.iterations.i32(i32 100)
229; CHECK-FP64:       call i32 @llvm.start.loop.iterations.i32(i32 100)
230define void @test_sqrt_64(double* %a, double* %b) {
231entry:
232  br label %loop
233loop:
234  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
235  %addr.a = getelementptr double, double* %a, i32 %count
236  %load.a = load double, double* %addr.a
237  %sqrt = call double @llvm.sqrt.f64(double %load.a)
238  %addr.b = getelementptr double, double* %b, i32 %count
239  store double %sqrt, double *%addr.b
240  %count.next = add nuw i32 %count, 1
241  %cmp = icmp ne i32 %count.next, 100
242  br i1 %cmp, label %loop, label %exit
243exit:
244  ret void
245}
246
247; CHECK-LABEL: test_sqrt_vec
248; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
249; CHECK-MVE-NOT:  call i32 @llvm.start.loop.iterations
250; CHECK-FP:       call i32 @llvm.start.loop.iterations.i32(i32 100)
251; CHECK-MVEFP:    call i32 @llvm.start.loop.iterations.i32(i32 100)
252define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
253entry:
254  br label %loop
255loop:
256  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
257  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
258  %load.a = load <4 x float>, <4 x float>* %addr.a
259  %sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
260  %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
261  store <4 x float> %sqrt, <4 x float>* %addr.b
262  %count.next = add nuw i32 %count, 1
263  %cmp = icmp ne i32 %count.next, 100
264  br i1 %cmp, label %loop, label %exit
265exit:
266  ret void
267}
268
269; CHECK-LABEL: test_overflow
270; CHECK: call i32 @llvm.start.loop.iterations
271define i32 @test_overflow(i32* %a, i32* %b) {
272entry:
273  br label %loop
274loop:
275  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
276  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
277  %addr.a = getelementptr i32, i32* %a, i32 %count
278  %addr.b = getelementptr i32, i32* %b, i32 %count
279  %load.a = load i32, i32* %addr.a
280  %load.b = load i32, i32* %addr.b
281  %sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
282  %res = extractvalue {i32, i1} %sadd, 0
283  %count.next = add nuw i32 %count, 1
284  %cmp = icmp ne i32 %count.next, 100
285  br i1 %cmp, label %loop, label %exit
286exit:
287  ret i32 %res
288}
289
290; TODO: We should be able to generate a qadd/sub
291; CHECK-LABEL: test_sat
292; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 100)
293define i32 @test_sat(i32* %a, i32* %b) {
294entry:
295  br label %loop
296loop:
297  %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
298  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
299  %addr.a = getelementptr i32, i32* %a, i32 %count
300  %addr.b = getelementptr i32, i32* %b, i32 %count
301  %load.a = load i32, i32* %addr.a
302  %load.b = load i32, i32* %addr.b
303  %res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
304  %count.next = add nuw i32 %count, 1
305  %cmp = icmp ne i32 %count.next, 100
306  br i1 %cmp, label %loop, label %exit
307exit:
308  ret i32 %res
309}
310
311; CHECK-LABEL: test_masked_i32
312; CHECK-NOT: call i32 @llvm.start.loop.iterations
313; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
314; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
315; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
316; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
317; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
318; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
319define arm_aapcs_vfpcc void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
320entry:
321  br label %loop
322loop:
323  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
324  %addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
325  %addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
326  %addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
327  %load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
328  %load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
329  %res = add <4 x i32> %load.a, %load.b
330  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
331  %count.next = add nuw i32 %count, 1
332  %cmp = icmp ne i32 %count.next, 100
333  br i1 %cmp, label %loop, label %exit
334exit:
335  ret void
336}
337
338; CHECK-LABEL: test_masked_f32
339; CHECK-NOT: call i32 @llvm.start.loop.iterations
340; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
341; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
342; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
343; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
344; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
345; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
346define arm_aapcs_vfpcc void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
347entry:
348  br label %loop
349loop:
350  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
351  %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
352  %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
353  %addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
354  %load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
355  %load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
356  %res = fadd <4 x float> %load.a, %load.b
357  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
358  %count.next = add nuw i32 %count, 1
359  %cmp = icmp ne i32 %count.next, 100
360  br i1 %cmp, label %loop, label %exit
361exit:
362  ret void
363}
364
365; CHECK-LABEL: test_gather_scatter
366; CHECK-NOT: call i32 @llvm.start.loop.iterations
367; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
368; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
369; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
370; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
371; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
372; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
373define arm_aapcs_vfpcc void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) {
374entry:
375  br label %loop
376loop:
377  %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
378  %load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
379  %load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
380  %res = fadd <4 x float> %load.a, %load.b
381  call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
382  %count.next = add nuw i32 %count, 1
383  %cmp = icmp ne i32 %count.next, 100
384  br i1 %cmp, label %loop, label %exit
385exit:
386  ret void
387}
388
389declare i32 @bar(...) local_unnamed_addr #1
390declare i32 @llvm.arm.smlad(i32, i32, i32)
391declare half @llvm.fabs.f16(half)
392declare float @llvm.fabs.f32(float)
393declare double @llvm.fabs.f64(double)
394declare float @llvm.log.f32(float)
395declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
396declare half @llvm.sqrt.f16(half)
397declare float @llvm.sqrt.f32(float)
398declare double @llvm.sqrt.f64(double)
399declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
400declare i32 @llvm.sadd.sat.i32(i32, i32)
401declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
402declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
403declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
404declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
405declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
406declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
407declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
408