1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
3
4define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
5; CHECK-LABEL: test_memcpy:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
8; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
9; CHECK-NEXT:    .pad #4
10; CHECK-NEXT:    sub sp, #4
11; CHECK-NEXT:    cmp r2, #1
12; CHECK-NEXT:    blt .LBB0_3
13; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
14; CHECK-NEXT:    mov r8, r3
15; CHECK-NEXT:    mov r5, r2
16; CHECK-NEXT:    mov r9, r1
17; CHECK-NEXT:    mov r7, r0
18; CHECK-NEXT:    lsls r4, r3, #2
19; CHECK-NEXT:    movs r6, #0
20; CHECK-NEXT:  .LBB0_2: @ %for.body
21; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
22; CHECK-NEXT:    adds r0, r7, r6
23; CHECK-NEXT:    add.w r1, r9, r6
24; CHECK-NEXT:    mov r2, r8
25; CHECK-NEXT:    bl __aeabi_memcpy4
26; CHECK-NEXT:    add r6, r4
27; CHECK-NEXT:    subs r5, #1
28; CHECK-NEXT:    bne .LBB0_2
29; CHECK-NEXT:  .LBB0_3: @ %for.cond.cleanup
30; CHECK-NEXT:    add sp, #4
31; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
32entry:
33  %cmp8 = icmp sgt i32 %n, 0
34  br i1 %cmp8, label %for.body, label %for.cond.cleanup
35
36for.cond.cleanup:                                 ; preds = %for.body, %entry
37  ret void
38
39for.body:                                         ; preds = %entry, %for.body
40  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
41  %x.addr.010 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
42  %y.addr.09 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
43  %0 = bitcast i32* %x.addr.010 to i8*
44  %1 = bitcast i32* %y.addr.09 to i8*
45  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %m, i1 false)
46  %add.ptr = getelementptr inbounds i32, i32* %x.addr.010, i32 %m
47  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.09, i32 %m
48  %inc = add nuw nsw i32 %i.011, 1
49  %exitcond.not = icmp eq i32 %inc, %n
50  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
51}
52
53define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {
54; CHECK-LABEL: test_memset:
55; CHECK:       @ %bb.0: @ %entry
56; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
57; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
58; CHECK-NEXT:    .pad #4
59; CHECK-NEXT:    sub sp, #4
60; CHECK-NEXT:    cmp r1, #1
61; CHECK-NEXT:    blt .LBB1_3
62; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
63; CHECK-NEXT:    mov r4, r2
64; CHECK-NEXT:    mov r5, r1
65; CHECK-NEXT:    mov r6, r0
66; CHECK-NEXT:    lsls r7, r2, #2
67; CHECK-NEXT:  .LBB1_2: @ %for.body
68; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
69; CHECK-NEXT:    mov r0, r6
70; CHECK-NEXT:    mov r1, r4
71; CHECK-NEXT:    bl __aeabi_memclr4
72; CHECK-NEXT:    add r6, r7
73; CHECK-NEXT:    subs r5, #1
74; CHECK-NEXT:    bne .LBB1_2
75; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup
76; CHECK-NEXT:    add sp, #4
77; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
78entry:
79  %cmp5 = icmp sgt i32 %n, 0
80  br i1 %cmp5, label %for.body, label %for.cond.cleanup
81
82for.cond.cleanup:                                 ; preds = %for.body, %entry
83  ret void
84
85for.body:                                         ; preds = %entry, %for.body
86  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
87  %x.addr.06 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
88  %0 = bitcast i32* %x.addr.06 to i8*
89  tail call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 0, i32 %m, i1 false)
90  %add.ptr = getelementptr inbounds i32, i32* %x.addr.06, i32 %m
91  %inc = add nuw nsw i32 %i.07, 1
92  %exitcond.not = icmp eq i32 %inc, %n
93  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
94}
95
96define void @test_memmove(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
97; CHECK-LABEL: test_memmove:
98; CHECK:       @ %bb.0: @ %entry
99; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
100; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
101; CHECK-NEXT:    .pad #4
102; CHECK-NEXT:    sub sp, #4
103; CHECK-NEXT:    cmp r2, #1
104; CHECK-NEXT:    blt .LBB2_3
105; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
106; CHECK-NEXT:    mov r8, r3
107; CHECK-NEXT:    mov r5, r2
108; CHECK-NEXT:    mov r9, r1
109; CHECK-NEXT:    mov r7, r0
110; CHECK-NEXT:    lsls r4, r3, #2
111; CHECK-NEXT:    movs r6, #0
112; CHECK-NEXT:  .LBB2_2: @ %for.body
113; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
114; CHECK-NEXT:    adds r0, r7, r6
115; CHECK-NEXT:    add.w r1, r9, r6
116; CHECK-NEXT:    mov r2, r8
117; CHECK-NEXT:    bl __aeabi_memmove4
118; CHECK-NEXT:    add r6, r4
119; CHECK-NEXT:    subs r5, #1
120; CHECK-NEXT:    bne .LBB2_2
121; CHECK-NEXT:  .LBB2_3: @ %for.cond.cleanup
122; CHECK-NEXT:    add sp, #4
123; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
124entry:
125  %cmp8 = icmp sgt i32 %n, 0
126  br i1 %cmp8, label %for.body, label %for.cond.cleanup
127
128for.cond.cleanup:                                 ; preds = %for.body, %entry
129  ret void
130
131for.body:                                         ; preds = %entry, %for.body
132  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
133  %x.addr.010 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
134  %y.addr.09 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
135  %0 = bitcast i32* %x.addr.010 to i8*
136  %1 = bitcast i32* %y.addr.09 to i8*
137  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %m, i1 false)
138  %add.ptr = getelementptr inbounds i32, i32* %x.addr.010, i32 %m
139  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.09, i32 %m
140  %inc = add nuw nsw i32 %i.011, 1
141  %exitcond.not = icmp eq i32 %inc, %n
142  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
143}
144
145
146define void @test_memcpy16(i32* nocapture %x, i32* nocapture readonly %y, i32 %n) {
147; CHECK-LABEL: test_memcpy16:
148; CHECK:       @ %bb.0: @ %entry
149; CHECK-NEXT:    .save {r4, lr}
150; CHECK-NEXT:    push {r4, lr}
151; CHECK-NEXT:    cmp r2, #1
152; CHECK-NEXT:    it lt
153; CHECK-NEXT:    poplt {r4, pc}
154; CHECK-NEXT:  .LBB3_1: @ %for.body.preheader
155; CHECK-NEXT:    dls lr, r2
156; CHECK-NEXT:  .LBB3_2: @ %for.body
157; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
158; CHECK-NEXT:    ldm.w r1, {r2, r3, r12}
159; CHECK-NEXT:    ldr r4, [r1, #12]
160; CHECK-NEXT:    adds r1, #64
161; CHECK-NEXT:    stm.w r0, {r2, r3, r12}
162; CHECK-NEXT:    str r4, [r0, #12]
163; CHECK-NEXT:    adds r0, #64
164; CHECK-NEXT:    le lr, .LBB3_2
165; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
166; CHECK-NEXT:    pop {r4, pc}
167entry:
168  %cmp6 = icmp sgt i32 %n, 0
169  br i1 %cmp6, label %for.body, label %for.cond.cleanup
170
171for.cond.cleanup:                                 ; preds = %for.body, %entry
172  ret void
173
174for.body:                                         ; preds = %entry, %for.body
175  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
176  %x.addr.08 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
177  %y.addr.07 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
178  %0 = bitcast i32* %x.addr.08 to i8*
179  %1 = bitcast i32* %y.addr.07 to i8*
180  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 dereferenceable(16) %0, i8* nonnull align 4 dereferenceable(16) %1, i32 16, i1 false)
181  %add.ptr = getelementptr inbounds i32, i32* %x.addr.08, i32 16
182  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.07, i32 16
183  %inc = add nuw nsw i32 %i.09, 1
184  %exitcond.not = icmp eq i32 %inc, %n
185  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
186}
187
188define void @test_memset16(i32* nocapture %x, i32 %n) {
189; CHECK-LABEL: test_memset16:
190; CHECK:       @ %bb.0: @ %entry
191; CHECK-NEXT:    .save {r7, lr}
192; CHECK-NEXT:    push {r7, lr}
193; CHECK-NEXT:    cmp r1, #1
194; CHECK-NEXT:    it lt
195; CHECK-NEXT:    poplt {r7, pc}
196; CHECK-NEXT:  .LBB4_1: @ %for.body.preheader
197; CHECK-NEXT:    dls lr, r1
198; CHECK-NEXT:    movs r1, #0
199; CHECK-NEXT:  .LBB4_2: @ %for.body
200; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
201; CHECK-NEXT:    strd r1, r1, [r0]
202; CHECK-NEXT:    strd r1, r1, [r0, #8]
203; CHECK-NEXT:    adds r0, #64
204; CHECK-NEXT:    le lr, .LBB4_2
205; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
206; CHECK-NEXT:    pop {r7, pc}
207entry:
208  %cmp4 = icmp sgt i32 %n, 0
209  br i1 %cmp4, label %for.body, label %for.cond.cleanup
210
211for.cond.cleanup:                                 ; preds = %for.body, %entry
212  ret void
213
214for.body:                                         ; preds = %entry, %for.body
215  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
216  %x.addr.05 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
217  %0 = bitcast i32* %x.addr.05 to i8*
218  tail call void @llvm.memset.p0i8.i32(i8* nonnull align 4 dereferenceable(16) %0, i8 0, i32 16, i1 false)
219  %add.ptr = getelementptr inbounds i32, i32* %x.addr.05, i32 16
220  %inc = add nuw nsw i32 %i.06, 1
221  %exitcond.not = icmp eq i32 %inc, %n
222  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
223}
224
225define void @test_memmove16(i32* nocapture %x, i32* nocapture readonly %y, i32 %n) {
226; CHECK-LABEL: test_memmove16:
227; CHECK:       @ %bb.0: @ %entry
228; CHECK-NEXT:    .save {r4, lr}
229; CHECK-NEXT:    push {r4, lr}
230; CHECK-NEXT:    cmp r2, #1
231; CHECK-NEXT:    it lt
232; CHECK-NEXT:    poplt {r4, pc}
233; CHECK-NEXT:  .LBB5_1: @ %for.body.preheader
234; CHECK-NEXT:    dls lr, r2
235; CHECK-NEXT:  .LBB5_2: @ %for.body
236; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
237; CHECK-NEXT:    ldm.w r1, {r2, r3, r12}
238; CHECK-NEXT:    ldr r4, [r1, #12]
239; CHECK-NEXT:    adds r1, #64
240; CHECK-NEXT:    stm.w r0, {r2, r3, r12}
241; CHECK-NEXT:    str r4, [r0, #12]
242; CHECK-NEXT:    adds r0, #64
243; CHECK-NEXT:    le lr, .LBB5_2
244; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
245; CHECK-NEXT:    pop {r4, pc}
246entry:
247  %cmp6 = icmp sgt i32 %n, 0
248  br i1 %cmp6, label %for.body, label %for.cond.cleanup
249
250for.cond.cleanup:                                 ; preds = %for.body, %entry
251  ret void
252
253for.body:                                         ; preds = %entry, %for.body
254  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
255  %x.addr.08 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
256  %y.addr.07 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
257  %0 = bitcast i32* %x.addr.08 to i8*
258  %1 = bitcast i32* %y.addr.07 to i8*
259  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* nonnull align 4 dereferenceable(16) %0, i8* nonnull align 4 dereferenceable(16) %1, i32 16, i1 false)
260  %add.ptr = getelementptr inbounds i32, i32* %x.addr.08, i32 16
261  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.07, i32 16
262  %inc = add nuw nsw i32 %i.09, 1
263  %exitcond.not = icmp eq i32 %inc, %n
264  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
265}
266
267
268declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
269declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)
270declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1 immarg)
271