1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s  | FileCheck %s
3
4; The generated code for this test uses a vld1.32 instruction
5; to write the lane 1 of a D register containing the value of
6; <2 x float> %B. Since the D register is defined, it would
7; be incorrect to fully write it (with a vmov.f64) before the
8; vld1.32 instruction. The test checks that a vmov.f64 was not
9; generated.
10
11define <2 x float> @t1(float* %A, <2 x float> %B) {
12; CHECK-LABEL: t1:
13; CHECK:       @ %bb.0:
14; CHECK-NEXT:    vmov d16, r2, r3
15; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
16; CHECK-NEXT:    vmov r0, r1, d16
17; CHECK-NEXT:    bx lr
18  %tmp2 = load float, float* %A, align 4
19  %tmp3 = insertelement <2 x float> %B, float %tmp2, i32 1
20  ret <2 x float> %tmp3
21}
22
23; The code generated by this test uses a vld1.32 instruction.
24; We check that a dependency breaking vmov* instruction was
25; generated.
26
27define void @t2(<4 x i8> *%in, <4 x i8> *%out, i32 %n) {
28; CHECK-LABEL: t2:
29; CHECK:       @ %bb.0: @ %entry
30; CHECK-NEXT:    add r0, r0, #4
31; CHECK-NEXT:    add r1, r1, #4
32; CHECK-NEXT:  .LBB1_1: @ %loop
33; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
34; CHECK-NEXT:    vmov.f64 d16, #5.000000e-01
35; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
36; CHECK-NEXT:    vmovl.u8 q8, d16
37; CHECK-NEXT:    vuzp.8 d16, d18
38; CHECK-NEXT:    vst1.32 {d16[0]}, [r1:32]!
39; CHECK-NEXT:    add r0, r0, #4
40; CHECK-NEXT:    subs r2, r2, #1
41; CHECK-NEXT:    beq .LBB1_1
42; CHECK-NEXT:  @ %bb.2: @ %ret
43; CHECK-NEXT:    bx lr
44entry:
45  br label %loop
46loop:
47  %oldcount = phi i32 [0, %entry], [%newcount, %loop]
48  %newcount = add i32 %oldcount, 1
49  %p1 = getelementptr <4 x i8>, <4 x i8> *%in, i32 %newcount
50  %p2 = getelementptr <4 x i8>, <4 x i8> *%out, i32 %newcount
51  %tmp1 = load <4 x i8> , <4 x i8> *%p1, align 4
52  store <4 x i8> %tmp1, <4 x i8> *%p2
53  %cmp = icmp eq i32 %newcount, %n
54  br i1 %cmp, label %loop, label %ret
55ret:
56  ret void
57}
58
59; If minimizing size, that overrides perf, so no extra vmov.f64 here.
60
61; TODO: This (and above) could use a splat load to remove the false
62;       dependence with no extra instruction.
63
64define void @t2_minsize(<4 x i8> *%in, <4 x i8> *%out, i32 %n) minsize {
65; CHECK-LABEL: t2_minsize:
66; CHECK:       @ %bb.0: @ %entry
67; CHECK-NEXT:    add r0, r0, #4
68; CHECK-NEXT:    add r1, r1, #4
69; CHECK-NEXT:  .LBB2_1: @ %loop
70; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
71; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
72; CHECK-NEXT:    vmovl.u8 q8, d16
73; CHECK-NEXT:    vuzp.8 d16, d18
74; CHECK-NEXT:    vst1.32 {d16[0]}, [r1:32]!
75; CHECK-NEXT:    add r0, r0, #4
76; CHECK-NEXT:    subs r2, r2, #1
77; CHECK-NEXT:    beq .LBB2_1
78; CHECK-NEXT:  @ %bb.2: @ %ret
79; CHECK-NEXT:    bx lr
80entry:
81  br label %loop
82loop:
83  %oldcount = phi i32 [0, %entry], [%newcount, %loop]
84  %newcount = add i32 %oldcount, 1
85  %p1 = getelementptr <4 x i8>, <4 x i8> *%in, i32 %newcount
86  %p2 = getelementptr <4 x i8>, <4 x i8> *%out, i32 %newcount
87  %tmp1 = load <4 x i8> , <4 x i8> *%p1, align 4
88  store <4 x i8> %tmp1, <4 x i8> *%p2
89  %cmp = icmp eq i32 %newcount, %n
90  br i1 %cmp, label %loop, label %ret
91ret:
92  ret void
93}
94