1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s 3 4; The generated code for this test uses a vld1.32 instruction 5; to write the lane 1 of a D register containing the value of 6; <2 x float> %B. Since the D register is defined, it would 7; be incorrect to fully write it (with a vmov.f64) before the 8; vld1.32 instruction. The test checks that a vmov.f64 was not 9; generated. 10 11define <2 x float> @t1(float* %A, <2 x float> %B) { 12; CHECK-LABEL: t1: 13; CHECK: @ %bb.0: 14; CHECK-NEXT: vmov d16, r2, r3 15; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32] 16; CHECK-NEXT: vmov r0, r1, d16 17; CHECK-NEXT: bx lr 18 %tmp2 = load float, float* %A, align 4 19 %tmp3 = insertelement <2 x float> %B, float %tmp2, i32 1 20 ret <2 x float> %tmp3 21} 22 23; The code generated by this test uses a vld1.32 instruction. 24; We check that a dependency breaking vmov* instruction was 25; generated. 26 27define void @t2(<4 x i8> *%in, <4 x i8> *%out, i32 %n) { 28; CHECK-LABEL: t2: 29; CHECK: @ %bb.0: @ %entry 30; CHECK-NEXT: add r0, r0, #4 31; CHECK-NEXT: add r1, r1, #4 32; CHECK-NEXT: .LBB1_1: @ %loop 33; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 34; CHECK-NEXT: vmov.f64 d16, #5.000000e-01 35; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] 36; CHECK-NEXT: vmovl.u8 q8, d16 37; CHECK-NEXT: vuzp.8 d16, d18 38; CHECK-NEXT: vst1.32 {d16[0]}, [r1:32]! 39; CHECK-NEXT: add r0, r0, #4 40; CHECK-NEXT: subs r2, r2, #1 41; CHECK-NEXT: beq .LBB1_1 42; CHECK-NEXT: @ %bb.2: @ %ret 43; CHECK-NEXT: bx lr 44entry: 45 br label %loop 46loop: 47 %oldcount = phi i32 [0, %entry], [%newcount, %loop] 48 %newcount = add i32 %oldcount, 1 49 %p1 = getelementptr <4 x i8>, <4 x i8> *%in, i32 %newcount 50 %p2 = getelementptr <4 x i8>, <4 x i8> *%out, i32 %newcount 51 %tmp1 = load <4 x i8> , <4 x i8> *%p1, align 4 52 store <4 x i8> %tmp1, <4 x i8> *%p2 53 %cmp = icmp eq i32 %newcount, %n 54 br i1 %cmp, label %loop, label %ret 55ret: 56 ret void 57} 58 59; If minimizing size, that overrides perf, so no extra vmov.f64 here. 60 61; TODO: This (and above) could use a splat load to remove the false 62; dependence with no extra instruction. 63 64define void @t2_minsize(<4 x i8> *%in, <4 x i8> *%out, i32 %n) minsize { 65; CHECK-LABEL: t2_minsize: 66; CHECK: @ %bb.0: @ %entry 67; CHECK-NEXT: add r0, r0, #4 68; CHECK-NEXT: add r1, r1, #4 69; CHECK-NEXT: .LBB2_1: @ %loop 70; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 71; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] 72; CHECK-NEXT: vmovl.u8 q8, d16 73; CHECK-NEXT: vuzp.8 d16, d18 74; CHECK-NEXT: vst1.32 {d16[0]}, [r1:32]! 75; CHECK-NEXT: add r0, r0, #4 76; CHECK-NEXT: subs r2, r2, #1 77; CHECK-NEXT: beq .LBB2_1 78; CHECK-NEXT: @ %bb.2: @ %ret 79; CHECK-NEXT: bx lr 80entry: 81 br label %loop 82loop: 83 %oldcount = phi i32 [0, %entry], [%newcount, %loop] 84 %newcount = add i32 %oldcount, 1 85 %p1 = getelementptr <4 x i8>, <4 x i8> *%in, i32 %newcount 86 %p2 = getelementptr <4 x i8>, <4 x i8> *%out, i32 %newcount 87 %tmp1 = load <4 x i8> , <4 x i8> *%p1, align 4 88 store <4 x i8> %tmp1, <4 x i8> *%p2 89 %cmp = icmp eq i32 %newcount, %n 90 br i1 %cmp, label %loop, label %ret 91ret: 92 ret void 93} 94