1; REQUIRES: asserts 2; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s 3; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -force-fast-cluster -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK-FAST 4 5; CHECK: ********** MI Scheduling ********** 6; CHECK-LABEL: stp_i64_scale:%bb.0 7; CHECK:Cluster ld/st SU(3) - SU(4) 8; CHECK:Cluster ld/st SU(2) - SU(5) 9; CHECK:SU(4): STRXui %1:gpr64, %0:gpr64common, 1 10; CHECK:SU(3): STRXui %1:gpr64, %0:gpr64common, 2 11; CHECK:SU(2): STRXui %1:gpr64, %0:gpr64common, 3 12; CHECK:SU(5): STRXui %1:gpr64, %0:gpr64common, 4 13define i64 @stp_i64_scale(i64* nocapture %P, i64 %v) { 14entry: 15 %arrayidx = getelementptr inbounds i64, i64* %P, i64 3 16 store i64 %v, i64* %arrayidx 17 %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2 18 store i64 %v, i64* %arrayidx1 19 %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1 20 store i64 %v, i64* %arrayidx2 21 %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4 22 store i64 %v, i64* %arrayidx3 23 ret i64 %v 24} 25 26; CHECK: ********** MI Scheduling ********** 27; CHECK-LABEL: stp_i32_scale:%bb.0 28; CHECK:Cluster ld/st SU(3) - SU(4) 29; CHECK:Cluster ld/st SU(2) - SU(5) 30; CHECK:SU(4): STRWui %1:gpr32, %0:gpr64common, 1 31; CHECK:SU(3): STRWui %1:gpr32, %0:gpr64common, 2 32; CHECK:SU(2): STRWui %1:gpr32, %0:gpr64common, 3 33; CHECK:SU(5): STRWui %1:gpr32, %0:gpr64common, 4 34define i32 @stp_i32_scale(i32* nocapture %P, i32 %v) { 35entry: 36 %arrayidx = getelementptr inbounds i32, i32* %P, i32 3 37 store i32 %v, i32* %arrayidx 38 %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 2 39 store i32 %v, i32* %arrayidx1 40 %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 1 41 store i32 %v, i32* %arrayidx2 42 %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 4 43 store i32 %v, i32* %arrayidx3 44 ret i32 %v 45} 46 47; CHECK:********** MI Scheduling ********** 48; CHECK-LABEL:stp_i64_unscale:%bb.0 entry 49; CHECK:Cluster ld/st SU(2) - SU(5) 50; CHECK:Cluster ld/st SU(3) - SU(4) 51; CHECK:SU(2): STURXi %1:gpr64, %0:gpr64common, -24 52; CHECK:SU(3): STURXi %1:gpr64, %0:gpr64common, -8 53; CHECK:SU(4): STURXi %1:gpr64, %0:gpr64common, -16 54; CHECK:SU(5): STURXi %1:gpr64, %0:gpr64common, -32 55define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 { 56entry: 57 %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3 58 store i64 %v, i64* %arrayidx 59 %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1 60 store i64 %v, i64* %arrayidx1 61 %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2 62 store i64 %v, i64* %arrayidx2 63 %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4 64 store i64 %v, i64* %arrayidx3 65 ret void 66} 67 68; CHECK:********** MI Scheduling ********** 69; CHECK-LABEL:stp_i32_unscale:%bb.0 entry 70; CHECK:Cluster ld/st SU(2) - SU(5) 71; CHECK:Cluster ld/st SU(3) - SU(4) 72; CHECK:SU(2): STURWi %1:gpr32, %0:gpr64common, -12 73; CHECK:SU(3): STURWi %1:gpr32, %0:gpr64common, -4 74; CHECK:SU(4): STURWi %1:gpr32, %0:gpr64common, -8 75; CHECK:SU(5): STURWi %1:gpr32, %0:gpr64common, -16 76define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 { 77entry: 78 %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3 79 store i32 %v, i32* %arrayidx 80 %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1 81 store i32 %v, i32* %arrayidx1 82 %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2 83 store i32 %v, i32* %arrayidx2 84 %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4 85 store i32 %v, i32* %arrayidx3 86 ret void 87} 88 89; CHECK:********** MI Scheduling ********** 90; CHECK-LABEL:stp_double:%bb.0 91; CHECK:Cluster ld/st SU(3) - SU(4) 92; CHECK:Cluster ld/st SU(2) - SU(5) 93; CHECK:SU(3): STRDui %1:fpr64, %0:gpr64common, 1 94; CHECK:SU(4): STRDui %1:fpr64, %0:gpr64common, 2 95; CHECK:SU(2): STRDui %1:fpr64, %0:gpr64common, 3 96; CHECK:SU(5): STRDui %1:fpr64, %0:gpr64common, 4 97define void @stp_double(double* nocapture %P, double %v) { 98entry: 99 %arrayidx = getelementptr inbounds double, double* %P, i64 3 100 store double %v, double* %arrayidx 101 %arrayidx1 = getelementptr inbounds double, double* %P, i64 1 102 store double %v, double* %arrayidx1 103 %arrayidx2 = getelementptr inbounds double, double* %P, i64 2 104 store double %v, double* %arrayidx2 105 %arrayidx3 = getelementptr inbounds double, double* %P, i64 4 106 store double %v, double* %arrayidx3 107 ret void 108} 109 110; CHECK:********** MI Scheduling ********** 111; CHECK-LABEL:stp_float:%bb.0 112; CHECK:Cluster ld/st SU(3) - SU(4) 113; CHECK:Cluster ld/st SU(2) - SU(5) 114; CHECK:SU(3): STRSui %1:fpr32, %0:gpr64common, 1 115; CHECK:SU(4): STRSui %1:fpr32, %0:gpr64common, 2 116; CHECK:SU(2): STRSui %1:fpr32, %0:gpr64common, 3 117; CHECK:SU(5): STRSui %1:fpr32, %0:gpr64common, 4 118define void @stp_float(float* nocapture %P, float %v) { 119entry: 120 %arrayidx = getelementptr inbounds float, float* %P, i64 3 121 store float %v, float* %arrayidx 122 %arrayidx1 = getelementptr inbounds float, float* %P, i64 1 123 store float %v, float* %arrayidx1 124 %arrayidx2 = getelementptr inbounds float, float* %P, i64 2 125 store float %v, float* %arrayidx2 126 %arrayidx3 = getelementptr inbounds float, float* %P, i64 4 127 store float %v, float* %arrayidx3 128 ret void 129} 130 131; CHECK: ********** MI Scheduling ********** 132; CHECK-LABEL: stp_volatile:%bb.0 133; CHECK-NOT: Cluster ld/st 134; CHECK:SU(2): STRXui %1:gpr64, %0:gpr64common, 3 :: (volatile 135; CHECK:SU(3): STRXui %1:gpr64, %0:gpr64common, 2 :: (volatile 136; CHECK:SU(4): STRXui %1:gpr64, %0:gpr64common, 1 :: (volatile 137; CHECK:SU(5): STRXui %1:gpr64, %0:gpr64common, 4 :: (volatile 138define i64 @stp_volatile(i64* nocapture %P, i64 %v) { 139entry: 140 %arrayidx = getelementptr inbounds i64, i64* %P, i64 3 141 store volatile i64 %v, i64* %arrayidx 142 %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2 143 store volatile i64 %v, i64* %arrayidx1 144 %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1 145 store volatile i64 %v, i64* %arrayidx2 146 %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4 147 store volatile i64 %v, i64* %arrayidx3 148 ret i64 %v 149} 150 151; CHECK: ********** MI Scheduling ********** 152; CHECK-LABEL: stp_i64_with_ld:%bb.0 153; CHECK:Cluster ld/st SU(5) - SU(10) 154; CHECK:Cluster ld/st SU(15) - SU(20) 155; CHECK:SU(5): STRXui %7:gpr64, %0:gpr64common, 0 :: 156; CHECK:SU(10): STRXui %12:gpr64, %0:gpr64common, 1 :: 157; CHECK:SU(15): STRXui %17:gpr64, %0:gpr64common, 2 :: 158; CHECK:SU(20): STRXui %22:gpr64, %0:gpr64common, 3 :: 159define void @stp_i64_with_ld(i64* noalias nocapture %a, i64* noalias nocapture readnone %b, i64* noalias nocapture readnone %c) { 160entry: 161 %arrayidx = getelementptr inbounds i64, i64* %a, i64 8 162 %0 = load i64, i64* %arrayidx, align 8 163 %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 16 164 %1 = load i64, i64* %arrayidx3, align 8 165 %mul = mul nsw i64 %1, %0 166 %2 = load i64, i64* %a, align 8 167 %add6 = add nsw i64 %2, %mul 168 store i64 %add6, i64* %a, align 8 169 %arrayidx.1 = getelementptr inbounds i64, i64* %a, i64 9 170 %3 = load i64, i64* %arrayidx.1, align 8 171 %arrayidx3.1 = getelementptr inbounds i64, i64* %a, i64 17 172 %4 = load i64, i64* %arrayidx3.1, align 8 173 %mul.1 = mul nsw i64 %4, %3 174 %arrayidx5.1 = getelementptr inbounds i64, i64* %a, i64 1 175 %5 = load i64, i64* %arrayidx5.1, align 8 176 %add6.1 = add nsw i64 %5, %mul.1 177 store i64 %add6.1, i64* %arrayidx5.1, align 8 178 %arrayidx.2 = getelementptr inbounds i64, i64* %a, i64 10 179 %6 = load i64, i64* %arrayidx.2, align 8 180 %arrayidx3.2 = getelementptr inbounds i64, i64* %a, i64 18 181 %7 = load i64, i64* %arrayidx3.2, align 8 182 %mul.2 = mul nsw i64 %7, %6 183 %arrayidx5.2 = getelementptr inbounds i64, i64* %a, i64 2 184 %8 = load i64, i64* %arrayidx5.2, align 8 185 %add6.2 = add nsw i64 %8, %mul.2 186 store i64 %add6.2, i64* %arrayidx5.2, align 8 187 %arrayidx.3 = getelementptr inbounds i64, i64* %a, i64 11 188 %9 = load i64, i64* %arrayidx.3, align 8 189 %arrayidx3.3 = getelementptr inbounds i64, i64* %a, i64 19 190 %10 = load i64, i64* %arrayidx3.3, align 8 191 %mul.3 = mul nsw i64 %10, %9 192 %arrayidx5.3 = getelementptr inbounds i64, i64* %a, i64 3 193 %11 = load i64, i64* %arrayidx5.3, align 8 194 %add6.3 = add nsw i64 %11, %mul.3 195 store i64 %add6.3, i64* %arrayidx5.3, align 8 196 ret void 197} 198 199; Verify that the SU(2) and SU(4) are the preds of SU(3) 200; CHECK: ********** MI Scheduling ********** 201; CHECK-LABEL: stp_missing_preds_edges:%bb.0 202; CHECK:Cluster ld/st SU(3) - SU(5) 203; CHECK: Copy Pred SU(4) 204; CHECK: Copy Pred SU(2) 205; CHECK:SU(2): %0:gpr64common = COPY $x0 206; CHECK:SU(3): STRWui %1:gpr32, %0:gpr64common, 0 207; CHECK:SU(4): %3:gpr32common = nsw ADDWri %2:gpr32common, 5, 0 208; CHECK:SU(5): STRWui %3:gpr32common, %0:gpr64common, 1 209define void @stp_missing_preds_edges(i32* %p, i32 %m, i32 %n) { 210entry: 211 store i32 %m, i32* %p, align 4 212 %add = add nsw i32 %n, 5 213 %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1 214 store i32 %add, i32* %arrayidx1, align 4 215 ret void 216} 217 218; Verify that the SU(4) and SU(7) can be clustered even with 219; different preds 220; CHECK: ********** MI Scheduling ********** 221; CHECK-LABEL: cluster_with_different_preds:%bb.0 222; CHECK:Cluster ld/st SU(4) - SU(7) 223; CHECK:SU(3): STRWui %2:gpr32, %0:gpr64common, 0 :: 224; CHECK:SU(4): %3:gpr32 = LDRWui %1:gpr64common, 0 :: 225; CHECK:Predecessors: 226; CHECK: SU(3): Ord Latency=1 Memory 227; CHECK:SU(6): STRBBui %4:gpr32, %1:gpr64common, 4 :: 228; CHECK:SU(7): %5:gpr32 = LDRWui %1:gpr64common, 1 :: 229; CHECK:Predecessors: 230; CHECK:SU(6): Ord Latency=1 Memory 231; CHECK-FAST: cluster_with_different_preds:%bb.0 232; CHECK-FAST-NOT: Cluster ld/st 233; CHECK-FAST:SU(3): STRWui %2:gpr32, %0:gpr64common, 0 :: 234; CHECK-FAST:SU(4): %3:gpr32 = LDRWui %1:gpr64common, 0 :: 235define i32 @cluster_with_different_preds(i32* %p, i32* %q) { 236entry: 237 store i32 3, i32* %p, align 4 238 %0 = load i32, i32* %q, align 4 239 %add.ptr = getelementptr inbounds i32, i32* %q, i64 1 240 %1 = bitcast i32* %add.ptr to i8* 241 store i8 5, i8* %1, align 1 242 %2 = load i32, i32* %add.ptr, align 4 243 %add = add nsw i32 %2, %0 244 ret i32 %add 245} 246