1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp %s -o - | FileCheck %s 3 4define void @remat_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5) { 5; CHECK-LABEL: remat_vctp: 6; CHECK: @ %bb.0: @ %bb 7; CHECK-NEXT: push {r4, r5, r7, lr} 8; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 9; CHECK-NEXT: ldrd r5, r12, [sp, #80] 10; CHECK-NEXT: vmvn.i32 q0, #0x80000000 11; CHECK-NEXT: vmov.i32 q1, #0x3f 12; CHECK-NEXT: vmov.i32 q2, #0x1 13; CHECK-NEXT: dlstp.32 lr, r12 14; CHECK-NEXT: .LBB0_1: @ %bb6 15; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 16; CHECK-NEXT: vldrw.u32 q4, [r1], #16 17; CHECK-NEXT: vabs.s32 q5, q4 18; CHECK-NEXT: vcls.s32 q3, q5 19; CHECK-NEXT: vshl.u32 q5, q5, q3 20; CHECK-NEXT: vadd.i32 q3, q3, q2 21; CHECK-NEXT: vshr.u32 q6, q5, #24 22; CHECK-NEXT: vand q6, q6, q1 23; CHECK-NEXT: vldrw.u32 q7, [r5, q6, uxtw #2] 24; CHECK-NEXT: vqrdmulh.s32 q6, q7, q5 25; CHECK-NEXT: vqsub.s32 q6, q0, q6 26; CHECK-NEXT: vqrdmulh.s32 q6, q7, q6 27; CHECK-NEXT: vqshl.s32 q6, q6, #1 28; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 29; CHECK-NEXT: vqsub.s32 q5, q0, q5 30; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 31; CHECK-NEXT: vqshl.s32 q5, q5, #1 32; CHECK-NEXT: vpt.s32 lt, q4, zr 33; CHECK-NEXT: vnegt.s32 q5, q5 34; CHECK-NEXT: vldrw.u32 q4, [r0], #16 35; CHECK-NEXT: vqrdmulh.s32 q4, q4, q5 36; CHECK-NEXT: vstrw.32 q4, [r2], #16 37; CHECK-NEXT: vstrw.32 q3, [r3], #16 38; CHECK-NEXT: letp lr, .LBB0_1 39; CHECK-NEXT: @ %bb.2: @ %bb44 40; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 41; CHECK-NEXT: pop {r4, r5, r7, pc} 42bb: 43 %i = zext i16 %arg5 to i32 44 br label %bb6 45 46bb6: ; preds = %bb6, %bb 47 %i7 = phi i32* [ %arg3, %bb ], [ %i38, %bb6 ] 48 %i8 = phi i32 [ %i, %bb ], [ %i42, %bb6 ] 49 %i9 = phi i32* [ %arg2, %bb ], [ %i41, %bb6 ] 50 %i10 = phi i32* [ %arg1, %bb ], [ %i40, %bb6 ] 51 %i11 = phi i32* [ %arg, %bb ], [ %i39, %bb6 ] 52 %i12 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i8) 53 %i13 = bitcast i32* %i11 to <4 x i32>* 54 %i14 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %i13, i32 4, <4 x i1> %i12, <4 x i32> zeroinitializer) 55 %i15 = bitcast i32* %i10 to <4 x i32>* 56 %i16 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %i15, i32 4, <4 x i1> %i12, <4 x i32> zeroinitializer) 57 %i17 = icmp slt <4 x i32> %i16, zeroinitializer 58 %i18 = sub <4 x i32> zeroinitializer, %i16 59 %i19 = select <4 x i1> %i17, <4 x i32> %i18, <4 x i32> %i16 60 %i20 = tail call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %i19) 61 %i21 = shl <4 x i32> %i19, %i20 62 %i22 = add <4 x i32> %i20, <i32 1, i32 1, i32 1, i32 1> 63 %i23 = lshr <4 x i32> %i21, <i32 24, i32 24, i32 24, i32 24> 64 %i24 = and <4 x i32> %i23, <i32 63, i32 63, i32 63, i32 63> 65 %i25 = tail call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %arg4, <4 x i32> %i24, i32 32, i32 2, i32 0) 66 %i26 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i21) 67 %i27 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i26) 68 %i28 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i27) 69 %i29 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i28, i32 1, i32 0) 70 %i30 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i21) 71 %i31 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i30) 72 %i32 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i31) 73 %i33 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i32, i32 1, i32 0) 74 %i34 = tail call <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32> %i33, <4 x i1> %i17, <4 x i32> %i33) 75 %i35 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i14, <4 x i32> %i34) 76 %i36 = bitcast i32* %i9 to <4 x i32>* 77 %i37 = bitcast i32* %i7 to <4 x i32>* 78 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %i35, <4 x i32>* %i36, i32 4, <4 x i1> %i12) 79 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %i22, <4 x i32>* %i37, i32 4, <4 x i1> %i12) 80 %i38 = getelementptr inbounds i32, i32* %i7, i32 4 81 %i39 = getelementptr inbounds i32, i32* %i11, i32 4 82 %i40 = getelementptr inbounds i32, i32* %i10, i32 4 83 %i41 = getelementptr inbounds i32, i32* %i9, i32 4 84 %i42 = add nsw i32 %i8, -4 85 %i43 = icmp sgt i32 %i8, 4 86 br i1 %i43, label %bb6, label %bb44 87 88bb44: ; preds = %bb6 89 ret void 90} 91 92define void @dont_remat_predicated_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5, i32 %conv.mask) { 93; CHECK-LABEL: dont_remat_predicated_vctp: 94; CHECK: @ %bb.0: @ %bb 95; CHECK-NEXT: push {r4, r5, r6, lr} 96; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 97; CHECK-NEXT: sub sp, #8 98; CHECK-NEXT: ldrd r6, r12, [sp, #88] 99; CHECK-NEXT: movs r4, #4 100; CHECK-NEXT: cmp.w r12, #4 101; CHECK-NEXT: vmvn.i32 q0, #0x80000000 102; CHECK-NEXT: csel r5, r12, r4, lt 103; CHECK-NEXT: vmov.i32 q1, #0x3f 104; CHECK-NEXT: sub.w r5, r12, r5 105; CHECK-NEXT: vmov.i32 q2, #0x1 106; CHECK-NEXT: add.w lr, r5, #3 107; CHECK-NEXT: movs r5, #1 108; CHECK-NEXT: add.w lr, r5, lr, lsr #2 109; CHECK-NEXT: dls lr, lr 110; CHECK-NEXT: .LBB1_1: @ %bb6 111; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 112; CHECK-NEXT: vctp.32 r12 113; CHECK-NEXT: sub.w r12, r12, #4 114; CHECK-NEXT: vpst 115; CHECK-NEXT: vctpt.32 r4 116; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill 117; CHECK-NEXT: vpst 118; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 119; CHECK-NEXT: vabs.s32 q5, q4 120; CHECK-NEXT: vcls.s32 q3, q5 121; CHECK-NEXT: vshl.u32 q5, q5, q3 122; CHECK-NEXT: vadd.i32 q3, q3, q2 123; CHECK-NEXT: vshr.u32 q6, q5, #24 124; CHECK-NEXT: vand q6, q6, q1 125; CHECK-NEXT: vldrw.u32 q7, [r6, q6, uxtw #2] 126; CHECK-NEXT: vqrdmulh.s32 q6, q7, q5 127; CHECK-NEXT: vqsub.s32 q6, q0, q6 128; CHECK-NEXT: vqrdmulh.s32 q6, q7, q6 129; CHECK-NEXT: vqshl.s32 q6, q6, #1 130; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 131; CHECK-NEXT: vqsub.s32 q5, q0, q5 132; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 133; CHECK-NEXT: vqshl.s32 q5, q5, #1 134; CHECK-NEXT: vpt.s32 lt, q4, zr 135; CHECK-NEXT: vnegt.s32 q5, q5 136; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload 137; CHECK-NEXT: vpst 138; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 139; CHECK-NEXT: vqrdmulh.s32 q4, q4, q5 140; CHECK-NEXT: vpstt 141; CHECK-NEXT: vstrwt.32 q4, [r2], #16 142; CHECK-NEXT: vstrwt.32 q3, [r3], #16 143; CHECK-NEXT: le lr, .LBB1_1 144; CHECK-NEXT: @ %bb.2: @ %bb44 145; CHECK-NEXT: add sp, #8 146; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 147; CHECK-NEXT: pop {r4, r5, r6, pc} 148bb: 149 %i = zext i16 %arg5 to i32 150 br label %bb6 151 152bb6: ; preds = %bb6, %bb 153 %i7 = phi i32* [ %arg3, %bb ], [ %i38, %bb6 ] 154 %i8 = phi i32 [ %i, %bb ], [ %i42, %bb6 ] 155 %i9 = phi i32* [ %arg2, %bb ], [ %i41, %bb6 ] 156 %i10 = phi i32* [ %arg1, %bb ], [ %i40, %bb6 ] 157 %i11 = phi i32* [ %arg, %bb ], [ %i39, %bb6 ] 158 %i12 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 4) 159 %mask = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i8) 160 %pred = and <4 x i1> %i12, %mask 161 %i13 = bitcast i32* %i11 to <4 x i32>* 162 %i14 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %i13, i32 4, <4 x i1> %pred, <4 x i32> zeroinitializer) 163 %i15 = bitcast i32* %i10 to <4 x i32>* 164 %i16 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %i15, i32 4, <4 x i1> %pred, <4 x i32> zeroinitializer) 165 %i17 = icmp slt <4 x i32> %i16, zeroinitializer 166 %i18 = sub <4 x i32> zeroinitializer, %i16 167 %i19 = select <4 x i1> %i17, <4 x i32> %i18, <4 x i32> %i16 168 %i20 = tail call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %i19) 169 %i21 = shl <4 x i32> %i19, %i20 170 %i22 = add <4 x i32> %i20, <i32 1, i32 1, i32 1, i32 1> 171 %i23 = lshr <4 x i32> %i21, <i32 24, i32 24, i32 24, i32 24> 172 %i24 = and <4 x i32> %i23, <i32 63, i32 63, i32 63, i32 63> 173 %i25 = tail call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %arg4, <4 x i32> %i24, i32 32, i32 2, i32 0) 174 %i26 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i21) 175 %i27 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i26) 176 %i28 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i27) 177 %i29 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i28, i32 1, i32 0) 178 %i30 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i21) 179 %i31 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i30) 180 %i32 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i31) 181 %i33 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i32, i32 1, i32 0) 182 %i34 = tail call <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32> %i33, <4 x i1> %i17, <4 x i32> %i33) 183 %i35 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i14, <4 x i32> %i34) 184 %i36 = bitcast i32* %i9 to <4 x i32>* 185 %i37 = bitcast i32* %i7 to <4 x i32>* 186 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %i35, <4 x i32>* %i36, i32 4, <4 x i1> %pred) 187 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %i22, <4 x i32>* %i37, i32 4, <4 x i1> %pred) 188 %i38 = getelementptr inbounds i32, i32* %i7, i32 4 189 %i39 = getelementptr inbounds i32, i32* %i11, i32 4 190 %i40 = getelementptr inbounds i32, i32* %i10, i32 4 191 %i41 = getelementptr inbounds i32, i32* %i9, i32 4 192 %i42 = add nsw i32 %i8, -4 193 %i43 = icmp sgt i32 %i8, 4 194 br i1 %i43, label %bb6, label %bb44 195 196bb44: ; preds = %bb6 197 ret void 198} 199 200declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) 201declare <4 x i1> @llvm.arm.mve.vctp32(i32) 202declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) 203declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) 204declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) 205declare <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32>) 206declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32*, <4 x i32>, i32, i32, i32) 207declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) 208declare <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32>, i32, i32) 209declare <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32>, <4 x i1>, <4 x i32>) 210