1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s 3 4define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_s8(<16 x i8> %a, i32* nocapture %b) { 5; CHECK-LABEL: test_vshlcq_s8: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: ldr r1, [r0] 8; CHECK-NEXT: vshlc q0, r1, #18 9; CHECK-NEXT: str r1, [r0] 10; CHECK-NEXT: bx lr 11entry: 12 %0 = load i32, i32* %b, align 4 13 %1 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> %a, i32 %0, i32 18) 14 %2 = extractvalue { i32, <16 x i8> } %1, 0 15 store i32 %2, i32* %b, align 4 16 %3 = extractvalue { i32, <16 x i8> } %1, 1 17 ret <16 x i8> %3 18} 19 20define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_s16(<8 x i16> %a, i32* nocapture %b) { 21; CHECK-LABEL: test_vshlcq_s16: 22; CHECK: @ %bb.0: @ %entry 23; CHECK-NEXT: ldr r1, [r0] 24; CHECK-NEXT: vshlc q0, r1, #16 25; CHECK-NEXT: str r1, [r0] 26; CHECK-NEXT: bx lr 27entry: 28 %0 = load i32, i32* %b, align 4 29 %1 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %a, i32 %0, i32 16) 30 %2 = extractvalue { i32, <8 x i16> } %1, 0 31 store i32 %2, i32* %b, align 4 32 %3 = extractvalue { i32, <8 x i16> } %1, 1 33 ret <8 x i16> %3 34} 35 36define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_s32(<4 x i32> %a, i32* nocapture %b) { 37; CHECK-LABEL: test_vshlcq_s32: 38; CHECK: @ %bb.0: @ %entry 39; CHECK-NEXT: ldr r1, [r0] 40; CHECK-NEXT: vshlc q0, r1, #4 41; CHECK-NEXT: str r1, [r0] 42; CHECK-NEXT: bx lr 43entry: 44 %0 = load i32, i32* %b, align 4 45 %1 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %a, i32 %0, i32 4) 46 %2 = extractvalue { i32, <4 x i32> } %1, 0 47 store i32 %2, i32* %b, align 4 48 %3 = extractvalue { i32, <4 x i32> } %1, 1 49 ret <4 x i32> %3 50} 51 52define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_u8(<16 x i8> %a, i32* nocapture %b) { 53; CHECK-LABEL: test_vshlcq_u8: 54; CHECK: @ %bb.0: @ %entry 55; CHECK-NEXT: ldr r1, [r0] 56; CHECK-NEXT: vshlc q0, r1, #17 57; CHECK-NEXT: str r1, [r0] 58; CHECK-NEXT: bx lr 59entry: 60 %0 = load i32, i32* %b, align 4 61 %1 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> %a, i32 %0, i32 17) 62 %2 = extractvalue { i32, <16 x i8> } %1, 0 63 store i32 %2, i32* %b, align 4 64 %3 = extractvalue { i32, <16 x i8> } %1, 1 65 ret <16 x i8> %3 66} 67 68define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_u16(<8 x i16> %a, i32* nocapture %b) { 69; CHECK-LABEL: test_vshlcq_u16: 70; CHECK: @ %bb.0: @ %entry 71; CHECK-NEXT: ldr r1, [r0] 72; CHECK-NEXT: vshlc q0, r1, #17 73; CHECK-NEXT: str r1, [r0] 74; CHECK-NEXT: bx lr 75entry: 76 %0 = load i32, i32* %b, align 4 77 %1 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %a, i32 %0, i32 17) 78 %2 = extractvalue { i32, <8 x i16> } %1, 0 79 store i32 %2, i32* %b, align 4 80 %3 = extractvalue { i32, <8 x i16> } %1, 1 81 ret <8 x i16> %3 82} 83 84define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_u32(<4 x i32> %a, i32* nocapture %b) { 85; CHECK-LABEL: test_vshlcq_u32: 86; CHECK: @ %bb.0: @ %entry 87; CHECK-NEXT: ldr r1, [r0] 88; CHECK-NEXT: vshlc q0, r1, #20 89; CHECK-NEXT: str r1, [r0] 90; CHECK-NEXT: bx lr 91entry: 92 %0 = load i32, i32* %b, align 4 93 %1 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %a, i32 %0, i32 20) 94 %2 = extractvalue { i32, <4 x i32> } %1, 0 95 store i32 %2, i32* %b, align 4 96 %3 = extractvalue { i32, <4 x i32> } %1, 1 97 ret <4 x i32> %3 98} 99 100define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_m_s8(<16 x i8> %a, i32* nocapture %b, i16 zeroext %p) { 101; CHECK-LABEL: test_vshlcq_m_s8: 102; CHECK: @ %bb.0: @ %entry 103; CHECK-NEXT: vmsr p0, r1 104; CHECK-NEXT: ldr r1, [r0] 105; CHECK-NEXT: vpst 106; CHECK-NEXT: vshlct q0, r1, #29 107; CHECK-NEXT: str r1, [r0] 108; CHECK-NEXT: bx lr 109entry: 110 %0 = load i32, i32* %b, align 4 111 %1 = zext i16 %p to i32 112 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 113 %3 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> %a, i32 %0, i32 29, <16 x i1> %2) 114 %4 = extractvalue { i32, <16 x i8> } %3, 0 115 store i32 %4, i32* %b, align 4 116 %5 = extractvalue { i32, <16 x i8> } %3, 1 117 ret <16 x i8> %5 118} 119 120define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_m_s16(<8 x i16> %a, i32* nocapture %b, i16 zeroext %p) { 121; CHECK-LABEL: test_vshlcq_m_s16: 122; CHECK: @ %bb.0: @ %entry 123; CHECK-NEXT: vmsr p0, r1 124; CHECK-NEXT: ldr r1, [r0] 125; CHECK-NEXT: vpst 126; CHECK-NEXT: vshlct q0, r1, #17 127; CHECK-NEXT: str r1, [r0] 128; CHECK-NEXT: bx lr 129entry: 130 %0 = load i32, i32* %b, align 4 131 %1 = zext i16 %p to i32 132 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 133 %3 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> %a, i32 %0, i32 17, <8 x i1> %2) 134 %4 = extractvalue { i32, <8 x i16> } %3, 0 135 store i32 %4, i32* %b, align 4 136 %5 = extractvalue { i32, <8 x i16> } %3, 1 137 ret <8 x i16> %5 138} 139 140define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_m_s32(<4 x i32> %a, i32* nocapture %b, i16 zeroext %p) { 141; CHECK-LABEL: test_vshlcq_m_s32: 142; CHECK: @ %bb.0: @ %entry 143; CHECK-NEXT: vmsr p0, r1 144; CHECK-NEXT: ldr r1, [r0] 145; CHECK-NEXT: vpst 146; CHECK-NEXT: vshlct q0, r1, #9 147; CHECK-NEXT: str r1, [r0] 148; CHECK-NEXT: bx lr 149entry: 150 %0 = load i32, i32* %b, align 4 151 %1 = zext i16 %p to i32 152 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) 153 %3 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> %a, i32 %0, i32 9, <4 x i1> %2) 154 %4 = extractvalue { i32, <4 x i32> } %3, 0 155 store i32 %4, i32* %b, align 4 156 %5 = extractvalue { i32, <4 x i32> } %3, 1 157 ret <4 x i32> %5 158} 159 160define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_m_u8(<16 x i8> %a, i32* nocapture %b, i16 zeroext %p) { 161; CHECK-LABEL: test_vshlcq_m_u8: 162; CHECK: @ %bb.0: @ %entry 163; CHECK-NEXT: vmsr p0, r1 164; CHECK-NEXT: ldr r1, [r0] 165; CHECK-NEXT: vpst 166; CHECK-NEXT: vshlct q0, r1, #21 167; CHECK-NEXT: str r1, [r0] 168; CHECK-NEXT: bx lr 169entry: 170 %0 = load i32, i32* %b, align 4 171 %1 = zext i16 %p to i32 172 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 173 %3 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> %a, i32 %0, i32 21, <16 x i1> %2) 174 %4 = extractvalue { i32, <16 x i8> } %3, 0 175 store i32 %4, i32* %b, align 4 176 %5 = extractvalue { i32, <16 x i8> } %3, 1 177 ret <16 x i8> %5 178} 179 180define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_m_u16(<8 x i16> %a, i32* nocapture %b, i16 zeroext %p) { 181; CHECK-LABEL: test_vshlcq_m_u16: 182; CHECK: @ %bb.0: @ %entry 183; CHECK-NEXT: vmsr p0, r1 184; CHECK-NEXT: ldr r1, [r0] 185; CHECK-NEXT: vpst 186; CHECK-NEXT: vshlct q0, r1, #24 187; CHECK-NEXT: str r1, [r0] 188; CHECK-NEXT: bx lr 189entry: 190 %0 = load i32, i32* %b, align 4 191 %1 = zext i16 %p to i32 192 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 193 %3 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> %a, i32 %0, i32 24, <8 x i1> %2) 194 %4 = extractvalue { i32, <8 x i16> } %3, 0 195 store i32 %4, i32* %b, align 4 196 %5 = extractvalue { i32, <8 x i16> } %3, 1 197 ret <8 x i16> %5 198} 199 200define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_m_u32(<4 x i32> %a, i32* nocapture %b, i16 zeroext %p) { 201; CHECK-LABEL: test_vshlcq_m_u32: 202; CHECK: @ %bb.0: @ %entry 203; CHECK-NEXT: vmsr p0, r1 204; CHECK-NEXT: ldr r1, [r0] 205; CHECK-NEXT: vpst 206; CHECK-NEXT: vshlct q0, r1, #26 207; CHECK-NEXT: str r1, [r0] 208; CHECK-NEXT: bx lr 209entry: 210 %0 = load i32, i32* %b, align 4 211 %1 = zext i16 %p to i32 212 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) 213 %3 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> %a, i32 %0, i32 26, <4 x i1> %2) 214 %4 = extractvalue { i32, <4 x i32> } %3, 0 215 store i32 %4, i32* %b, align 4 216 %5 = extractvalue { i32, <4 x i32> } %3, 1 217 ret <4 x i32> %5 218} 219 220declare { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8>, i32, i32) 221declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16>, i32, i32) 222declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32>, i32, i32) 223declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) 224declare { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>) 225declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) 226declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>) 227declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) 228declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>) 229