1; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 2; NB: this tests vcnt, vclz, and vcls 3 4define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind { 5;CHECK-LABEL: vcnt8: 6;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 7 %tmp1 = load <8 x i8>, <8 x i8>* %A 8 %tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1) 9 ret <8 x i8> %tmp2 10} 11 12define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { 13;CHECK-LABEL: vcntQ8: 14;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 15 %tmp1 = load <16 x i8>, <16 x i8>* %A 16 %tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1) 17 ret <16 x i8> %tmp2 18} 19 20declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone 21declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone 22 23define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { 24;CHECK-LABEL: vclz8: 25;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}} 26 %tmp1 = load <8 x i8>, <8 x i8>* %A 27 %tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0) 28 ret <8 x i8> %tmp2 29} 30 31define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { 32;CHECK-LABEL: vclz16: 33;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}} 34 %tmp1 = load <4 x i16>, <4 x i16>* %A 35 %tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0) 36 ret <4 x i16> %tmp2 37} 38 39define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { 40;CHECK-LABEL: vclz32: 41;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}} 42 %tmp1 = load <2 x i32>, <2 x i32>* %A 43 %tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0) 44 ret <2 x i32> %tmp2 45} 46 47define <1 x i64> @vclz64(<1 x i64>* %A) nounwind { 48;CHECK-LABEL: vclz64: 49 %tmp1 = load <1 x i64>, <1 x i64>* %A 50 %tmp2 = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %tmp1, i1 0) 51 ret <1 x i64> %tmp2 52} 53 54define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { 55;CHECK-LABEL: vclzQ8: 56;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}} 57 %tmp1 = load <16 x i8>, <16 x i8>* %A 58 %tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0) 59 ret <16 x i8> %tmp2 60} 61 62define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { 63;CHECK-LABEL: vclzQ16: 64;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}} 65 %tmp1 = load <8 x i16>, <8 x i16>* %A 66 %tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0) 67 ret <8 x i16> %tmp2 68} 69 70define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { 71;CHECK-LABEL: vclzQ32: 72;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}} 73 %tmp1 = load <4 x i32>, <4 x i32>* %A 74 %tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0) 75 ret <4 x i32> %tmp2 76} 77 78define <2 x i64> @vclzQ64(<2 x i64>* %A) nounwind { 79;CHECK-LABEL: vclzQ64: 80 %tmp1 = load <2 x i64>, <2 x i64>* %A 81 %tmp2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %tmp1, i1 0) 82 ret <2 x i64> %tmp2 83} 84 85define <8 x i8> @vclz8b(<8 x i8>* %A) nounwind { 86;CHECK-LABEL: vclz8b: 87;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}} 88 %tmp1 = load <8 x i8>, <8 x i8>* %A 89 %tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 1) 90 ret <8 x i8> %tmp2 91} 92 93define <4 x i16> @vclz16b(<4 x i16>* %A) nounwind { 94;CHECK-LABEL: vclz16b: 95;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}} 96 %tmp1 = load <4 x i16>, <4 x i16>* %A 97 %tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 1) 98 ret <4 x i16> %tmp2 99} 100 101define <2 x i32> @vclz32b(<2 x i32>* %A) nounwind { 102;CHECK-LABEL: vclz32b: 103;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}} 104 %tmp1 = load <2 x i32>, <2 x i32>* %A 105 %tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 1) 106 ret <2 x i32> %tmp2 107} 108 109define <1 x i64> @vclz64b(<1 x i64>* %A) nounwind { 110;CHECK-LABEL: vclz64b: 111 %tmp1 = load <1 x i64>, <1 x i64>* %A 112 %tmp2 = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %tmp1, i1 1) 113 ret <1 x i64> %tmp2 114} 115 116define <16 x i8> @vclzQ8b(<16 x i8>* %A) nounwind { 117;CHECK-LABEL: vclzQ8b: 118;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}} 119 %tmp1 = load <16 x i8>, <16 x i8>* %A 120 %tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 1) 121 ret <16 x i8> %tmp2 122} 123 124define <8 x i16> @vclzQ16b(<8 x i16>* %A) nounwind { 125;CHECK-LABEL: vclzQ16b: 126;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}} 127 %tmp1 = load <8 x i16>, <8 x i16>* %A 128 %tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 1) 129 ret <8 x i16> %tmp2 130} 131 132define <4 x i32> @vclzQ32b(<4 x i32>* %A) nounwind { 133;CHECK-LABEL: vclzQ32b: 134;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}} 135 %tmp1 = load <4 x i32>, <4 x i32>* %A 136 %tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 1) 137 ret <4 x i32> %tmp2 138} 139 140define <2 x i64> @vclzQ64b(<2 x i64>* %A) nounwind { 141;CHECK-LABEL: vclzQ64b: 142 %tmp1 = load <2 x i64>, <2 x i64>* %A 143 %tmp2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %tmp1, i1 1) 144 ret <2 x i64> %tmp2 145} 146 147declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone 148declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone 149declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 150declare <1 x i64> @llvm.ctlz.v1i64(<1 x i64>, i1) nounwind readnone 151 152declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone 153declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone 154declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 155declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone 156 157define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { 158;CHECK-LABEL: vclss8: 159;CHECK: vcls.s8 160 %tmp1 = load <8 x i8>, <8 x i8>* %A 161 %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) 162 ret <8 x i8> %tmp2 163} 164 165define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { 166;CHECK-LABEL: vclss16: 167;CHECK: vcls.s16 168 %tmp1 = load <4 x i16>, <4 x i16>* %A 169 %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) 170 ret <4 x i16> %tmp2 171} 172 173define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { 174;CHECK-LABEL: vclss32: 175;CHECK: vcls.s32 176 %tmp1 = load <2 x i32>, <2 x i32>* %A 177 %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) 178 ret <2 x i32> %tmp2 179} 180 181define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { 182;CHECK-LABEL: vclsQs8: 183;CHECK: vcls.s8 184 %tmp1 = load <16 x i8>, <16 x i8>* %A 185 %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) 186 ret <16 x i8> %tmp2 187} 188 189define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { 190;CHECK-LABEL: vclsQs16: 191;CHECK: vcls.s16 192 %tmp1 = load <8 x i16>, <8 x i16>* %A 193 %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) 194 ret <8 x i16> %tmp2 195} 196 197define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { 198;CHECK-LABEL: vclsQs32: 199;CHECK: vcls.s32 200 %tmp1 = load <4 x i32>, <4 x i32>* %A 201 %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) 202 ret <4 x i32> %tmp2 203} 204 205declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone 206declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone 207declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone 208 209declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone 210declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone 211declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone 212