1; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+dotprod < %s | FileCheck %s 2; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65 < %s | FileCheck %s 3; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65ae < %s | FileCheck %s 4; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-e1 < %s | FileCheck %s 5; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n1 < %s | FileCheck %s 6; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2 < %s | FileCheck %s 7 8declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) 9declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) 10declare <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) 11declare <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) 12 13define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { 14entry: 15; CHECK-LABEL: test_vdot_u32: 16; CHECK: udot v0.2s, v1.8b, v2.8b 17 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2 18 ret <2 x i32> %vdot1.i 19} 20 21define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { 22entry: 23; CHECK-LABEL: test_vdotq_u32: 24; CHECK: udot v0.4s, v1.16b, v2.16b 25 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2 26 ret <4 x i32> %vdot1.i 27} 28 29define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { 30entry: 31; CHECK-LABEL: test_vdot_s32: 32; CHECK: sdot v0.2s, v1.8b, v2.8b 33 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2 34 ret <2 x i32> %vdot1.i 35} 36 37define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { 38entry: 39; CHECK-LABEL: test_vdotq_s32: 40; CHECK: sdot v0.4s, v1.16b, v2.16b 41 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2 42 ret <4 x i32> %vdot1.i 43} 44 45define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { 46entry: 47; CHECK-LABEL: test_vdot_lane_u32: 48; CHECK: udot v0.2s, v1.8b, v2.4b[1] 49 %.cast = bitcast <8 x i8> %c to <2 x i32> 50 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 51 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 52 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 53 ret <2 x i32> %vdot1.i 54} 55 56define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) { 57entry: 58; CHECK-LABEL: test_vdotq_lane_u32: 59; CHECK: udot v0.4s, v1.16b, v2.4b[1] 60 %.cast = bitcast <8 x i8> %c to <2 x i32> 61 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 62 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 63 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 64 ret <4 x i32> %vdot1.i 65} 66 67define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) { 68entry: 69; CHECK-LABEL: test_vdot_laneq_u32: 70; CHECK: udot v0.2s, v1.8b, v2.4b[1] 71 %.cast = bitcast <16 x i8> %c to <4 x i32> 72 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 73 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 74 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 75 ret <2 x i32> %vdot1.i 76} 77 78define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) { 79entry: 80; CHECK-LABEL: test_vdotq_laneq_u32: 81; CHECK: udot v0.4s, v1.16b, v2.4b[1] 82 %.cast = bitcast <16 x i8> %c to <4 x i32> 83 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 84 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 85 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 86 ret <4 x i32> %vdot1.i 87} 88 89define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { 90entry: 91; CHECK-LABEL: test_vdot_lane_s32: 92; CHECK: sdot v0.2s, v1.8b, v2.4b[1] 93 %.cast = bitcast <8 x i8> %c to <2 x i32> 94 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 95 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 96 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 97 ret <2 x i32> %vdot1.i 98} 99 100define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) { 101entry: 102; CHECK-LABEL: test_vdotq_lane_s32: 103; CHECK: sdot v0.4s, v1.16b, v2.4b[1] 104 %.cast = bitcast <8 x i8> %c to <2 x i32> 105 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 106 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 107 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 108 ret <4 x i32> %vdot1.i 109} 110 111define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) { 112entry: 113; CHECK-LABEL: test_vdot_laneq_s32: 114; CHECK: sdot v0.2s, v1.8b, v2.4b[1] 115 %.cast = bitcast <16 x i8> %c to <4 x i32> 116 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 117 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 118 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 119 ret <2 x i32> %vdot1.i 120} 121 122define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) { 123entry: 124; CHECK-LABEL: test_vdotq_laneq_s32: 125; CHECK: sdot v0.4s, v1.16b, v2.4b[1] 126 %.cast = bitcast <16 x i8> %c to <4 x i32> 127 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 128 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 129 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 130 ret <4 x i32> %vdot1.i 131} 132 133define fastcc void @test_sdot_v4i8(i8* noalias nocapture %0, i8* noalias nocapture readonly %1, i8* noalias nocapture readonly %2) { 134entry: 135; CHECK-LABEL: test_sdot_v4i8: 136; CHECK: sdot {{v[0-9]+}}.2s, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 137 %3 = bitcast i8* %0 to i32* 138 %4 = load i8, i8* %1, align 1 139 %5 = sext i8 %4 to i32 140 %6 = load i8, i8* %2, align 1 141 %7 = sext i8 %6 to i32 142 %8 = mul nsw i32 %7, %5 143 %9 = getelementptr inbounds i8, i8* %1, i64 1 144 %10 = load i8, i8* %9, align 1 145 %11 = sext i8 %10 to i32 146 %12 = getelementptr inbounds i8, i8* %2, i64 1 147 %13 = load i8, i8* %12, align 1 148 %14 = sext i8 %13 to i32 149 %15 = mul nsw i32 %14, %11 150 %16 = add nsw i32 %15, %8 151 %17 = getelementptr inbounds i8, i8* %1, i64 2 152 %18 = load i8, i8* %17, align 1 153 %19 = sext i8 %18 to i32 154 %20 = getelementptr inbounds i8, i8* %2, i64 2 155 %21 = load i8, i8* %20, align 1 156 %22 = sext i8 %21 to i32 157 %23 = mul nsw i32 %22, %19 158 %24 = add nsw i32 %23, %16 159 %25 = getelementptr inbounds i8, i8* %1, i64 3 160 %26 = load i8, i8* %25, align 1 161 %27 = sext i8 %26 to i32 162 %28 = getelementptr inbounds i8, i8* %2, i64 3 163 %29 = load i8, i8* %28, align 1 164 %30 = sext i8 %29 to i32 165 %31 = mul nsw i32 %30, %27 166 %32 = add nsw i32 %31, %24 167 store i32 %32, i32* %3, align 64 168 ret void 169} 170 171define fastcc void @test_udot_v4i8(i8* noalias nocapture %0, i8* noalias nocapture readonly %1, i8* noalias nocapture readonly %2) { 172entry: 173; CHECK-LABEL: test_udot_v4i8: 174; CHECK: udot {{v[0-9]+}}.2s, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 175 %3 = bitcast i8* %0 to i32* 176 %4 = load i8, i8* %1, align 1 177 %5 = zext i8 %4 to i32 178 %6 = load i8, i8* %2, align 1 179 %7 = zext i8 %6 to i32 180 %8 = mul nsw i32 %7, %5 181 %9 = getelementptr inbounds i8, i8* %1, i64 1 182 %10 = load i8, i8* %9, align 1 183 %11 = zext i8 %10 to i32 184 %12 = getelementptr inbounds i8, i8* %2, i64 1 185 %13 = load i8, i8* %12, align 1 186 %14 = zext i8 %13 to i32 187 %15 = mul nsw i32 %14, %11 188 %16 = add nsw i32 %15, %8 189 %17 = getelementptr inbounds i8, i8* %1, i64 2 190 %18 = load i8, i8* %17, align 1 191 %19 = zext i8 %18 to i32 192 %20 = getelementptr inbounds i8, i8* %2, i64 2 193 %21 = load i8, i8* %20, align 1 194 %22 = zext i8 %21 to i32 195 %23 = mul nsw i32 %22, %19 196 %24 = add nsw i32 %23, %16 197 %25 = getelementptr inbounds i8, i8* %1, i64 3 198 %26 = load i8, i8* %25, align 1 199 %27 = zext i8 %26 to i32 200 %28 = getelementptr inbounds i8, i8* %2, i64 3 201 %29 = load i8, i8* %28, align 1 202 %30 = zext i8 %29 to i32 203 %31 = mul nsw i32 %30, %27 204 %32 = add nsw i32 %31, %24 205 store i32 %32, i32* %3, align 64 206 ret void 207} 208 209declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 210 211define i32 @test_udot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b) { 212entry: 213; CHECK-LABEL: test_udot_v8i8: 214; CHECK: udot {{v[0-9]+}}.2s, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 215 %0 = bitcast i8* %a to <8 x i8>* 216 %1 = load <8 x i8>, <8 x i8>* %0 217 %2 = zext <8 x i8> %1 to <8 x i32> 218 %3 = bitcast i8* %b to <8 x i8>* 219 %4 = load <8 x i8>, <8 x i8>* %3 220 %5 = zext <8 x i8> %4 to <8 x i32> 221 %6 = mul nuw nsw <8 x i32> %5, %2 222 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6) 223 ret i32 %7 224} 225 226define i32 @test_sdot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b) { 227entry: 228; CHECK-LABEL: test_sdot_v8i8: 229; CHECK: sdot {{v[0-9]+}}.2s, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 230 %0 = bitcast i8* %a to <8 x i8>* 231 %1 = load <8 x i8>, <8 x i8>* %0 232 %2 = sext <8 x i8> %1 to <8 x i32> 233 %3 = bitcast i8* %b to <8 x i8>* 234 %4 = load <8 x i8>, <8 x i8>* %3 235 %5 = sext <8 x i8> %4 to <8 x i32> 236 %6 = mul nsw <8 x i32> %5, %2 237 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6) 238 ret i32 %7 239} 240 241declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 242 243define i32 @test_udot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %sum) { 244entry: 245; CHECK-LABEL: test_udot_v16i8: 246; CHECK: udot {{v[0-9]+}}.4s, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b 247 %0 = bitcast i8* %a to <16 x i8>* 248 %1 = load <16 x i8>, <16 x i8>* %0 249 %2 = zext <16 x i8> %1 to <16 x i32> 250 %3 = bitcast i8* %b to <16 x i8>* 251 %4 = load <16 x i8>, <16 x i8>* %3 252 %5 = zext <16 x i8> %4 to <16 x i32> 253 %6 = mul nuw nsw <16 x i32> %5, %2 254 %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6) 255 %op.extra = add i32 %7, %sum 256 ret i32 %op.extra 257} 258 259define i32 @test_udot_v16i8_2(i8* nocapture readonly %a1) { 260; CHECK-LABEL: test_udot_v16i8_2: 261; CHECK: movi {{v[0-9]+}}.16b, #1 262; CHECK: movi {{v[0-9]+}}.2d, #0000000000000000 263; CHECK: udot {{v[0-9]+}}.4s, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b 264; CHECK: addv s0, {{v[0-9]+}}.4s 265entry: 266 %0 = bitcast i8* %a1 to <16 x i8>* 267 %1 = load <16 x i8>, <16 x i8>* %0 268 %2 = zext <16 x i8> %1 to <16 x i32> 269 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 270 ret i32 %3 271} 272 273define i32 @test_sdot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %sum) { 274entry: 275; CHECK-LABEL: test_sdot_v16i8: 276; CHECK: sdot {{v[0-9]+}}.4s, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b 277 %0 = bitcast i8* %a to <16 x i8>* 278 %1 = load <16 x i8>, <16 x i8>* %0 279 %2 = sext <16 x i8> %1 to <16 x i32> 280 %3 = bitcast i8* %b to <16 x i8>* 281 %4 = load <16 x i8>, <16 x i8>* %3 282 %5 = sext <16 x i8> %4 to <16 x i32> 283 %6 = mul nsw <16 x i32> %5, %2 284 %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6) 285 %op.extra = add nsw i32 %7, %sum 286 ret i32 %op.extra 287} 288 289define i32 @test_sdot_v16i8_2(i8* nocapture readonly %a1) { 290; CHECK-LABEL: test_sdot_v16i8_2: 291; CHECK: movi {{v[0-9]+}}.16b, #1 292; CHECK: movi {{v[0-9]+}}.2d, #0000000000000000 293; CHECK: sdot {{v[0-9]+}}.4s, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b 294; CHECK: addv s0, {{v[0-9]+}}.4s 295entry: 296 %0 = bitcast i8* %a1 to <16 x i8>* 297 %1 = load <16 x i8>, <16 x i8>* %0 298 %2 = sext <16 x i8> %1 to <16 x i32> 299 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 300 ret i32 %3 301} 302