1; RUN: llc -mattr=+neon < %s | FileCheck %s 2target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" 3target triple = "thumbv7-elf" 4 5define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 6;CHECK: vqdmulhs16: 7;CHECK: vqdmulh.s16 8 %tmp1 = load <4 x i16>* %A 9 %tmp2 = load <4 x i16>* %B 10 %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 11 ret <4 x i16> %tmp3 12} 13 14define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 15;CHECK: vqdmulhs32: 16;CHECK: vqdmulh.s32 17 %tmp1 = load <2 x i32>* %A 18 %tmp2 = load <2 x i32>* %B 19 %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 20 ret <2 x i32> %tmp3 21} 22 23define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 24;CHECK: vqdmulhQs16: 25;CHECK: vqdmulh.s16 26 %tmp1 = load <8 x i16>* %A 27 %tmp2 = load <8 x i16>* %B 28 %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 29 ret <8 x i16> %tmp3 30} 31 32define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 33;CHECK: vqdmulhQs32: 34;CHECK: vqdmulh.s32 35 %tmp1 = load <4 x i32>* %A 36 %tmp2 = load <4 x i32>* %B 37 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 38 ret <4 x i32> %tmp3 39} 40 41define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 42entry: 43; CHECK: test_vqdmulhQ_lanes16 44; CHECK: vqdmulh.s16 q0, q0, d2[1] 45 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] 46 %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] 47 ret <8 x i16> %1 48} 49 50define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 51entry: 52; CHECK: test_vqdmulhQ_lanes32 53; CHECK: vqdmulh.s32 q0, q0, d2[1] 54 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] 55 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] 56 ret <4 x i32> %1 57} 58 59define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 60entry: 61; CHECK: test_vqdmulh_lanes16 62; CHECK: vqdmulh.s16 d0, d0, d1[1] 63 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 64 %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] 65 ret <4 x i16> %1 66} 67 68define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 69entry: 70; CHECK: test_vqdmulh_lanes32 71; CHECK: vqdmulh.s32 d0, d0, d1[1] 72 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 73 %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] 74 ret <2 x i32> %1 75} 76 77declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 78declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 79 80declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 81declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 82 83define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 84;CHECK: vqrdmulhs16: 85;CHECK: vqrdmulh.s16 86 %tmp1 = load <4 x i16>* %A 87 %tmp2 = load <4 x i16>* %B 88 %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 89 ret <4 x i16> %tmp3 90} 91 92define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 93;CHECK: vqrdmulhs32: 94;CHECK: vqrdmulh.s32 95 %tmp1 = load <2 x i32>* %A 96 %tmp2 = load <2 x i32>* %B 97 %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 98 ret <2 x i32> %tmp3 99} 100 101define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 102;CHECK: vqrdmulhQs16: 103;CHECK: vqrdmulh.s16 104 %tmp1 = load <8 x i16>* %A 105 %tmp2 = load <8 x i16>* %B 106 %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 107 ret <8 x i16> %tmp3 108} 109 110define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 111;CHECK: vqrdmulhQs32: 112;CHECK: vqrdmulh.s32 113 %tmp1 = load <4 x i32>* %A 114 %tmp2 = load <4 x i32>* %B 115 %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 116 ret <4 x i32> %tmp3 117} 118 119define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 120entry: 121; CHECK: test_vqRdmulhQ_lanes16 122; CHECK: vqrdmulh.s16 q0, q0, d2[1] 123 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] 124 %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] 125 ret <8 x i16> %1 126} 127 128define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 129entry: 130; CHECK: test_vqRdmulhQ_lanes32 131; CHECK: vqrdmulh.s32 q0, q0, d2[1] 132 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] 133 %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] 134 ret <4 x i32> %1 135} 136 137define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 138entry: 139; CHECK: test_vqRdmulh_lanes16 140; CHECK: vqrdmulh.s16 d0, d0, d1[1] 141 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 142 %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] 143 ret <4 x i16> %1 144} 145 146define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 147entry: 148; CHECK: test_vqRdmulh_lanes32 149; CHECK: vqrdmulh.s32 d0, d0, d1[1] 150 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 151 %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] 152 ret <2 x i32> %1 153} 154 155declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 156declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 157 158declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 159declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 160 161define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 162;CHECK: vqdmulls16: 163;CHECK: vqdmull.s16 164 %tmp1 = load <4 x i16>* %A 165 %tmp2 = load <4 x i16>* %B 166 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 167 ret <4 x i32> %tmp3 168} 169 170define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 171;CHECK: vqdmulls32: 172;CHECK: vqdmull.s32 173 %tmp1 = load <2 x i32>* %A 174 %tmp2 = load <2 x i32>* %B 175 %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 176 ret <2 x i64> %tmp3 177} 178 179define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 180entry: 181; CHECK: test_vqdmull_lanes16 182; CHECK: vqdmull.s16 q0, d0, d1[1] 183 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 184 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] 185 ret <4 x i32> %1 186} 187 188define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 189entry: 190; CHECK: test_vqdmull_lanes32 191; CHECK: vqdmull.s32 q0, d0, d1[1] 192 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 193 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] 194 ret <2 x i64> %1 195} 196 197declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 198declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 199 200define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 201;CHECK: vqdmlals16: 202;CHECK: vqdmlal.s16 203 %tmp1 = load <4 x i32>* %A 204 %tmp2 = load <4 x i16>* %B 205 %tmp3 = load <4 x i16>* %C 206 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) 207 ret <4 x i32> %tmp4 208} 209 210define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 211;CHECK: vqdmlals32: 212;CHECK: vqdmlal.s32 213 %tmp1 = load <2 x i64>* %A 214 %tmp2 = load <2 x i32>* %B 215 %tmp3 = load <2 x i32>* %C 216 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) 217 ret <2 x i64> %tmp4 218} 219 220define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 221entry: 222; CHECK: test_vqdmlal_lanes16 223; CHECK: vqdmlal.s16 q0, d2, d3[1] 224 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 225 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] 226 ret <4 x i32> %1 227} 228 229define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 230entry: 231; CHECK: test_vqdmlal_lanes32 232; CHECK: vqdmlal.s32 q0, d2, d3[1] 233 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 234 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] 235 ret <2 x i64> %1 236} 237 238declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone 239declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone 240 241define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 242;CHECK: vqdmlsls16: 243;CHECK: vqdmlsl.s16 244 %tmp1 = load <4 x i32>* %A 245 %tmp2 = load <4 x i16>* %B 246 %tmp3 = load <4 x i16>* %C 247 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) 248 ret <4 x i32> %tmp4 249} 250 251define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 252;CHECK: vqdmlsls32: 253;CHECK: vqdmlsl.s32 254 %tmp1 = load <2 x i64>* %A 255 %tmp2 = load <2 x i32>* %B 256 %tmp3 = load <2 x i32>* %C 257 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) 258 ret <2 x i64> %tmp4 259} 260 261define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 262entry: 263; CHECK: test_vqdmlsl_lanes16 264; CHECK: vqdmlsl.s16 q0, d2, d3[1] 265 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 266 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] 267 ret <4 x i32> %1 268} 269 270define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 271entry: 272; CHECK: test_vqdmlsl_lanes32 273; CHECK: vqdmlsl.s32 q0, d2, d3[1] 274 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 275 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] 276 ret <2 x i64> %1 277} 278 279declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone 280declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone 281