1; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a 2; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a 3; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple 4 5declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) 6declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) 7declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) 8declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) 9declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) 10declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16) 11 12declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) 13declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) 14declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) 15declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 16declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) 17declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16) 18 19declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) 20declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) 21declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) 22declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) 23declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) 24declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16) 25 26;----------------------------------------------------------------------------- 27; RDMA Vector 28; test for SIMDThreeSameVectorSQRDMLxHTiedHS 29 30define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { 31; CHECK-LABEL: test_sqrdmlah_v4i16: 32 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) 33 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) 34; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h 35; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h 36; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2 37 ret <4 x i16> %retval 38} 39 40define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { 41; CHECK-LABEL: test_sqrdmlah_v8i16: 42 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) 43 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) 44; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h 45; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h 46; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2 47 ret <8 x i16> %retval 48} 49 50define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { 51; CHECK-LABEL: test_sqrdmlah_v2i32: 52 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) 53 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) 54; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s 55; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s 56; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2 57 ret <2 x i32> %retval 58} 59 60define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { 61; CHECK-LABEL: test_sqrdmlah_v4i32: 62 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) 63 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) 64; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s 65; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s 66; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2 67 ret <4 x i32> %retval 68} 69 70define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { 71; CHECK-LABEL: test_sqrdmlsh_v4i16: 72 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) 73 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) 74; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h 75; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h 76; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2 77 ret <4 x i16> %retval 78} 79 80define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { 81; CHECK-LABEL: test_sqrdmlsh_v8i16: 82 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) 83 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) 84; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h 85; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h 86; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2 87 ret <8 x i16> %retval 88} 89 90define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { 91; CHECK-LABEL: test_sqrdmlsh_v2i32: 92 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) 93 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) 94; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s 95; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s 96; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2 97 ret <2 x i32> %retval 98} 99 100define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { 101; CHECK-LABEL: test_sqrdmlsh_v4i32: 102 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) 103 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) 104; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s 105; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s 106; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2 107 ret <4 x i32> %retval 108} 109 110;----------------------------------------------------------------------------- 111; RDMA Vector, by element 112; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied 113 114define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { 115; CHECK-LABEL: test_sqrdmlah_lane_s16: 116entry: 117 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 118 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 119 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) 120; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3] 121; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3] 122; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3] 123 ret <4 x i16> %retval 124} 125 126define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { 127; CHECK-LABEL: test_sqrdmlahq_lane_s16: 128entry: 129 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 130 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 131 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) 132; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] 133; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2] 134; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2] 135 ret <8 x i16> %retval 136} 137 138define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { 139; CHECK-LABEL: test_sqrdmlah_lane_s32: 140entry: 141 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 142 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 143 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) 144; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] 145; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1] 146; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1] 147 ret <2 x i32> %retval 148} 149 150define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { 151; CHECK-LABEL: test_sqrdmlahq_lane_s32: 152entry: 153 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 154 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 155 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) 156; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] 157; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0] 158; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0] 159 ret <4 x i32> %retval 160} 161 162define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { 163; CHECK-LABEL: test_sqrdmlsh_lane_s16: 164entry: 165 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 166 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 167 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) 168; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3] 169; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3] 170; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3] 171 ret <4 x i16> %retval 172} 173 174define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { 175; CHECK-LABEL: test_sqrdmlshq_lane_s16: 176entry: 177 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 178 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 179 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) 180; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] 181; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2] 182; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2] 183 ret <8 x i16> %retval 184} 185 186define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { 187; CHECK-LABEL: test_sqrdmlsh_lane_s32: 188entry: 189 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 190 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 191 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) 192; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] 193; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1] 194; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1] 195 ret <2 x i32> %retval 196} 197 198define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { 199; CHECK-LABEL: test_sqrdmlshq_lane_s32: 200entry: 201 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 202 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 203 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) 204; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] 205; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0] 206; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0] 207 ret <4 x i32> %retval 208} 209 210;----------------------------------------------------------------------------- 211; RDMA Vector, by element, extracted 212; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style 213; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied 214 215define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { 216; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16: 217entry: 218 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 219 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 220 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 221 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 222 %retval = extractelement <4 x i16> %retval_vec, i64 0 223; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] 224; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1] 225; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1] 226 ret i16 %retval 227} 228 229define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { 230; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16: 231entry: 232 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1> 233 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 234 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 235 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 236 %retval = extractelement <8 x i16> %retval_vec, i64 0 237; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] 238; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1] 239; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1] 240 ret i16 %retval 241} 242 243define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { 244; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32: 245entry: 246 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer 247 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 248 %extract = extractelement <2 x i32> %prod, i64 0 249 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) 250; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] 251; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0] 252; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0] 253 ret i32 %retval 254} 255 256define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { 257; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32: 258entry: 259 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 260 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 261 %extract = extractelement <4 x i32> %prod, i64 0 262 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) 263; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] 264; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0] 265; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0] 266 ret i32 %retval 267} 268 269define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { 270; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16: 271entry: 272 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 273 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 274 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 275 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 276 %retval = extractelement <4 x i16> %retval_vec, i64 0 277; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] 278; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1] 279; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1] 280 ret i16 %retval 281} 282 283define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { 284; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16: 285entry: 286 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1> 287 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 288 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 289 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 290 %retval = extractelement <8 x i16> %retval_vec, i64 0 291; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] 292; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1] 293; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1] 294 ret i16 %retval 295} 296 297define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { 298; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32: 299entry: 300 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer 301 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 302 %extract = extractelement <2 x i32> %prod, i64 0 303 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) 304; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] 305; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0] 306; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0] 307 ret i32 %retval 308} 309 310define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { 311; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32: 312entry: 313 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 314 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 315 %extract = extractelement <4 x i32> %prod, i64 0 316 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) 317; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] 318; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0] 319; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0] 320 ret i32 %retval 321} 322 323;----------------------------------------------------------------------------- 324; RDMA Scalar 325; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td 326 327define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) { 328; CHECK-LABEL: test_sqrdmlah_v1i16: 329 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 330 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 331 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) 332 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 333 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) 334 %retval = extractelement <4 x i16> %retval_vec, i64 0 335; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 336; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 337; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 338 ret i16 %retval 339} 340 341define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) { 342; CHECK-LABEL: test_sqrdmlah_v1i32: 343 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 344 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 345 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) 346 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 347 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) 348 %retval = extractelement <4 x i32> %retval_vec, i64 0 349; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 350; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 351; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 352 ret i32 %retval 353} 354 355 356define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) { 357; CHECK-LABEL: test_sqrdmlsh_v1i16: 358 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 359 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 360 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) 361 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 362 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) 363 %retval = extractelement <4 x i16> %retval_vec, i64 0 364; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 365; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 366; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 367 ret i16 %retval 368} 369 370define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) { 371; CHECK-LABEL: test_sqrdmlsh_v1i32: 372 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 373 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 374 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) 375 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 376 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) 377 %retval = extractelement <4 x i32> %retval_vec, i64 0 378; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 379; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 380; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 381 ret i32 %retval 382} 383define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) { 384; CHECK-LABEL: test_sqrdmlah_i32: 385 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) 386 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) 387; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 388; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 389; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 390 ret i32 %retval 391} 392 393define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) { 394; CHECK-LABEL: test_sqrdmlsh_i32: 395 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) 396 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) 397; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 398; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 399; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 400 ret i32 %retval 401} 402 403;----------------------------------------------------------------------------- 404; RDMA Scalar, by element 405; i16 tests are performed via tests in above chapter, with IR in ACLE style 406; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied 407 408define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) { 409; CHECK-LABEL: test_sqrdmlah_extract_i16: 410 %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 411 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 412 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle) 413 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 414 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 415 %retval = extractelement <4 x i16> %retval_vec, i32 0 416; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] 417; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] 418; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] 419 ret i16 %retval 420} 421 422define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { 423; CHECK-LABEL: test_sqrdmlah_extract_i32: 424 %extract = extractelement <4 x i32> %rhs, i32 3 425 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) 426 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) 427; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 428; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 429; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] 430 ret i32 %retval 431} 432 433define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) { 434; CHECK-LABEL: test_sqrdmlshq_extract_i16: 435 %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1> 436 %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0 437 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle) 438 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 439 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 440 %retval = extractelement <8 x i16> %retval_vec, i32 0 441; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] 442; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] 443; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] 444 ret i16 %retval 445} 446 447define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { 448; CHECK-LABEL: test_sqrdmlsh_extract_i32: 449 %extract = extractelement <4 x i32> %rhs, i32 3 450 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) 451 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) 452; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 453; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 454; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] 455 ret i32 %retval 456} 457