1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm -mattr=+neon | FileCheck %s 3 4;Check the (default) alignment. 5define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind { 6; CHECK-LABEL: vst1lanei8: 7; CHECK: @ %bb.0: 8; CHECK-NEXT: vldr d16, [r1] 9; CHECK-NEXT: vst1.8 {d16[3]}, [r0] 10; CHECK-NEXT: mov pc, lr 11 %tmp1 = load <8 x i8>, <8 x i8>* %B 12 %tmp2 = extractelement <8 x i8> %tmp1, i32 3 13 store i8 %tmp2, i8* %A, align 8 14 ret void 15} 16 17;Check for a post-increment updating store. 18define void @vst1lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 19; CHECK-LABEL: vst1lanei8_update: 20; CHECK: @ %bb.0: 21; CHECK-NEXT: ldr r2, [r0] 22; CHECK-NEXT: vldr d16, [r1] 23; CHECK-NEXT: vst1.8 {d16[3]}, [r2]! 24; CHECK-NEXT: str r2, [r0] 25; CHECK-NEXT: mov pc, lr 26 %A = load i8*, i8** %ptr 27 %tmp1 = load <8 x i8>, <8 x i8>* %B 28 %tmp2 = extractelement <8 x i8> %tmp1, i32 3 29 store i8 %tmp2, i8* %A, align 8 30 %tmp3 = getelementptr i8, i8* %A, i32 1 31 store i8* %tmp3, i8** %ptr 32 ret void 33} 34 35;Check the alignment value. Max for this instruction is 16 bits: 36define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind { 37; CHECK-LABEL: vst1lanei16: 38; CHECK: @ %bb.0: 39; CHECK-NEXT: vldr d16, [r1] 40; CHECK-NEXT: vst1.16 {d16[2]}, [r0:16] 41; CHECK-NEXT: mov pc, lr 42 %tmp1 = load <4 x i16>, <4 x i16>* %B 43 %tmp2 = extractelement <4 x i16> %tmp1, i32 2 44 store i16 %tmp2, i16* %A, align 8 45 ret void 46} 47 48;Check the alignment value. Max for this instruction is 32 bits: 49define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind { 50; CHECK-LABEL: vst1lanei32: 51; CHECK: @ %bb.0: 52; CHECK-NEXT: vldr d16, [r1] 53; CHECK-NEXT: vst1.32 {d16[1]}, [r0:32] 54; CHECK-NEXT: mov pc, lr 55 %tmp1 = load <2 x i32>, <2 x i32>* %B 56 %tmp2 = extractelement <2 x i32> %tmp1, i32 1 57 store i32 %tmp2, i32* %A, align 8 58 ret void 59} 60 61define void @vst1lanef(float* %A, <2 x float>* %B) nounwind { 62; CHECK-LABEL: vst1lanef: 63; CHECK: @ %bb.0: 64; CHECK-NEXT: vldr d16, [r1] 65; CHECK-NEXT: vst1.32 {d16[1]}, [r0:32] 66; CHECK-NEXT: mov pc, lr 67 %tmp1 = load <2 x float>, <2 x float>* %B 68 %tmp2 = extractelement <2 x float> %tmp1, i32 1 69 store float %tmp2, float* %A 70 ret void 71} 72 73; // Can use scalar load. No need to use vectors. 74; // CHE-CK: vst1.8 {d17[1]}, [r0] 75define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 76; CHECK-LABEL: vst1laneQi8: 77; CHECK: @ %bb.0: 78; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 79; CHECK-NEXT: vst1.8 {d17[1]}, [r0] 80; CHECK-NEXT: mov pc, lr 81 %tmp1 = load <16 x i8>, <16 x i8>* %B 82 %tmp2 = extractelement <16 x i8> %tmp1, i32 9 83 store i8 %tmp2, i8* %A, align 8 84 ret void 85} 86 87define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 88; CHECK-LABEL: vst1laneQi16: 89; CHECK: @ %bb.0: 90; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 91; CHECK-NEXT: vst1.16 {d17[1]}, [r0:16] 92; CHECK-NEXT: mov pc, lr 93 %tmp1 = load <8 x i16>, <8 x i16>* %B 94 %tmp2 = extractelement <8 x i16> %tmp1, i32 5 95 store i16 %tmp2, i16* %A, align 8 96 ret void 97} 98 99; // Can use scalar load. No need to use vectors. 100; // CHE-CK: vst1.32 {d17[1]}, [r0:32] 101define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 102; CHECK-LABEL: vst1laneQi32: 103; CHECK: @ %bb.0: 104; CHECK-NEXT: ldr r1, [r1, #12] 105; CHECK-NEXT: str r1, [r0] 106; CHECK-NEXT: mov pc, lr 107 %tmp1 = load <4 x i32>, <4 x i32>* %B 108 %tmp2 = extractelement <4 x i32> %tmp1, i32 3 109 store i32 %tmp2, i32* %A, align 8 110 ret void 111} 112 113;Check for a post-increment updating store. 114; // Can use scalar load. No need to use vectors. 115; // CHE-CK: vst1.32 {d17[1]}, [r1:32]! 116define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind { 117; CHECK-LABEL: vst1laneQi32_update: 118; CHECK: @ %bb.0: 119; CHECK-NEXT: ldr r2, [r0] 120; CHECK-NEXT: ldr r1, [r1, #12] 121; CHECK-NEXT: str r1, [r2], #4 122; CHECK-NEXT: str r2, [r0] 123; CHECK-NEXT: mov pc, lr 124 %A = load i32*, i32** %ptr 125 %tmp1 = load <4 x i32>, <4 x i32>* %B 126 %tmp2 = extractelement <4 x i32> %tmp1, i32 3 127 store i32 %tmp2, i32* %A, align 8 128 %tmp3 = getelementptr i32, i32* %A, i32 1 129 store i32* %tmp3, i32** %ptr 130 ret void 131} 132 133; // Can use scalar load. No need to use vectors. 134; // CHE-CK: vst1.32 {d17[1]}, [r0] 135define void @vst1laneQf(float* %A, <4 x float>* %B) nounwind { 136; CHECK-LABEL: vst1laneQf: 137; CHECK: @ %bb.0: 138; CHECK-NEXT: ldr r1, [r1, #12] 139; CHECK-NEXT: str r1, [r0] 140; CHECK-NEXT: mov pc, lr 141 %tmp1 = load <4 x float>, <4 x float>* %B 142 %tmp2 = extractelement <4 x float> %tmp1, i32 3 143 store float %tmp2, float* %A 144 ret void 145} 146 147;Check the alignment value. Max for this instruction is 16 bits: 148define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind { 149; CHECK-LABEL: vst2lanei8: 150; CHECK: @ %bb.0: 151; CHECK-NEXT: vldr d16, [r1] 152; CHECK-NEXT: vorr d17, d16, d16 153; CHECK-NEXT: vst2.8 {d16[1], d17[1]}, [r0:16] 154; CHECK-NEXT: mov pc, lr 155 %tmp1 = load <8 x i8>, <8 x i8>* %B 156 call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 157 ret void 158} 159 160;Check the alignment value. Max for this instruction is 32 bits: 161define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind { 162; CHECK-LABEL: vst2lanei16: 163; CHECK: @ %bb.0: 164; CHECK-NEXT: vldr d16, [r1] 165; CHECK-NEXT: vorr d17, d16, d16 166; CHECK-NEXT: vst2.16 {d16[1], d17[1]}, [r0:32] 167; CHECK-NEXT: mov pc, lr 168 %tmp0 = bitcast i16* %A to i8* 169 %tmp1 = load <4 x i16>, <4 x i16>* %B 170 call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 171 ret void 172} 173 174;Check for a post-increment updating store with register increment. 175define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind { 176; CHECK-LABEL: vst2lanei16_update: 177; CHECK: @ %bb.0: 178; CHECK-NEXT: vldr d16, [r1] 179; CHECK-NEXT: lsl r1, r2, #1 180; CHECK-NEXT: ldr r3, [r0] 181; CHECK-NEXT: vorr d17, d16, d16 182; CHECK-NEXT: vst2.16 {d16[1], d17[1]}, [r3], r1 183; CHECK-NEXT: str r3, [r0] 184; CHECK-NEXT: mov pc, lr 185 %A = load i16*, i16** %ptr 186 %tmp0 = bitcast i16* %A to i8* 187 %tmp1 = load <4 x i16>, <4 x i16>* %B 188 call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2) 189 %tmp2 = getelementptr i16, i16* %A, i32 %inc 190 store i16* %tmp2, i16** %ptr 191 ret void 192} 193 194define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind { 195; CHECK-LABEL: vst2lanei32: 196; CHECK: @ %bb.0: 197; CHECK-NEXT: vldr d16, [r1] 198; CHECK-NEXT: vorr d17, d16, d16 199; CHECK-NEXT: vst2.32 {d16[1], d17[1]}, [r0] 200; CHECK-NEXT: mov pc, lr 201 %tmp0 = bitcast i32* %A to i8* 202 %tmp1 = load <2 x i32>, <2 x i32>* %B 203 call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 204 ret void 205} 206 207define void @vst2lanef(float* %A, <2 x float>* %B) nounwind { 208; CHECK-LABEL: vst2lanef: 209; CHECK: @ %bb.0: 210; CHECK-NEXT: vldr d16, [r1] 211; CHECK-NEXT: vorr d17, d16, d16 212; CHECK-NEXT: vst2.32 {d16[1], d17[1]}, [r0] 213; CHECK-NEXT: mov pc, lr 214 %tmp0 = bitcast float* %A to i8* 215 %tmp1 = load <2 x float>, <2 x float>* %B 216 call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 217 ret void 218} 219 220;Check the (default) alignment. 221define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 222; CHECK-LABEL: vst2laneQi16: 223; CHECK: @ %bb.0: 224; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 225; CHECK-NEXT: vorr q9, q8, q8 226; CHECK-NEXT: vst2.16 {d17[1], d19[1]}, [r0] 227; CHECK-NEXT: mov pc, lr 228 %tmp0 = bitcast i16* %A to i8* 229 %tmp1 = load <8 x i16>, <8 x i16>* %B 230 call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 231 ret void 232} 233 234;Check the alignment value. Max for this instruction is 64 bits: 235define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 236; CHECK-LABEL: vst2laneQi32: 237; CHECK: @ %bb.0: 238; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 239; CHECK-NEXT: vorr q9, q8, q8 240; CHECK-NEXT: vst2.32 {d17[0], d19[0]}, [r0:64] 241; CHECK-NEXT: mov pc, lr 242 %tmp0 = bitcast i32* %A to i8* 243 %tmp1 = load <4 x i32>, <4 x i32>* %B 244 call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 245 ret void 246} 247 248define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind { 249; CHECK-LABEL: vst2laneQf: 250; CHECK: @ %bb.0: 251; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 252; CHECK-NEXT: vorr q9, q8, q8 253; CHECK-NEXT: vst2.32 {d17[1], d19[1]}, [r0] 254; CHECK-NEXT: mov pc, lr 255 %tmp0 = bitcast float* %A to i8* 256 %tmp1 = load <4 x float>, <4 x float>* %B 257 call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1) 258 ret void 259} 260 261declare void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind 262declare void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind 263declare void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind 264declare void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind 265 266declare void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind 267declare void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind 268declare void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind 269 270define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind { 271; CHECK-LABEL: vst3lanei8: 272; CHECK: @ %bb.0: 273; CHECK-NEXT: vldr d16, [r1] 274; CHECK-NEXT: vorr d17, d16, d16 275; CHECK-NEXT: vorr d18, d16, d16 276; CHECK-NEXT: vst3.8 {d16[1], d17[1], d18[1]}, [r0] 277; CHECK-NEXT: mov pc, lr 278 %tmp1 = load <8 x i8>, <8 x i8>* %B 279 call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 280 ret void 281} 282 283;Check the (default) alignment value. VST3 does not support alignment. 284define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind { 285; CHECK-LABEL: vst3lanei16: 286; CHECK: @ %bb.0: 287; CHECK-NEXT: vldr d16, [r1] 288; CHECK-NEXT: vorr d17, d16, d16 289; CHECK-NEXT: vorr d18, d16, d16 290; CHECK-NEXT: vst3.16 {d16[1], d17[1], d18[1]}, [r0] 291; CHECK-NEXT: mov pc, lr 292 %tmp0 = bitcast i16* %A to i8* 293 %tmp1 = load <4 x i16>, <4 x i16>* %B 294 call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 295 ret void 296} 297 298define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind { 299; CHECK-LABEL: vst3lanei32: 300; CHECK: @ %bb.0: 301; CHECK-NEXT: vldr d16, [r1] 302; CHECK-NEXT: vorr d17, d16, d16 303; CHECK-NEXT: vorr d18, d16, d16 304; CHECK-NEXT: vst3.32 {d16[1], d17[1], d18[1]}, [r0] 305; CHECK-NEXT: mov pc, lr 306 %tmp0 = bitcast i32* %A to i8* 307 %tmp1 = load <2 x i32>, <2 x i32>* %B 308 call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 309 ret void 310} 311 312define void @vst3lanef(float* %A, <2 x float>* %B) nounwind { 313; CHECK-LABEL: vst3lanef: 314; CHECK: @ %bb.0: 315; CHECK-NEXT: vldr d16, [r1] 316; CHECK-NEXT: vorr d17, d16, d16 317; CHECK-NEXT: vorr d18, d16, d16 318; CHECK-NEXT: vst3.32 {d16[1], d17[1], d18[1]}, [r0] 319; CHECK-NEXT: mov pc, lr 320 %tmp0 = bitcast float* %A to i8* 321 %tmp1 = load <2 x float>, <2 x float>* %B 322 call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 323 ret void 324} 325 326define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 327; CHECK-LABEL: vst3laneQi16: 328; CHECK: @ %bb.0: 329; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 330; CHECK-NEXT: vorr q9, q8, q8 331; CHECK-NEXT: vorr q10, q8, q8 332; CHECK-NEXT: vst3.16 {d17[2], d19[2], d21[2]}, [r0] 333; CHECK-NEXT: mov pc, lr 334;Check the (default) alignment value. VST3 does not support alignment. 335 %tmp0 = bitcast i16* %A to i8* 336 %tmp1 = load <8 x i16>, <8 x i16>* %B 337 call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8) 338 ret void 339} 340 341define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 342; CHECK-LABEL: vst3laneQi32: 343; CHECK: @ %bb.0: 344; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 345; CHECK-NEXT: vorr q9, q8, q8 346; CHECK-NEXT: vorr q10, q8, q8 347; CHECK-NEXT: vst3.32 {d16[0], d18[0], d20[0]}, [r0] 348; CHECK-NEXT: mov pc, lr 349 %tmp0 = bitcast i32* %A to i8* 350 %tmp1 = load <4 x i32>, <4 x i32>* %B 351 call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1) 352 ret void 353} 354 355;Check for a post-increment updating store. 356define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind { 357; CHECK-LABEL: vst3laneQi32_update: 358; CHECK: @ %bb.0: 359; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 360; CHECK-NEXT: vorr q9, q8, q8 361; CHECK-NEXT: ldr r2, [r0] 362; CHECK-NEXT: vorr q10, q8, q8 363; CHECK-NEXT: vst3.32 {d16[0], d18[0], d20[0]}, [r2]! 364; CHECK-NEXT: str r2, [r0] 365; CHECK-NEXT: mov pc, lr 366 %A = load i32*, i32** %ptr 367 %tmp0 = bitcast i32* %A to i8* 368 %tmp1 = load <4 x i32>, <4 x i32>* %B 369 call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1) 370 %tmp2 = getelementptr i32, i32* %A, i32 3 371 store i32* %tmp2, i32** %ptr 372 ret void 373} 374 375define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind { 376; CHECK-LABEL: vst3laneQf: 377; CHECK: @ %bb.0: 378; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 379; CHECK-NEXT: vorr q9, q8, q8 380; CHECK-NEXT: vorr q10, q8, q8 381; CHECK-NEXT: vst3.32 {d16[1], d18[1], d20[1]}, [r0] 382; CHECK-NEXT: mov pc, lr 383 %tmp0 = bitcast float* %A to i8* 384 %tmp1 = load <4 x float>, <4 x float>* %B 385 call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 386 ret void 387} 388 389declare void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind 390declare void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind 391declare void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind 392declare void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind 393 394declare void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind 395declare void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind 396declare void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind 397 398 399;Check the alignment value. Max for this instruction is 32 bits: 400define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind { 401; CHECK-LABEL: vst4lanei8: 402; CHECK: @ %bb.0: 403; CHECK-NEXT: vldr d16, [r1] 404; CHECK-NEXT: vorr d17, d16, d16 405; CHECK-NEXT: vorr d18, d16, d16 406; CHECK-NEXT: vorr d19, d16, d16 407; CHECK-NEXT: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32] 408; CHECK-NEXT: mov pc, lr 409 %tmp1 = load <8 x i8>, <8 x i8>* %B 410 call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 411 ret void 412} 413 414;Check for a post-increment updating store. 415define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 416; CHECK-LABEL: vst4lanei8_update: 417; CHECK: @ %bb.0: 418; CHECK-NEXT: vldr d16, [r1] 419; CHECK-NEXT: vorr d17, d16, d16 420; CHECK-NEXT: ldr r2, [r0] 421; CHECK-NEXT: vorr d18, d16, d16 422; CHECK-NEXT: vorr d19, d16, d16 423; CHECK-NEXT: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r2:32]! 424; CHECK-NEXT: str r2, [r0] 425; CHECK-NEXT: mov pc, lr 426 %A = load i8*, i8** %ptr 427 %tmp1 = load <8 x i8>, <8 x i8>* %B 428 call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 429 %tmp2 = getelementptr i8, i8* %A, i32 4 430 store i8* %tmp2, i8** %ptr 431 ret void 432} 433 434define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind { 435; CHECK-LABEL: vst4lanei16: 436; CHECK: @ %bb.0: 437; CHECK-NEXT: vldr d16, [r1] 438; CHECK-NEXT: vorr d17, d16, d16 439; CHECK-NEXT: vorr d18, d16, d16 440; CHECK-NEXT: vorr d19, d16, d16 441; CHECK-NEXT: vst4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0] 442; CHECK-NEXT: mov pc, lr 443 %tmp0 = bitcast i16* %A to i8* 444 %tmp1 = load <4 x i16>, <4 x i16>* %B 445 call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1) 446 ret void 447} 448 449;Check the alignment value. Max for this instruction is 128 bits: 450define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind { 451; CHECK-LABEL: vst4lanei32: 452; CHECK: @ %bb.0: 453; CHECK-NEXT: vldr d16, [r1] 454; CHECK-NEXT: vorr d17, d16, d16 455; CHECK-NEXT: vorr d18, d16, d16 456; CHECK-NEXT: vorr d19, d16, d16 457; CHECK-NEXT: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128] 458; CHECK-NEXT: mov pc, lr 459 %tmp0 = bitcast i32* %A to i8* 460 %tmp1 = load <2 x i32>, <2 x i32>* %B 461 call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16) 462 ret void 463} 464 465define void @vst4lanef(float* %A, <2 x float>* %B) nounwind { 466; CHECK-LABEL: vst4lanef: 467; CHECK: @ %bb.0: 468; CHECK-NEXT: vldr d16, [r1] 469; CHECK-NEXT: vorr d17, d16, d16 470; CHECK-NEXT: vorr d18, d16, d16 471; CHECK-NEXT: vorr d19, d16, d16 472; CHECK-NEXT: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0] 473; CHECK-NEXT: mov pc, lr 474 %tmp0 = bitcast float* %A to i8* 475 %tmp1 = load <2 x float>, <2 x float>* %B 476 call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 477 ret void 478} 479 480;Check the alignment value. Max for this instruction is 64 bits: 481define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 482; CHECK-LABEL: vst4laneQi16: 483; CHECK: @ %bb.0: 484; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 485; CHECK-NEXT: vorr q9, q8, q8 486; CHECK-NEXT: vorr q10, q8, q8 487; CHECK-NEXT: vorr q11, q8, q8 488; CHECK-NEXT: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64] 489; CHECK-NEXT: mov pc, lr 490 %tmp0 = bitcast i16* %A to i8* 491 %tmp1 = load <8 x i16>, <8 x i16>* %B 492 call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16) 493 ret void 494} 495 496;Check the (default) alignment. 497define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 498; CHECK-LABEL: vst4laneQi32: 499; CHECK: @ %bb.0: 500; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 501; CHECK-NEXT: vorr q9, q8, q8 502; CHECK-NEXT: vorr q10, q8, q8 503; CHECK-NEXT: vorr q11, q8, q8 504; CHECK-NEXT: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0] 505; CHECK-NEXT: mov pc, lr 506 %tmp0 = bitcast i32* %A to i8* 507 %tmp1 = load <4 x i32>, <4 x i32>* %B 508 call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 509 ret void 510} 511 512define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind { 513; CHECK-LABEL: vst4laneQf: 514; CHECK: @ %bb.0: 515; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 516; CHECK-NEXT: vorr q9, q8, q8 517; CHECK-NEXT: vorr q10, q8, q8 518; CHECK-NEXT: vorr q11, q8, q8 519; CHECK-NEXT: vst4.32 {d16[1], d18[1], d20[1], d22[1]}, [r0] 520; CHECK-NEXT: mov pc, lr 521 %tmp0 = bitcast float* %A to i8* 522 %tmp1 = load <4 x float>, <4 x float>* %B 523 call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 524 ret void 525} 526 527; Make sure this doesn't crash; PR10258 528define <8 x i16> @variable_insertelement(<8 x i16> %a, i16 %b, i32 %c) nounwind readnone { 529; CHECK-LABEL: variable_insertelement: 530; CHECK: @ %bb.0: 531; CHECK-NEXT: push {r11, lr} 532; CHECK-NEXT: mov r11, sp 533; CHECK-NEXT: sub sp, sp, #24 534; CHECK-NEXT: bic sp, sp, #15 535; CHECK-NEXT: ldr lr, [r11, #12] 536; CHECK-NEXT: vmov d17, r2, r3 537; CHECK-NEXT: vmov d16, r0, r1 538; CHECK-NEXT: mov r1, sp 539; CHECK-NEXT: and r0, lr, #7 540; CHECK-NEXT: mov r2, r1 541; CHECK-NEXT: ldrh r12, [r11, #8] 542; CHECK-NEXT: lsl r0, r0, #1 543; CHECK-NEXT: vst1.64 {d16, d17}, [r2:128], r0 544; CHECK-NEXT: strh r12, [r2] 545; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128] 546; CHECK-NEXT: vmov r0, r1, d16 547; CHECK-NEXT: vmov r2, r3, d17 548; CHECK-NEXT: mov sp, r11 549; CHECK-NEXT: pop {r11, lr} 550; CHECK-NEXT: mov pc, lr 551 %r = insertelement <8 x i16> %a, i16 %b, i32 %c 552 ret <8 x i16> %r 553} 554 555declare void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind 556declare void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind 557declare void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind 558declare void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind 559 560declare void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind 561declare void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind 562declare void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind 563