1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon | FileCheck %s --check-prefixes=CHECK,DEFAULT 3; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic | FileCheck %s --check-prefixes=CHECK,BASIC 4 5;Check the (default) alignment value. 6define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { 7; CHECK-LABEL: vld1lanei8: 8; CHECK: @ %bb.0: 9; CHECK-NEXT: vldr d16, [r1] 10; CHECK-NEXT: vld1.8 {d16[3]}, [r0] 11; CHECK-NEXT: vmov r0, r1, d16 12; CHECK-NEXT: mov pc, lr 13 %tmp1 = load <8 x i8>, <8 x i8>* %B 14 %tmp2 = load i8, i8* %A, align 8 15 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 16 ret <8 x i8> %tmp3 17} 18 19;Check the alignment value. Max for this instruction is 16 bits: 20define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { 21; CHECK-LABEL: vld1lanei16: 22; CHECK: @ %bb.0: 23; CHECK-NEXT: vldr d16, [r1] 24; CHECK-NEXT: vld1.16 {d16[2]}, [r0:16] 25; CHECK-NEXT: vmov r0, r1, d16 26; CHECK-NEXT: mov pc, lr 27 %tmp1 = load <4 x i16>, <4 x i16>* %B 28 %tmp2 = load i16, i16* %A, align 8 29 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 30 ret <4 x i16> %tmp3 31} 32 33;Check the alignment value. Max for this instruction is 32 bits: 34define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { 35; CHECK-LABEL: vld1lanei32: 36; CHECK: @ %bb.0: 37; CHECK-NEXT: vldr d16, [r1] 38; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32] 39; CHECK-NEXT: vmov r0, r1, d16 40; CHECK-NEXT: mov pc, lr 41 %tmp1 = load <2 x i32>, <2 x i32>* %B 42 %tmp2 = load i32, i32* %A, align 8 43 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 44 ret <2 x i32> %tmp3 45} 46 47;Check the alignment value. Legal values are none or :32. 48define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind { 49; CHECK-LABEL: vld1lanei32a32: 50; CHECK: @ %bb.0: 51; CHECK-NEXT: vldr d16, [r1] 52; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32] 53; CHECK-NEXT: vmov r0, r1, d16 54; CHECK-NEXT: mov pc, lr 55 %tmp1 = load <2 x i32>, <2 x i32>* %B 56 %tmp2 = load i32, i32* %A, align 4 57 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 58 ret <2 x i32> %tmp3 59} 60 61define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind { 62; CHECK-LABEL: vld1lanef: 63; CHECK: @ %bb.0: 64; CHECK-NEXT: vldr d16, [r1] 65; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32] 66; CHECK-NEXT: vmov r0, r1, d16 67; CHECK-NEXT: mov pc, lr 68 %tmp1 = load <2 x float>, <2 x float>* %B 69 %tmp2 = load float, float* %A, align 4 70 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1 71 ret <2 x float> %tmp3 72} 73 74define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 75; CHECK-LABEL: vld1laneQi8: 76; CHECK: @ %bb.0: 77; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 78; CHECK-NEXT: vld1.8 {d17[1]}, [r0] 79; CHECK-NEXT: vmov r0, r1, d16 80; CHECK-NEXT: vmov r2, r3, d17 81; CHECK-NEXT: mov pc, lr 82 %tmp1 = load <16 x i8>, <16 x i8>* %B 83 %tmp2 = load i8, i8* %A, align 8 84 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 85 ret <16 x i8> %tmp3 86} 87 88define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 89; CHECK-LABEL: vld1laneQi16: 90; CHECK: @ %bb.0: 91; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 92; CHECK-NEXT: vld1.16 {d17[1]}, [r0:16] 93; CHECK-NEXT: vmov r0, r1, d16 94; CHECK-NEXT: vmov r2, r3, d17 95; CHECK-NEXT: mov pc, lr 96 %tmp1 = load <8 x i16>, <8 x i16>* %B 97 %tmp2 = load i16, i16* %A, align 8 98 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 99 ret <8 x i16> %tmp3 100} 101 102define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 103; CHECK-LABEL: vld1laneQi32: 104; CHECK: @ %bb.0: 105; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 106; CHECK-NEXT: vld1.32 {d17[1]}, [r0:32] 107; CHECK-NEXT: vmov r0, r1, d16 108; CHECK-NEXT: vmov r2, r3, d17 109; CHECK-NEXT: mov pc, lr 110 %tmp1 = load <4 x i32>, <4 x i32>* %B 111 %tmp2 = load i32, i32* %A, align 8 112 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 113 ret <4 x i32> %tmp3 114} 115 116define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { 117; CHECK-LABEL: vld1laneQf: 118; CHECK: @ %bb.0: 119; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 120; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] 121; CHECK-NEXT: vmov r2, r3, d17 122; CHECK-NEXT: vmov r0, r1, d16 123; CHECK-NEXT: mov pc, lr 124 %tmp1 = load <4 x float>, <4 x float>* %B 125 %tmp2 = load float, float* %A 126 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 127 ret <4 x float> %tmp3 128} 129 130%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 131%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } 132%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> } 133%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } 134 135%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } 136%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } 137%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> } 138 139;Check the alignment value. Max for this instruction is 16 bits: 140define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { 141; CHECK-LABEL: vld2lanei8: 142; CHECK: @ %bb.0: 143; CHECK-NEXT: vldr d16, [r1] 144; CHECK-NEXT: vorr d17, d16, d16 145; CHECK-NEXT: vld2.8 {d16[1], d17[1]}, [r0:16] 146; CHECK-NEXT: vadd.i8 d16, d16, d17 147; CHECK-NEXT: vmov r0, r1, d16 148; CHECK-NEXT: mov pc, lr 149 %tmp1 = load <8 x i8>, <8 x i8>* %B 150 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 151 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 152 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 153 %tmp5 = add <8 x i8> %tmp3, %tmp4 154 ret <8 x i8> %tmp5 155} 156 157;Check the alignment value. Max for this instruction is 32 bits: 158define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { 159; CHECK-LABEL: vld2lanei16: 160; CHECK: @ %bb.0: 161; CHECK-NEXT: vldr d16, [r1] 162; CHECK-NEXT: vorr d17, d16, d16 163; CHECK-NEXT: vld2.16 {d16[1], d17[1]}, [r0:32] 164; CHECK-NEXT: vadd.i16 d16, d16, d17 165; CHECK-NEXT: vmov r0, r1, d16 166; CHECK-NEXT: mov pc, lr 167 %tmp0 = bitcast i16* %A to i8* 168 %tmp1 = load <4 x i16>, <4 x i16>* %B 169 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 170 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 171 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 172 %tmp5 = add <4 x i16> %tmp3, %tmp4 173 ret <4 x i16> %tmp5 174} 175 176define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { 177; CHECK-LABEL: vld2lanei32: 178; CHECK: @ %bb.0: 179; CHECK-NEXT: vldr d16, [r1] 180; CHECK-NEXT: vorr d17, d16, d16 181; CHECK-NEXT: vld2.32 {d16[1], d17[1]}, [r0] 182; CHECK-NEXT: vadd.i32 d16, d16, d17 183; CHECK-NEXT: vmov r0, r1, d16 184; CHECK-NEXT: mov pc, lr 185 %tmp0 = bitcast i32* %A to i8* 186 %tmp1 = load <2 x i32>, <2 x i32>* %B 187 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 188 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 189 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 190 %tmp5 = add <2 x i32> %tmp3, %tmp4 191 ret <2 x i32> %tmp5 192} 193 194;Check for a post-increment updating load. 195define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { 196; DEFAULT-LABEL: vld2lanei32_update: 197; DEFAULT: @ %bb.0: 198; DEFAULT-NEXT: vldr d16, [r1] 199; DEFAULT-NEXT: ldr r3, [r0] 200; DEFAULT-NEXT: vorr d17, d16, d16 201; DEFAULT-NEXT: vld2.32 {d16[1], d17[1]}, [r3]! 202; DEFAULT-NEXT: vadd.i32 d16, d16, d17 203; DEFAULT-NEXT: str r3, [r0] 204; DEFAULT-NEXT: vmov r2, r1, d16 205; DEFAULT-NEXT: mov r0, r2 206; DEFAULT-NEXT: mov pc, lr 207; 208; BASIC-LABEL: vld2lanei32_update: 209; BASIC: @ %bb.0: 210; BASIC-NEXT: mov r2, r1 211; BASIC-NEXT: mov r1, r0 212; BASIC-NEXT: vldr d16, [r2] 213; BASIC-NEXT: ldr r0, [r0] 214; BASIC-NEXT: vorr d17, d16, d16 215; BASIC-NEXT: vld2.32 {d16[1], d17[1]}, [r0]! 216; BASIC-NEXT: vadd.i32 d16, d16, d17 217; BASIC-NEXT: str r0, [r1] 218; BASIC-NEXT: vmov r2, r3, d16 219; BASIC-NEXT: mov r0, r2 220; BASIC-NEXT: mov r1, r3 221; BASIC-NEXT: mov pc, lr 222 %A = load i32*, i32** %ptr 223 %tmp0 = bitcast i32* %A to i8* 224 %tmp1 = load <2 x i32>, <2 x i32>* %B 225 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 226 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 227 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 228 %tmp5 = add <2 x i32> %tmp3, %tmp4 229 %tmp6 = getelementptr i32, i32* %A, i32 2 230 store i32* %tmp6, i32** %ptr 231 ret <2 x i32> %tmp5 232} 233 234define <2 x i32> @vld2lanei32_odd_update(i32** %ptr, <2 x i32>* %B) nounwind { 235; DEFAULT-LABEL: vld2lanei32_odd_update: 236; DEFAULT: @ %bb.0: 237; DEFAULT-NEXT: vldr d16, [r1] 238; DEFAULT-NEXT: mov r1, #12 239; DEFAULT-NEXT: ldr r3, [r0] 240; DEFAULT-NEXT: vorr d17, d16, d16 241; DEFAULT-NEXT: vld2.32 {d16[1], d17[1]}, [r3], r1 242; DEFAULT-NEXT: vadd.i32 d16, d16, d17 243; DEFAULT-NEXT: str r3, [r0] 244; DEFAULT-NEXT: vmov r2, r1, d16 245; DEFAULT-NEXT: mov r0, r2 246; DEFAULT-NEXT: mov pc, lr 247; 248; BASIC-LABEL: vld2lanei32_odd_update: 249; BASIC: @ %bb.0: 250; BASIC-NEXT: mov r2, r1 251; BASIC-NEXT: mov r1, r0 252; BASIC-NEXT: vldr d16, [r2] 253; BASIC-NEXT: mov r2, #12 254; BASIC-NEXT: ldr r0, [r0] 255; BASIC-NEXT: vorr d17, d16, d16 256; BASIC-NEXT: vld2.32 {d16[1], d17[1]}, [r0], r2 257; BASIC-NEXT: vadd.i32 d16, d16, d17 258; BASIC-NEXT: str r0, [r1] 259; BASIC-NEXT: vmov r2, r3, d16 260; BASIC-NEXT: mov r0, r2 261; BASIC-NEXT: mov r1, r3 262; BASIC-NEXT: mov pc, lr 263 %A = load i32*, i32** %ptr 264 %tmp0 = bitcast i32* %A to i8* 265 %tmp1 = load <2 x i32>, <2 x i32>* %B 266 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 267 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 268 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 269 %tmp5 = add <2 x i32> %tmp3, %tmp4 270 %tmp6 = getelementptr i32, i32* %A, i32 3 271 store i32* %tmp6, i32** %ptr 272 ret <2 x i32> %tmp5 273} 274 275define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { 276; CHECK-LABEL: vld2lanef: 277; CHECK: @ %bb.0: 278; CHECK-NEXT: vldr d16, [r1] 279; CHECK-NEXT: vorr d17, d16, d16 280; CHECK-NEXT: vld2.32 {d16[1], d17[1]}, [r0] 281; CHECK-NEXT: vadd.f32 d16, d16, d17 282; CHECK-NEXT: vmov r0, r1, d16 283; CHECK-NEXT: mov pc, lr 284 %tmp0 = bitcast float* %A to i8* 285 %tmp1 = load <2 x float>, <2 x float>* %B 286 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 287 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 288 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 289 %tmp5 = fadd <2 x float> %tmp3, %tmp4 290 ret <2 x float> %tmp5 291} 292 293;Check the (default) alignment. 294define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 295; CHECK-LABEL: vld2laneQi16: 296; CHECK: @ %bb.0: 297; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 298; CHECK-NEXT: vorr q9, q8, q8 299; CHECK-NEXT: vld2.16 {d17[1], d19[1]}, [r0] 300; CHECK-NEXT: vadd.i16 q8, q8, q9 301; CHECK-NEXT: vmov r0, r1, d16 302; CHECK-NEXT: vmov r2, r3, d17 303; CHECK-NEXT: mov pc, lr 304 %tmp0 = bitcast i16* %A to i8* 305 %tmp1 = load <8 x i16>, <8 x i16>* %B 306 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 307 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 308 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 309 %tmp5 = add <8 x i16> %tmp3, %tmp4 310 ret <8 x i16> %tmp5 311} 312 313;Check the alignment value. Max for this instruction is 64 bits: 314define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 315; CHECK-LABEL: vld2laneQi32: 316; CHECK: @ %bb.0: 317; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 318; CHECK-NEXT: vorr q9, q8, q8 319; CHECK-NEXT: vld2.32 {d17[0], d19[0]}, [r0:64] 320; CHECK-NEXT: vadd.i32 q8, q8, q9 321; CHECK-NEXT: vmov r0, r1, d16 322; CHECK-NEXT: vmov r2, r3, d17 323; CHECK-NEXT: mov pc, lr 324 %tmp0 = bitcast i32* %A to i8* 325 %tmp1 = load <4 x i32>, <4 x i32>* %B 326 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 327 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 328 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 329 %tmp5 = add <4 x i32> %tmp3, %tmp4 330 ret <4 x i32> %tmp5 331} 332 333define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { 334; CHECK-LABEL: vld2laneQf: 335; CHECK: @ %bb.0: 336; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 337; CHECK-NEXT: vorr q9, q8, q8 338; CHECK-NEXT: vld2.32 {d16[1], d18[1]}, [r0] 339; CHECK-NEXT: vadd.f32 q8, q8, q9 340; CHECK-NEXT: vmov r0, r1, d16 341; CHECK-NEXT: vmov r2, r3, d17 342; CHECK-NEXT: mov pc, lr 343 %tmp0 = bitcast float* %A to i8* 344 %tmp1 = load <4 x float>, <4 x float>* %B 345 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 346 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 347 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 348 %tmp5 = fadd <4 x float> %tmp3, %tmp4 349 ret <4 x float> %tmp5 350} 351 352declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 353declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 354declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 355declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly 356 357declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 358declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 359declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly 360 361%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 362%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 363%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } 364%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } 365 366%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } 367%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } 368%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } 369 370define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { 371; DEFAULT-LABEL: vld3lanei8: 372; DEFAULT: @ %bb.0: 373; DEFAULT-NEXT: vldr d16, [r1] 374; DEFAULT-NEXT: vorr d17, d16, d16 375; DEFAULT-NEXT: vorr d18, d16, d16 376; DEFAULT-NEXT: vld3.8 {d16[1], d17[1], d18[1]}, [r0] 377; DEFAULT-NEXT: vadd.i8 d20, d16, d17 378; DEFAULT-NEXT: vadd.i8 d16, d18, d20 379; DEFAULT-NEXT: vmov r0, r1, d16 380; DEFAULT-NEXT: mov pc, lr 381; 382; BASIC-LABEL: vld3lanei8: 383; BASIC: @ %bb.0: 384; BASIC-NEXT: vldr d18, [r1] 385; BASIC-NEXT: vorr d19, d18, d18 386; BASIC-NEXT: vorr d20, d18, d18 387; BASIC-NEXT: vld3.8 {d18[1], d19[1], d20[1]}, [r0] 388; BASIC-NEXT: vadd.i8 d16, d18, d19 389; BASIC-NEXT: vadd.i8 d16, d20, d16 390; BASIC-NEXT: vmov r0, r1, d16 391; BASIC-NEXT: mov pc, lr 392 %tmp1 = load <8 x i8>, <8 x i8>* %B 393 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 394 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 395 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 396 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 397 %tmp6 = add <8 x i8> %tmp3, %tmp4 398 %tmp7 = add <8 x i8> %tmp5, %tmp6 399 ret <8 x i8> %tmp7 400} 401 402;Check the (default) alignment value. VLD3 does not support alignment. 403define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { 404; DEFAULT-LABEL: vld3lanei16: 405; DEFAULT: @ %bb.0: 406; DEFAULT-NEXT: vldr d16, [r1] 407; DEFAULT-NEXT: vorr d17, d16, d16 408; DEFAULT-NEXT: vorr d18, d16, d16 409; DEFAULT-NEXT: vld3.16 {d16[1], d17[1], d18[1]}, [r0] 410; DEFAULT-NEXT: vadd.i16 d20, d16, d17 411; DEFAULT-NEXT: vadd.i16 d16, d18, d20 412; DEFAULT-NEXT: vmov r0, r1, d16 413; DEFAULT-NEXT: mov pc, lr 414; 415; BASIC-LABEL: vld3lanei16: 416; BASIC: @ %bb.0: 417; BASIC-NEXT: vldr d18, [r1] 418; BASIC-NEXT: vorr d19, d18, d18 419; BASIC-NEXT: vorr d20, d18, d18 420; BASIC-NEXT: vld3.16 {d18[1], d19[1], d20[1]}, [r0] 421; BASIC-NEXT: vadd.i16 d16, d18, d19 422; BASIC-NEXT: vadd.i16 d16, d20, d16 423; BASIC-NEXT: vmov r0, r1, d16 424; BASIC-NEXT: mov pc, lr 425 %tmp0 = bitcast i16* %A to i8* 426 %tmp1 = load <4 x i16>, <4 x i16>* %B 427 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 428 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 429 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 430 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 431 %tmp6 = add <4 x i16> %tmp3, %tmp4 432 %tmp7 = add <4 x i16> %tmp5, %tmp6 433 ret <4 x i16> %tmp7 434} 435 436define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { 437; DEFAULT-LABEL: vld3lanei32: 438; DEFAULT: @ %bb.0: 439; DEFAULT-NEXT: vldr d16, [r1] 440; DEFAULT-NEXT: vorr d17, d16, d16 441; DEFAULT-NEXT: vorr d18, d16, d16 442; DEFAULT-NEXT: vld3.32 {d16[1], d17[1], d18[1]}, [r0] 443; DEFAULT-NEXT: vadd.i32 d20, d16, d17 444; DEFAULT-NEXT: vadd.i32 d16, d18, d20 445; DEFAULT-NEXT: vmov r0, r1, d16 446; DEFAULT-NEXT: mov pc, lr 447; 448; BASIC-LABEL: vld3lanei32: 449; BASIC: @ %bb.0: 450; BASIC-NEXT: vldr d18, [r1] 451; BASIC-NEXT: vorr d19, d18, d18 452; BASIC-NEXT: vorr d20, d18, d18 453; BASIC-NEXT: vld3.32 {d18[1], d19[1], d20[1]}, [r0] 454; BASIC-NEXT: vadd.i32 d16, d18, d19 455; BASIC-NEXT: vadd.i32 d16, d20, d16 456; BASIC-NEXT: vmov r0, r1, d16 457; BASIC-NEXT: mov pc, lr 458 %tmp0 = bitcast i32* %A to i8* 459 %tmp1 = load <2 x i32>, <2 x i32>* %B 460 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 461 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 462 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 463 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 464 %tmp6 = add <2 x i32> %tmp3, %tmp4 465 %tmp7 = add <2 x i32> %tmp5, %tmp6 466 ret <2 x i32> %tmp7 467} 468 469define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { 470; DEFAULT-LABEL: vld3lanef: 471; DEFAULT: @ %bb.0: 472; DEFAULT-NEXT: vldr d16, [r1] 473; DEFAULT-NEXT: vorr d17, d16, d16 474; DEFAULT-NEXT: vorr d18, d16, d16 475; DEFAULT-NEXT: vld3.32 {d16[1], d17[1], d18[1]}, [r0] 476; DEFAULT-NEXT: vadd.f32 d20, d16, d17 477; DEFAULT-NEXT: vadd.f32 d16, d18, d20 478; DEFAULT-NEXT: vmov r0, r1, d16 479; DEFAULT-NEXT: mov pc, lr 480; 481; BASIC-LABEL: vld3lanef: 482; BASIC: @ %bb.0: 483; BASIC-NEXT: vldr d18, [r1] 484; BASIC-NEXT: vorr d19, d18, d18 485; BASIC-NEXT: vorr d20, d18, d18 486; BASIC-NEXT: vld3.32 {d18[1], d19[1], d20[1]}, [r0] 487; BASIC-NEXT: vadd.f32 d16, d18, d19 488; BASIC-NEXT: vadd.f32 d16, d20, d16 489; BASIC-NEXT: vmov r0, r1, d16 490; BASIC-NEXT: mov pc, lr 491 %tmp0 = bitcast float* %A to i8* 492 %tmp1 = load <2 x float>, <2 x float>* %B 493 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 494 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 495 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 496 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 497 %tmp6 = fadd <2 x float> %tmp3, %tmp4 498 %tmp7 = fadd <2 x float> %tmp5, %tmp6 499 ret <2 x float> %tmp7 500} 501 502;Check the (default) alignment value. VLD3 does not support alignment. 503define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 504; DEFAULT-LABEL: vld3laneQi16: 505; DEFAULT: @ %bb.0: 506; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] 507; DEFAULT-NEXT: vorr q9, q8, q8 508; DEFAULT-NEXT: vorr q10, q8, q8 509; DEFAULT-NEXT: vld3.16 {d16[1], d18[1], d20[1]}, [r0] 510; DEFAULT-NEXT: vadd.i16 q12, q8, q9 511; DEFAULT-NEXT: vadd.i16 q8, q10, q12 512; DEFAULT-NEXT: vmov r0, r1, d16 513; DEFAULT-NEXT: vmov r2, r3, d17 514; DEFAULT-NEXT: mov pc, lr 515; 516; BASIC-LABEL: vld3laneQi16: 517; BASIC: @ %bb.0: 518; BASIC-NEXT: vld1.64 {d18, d19}, [r1] 519; BASIC-NEXT: vorr q10, q9, q9 520; BASIC-NEXT: vorr q11, q9, q9 521; BASIC-NEXT: vld3.16 {d18[1], d20[1], d22[1]}, [r0] 522; BASIC-NEXT: vadd.i16 q8, q9, q10 523; BASIC-NEXT: vadd.i16 q8, q11, q8 524; BASIC-NEXT: vmov r0, r1, d16 525; BASIC-NEXT: vmov r2, r3, d17 526; BASIC-NEXT: mov pc, lr 527 %tmp0 = bitcast i16* %A to i8* 528 %tmp1 = load <8 x i16>, <8 x i16>* %B 529 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 530 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 531 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 532 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 533 %tmp6 = add <8 x i16> %tmp3, %tmp4 534 %tmp7 = add <8 x i16> %tmp5, %tmp6 535 ret <8 x i16> %tmp7 536} 537 538;Check for a post-increment updating load with register increment. 539define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind { 540; DEFAULT-LABEL: vld3laneQi16_update: 541; DEFAULT: @ %bb.0: 542; DEFAULT-NEXT: .save {r11, lr} 543; DEFAULT-NEXT: push {r11, lr} 544; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] 545; DEFAULT-NEXT: lsl r1, r2, #1 546; DEFAULT-NEXT: vorr q9, q8, q8 547; DEFAULT-NEXT: ldr lr, [r0] 548; DEFAULT-NEXT: vorr q10, q8, q8 549; DEFAULT-NEXT: vld3.16 {d16[1], d18[1], d20[1]}, [lr], r1 550; DEFAULT-NEXT: vadd.i16 q12, q8, q9 551; DEFAULT-NEXT: vadd.i16 q8, q10, q12 552; DEFAULT-NEXT: str lr, [r0] 553; DEFAULT-NEXT: vmov r12, r1, d16 554; DEFAULT-NEXT: vmov r2, r3, d17 555; DEFAULT-NEXT: mov r0, r12 556; DEFAULT-NEXT: pop {r11, lr} 557; DEFAULT-NEXT: mov pc, lr 558; 559; BASIC-LABEL: vld3laneQi16_update: 560; BASIC: @ %bb.0: 561; BASIC-NEXT: .save {r11, lr} 562; BASIC-NEXT: push {r11, lr} 563; BASIC-NEXT: vld1.64 {d18, d19}, [r1] 564; BASIC-NEXT: mov r3, r0 565; BASIC-NEXT: vorr q10, q9, q9 566; BASIC-NEXT: lsl r1, r2, #1 567; BASIC-NEXT: ldr r0, [r0] 568; BASIC-NEXT: vorr q11, q9, q9 569; BASIC-NEXT: vld3.16 {d18[1], d20[1], d22[1]}, [r0], r1 570; BASIC-NEXT: vadd.i16 q8, q9, q10 571; BASIC-NEXT: vadd.i16 q8, q11, q8 572; BASIC-NEXT: str r0, [r3] 573; BASIC-NEXT: vmov r1, lr, d16 574; BASIC-NEXT: vmov r2, r12, d17 575; BASIC-NEXT: mov r0, r1 576; BASIC-NEXT: mov r1, lr 577; BASIC-NEXT: mov r3, r12 578; BASIC-NEXT: pop {r11, lr} 579; BASIC-NEXT: mov pc, lr 580 %A = load i16*, i16** %ptr 581 %tmp0 = bitcast i16* %A to i8* 582 %tmp1 = load <8 x i16>, <8 x i16>* %B 583 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 584 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 585 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 586 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 587 %tmp6 = add <8 x i16> %tmp3, %tmp4 588 %tmp7 = add <8 x i16> %tmp5, %tmp6 589 %tmp8 = getelementptr i16, i16* %A, i32 %inc 590 store i16* %tmp8, i16** %ptr 591 ret <8 x i16> %tmp7 592} 593 594define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 595; DEFAULT-LABEL: vld3laneQi32: 596; DEFAULT: @ %bb.0: 597; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] 598; DEFAULT-NEXT: vorr q9, q8, q8 599; DEFAULT-NEXT: vorr q10, q8, q8 600; DEFAULT-NEXT: vld3.32 {d17[1], d19[1], d21[1]}, [r0] 601; DEFAULT-NEXT: vadd.i32 q12, q8, q9 602; DEFAULT-NEXT: vadd.i32 q8, q10, q12 603; DEFAULT-NEXT: vmov r0, r1, d16 604; DEFAULT-NEXT: vmov r2, r3, d17 605; DEFAULT-NEXT: mov pc, lr 606; 607; BASIC-LABEL: vld3laneQi32: 608; BASIC: @ %bb.0: 609; BASIC-NEXT: vld1.64 {d18, d19}, [r1] 610; BASIC-NEXT: vorr q10, q9, q9 611; BASIC-NEXT: vorr q11, q9, q9 612; BASIC-NEXT: vld3.32 {d19[1], d21[1], d23[1]}, [r0] 613; BASIC-NEXT: vadd.i32 q8, q9, q10 614; BASIC-NEXT: vadd.i32 q8, q11, q8 615; BASIC-NEXT: vmov r0, r1, d16 616; BASIC-NEXT: vmov r2, r3, d17 617; BASIC-NEXT: mov pc, lr 618 %tmp0 = bitcast i32* %A to i8* 619 %tmp1 = load <4 x i32>, <4 x i32>* %B 620 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) 621 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 622 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 623 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 624 %tmp6 = add <4 x i32> %tmp3, %tmp4 625 %tmp7 = add <4 x i32> %tmp5, %tmp6 626 ret <4 x i32> %tmp7 627} 628 629define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { 630; DEFAULT-LABEL: vld3laneQf: 631; DEFAULT: @ %bb.0: 632; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] 633; DEFAULT-NEXT: vorr q9, q8, q8 634; DEFAULT-NEXT: vorr q10, q8, q8 635; DEFAULT-NEXT: vld3.32 {d16[1], d18[1], d20[1]}, [r0] 636; DEFAULT-NEXT: vadd.f32 q12, q8, q9 637; DEFAULT-NEXT: vadd.f32 q8, q10, q12 638; DEFAULT-NEXT: vmov r0, r1, d16 639; DEFAULT-NEXT: vmov r2, r3, d17 640; DEFAULT-NEXT: mov pc, lr 641; 642; BASIC-LABEL: vld3laneQf: 643; BASIC: @ %bb.0: 644; BASIC-NEXT: vld1.64 {d18, d19}, [r1] 645; BASIC-NEXT: vorr q10, q9, q9 646; BASIC-NEXT: vorr q11, q9, q9 647; BASIC-NEXT: vld3.32 {d18[1], d20[1], d22[1]}, [r0] 648; BASIC-NEXT: vadd.f32 q8, q9, q10 649; BASIC-NEXT: vadd.f32 q8, q11, q8 650; BASIC-NEXT: vmov r0, r1, d16 651; BASIC-NEXT: vmov r2, r3, d17 652; BASIC-NEXT: mov pc, lr 653 %tmp0 = bitcast float* %A to i8* 654 %tmp1 = load <4 x float>, <4 x float>* %B 655 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 656 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 657 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 658 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 659 %tmp6 = fadd <4 x float> %tmp3, %tmp4 660 %tmp7 = fadd <4 x float> %tmp5, %tmp6 661 ret <4 x float> %tmp7 662} 663 664declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 665declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 666declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 667declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 668 669declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 670declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 671declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 672 673%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } 674%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 675%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 676%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } 677 678%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } 679%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } 680%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } 681 682;Check the alignment value. Max for this instruction is 32 bits: 683define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { 684; CHECK-LABEL: vld4lanei8: 685; CHECK: @ %bb.0: 686; CHECK-NEXT: vldr d16, [r1] 687; CHECK-NEXT: vorr d17, d16, d16 688; CHECK-NEXT: vorr d18, d16, d16 689; CHECK-NEXT: vorr d19, d16, d16 690; CHECK-NEXT: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32] 691; CHECK-NEXT: vadd.i8 d16, d16, d17 692; CHECK-NEXT: vadd.i8 d20, d18, d19 693; CHECK-NEXT: vadd.i8 d16, d16, d20 694; CHECK-NEXT: vmov r0, r1, d16 695; CHECK-NEXT: mov pc, lr 696 %tmp1 = load <8 x i8>, <8 x i8>* %B 697 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 698 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 699 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 700 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 701 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 702 %tmp7 = add <8 x i8> %tmp3, %tmp4 703 %tmp8 = add <8 x i8> %tmp5, %tmp6 704 %tmp9 = add <8 x i8> %tmp7, %tmp8 705 ret <8 x i8> %tmp9 706} 707 708;Check for a post-increment updating load. 709define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 710; DEFAULT-LABEL: vld4lanei8_update: 711; DEFAULT: @ %bb.0: 712; DEFAULT-NEXT: vldr d16, [r1] 713; DEFAULT-NEXT: vorr d17, d16, d16 714; DEFAULT-NEXT: ldr r3, [r0] 715; DEFAULT-NEXT: vorr d18, d16, d16 716; DEFAULT-NEXT: vorr d19, d16, d16 717; DEFAULT-NEXT: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3:32]! 718; DEFAULT-NEXT: vadd.i8 d16, d16, d17 719; DEFAULT-NEXT: vadd.i8 d20, d18, d19 720; DEFAULT-NEXT: str r3, [r0] 721; DEFAULT-NEXT: vadd.i8 d16, d16, d20 722; DEFAULT-NEXT: vmov r2, r1, d16 723; DEFAULT-NEXT: mov r0, r2 724; DEFAULT-NEXT: mov pc, lr 725; 726; BASIC-LABEL: vld4lanei8_update: 727; BASIC: @ %bb.0: 728; BASIC-NEXT: vldr d16, [r1] 729; BASIC-NEXT: mov r3, r0 730; BASIC-NEXT: vorr d17, d16, d16 731; BASIC-NEXT: ldr r0, [r0] 732; BASIC-NEXT: vorr d18, d16, d16 733; BASIC-NEXT: vorr d19, d16, d16 734; BASIC-NEXT: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]! 735; BASIC-NEXT: vadd.i8 d16, d16, d17 736; BASIC-NEXT: vadd.i8 d20, d18, d19 737; BASIC-NEXT: str r0, [r3] 738; BASIC-NEXT: vadd.i8 d16, d16, d20 739; BASIC-NEXT: vmov r1, r2, d16 740; BASIC-NEXT: mov r0, r1 741; BASIC-NEXT: mov r1, r2 742; BASIC-NEXT: mov pc, lr 743 %A = load i8*, i8** %ptr 744 %tmp1 = load <8 x i8>, <8 x i8>* %B 745 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 746 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 747 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 748 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 749 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 750 %tmp7 = add <8 x i8> %tmp3, %tmp4 751 %tmp8 = add <8 x i8> %tmp5, %tmp6 752 %tmp9 = add <8 x i8> %tmp7, %tmp8 753 %tmp10 = getelementptr i8, i8* %A, i32 4 754 store i8* %tmp10, i8** %ptr 755 ret <8 x i8> %tmp9 756} 757 758;Check that a power-of-two alignment smaller than the total size of the memory 759;being loaded is ignored. 760define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { 761; CHECK-LABEL: vld4lanei16: 762; CHECK: @ %bb.0: 763; CHECK-NEXT: vldr d16, [r1] 764; CHECK-NEXT: vorr d17, d16, d16 765; CHECK-NEXT: vorr d18, d16, d16 766; CHECK-NEXT: vorr d19, d16, d16 767; CHECK-NEXT: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0] 768; CHECK-NEXT: vadd.i16 d16, d16, d17 769; CHECK-NEXT: vadd.i16 d20, d18, d19 770; CHECK-NEXT: vadd.i16 d16, d16, d20 771; CHECK-NEXT: vmov r0, r1, d16 772; CHECK-NEXT: mov pc, lr 773 %tmp0 = bitcast i16* %A to i8* 774 %tmp1 = load <4 x i16>, <4 x i16>* %B 775 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) 776 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 777 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 778 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 779 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3 780 %tmp7 = add <4 x i16> %tmp3, %tmp4 781 %tmp8 = add <4 x i16> %tmp5, %tmp6 782 %tmp9 = add <4 x i16> %tmp7, %tmp8 783 ret <4 x i16> %tmp9 784} 785 786;Check the alignment value. An 8-byte alignment is allowed here even though 787;it is smaller than the total size of the memory being loaded. 788define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { 789; CHECK-LABEL: vld4lanei32: 790; CHECK: @ %bb.0: 791; CHECK-NEXT: vldr d16, [r1] 792; CHECK-NEXT: vorr d17, d16, d16 793; CHECK-NEXT: vorr d18, d16, d16 794; CHECK-NEXT: vorr d19, d16, d16 795; CHECK-NEXT: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:64] 796; CHECK-NEXT: vadd.i32 d16, d16, d17 797; CHECK-NEXT: vadd.i32 d20, d18, d19 798; CHECK-NEXT: vadd.i32 d16, d16, d20 799; CHECK-NEXT: vmov r0, r1, d16 800; CHECK-NEXT: mov pc, lr 801 %tmp0 = bitcast i32* %A to i8* 802 %tmp1 = load <2 x i32>, <2 x i32>* %B 803 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) 804 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 805 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 806 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 807 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3 808 %tmp7 = add <2 x i32> %tmp3, %tmp4 809 %tmp8 = add <2 x i32> %tmp5, %tmp6 810 %tmp9 = add <2 x i32> %tmp7, %tmp8 811 ret <2 x i32> %tmp9 812} 813 814define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { 815; CHECK-LABEL: vld4lanef: 816; CHECK: @ %bb.0: 817; CHECK-NEXT: vldr d16, [r1] 818; CHECK-NEXT: vorr d17, d16, d16 819; CHECK-NEXT: vorr d18, d16, d16 820; CHECK-NEXT: vorr d19, d16, d16 821; CHECK-NEXT: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0] 822; CHECK-NEXT: vadd.f32 d16, d16, d17 823; CHECK-NEXT: vadd.f32 d20, d18, d19 824; CHECK-NEXT: vadd.f32 d16, d16, d20 825; CHECK-NEXT: vmov r0, r1, d16 826; CHECK-NEXT: mov pc, lr 827 %tmp0 = bitcast float* %A to i8* 828 %tmp1 = load <2 x float>, <2 x float>* %B 829 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 830 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 831 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 832 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 833 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3 834 %tmp7 = fadd <2 x float> %tmp3, %tmp4 835 %tmp8 = fadd <2 x float> %tmp5, %tmp6 836 %tmp9 = fadd <2 x float> %tmp7, %tmp8 837 ret <2 x float> %tmp9 838} 839 840;Check the alignment value. Max for this instruction is 64 bits: 841define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 842; CHECK-LABEL: vld4laneQi16: 843; CHECK: @ %bb.0: 844; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 845; CHECK-NEXT: vorr q9, q8, q8 846; CHECK-NEXT: vorr q10, q8, q8 847; CHECK-NEXT: vorr q11, q8, q8 848; CHECK-NEXT: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0:64] 849; CHECK-NEXT: vadd.i16 q8, q8, q9 850; CHECK-NEXT: vadd.i16 q12, q10, q11 851; CHECK-NEXT: vadd.i16 q8, q8, q12 852; CHECK-NEXT: vmov r0, r1, d16 853; CHECK-NEXT: vmov r2, r3, d17 854; CHECK-NEXT: mov pc, lr 855 %tmp0 = bitcast i16* %A to i8* 856 %tmp1 = load <8 x i16>, <8 x i16>* %B 857 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) 858 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 859 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 860 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 861 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3 862 %tmp7 = add <8 x i16> %tmp3, %tmp4 863 %tmp8 = add <8 x i16> %tmp5, %tmp6 864 %tmp9 = add <8 x i16> %tmp7, %tmp8 865 ret <8 x i16> %tmp9 866} 867 868;Check the (default) alignment. 869define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 870; CHECK-LABEL: vld4laneQi32: 871; CHECK: @ %bb.0: 872; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 873; CHECK-NEXT: vorr q9, q8, q8 874; CHECK-NEXT: vorr q10, q8, q8 875; CHECK-NEXT: vorr q11, q8, q8 876; CHECK-NEXT: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0] 877; CHECK-NEXT: vadd.i32 q8, q8, q9 878; CHECK-NEXT: vadd.i32 q12, q10, q11 879; CHECK-NEXT: vadd.i32 q8, q8, q12 880; CHECK-NEXT: vmov r0, r1, d16 881; CHECK-NEXT: vmov r2, r3, d17 882; CHECK-NEXT: mov pc, lr 883 %tmp0 = bitcast i32* %A to i8* 884 %tmp1 = load <4 x i32>, <4 x i32>* %B 885 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 886 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 887 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 888 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 889 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3 890 %tmp7 = add <4 x i32> %tmp3, %tmp4 891 %tmp8 = add <4 x i32> %tmp5, %tmp6 892 %tmp9 = add <4 x i32> %tmp7, %tmp8 893 ret <4 x i32> %tmp9 894} 895 896define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { 897; CHECK-LABEL: vld4laneQf: 898; CHECK: @ %bb.0: 899; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 900; CHECK-NEXT: vorr q9, q8, q8 901; CHECK-NEXT: vorr q10, q8, q8 902; CHECK-NEXT: vorr q11, q8, q8 903; CHECK-NEXT: vld4.32 {d16[1], d18[1], d20[1], d22[1]}, [r0] 904; CHECK-NEXT: vadd.f32 q8, q8, q9 905; CHECK-NEXT: vadd.f32 q12, q10, q11 906; CHECK-NEXT: vadd.f32 q8, q8, q12 907; CHECK-NEXT: vmov r0, r1, d16 908; CHECK-NEXT: vmov r2, r3, d17 909; CHECK-NEXT: mov pc, lr 910 %tmp0 = bitcast float* %A to i8* 911 %tmp1 = load <4 x float>, <4 x float>* %B 912 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 913 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 914 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 915 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 916 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3 917 %tmp7 = fadd <4 x float> %tmp3, %tmp4 918 %tmp8 = fadd <4 x float> %tmp5, %tmp6 919 %tmp9 = fadd <4 x float> %tmp7, %tmp8 920 ret <4 x float> %tmp9 921} 922 923declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 924declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 925declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 926declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 927 928declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 929declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 930declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 931 932; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register 933; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because 934; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low 935; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) 936define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { 937; DEFAULT-LABEL: test_qqqq_regsequence_subreg: 938; DEFAULT: @ %bb.0: 939; DEFAULT-NEXT: add r0, sp, #24 940; DEFAULT-NEXT: vld1.32 {d21[0]}, [r0:32] 941; DEFAULT-NEXT: add r0, sp, #28 942; DEFAULT-NEXT: vmov.i32 d20, #0x0 943; DEFAULT-NEXT: vld1.32 {d21[1]}, [r0:32] 944; DEFAULT-NEXT: vld3.16 {d16[1], d18[1], d20[1]}, [r0] 945; DEFAULT-NEXT: vadd.i16 q12, q8, q9 946; DEFAULT-NEXT: vadd.i16 q8, q10, q12 947; DEFAULT-NEXT: vmov r0, r1, d16 948; DEFAULT-NEXT: vmov r2, r3, d17 949; DEFAULT-NEXT: mov pc, lr 950; 951; BASIC-LABEL: test_qqqq_regsequence_subreg: 952; BASIC: @ %bb.0: 953; BASIC-NEXT: add r0, sp, #24 954; BASIC-NEXT: vld1.32 {d23[0]}, [r0:32] 955; BASIC-NEXT: add r0, sp, #28 956; BASIC-NEXT: vmov.i32 d22, #0x0 957; BASIC-NEXT: vld1.32 {d23[1]}, [r0:32] 958; BASIC-NEXT: vld3.16 {d18[1], d20[1], d22[1]}, [r0] 959; BASIC-NEXT: vadd.i16 q8, q9, q10 960; BASIC-NEXT: vadd.i16 q8, q11, q8 961; BASIC-NEXT: vmov r0, r1, d16 962; BASIC-NEXT: vmov r2, r3, d17 963; BASIC-NEXT: mov pc, lr 964 %tmp63 = extractvalue [6 x i64] %b, 5 965 %tmp64 = zext i64 %tmp63 to i128 966 %tmp65 = shl i128 %tmp64, 64 967 %ins67 = or i128 %tmp65, 0 968 %tmp78 = bitcast i128 %ins67 to <8 x i16> 969 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) 970 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 971 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 972 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 973 %tmp6 = add <8 x i16> %tmp3, %tmp4 974 %tmp7 = add <8 x i16> %tmp5, %tmp6 975 ret <8 x i16> %tmp7 976} 977 978declare void @llvm.trap() nounwind 979