1; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s 2 3define <8 x i8> @vld1dupi8(i8* %A) nounwind { 4;CHECK-LABEL: vld1dupi8: 5;Check the (default) alignment value. 6;CHECK: vld1.8 {d16[]}, [{{r[0-9]+|lr}}] 7 %tmp1 = load i8, i8* %A, align 8 8 %tmp2 = insertelement <8 x i8> undef, i8 %tmp1, i32 0 9 %tmp3 = shufflevector <8 x i8> %tmp2, <8 x i8> undef, <8 x i32> zeroinitializer 10 ret <8 x i8> %tmp3 11} 12 13define <8 x i8> @vld1dupi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind { 14entry: 15;CHECK-LABEL: vld1dupi8_preinc: 16;CHECK: vld1.8 {d16[]}, [{{r[0-9]+|lr}}] 17 %0 = load i8*, i8** %a, align 4 18 %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b 19 %1 = load i8, i8* %add.ptr, align 1 20 %2 = insertelement <8 x i8> undef, i8 %1, i32 0 21 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 22 store i8* %add.ptr, i8** %a, align 4 23 ret <8 x i8> %lane 24} 25 26define <8 x i8> @vld1dupi8_postinc_fixed(i8** noalias nocapture %a) nounwind { 27entry: 28;CHECK-LABEL: vld1dupi8_postinc_fixed: 29;CHECK: vld1.8 {d16[]}, [{{r[0-9]+|lr}}]! 30 %0 = load i8*, i8** %a, align 4 31 %1 = load i8, i8* %0, align 1 32 %2 = insertelement <8 x i8> undef, i8 %1, i32 0 33 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 34 %add.ptr = getelementptr inbounds i8, i8* %0, i32 1 35 store i8* %add.ptr, i8** %a, align 4 36 ret <8 x i8> %lane 37} 38 39define <8 x i8> @vld1dupi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind { 40entry: 41;CHECK-LABEL: vld1dupi8_postinc_register: 42;CHECK: vld1.8 {d16[]}, [{{r[0-9]+|lr}}], r1 43 %0 = load i8*, i8** %a, align 4 44 %1 = load i8, i8* %0, align 1 45 %2 = insertelement <8 x i8> undef, i8 %1, i32 0 46 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 47 %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n 48 store i8* %add.ptr, i8** %a, align 4 49 ret <8 x i8> %lane 50} 51 52define <16 x i8> @vld1dupqi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind { 53entry: 54;CHECK-LABEL: vld1dupqi8_preinc: 55;CHECK: vld1.8 {d16[], d17[]}, [{{r[0-9]+|lr}}] 56 %0 = load i8*, i8** %a, align 4 57 %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b 58 %1 = load i8, i8* %add.ptr, align 1 59 %2 = insertelement <16 x i8> undef, i8 %1, i32 0 60 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer 61 store i8* %add.ptr, i8** %a, align 4 62 ret <16 x i8> %lane 63} 64 65define <16 x i8> @vld1dupqi8_postinc_fixed(i8** noalias nocapture %a) nounwind { 66entry: 67;CHECK-LABEL: vld1dupqi8_postinc_fixed: 68;CHECK: vld1.8 {d16[], d17[]}, [{{r[0-9]+|lr}}]! 69 %0 = load i8*, i8** %a, align 4 70 %1 = load i8, i8* %0, align 1 71 %2 = insertelement <16 x i8> undef, i8 %1, i32 0 72 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer 73 %add.ptr = getelementptr inbounds i8, i8* %0, i32 1 74 store i8* %add.ptr, i8** %a, align 4 75 ret <16 x i8> %lane 76} 77 78define <16 x i8> @vld1dupqi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind { 79entry: 80;CHECK-LABEL: vld1dupqi8_postinc_register: 81;CHECK: vld1.8 {d16[], d17[]}, [{{r[0-9]+|lr}}], r1 82 %0 = load i8*, i8** %a, align 4 83 %1 = load i8, i8* %0, align 1 84 %2 = insertelement <16 x i8> undef, i8 %1, i32 0 85 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer 86 %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n 87 store i8* %add.ptr, i8** %a, align 4 88 ret <16 x i8> %lane 89} 90 91define <4 x i16> @vld1dupi16(i16* %A) nounwind { 92;CHECK-LABEL: vld1dupi16: 93;Check the alignment value. Max for this instruction is 16 bits: 94;CHECK: vld1.16 {d16[]}, [{{r[0-9]+|lr}}:16] 95 %tmp1 = load i16, i16* %A, align 8 96 %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0 97 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer 98 ret <4 x i16> %tmp3 99} 100 101define <4 x i16> @vld1dupi16_misaligned(i16* %A) nounwind { 102;CHECK-LABEL: vld1dupi16_misaligned: 103;CHECK: vld1.16 {d16[]}, [{{r[0-9]+|lr}}] 104 %tmp1 = load i16, i16* %A, align 1 105 %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0 106 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer 107 ret <4 x i16> %tmp3 108} 109 110; This sort of looks like a vld1dup, but there's an extension in the way. 111define <4 x i16> @load_i16_dup_zext(i8* %A) nounwind { 112;CHECK-LABEL: load_i16_dup_zext: 113;CHECK: ldrb r0, [{{r[0-9]+|lr}}] 114;CHECK-NEXT: vdup.16 d16, r0 115 %tmp1 = load i8, i8* %A, align 1 116 %tmp2 = zext i8 %tmp1 to i16 117 %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0 118 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 119 ret <4 x i16> %tmp4 120} 121 122; This sort of looks like a vld1dup, but there's an extension in the way. 123define <4 x i16> @load_i16_dup_sext(i8* %A) nounwind { 124;CHECK-LABEL: load_i16_dup_sext: 125;CHECK: ldrsb r0, [{{r[0-9]+|lr}}] 126;CHECK-NEXT: vdup.16 d16, r0 127 %tmp1 = load i8, i8* %A, align 1 128 %tmp2 = sext i8 %tmp1 to i16 129 %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0 130 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 131 ret <4 x i16> %tmp4 132} 133 134; This sort of looks like a vld1dup, but there's an extension in the way. 135define <8 x i16> @load_i16_dupq_zext(i8* %A) nounwind { 136;CHECK-LABEL: load_i16_dupq_zext: 137;CHECK: ldrb r0, [{{r[0-9]+|lr}}] 138;CHECK-NEXT: vdup.16 q8, r0 139 %tmp1 = load i8, i8* %A, align 1 140 %tmp2 = zext i8 %tmp1 to i16 141 %tmp3 = insertelement <8 x i16> undef, i16 %tmp2, i32 0 142 %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer 143 ret <8 x i16> %tmp4 144} 145 146define <2 x i32> @vld1dupi32(i32* %A) nounwind { 147;CHECK-LABEL: vld1dupi32: 148;Check the alignment value. Max for this instruction is 32 bits: 149;CHECK: vld1.32 {d16[]}, [{{r[0-9]+|lr}}:32] 150 %tmp1 = load i32, i32* %A, align 8 151 %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0 152 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer 153 ret <2 x i32> %tmp3 154} 155 156; This sort of looks like a vld1dup, but there's an extension in the way. 157define <4 x i32> @load_i32_dup_zext(i8* %A) nounwind { 158;CHECK-LABEL: load_i32_dup_zext: 159;CHECK: ldrb r0, [{{r[0-9]+|lr}}] 160;CHECK-NEXT: vdup.32 q8, r0 161 %tmp1 = load i8, i8* %A, align 1 162 %tmp2 = zext i8 %tmp1 to i32 163 %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0 164 %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer 165 ret <4 x i32> %tmp4 166} 167 168; This sort of looks like a vld1dup, but there's an extension in the way. 169define <4 x i32> @load_i32_dup_sext(i8* %A) nounwind { 170;CHECK-LABEL: load_i32_dup_sext: 171;CHECK: ldrsb r0, [{{r[0-9]+|lr}}] 172;CHECK-NEXT: vdup.32 q8, r0 173 %tmp1 = load i8, i8* %A, align 1 174 %tmp2 = sext i8 %tmp1 to i32 175 %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0 176 %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer 177 ret <4 x i32> %tmp4 178} 179 180define <2 x float> @vld1dupf(float* %A) nounwind { 181;CHECK-LABEL: vld1dupf: 182;CHECK: vld1.32 {d16[]}, [{{r[0-9]+|lr}}:32] 183 %tmp0 = load float, float* %A 184 %tmp1 = insertelement <2 x float> undef, float %tmp0, i32 0 185 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 186 ret <2 x float> %tmp2 187} 188 189define <16 x i8> @vld1dupQi8(i8* %A) nounwind { 190;CHECK-LABEL: vld1dupQi8: 191;Check the (default) alignment value. 192;CHECK: vld1.8 {d16[], d17[]}, [{{r[0-9]+|lr}}] 193 %tmp1 = load i8, i8* %A, align 8 194 %tmp2 = insertelement <16 x i8> undef, i8 %tmp1, i32 0 195 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer 196 ret <16 x i8> %tmp3 197} 198 199define <4 x float> @vld1dupQf(float* %A) nounwind { 200;CHECK-LABEL: vld1dupQf: 201;CHECK: vld1.32 {d16[], d17[]}, [{{r[0-9]+|lr}}:32] 202 %tmp0 = load float, float* %A 203 %tmp1 = insertelement <4 x float> undef, float %tmp0, i32 0 204 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 205 ret <4 x float> %tmp2 206} 207 208%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 209%struct.__neon_int4x16x2_t = type { <4 x i16>, <4 x i16> } 210%struct.__neon_int2x32x2_t = type { <2 x i32>, <2 x i32> } 211 212define <8 x i8> @vld2dupi8(i8* %A) nounwind { 213;CHECK-LABEL: vld2dupi8: 214;Check the (default) alignment value. 215;CHECK: vld2.8 {d16[], d17[]}, [{{r[0-9]+|lr}}] 216 %tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) 217 %tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0 218 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 219 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1 220 %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer 221 %tmp5 = add <8 x i8> %tmp2, %tmp4 222 ret <8 x i8> %tmp5 223} 224 225define void @vld2dupi8_preinc(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %b) nounwind { 226;CHECK-LABEL: vld2dupi8_preinc: 227;CHECK: vld2.8 {d16[], d17[]}, [r2] 228entry: 229 %0 = load i8*, i8** %a, align 4 230 %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b 231 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %add.ptr, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) 232 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 233 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 234 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 235 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 236 store i8* %add.ptr, i8** %a, align 4 237 %r8 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 238 store <8 x i8> %lane, <8 x i8>* %r8, align 8 239 %r11 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 240 store <8 x i8> %lane1, <8 x i8>* %r11, align 8 241 ret void 242} 243 244define void @vld2dupi8_postinc_fixed(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a) nounwind { 245entry: 246;CHECK-LABEL: vld2dupi8_postinc_fixed: 247;CHECK: vld2.8 {d16[], d17[]}, [r2]! 248 %0 = load i8*, i8** %a, align 4 249 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) 250 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 251 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 252 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 253 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 254 %add.ptr = getelementptr inbounds i8, i8* %0, i32 2 255 store i8* %add.ptr, i8** %a, align 4 256 %r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 257 store <8 x i8> %lane, <8 x i8>* %r7, align 8 258 %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 259 store <8 x i8> %lane1, <8 x i8>* %r10, align 8 260 ret void 261} 262 263define void @vld2dupi8_postinc_variable(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %n) nounwind { 264entry: 265;CHECK-LABEL: vld2dupi8_postinc_variable: 266;CHECK: vld2.8 {d16[], d17[]}, [r3], r2 267 %0 = load i8*, i8** %a, align 4 268 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) 269 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 270 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 271 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 272 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 273 %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n 274 store i8* %add.ptr, i8** %a, align 4 275 %r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 276 store <8 x i8> %lane, <8 x i8>* %r7, align 8 277 %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 278 store <8 x i8> %lane1, <8 x i8>* %r10, align 8 279 ret void 280} 281 282define <4 x i16> @vld2dupi16(i8* %A) nounwind { 283;CHECK-LABEL: vld2dupi16: 284;Check that a power-of-two alignment smaller than the total size of the memory 285;being loaded is ignored. 286;CHECK: vld2.16 {d16[], d17[]}, [{{r[0-9]+|lr}}] 287 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) 288 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 289 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 290 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 291 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 292 %tmp5 = add <4 x i16> %tmp2, %tmp4 293 ret <4 x i16> %tmp5 294} 295 296;Check for a post-increment updating load. 297define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind { 298;CHECK-LABEL: vld2dupi16_update: 299;CHECK: vld2.16 {d16[], d17[]}, [{{r[0-9]+|lr}}]! 300 %A = load i16*, i16** %ptr 301 %A2 = bitcast i16* %A to i8* 302 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) 303 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 304 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 305 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 306 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 307 %tmp5 = add <4 x i16> %tmp2, %tmp4 308 %tmp6 = getelementptr i16, i16* %A, i32 2 309 store i16* %tmp6, i16** %ptr 310 ret <4 x i16> %tmp5 311} 312 313define <4 x i16> @vld2dupi16_odd_update(i16** %ptr) nounwind { 314;CHECK-LABEL: vld2dupi16_odd_update: 315;CHECK: mov [[INC:r[0-9]+]], #6 316;CHECK: vld2.16 {d16[], d17[]}, [{{r[0-9]+|lr}}], [[INC]] 317 %A = load i16*, i16** %ptr 318 %A2 = bitcast i16* %A to i8* 319 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) 320 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 321 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 322 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 323 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 324 %tmp5 = add <4 x i16> %tmp2, %tmp4 325 %tmp6 = getelementptr i16, i16* %A, i32 3 326 store i16* %tmp6, i16** %ptr 327 ret <4 x i16> %tmp5 328} 329 330define <2 x i32> @vld2dupi32(i8* %A) nounwind { 331;CHECK-LABEL: vld2dupi32: 332;Check the alignment value. Max for this instruction is 64 bits: 333;CHECK: vld2.32 {d16[], d17[]}, [{{r[0-9]+|lr}}:64] 334 %tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16) 335 %tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0 336 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 337 %tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1 338 %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer 339 %tmp5 = add <2 x i32> %tmp2, %tmp4 340 ret <2 x i32> %tmp5 341} 342 343declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 344declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 345declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 346 347%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 348%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 349 350;Check for a post-increment updating load with register increment. 351define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind { 352;CHECK-LABEL: vld3dupi8_update: 353;CHECK: vld3.8 {d16[], d17[], d18[]}, [{{r[0-9]+|lr}}], r1 354 %A = load i8*, i8** %ptr 355 %tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8) 356 %tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0 357 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 358 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1 359 %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer 360 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 2 361 %tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <8 x i32> zeroinitializer 362 %tmp7 = add <8 x i8> %tmp2, %tmp4 363 %tmp8 = add <8 x i8> %tmp7, %tmp6 364 %tmp9 = getelementptr i8, i8* %A, i32 %inc 365 store i8* %tmp9, i8** %ptr 366 ret <8 x i8> %tmp8 367} 368 369define <4 x i16> @vld3dupi16(i8* %A) nounwind { 370;CHECK-LABEL: vld3dupi16: 371;Check the (default) alignment value. VLD3 does not support alignment. 372;CHECK: vld3.16 {d16[], d17[], d18[]}, [{{r[0-9]+|lr}}] 373 %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8) 374 %tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0 375 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 376 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1 377 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 378 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 2 379 %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer 380 %tmp7 = add <4 x i16> %tmp2, %tmp4 381 %tmp8 = add <4 x i16> %tmp7, %tmp6 382 ret <4 x i16> %tmp8 383} 384 385declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 386declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 387 388%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 389%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 390 391;Check for a post-increment updating load. 392define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind { 393;CHECK-LABEL: vld4dupi16_update: 394;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [{{r[0-9]+|lr}}]! 395 %A = load i16*, i16** %ptr 396 %A2 = bitcast i16* %A to i8* 397 %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1) 398 %tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0 399 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 400 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1 401 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 402 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 2 403 %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer 404 %tmp7 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 3 405 %tmp8 = shufflevector <4 x i16> %tmp7, <4 x i16> undef, <4 x i32> zeroinitializer 406 %tmp9 = add <4 x i16> %tmp2, %tmp4 407 %tmp10 = add <4 x i16> %tmp6, %tmp8 408 %tmp11 = add <4 x i16> %tmp9, %tmp10 409 %tmp12 = getelementptr i16, i16* %A, i32 4 410 store i16* %tmp12, i16** %ptr 411 ret <4 x i16> %tmp11 412} 413 414define <2 x i32> @vld4dupi32(i8* %A) nounwind { 415;CHECK-LABEL: vld4dupi32: 416;Check the alignment value. An 8-byte alignment is allowed here even though 417;it is smaller than the total size of the memory being loaded. 418;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [{{r[0-9]+|lr}}:64] 419 %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8) 420 %tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0 421 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 422 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1 423 %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer 424 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 2 425 %tmp6 = shufflevector <2 x i32> %tmp5, <2 x i32> undef, <2 x i32> zeroinitializer 426 %tmp7 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 3 427 %tmp8 = shufflevector <2 x i32> %tmp7, <2 x i32> undef, <2 x i32> zeroinitializer 428 %tmp9 = add <2 x i32> %tmp2, %tmp4 429 %tmp10 = add <2 x i32> %tmp6, %tmp8 430 %tmp11 = add <2 x i32> %tmp9, %tmp10 431 ret <2 x i32> %tmp11 432} 433 434declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 435declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 436