1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) { 7; CHECK-LABEL: vst2_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: ldrd r2, r12, [r0] 10; CHECK-NEXT: ldrd r3, r0, [r0, #8] 11; CHECK-NEXT: vmov.32 q0[0], r2 12; CHECK-NEXT: vmov.32 q1[0], r3 13; CHECK-NEXT: vmov.32 q0[2], r12 14; CHECK-NEXT: vmov.f64 d4, d1 15; CHECK-NEXT: vmov.32 q1[2], r0 16; CHECK-NEXT: vmov.f32 s9, s3 17; CHECK-NEXT: vmov.f32 s2, s4 18; CHECK-NEXT: vmov.f32 s3, s5 19; CHECK-NEXT: vmov.f32 s10, s6 20; CHECK-NEXT: vmov.f32 s1, s2 21; CHECK-NEXT: vmov.f32 s11, s7 22; CHECK-NEXT: vmov.f32 s2, s8 23; CHECK-NEXT: vmov.f32 s3, s10 24; CHECK-NEXT: vstrw.32 q0, [r1] 25; CHECK-NEXT: bx lr 26entry: 27 %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 28 %l1 = load <2 x i32>, <2 x i32>* %s1, align 4 29 %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1 30 %l2 = load <2 x i32>, <2 x i32>* %s2, align 4 31 %s = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 32 store <4 x i32> %s, <4 x i32> *%dst 33 ret void 34} 35 36define void @vst2_v4i32(<4 x i32> *%src, <8 x i32> *%dst) { 37; CHECK-LABEL: vst2_v4i32: 38; CHECK: @ %bb.0: @ %entry 39; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 40; CHECK-NEXT: vldrw.u32 q0, [r0] 41; CHECK-NEXT: vst20.32 {q0, q1}, [r1] 42; CHECK-NEXT: vst21.32 {q0, q1}, [r1] 43; CHECK-NEXT: bx lr 44entry: 45 %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 46 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4 47 %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1 48 %l2 = load <4 x i32>, <4 x i32>* %s2, align 4 49 %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 50 store <8 x i32> %s, <8 x i32> *%dst 51 ret void 52} 53 54define void @vst2_v8i32(<8 x i32> *%src, <16 x i32> *%dst) { 55; CHECK-LABEL: vst2_v8i32: 56; CHECK: @ %bb.0: @ %entry 57; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 58; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 59; CHECK-NEXT: vldrw.u32 q0, [r0] 60; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 61; CHECK-NEXT: vst20.32 {q0, q1}, [r1] 62; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! 63; CHECK-NEXT: vst20.32 {q2, q3}, [r1] 64; CHECK-NEXT: vst21.32 {q2, q3}, [r1] 65; CHECK-NEXT: bx lr 66entry: 67 %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0 68 %l1 = load <8 x i32>, <8 x i32>* %s1, align 4 69 %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1 70 %l2 = load <8 x i32>, <8 x i32>* %s2, align 4 71 %s = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 72 store <16 x i32> %s, <16 x i32> *%dst 73 ret void 74} 75 76define void @vst2_v16i32(<16 x i32> *%src, <32 x i32> *%dst) { 77; CHECK-LABEL: vst2_v16i32: 78; CHECK: @ %bb.0: @ %entry 79; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 80; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 81; CHECK-NEXT: vldrw.u32 q7, [r0, #64] 82; CHECK-NEXT: vldrw.u32 q1, [r0, #112] 83; CHECK-NEXT: vldrw.u32 q3, [r0, #96] 84; CHECK-NEXT: vldrw.u32 q5, [r0, #80] 85; CHECK-NEXT: vldrw.u32 q6, [r0] 86; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 87; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 88; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 89; CHECK-NEXT: vst20.32 {q6, q7}, [r1] 90; CHECK-NEXT: add.w r0, r1, #96 91; CHECK-NEXT: add.w r2, r1, #64 92; CHECK-NEXT: vst21.32 {q6, q7}, [r1]! 93; CHECK-NEXT: vst20.32 {q4, q5}, [r1] 94; CHECK-NEXT: vst21.32 {q4, q5}, [r1] 95; CHECK-NEXT: vst20.32 {q2, q3}, [r2] 96; CHECK-NEXT: vst21.32 {q2, q3}, [r2] 97; CHECK-NEXT: vst20.32 {q0, q1}, [r0] 98; CHECK-NEXT: vst21.32 {q0, q1}, [r0] 99; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 100; CHECK-NEXT: bx lr 101entry: 102 %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0 103 %l1 = load <16 x i32>, <16 x i32>* %s1, align 4 104 %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1 105 %l2 = load <16 x i32>, <16 x i32>* %s2, align 4 106 %s = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 107 store <32 x i32> %s, <32 x i32> *%dst 108 ret void 109} 110 111; i16 112 113define void @vst2_v2i16(<2 x i16> *%src, <4 x i16> *%dst) { 114; CHECK-LABEL: vst2_v2i16: 115; CHECK: @ %bb.0: @ %entry 116; CHECK-NEXT: ldrh r3, [r0] 117; CHECK-NEXT: ldrh r2, [r0, #4] 118; CHECK-NEXT: vmov.32 q0[0], r3 119; CHECK-NEXT: ldrh.w r12, [r0, #6] 120; CHECK-NEXT: ldrh r0, [r0, #2] 121; CHECK-NEXT: vmov.32 q0[1], r2 122; CHECK-NEXT: vmov.32 q0[2], r0 123; CHECK-NEXT: vmov.32 q0[3], r12 124; CHECK-NEXT: vstrh.32 q0, [r1] 125; CHECK-NEXT: bx lr 126entry: 127 %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0 128 %l1 = load <2 x i16>, <2 x i16>* %s1, align 4 129 %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1 130 %l2 = load <2 x i16>, <2 x i16>* %s2, align 4 131 %s = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 132 store <4 x i16> %s, <4 x i16> *%dst 133 ret void 134} 135 136define void @vst2_v4i16(<4 x i16> *%src, <8 x i16> *%dst) { 137; CHECK-LABEL: vst2_v4i16: 138; CHECK: @ %bb.0: @ %entry 139; CHECK-NEXT: vldrh.u32 q0, [r0, #8] 140; CHECK-NEXT: vldrh.u32 q1, [r0] 141; CHECK-NEXT: vmovnt.i32 q1, q0 142; CHECK-NEXT: vstrw.32 q1, [r1] 143; CHECK-NEXT: bx lr 144entry: 145 %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0 146 %l1 = load <4 x i16>, <4 x i16>* %s1, align 4 147 %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1 148 %l2 = load <4 x i16>, <4 x i16>* %s2, align 4 149 %s = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 150 store <8 x i16> %s, <8 x i16> *%dst 151 ret void 152} 153 154define void @vst2_v8i16(<8 x i16> *%src, <16 x i16> *%dst) { 155; CHECK-LABEL: vst2_v8i16: 156; CHECK: @ %bb.0: @ %entry 157; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 158; CHECK-NEXT: vldrw.u32 q0, [r0] 159; CHECK-NEXT: vst20.16 {q0, q1}, [r1] 160; CHECK-NEXT: vst21.16 {q0, q1}, [r1] 161; CHECK-NEXT: bx lr 162entry: 163 %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 164 %l1 = load <8 x i16>, <8 x i16>* %s1, align 4 165 %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1 166 %l2 = load <8 x i16>, <8 x i16>* %s2, align 4 167 %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 168 store <16 x i16> %s, <16 x i16> *%dst 169 ret void 170} 171 172define void @vst2_v16i16(<16 x i16> *%src, <32 x i16> *%dst) { 173; CHECK-LABEL: vst2_v16i16: 174; CHECK: @ %bb.0: @ %entry 175; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 176; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 177; CHECK-NEXT: vldrw.u32 q0, [r0] 178; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 179; CHECK-NEXT: vst20.16 {q0, q1}, [r1] 180; CHECK-NEXT: vst21.16 {q0, q1}, [r1]! 181; CHECK-NEXT: vst20.16 {q2, q3}, [r1] 182; CHECK-NEXT: vst21.16 {q2, q3}, [r1] 183; CHECK-NEXT: bx lr 184entry: 185 %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0 186 %l1 = load <16 x i16>, <16 x i16>* %s1, align 4 187 %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1 188 %l2 = load <16 x i16>, <16 x i16>* %s2, align 4 189 %s = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 190 store <32 x i16> %s, <32 x i16> *%dst 191 ret void 192} 193 194; i8 195 196define void @vst2_v2i8(<2 x i8> *%src, <4 x i8> *%dst) { 197; CHECK-LABEL: vst2_v2i8: 198; CHECK: @ %bb.0: @ %entry 199; CHECK-NEXT: ldrb r2, [r0] 200; CHECK-NEXT: ldrb r3, [r0, #2] 201; CHECK-NEXT: vmov.32 q0[0], r2 202; CHECK-NEXT: ldrb.w r12, [r0, #1] 203; CHECK-NEXT: vmov.32 q0[1], r3 204; CHECK-NEXT: ldrb r0, [r0, #3] 205; CHECK-NEXT: vmov.32 q0[2], r12 206; CHECK-NEXT: vmov.32 q0[3], r0 207; CHECK-NEXT: vstrb.32 q0, [r1] 208; CHECK-NEXT: bx lr 209entry: 210 %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0 211 %l1 = load <2 x i8>, <2 x i8>* %s1, align 4 212 %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1 213 %l2 = load <2 x i8>, <2 x i8>* %s2, align 4 214 %s = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 215 store <4 x i8> %s, <4 x i8> *%dst 216 ret void 217} 218 219define void @vst2_v4i8(<4 x i8> *%src, <8 x i8> *%dst) { 220; CHECK-LABEL: vst2_v4i8: 221; CHECK: @ %bb.0: @ %entry 222; CHECK-NEXT: vldrb.u32 q0, [r0, #4] 223; CHECK-NEXT: vldrb.u32 q1, [r0] 224; CHECK-NEXT: vmovnt.i32 q1, q0 225; CHECK-NEXT: vstrb.16 q1, [r1] 226; CHECK-NEXT: bx lr 227entry: 228 %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0 229 %l1 = load <4 x i8>, <4 x i8>* %s1, align 4 230 %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1 231 %l2 = load <4 x i8>, <4 x i8>* %s2, align 4 232 %s = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 233 store <8 x i8> %s, <8 x i8> *%dst 234 ret void 235} 236 237define void @vst2_v8i8(<8 x i8> *%src, <16 x i8> *%dst) { 238; CHECK-LABEL: vst2_v8i8: 239; CHECK: @ %bb.0: @ %entry 240; CHECK-NEXT: vldrb.u16 q0, [r0, #8] 241; CHECK-NEXT: vldrb.u16 q1, [r0] 242; CHECK-NEXT: vmovnt.i16 q1, q0 243; CHECK-NEXT: vstrw.32 q1, [r1] 244; CHECK-NEXT: bx lr 245entry: 246 %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 247 %l1 = load <8 x i8>, <8 x i8>* %s1, align 4 248 %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1 249 %l2 = load <8 x i8>, <8 x i8>* %s2, align 4 250 %s = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 251 store <16 x i8> %s, <16 x i8> *%dst 252 ret void 253} 254 255define void @vst2_v16i8(<16 x i8> *%src, <32 x i8> *%dst) { 256; CHECK-LABEL: vst2_v16i8: 257; CHECK: @ %bb.0: @ %entry 258; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 259; CHECK-NEXT: vldrw.u32 q0, [r0] 260; CHECK-NEXT: vst20.8 {q0, q1}, [r1] 261; CHECK-NEXT: vst21.8 {q0, q1}, [r1] 262; CHECK-NEXT: bx lr 263entry: 264 %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 265 %l1 = load <16 x i8>, <16 x i8>* %s1, align 4 266 %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1 267 %l2 = load <16 x i8>, <16 x i8>* %s2, align 4 268 %s = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 269 store <32 x i8> %s, <32 x i8> *%dst 270 ret void 271} 272 273; i64 274 275define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) { 276; CHECK-LABEL: vst2_v2i64: 277; CHECK: @ %bb.0: @ %entry 278; CHECK-NEXT: vldrw.u32 q1, [r0] 279; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 280; CHECK-NEXT: vmov.f64 d4, d2 281; CHECK-NEXT: vmov.f32 s9, s5 282; CHECK-NEXT: vmov.f32 s10, s0 283; CHECK-NEXT: vmov.f32 s11, s1 284; CHECK-NEXT: vmov.f32 s0, s6 285; CHECK-NEXT: vstrb.8 q2, [r1], #16 286; CHECK-NEXT: vmov.f32 s1, s7 287; CHECK-NEXT: vstrw.32 q0, [r1] 288; CHECK-NEXT: bx lr 289entry: 290 %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 291 %l1 = load <2 x i64>, <2 x i64>* %s1, align 4 292 %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1 293 %l2 = load <2 x i64>, <2 x i64>* %s2, align 4 294 %s = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 295 store <4 x i64> %s, <4 x i64> *%dst 296 ret void 297} 298 299define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) { 300; CHECK-LABEL: vst2_v4i64: 301; CHECK: @ %bb.0: @ %entry 302; CHECK-NEXT: .vsave {d8, d9, d10, d11} 303; CHECK-NEXT: vpush {d8, d9, d10, d11} 304; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 305; CHECK-NEXT: vldrw.u32 q1, [r0] 306; CHECK-NEXT: vldrw.u32 q4, [r0, #48] 307; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 308; CHECK-NEXT: vmov.f64 d6, d1 309; CHECK-NEXT: vmov.f64 d10, d3 310; CHECK-NEXT: vmov.f32 s13, s3 311; CHECK-NEXT: vmov.f32 s21, s7 312; CHECK-NEXT: vmov.f32 s2, s16 313; CHECK-NEXT: vmov.f32 s6, s8 314; CHECK-NEXT: vmov.f32 s14, s18 315; CHECK-NEXT: vmov.f32 s22, s10 316; CHECK-NEXT: vmov.f32 s3, s17 317; CHECK-NEXT: vmov.f32 s7, s9 318; CHECK-NEXT: vstrw.32 q0, [r1, #32] 319; CHECK-NEXT: vmov.f32 s15, s19 320; CHECK-NEXT: vstrb.8 q1, [r1], #48 321; CHECK-NEXT: vmov.f32 s23, s11 322; CHECK-NEXT: vstrw.32 q3, [r1] 323; CHECK-NEXT: vstrw.32 q5, [r1, #-32] 324; CHECK-NEXT: vpop {d8, d9, d10, d11} 325; CHECK-NEXT: bx lr 326entry: 327 %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0 328 %l1 = load <4 x i64>, <4 x i64>* %s1, align 4 329 %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1 330 %l2 = load <4 x i64>, <4 x i64>* %s2, align 4 331 %s = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 332 store <8 x i64> %s, <8 x i64> *%dst 333 ret void 334} 335 336; f32 337 338define void @vst2_v2f32(<2 x float> *%src, <4 x float> *%dst) { 339; CHECK-LABEL: vst2_v2f32: 340; CHECK: @ %bb.0: @ %entry 341; CHECK-NEXT: vldr s0, [r0] 342; CHECK-NEXT: vldr s4, [r0, #4] 343; CHECK-NEXT: vldr s1, [r0, #8] 344; CHECK-NEXT: vldr s5, [r0, #12] 345; CHECK-NEXT: vmov.f32 s2, s4 346; CHECK-NEXT: vmov.f32 s3, s5 347; CHECK-NEXT: vstrw.32 q0, [r1] 348; CHECK-NEXT: bx lr 349entry: 350 %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0 351 %l1 = load <2 x float>, <2 x float>* %s1, align 4 352 %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1 353 %l2 = load <2 x float>, <2 x float>* %s2, align 4 354 %s = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 355 store <4 x float> %s, <4 x float> *%dst 356 ret void 357} 358 359define void @vst2_v4f32(<4 x float> *%src, <8 x float> *%dst) { 360; CHECK-LABEL: vst2_v4f32: 361; CHECK: @ %bb.0: @ %entry 362; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 363; CHECK-NEXT: vldrw.u32 q0, [r0] 364; CHECK-NEXT: vst20.32 {q0, q1}, [r1] 365; CHECK-NEXT: vst21.32 {q0, q1}, [r1] 366; CHECK-NEXT: bx lr 367entry: 368 %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 369 %l1 = load <4 x float>, <4 x float>* %s1, align 4 370 %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1 371 %l2 = load <4 x float>, <4 x float>* %s2, align 4 372 %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 373 store <8 x float> %s, <8 x float> *%dst 374 ret void 375} 376 377define void @vst2_v8f32(<8 x float> *%src, <16 x float> *%dst) { 378; CHECK-LABEL: vst2_v8f32: 379; CHECK: @ %bb.0: @ %entry 380; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 381; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 382; CHECK-NEXT: vldrw.u32 q0, [r0] 383; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 384; CHECK-NEXT: vst20.32 {q0, q1}, [r1] 385; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! 386; CHECK-NEXT: vst20.32 {q2, q3}, [r1] 387; CHECK-NEXT: vst21.32 {q2, q3}, [r1] 388; CHECK-NEXT: bx lr 389entry: 390 %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0 391 %l1 = load <8 x float>, <8 x float>* %s1, align 4 392 %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1 393 %l2 = load <8 x float>, <8 x float>* %s2, align 4 394 %s = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 395 store <16 x float> %s, <16 x float> *%dst 396 ret void 397} 398 399define void @vst2_v16f32(<16 x float> *%src, <32 x float> *%dst) { 400; CHECK-LABEL: vst2_v16f32: 401; CHECK: @ %bb.0: @ %entry 402; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 403; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 404; CHECK-NEXT: vldrw.u32 q7, [r0, #64] 405; CHECK-NEXT: vldrw.u32 q1, [r0, #112] 406; CHECK-NEXT: vldrw.u32 q3, [r0, #96] 407; CHECK-NEXT: vldrw.u32 q5, [r0, #80] 408; CHECK-NEXT: vldrw.u32 q6, [r0] 409; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 410; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 411; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 412; CHECK-NEXT: vst20.32 {q6, q7}, [r1] 413; CHECK-NEXT: add.w r0, r1, #96 414; CHECK-NEXT: add.w r2, r1, #64 415; CHECK-NEXT: vst21.32 {q6, q7}, [r1]! 416; CHECK-NEXT: vst20.32 {q4, q5}, [r1] 417; CHECK-NEXT: vst21.32 {q4, q5}, [r1] 418; CHECK-NEXT: vst20.32 {q2, q3}, [r2] 419; CHECK-NEXT: vst21.32 {q2, q3}, [r2] 420; CHECK-NEXT: vst20.32 {q0, q1}, [r0] 421; CHECK-NEXT: vst21.32 {q0, q1}, [r0] 422; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 423; CHECK-NEXT: bx lr 424entry: 425 %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0 426 %l1 = load <16 x float>, <16 x float>* %s1, align 4 427 %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1 428 %l2 = load <16 x float>, <16 x float>* %s2, align 4 429 %s = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 430 store <32 x float> %s, <32 x float> *%dst 431 ret void 432} 433 434; f16 435 436define void @vst2_v2f16(<2 x half> *%src, <4 x half> *%dst) { 437; CHECK-LABEL: vst2_v2f16: 438; CHECK: @ %bb.0: @ %entry 439; CHECK-NEXT: ldrd r2, r0, [r0] 440; CHECK-NEXT: vmov.32 q0[0], r2 441; CHECK-NEXT: vmov.32 q1[0], r0 442; CHECK-NEXT: vmov r2, s0 443; CHECK-NEXT: vmovx.f16 s0, s0 444; CHECK-NEXT: vmov r0, s4 445; CHECK-NEXT: vmov.16 q2[0], r2 446; CHECK-NEXT: vmov.16 q2[1], r0 447; CHECK-NEXT: vmov r0, s0 448; CHECK-NEXT: vmovx.f16 s0, s4 449; CHECK-NEXT: vmov.16 q2[2], r0 450; CHECK-NEXT: vmov r0, s0 451; CHECK-NEXT: vmov.16 q2[3], r0 452; CHECK-NEXT: vmov r2, s9 453; CHECK-NEXT: vmov r0, s8 454; CHECK-NEXT: strd r0, r2, [r1] 455; CHECK-NEXT: bx lr 456entry: 457 %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 458 %l1 = load <2 x half>, <2 x half>* %s1, align 4 459 %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1 460 %l2 = load <2 x half>, <2 x half>* %s2, align 4 461 %s = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 462 store <4 x half> %s, <4 x half> *%dst 463 ret void 464} 465 466define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) { 467; CHECK-LABEL: vst2_v4f16: 468; CHECK: @ %bb.0: @ %entry 469; CHECK-NEXT: ldm.w r0, {r2, r3, r12} 470; CHECK-NEXT: vmov.32 q0[0], r12 471; CHECK-NEXT: ldr r0, [r0, #12] 472; CHECK-NEXT: vmov.32 q2[0], r2 473; CHECK-NEXT: vmov.32 q2[1], r3 474; CHECK-NEXT: vmov.32 q0[1], r0 475; CHECK-NEXT: vmov r2, s8 476; CHECK-NEXT: vmovx.f16 s12, s8 477; CHECK-NEXT: vmov r0, s0 478; CHECK-NEXT: vmov.16 q1[0], r2 479; CHECK-NEXT: vmov.16 q1[1], r0 480; CHECK-NEXT: vmov r0, s12 481; CHECK-NEXT: vmovx.f16 s12, s0 482; CHECK-NEXT: vmov.16 q1[2], r0 483; CHECK-NEXT: vmov r0, s12 484; CHECK-NEXT: vmovx.f16 s8, s9 485; CHECK-NEXT: vmov.16 q1[3], r0 486; CHECK-NEXT: vmov r0, s9 487; CHECK-NEXT: vmov.16 q1[4], r0 488; CHECK-NEXT: vmov r0, s1 489; CHECK-NEXT: vmov.16 q1[5], r0 490; CHECK-NEXT: vmov r0, s8 491; CHECK-NEXT: vmovx.f16 s0, s1 492; CHECK-NEXT: vmov.16 q1[6], r0 493; CHECK-NEXT: vmov r0, s0 494; CHECK-NEXT: vmov.16 q1[7], r0 495; CHECK-NEXT: vstrw.32 q1, [r1] 496; CHECK-NEXT: bx lr 497entry: 498 %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 499 %l1 = load <4 x half>, <4 x half>* %s1, align 4 500 %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1 501 %l2 = load <4 x half>, <4 x half>* %s2, align 4 502 %s = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 503 store <8 x half> %s, <8 x half> *%dst 504 ret void 505} 506 507define void @vst2_v8f16(<8 x half> *%src, <16 x half> *%dst) { 508; CHECK-LABEL: vst2_v8f16: 509; CHECK: @ %bb.0: @ %entry 510; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 511; CHECK-NEXT: vldrw.u32 q0, [r0] 512; CHECK-NEXT: vst20.16 {q0, q1}, [r1] 513; CHECK-NEXT: vst21.16 {q0, q1}, [r1] 514; CHECK-NEXT: bx lr 515entry: 516 %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 517 %l1 = load <8 x half>, <8 x half>* %s1, align 4 518 %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1 519 %l2 = load <8 x half>, <8 x half>* %s2, align 4 520 %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 521 store <16 x half> %s, <16 x half> *%dst 522 ret void 523} 524 525define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) { 526; CHECK-LABEL: vst2_v16f16: 527; CHECK: @ %bb.0: @ %entry 528; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 529; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 530; CHECK-NEXT: vldrw.u32 q2, [r0] 531; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 532; CHECK-NEXT: vst20.16 {q2, q3}, [r1] 533; CHECK-NEXT: vst21.16 {q2, q3}, [r1]! 534; CHECK-NEXT: vst20.16 {q0, q1}, [r1] 535; CHECK-NEXT: vst21.16 {q0, q1}, [r1] 536; CHECK-NEXT: bx lr 537entry: 538 %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0 539 %l1 = load <16 x half>, <16 x half>* %s1, align 4 540 %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1 541 %l2 = load <16 x half>, <16 x half>* %s2, align 4 542 %s = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 543 store <32 x half> %s, <32 x half> *%dst 544 ret void 545} 546 547; f64 548 549define void @vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) { 550; CHECK-LABEL: vst2_v2f64: 551; CHECK: @ %bb.0: @ %entry 552; CHECK-NEXT: vldrw.u32 q1, [r0] 553; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 554; CHECK-NEXT: vmov.f64 d4, d3 555; CHECK-NEXT: vmov.f64 d5, d1 556; CHECK-NEXT: vmov.f64 d3, d0 557; CHECK-NEXT: vstrw.32 q2, [r1, #16] 558; CHECK-NEXT: vstrw.32 q1, [r1] 559; CHECK-NEXT: bx lr 560entry: 561 %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0 562 %l1 = load <2 x double>, <2 x double>* %s1, align 4 563 %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1 564 %l2 = load <2 x double>, <2 x double>* %s2, align 4 565 %s = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 566 store <4 x double> %s, <4 x double> *%dst 567 ret void 568} 569 570define void @vst2_v4f64(<4 x double> *%src, <8 x double> *%dst) { 571; CHECK-LABEL: vst2_v4f64: 572; CHECK: @ %bb.0: @ %entry 573; CHECK-NEXT: .vsave {d8, d9} 574; CHECK-NEXT: vpush {d8, d9} 575; CHECK-NEXT: vldrw.u32 q2, [r0] 576; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 577; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 578; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 579; CHECK-NEXT: vmov.f64 d8, d4 580; CHECK-NEXT: vmov.f64 d9, d0 581; CHECK-NEXT: vmov.f64 d0, d5 582; CHECK-NEXT: vstrw.32 q4, [r1] 583; CHECK-NEXT: vmov.f64 d4, d6 584; CHECK-NEXT: vstrw.32 q0, [r1, #16] 585; CHECK-NEXT: vmov.f64 d5, d2 586; CHECK-NEXT: vmov.f64 d2, d7 587; CHECK-NEXT: vstrw.32 q2, [r1, #32] 588; CHECK-NEXT: vstrw.32 q1, [r1, #48] 589; CHECK-NEXT: vpop {d8, d9} 590; CHECK-NEXT: bx lr 591entry: 592 %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0 593 %l1 = load <4 x double>, <4 x double>* %s1, align 4 594 %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1 595 %l2 = load <4 x double>, <4 x double>* %s2, align 4 596 %s = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 597 store <8 x double> %s, <8 x double> *%dst 598 ret void 599} 600