1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) { 7; CHECK-LABEL: vld3_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: vldrw.u32 q0, [r0] 10; CHECK-NEXT: ldrd r12, r3, [r0, #16] 11; CHECK-NEXT: vmov r0, s1 12; CHECK-NEXT: vmov r2, s0 13; CHECK-NEXT: vmov.f64 d2, d0 14; CHECK-NEXT: vmov.f32 s6, s3 15; CHECK-NEXT: add r0, r2 16; CHECK-NEXT: vmov r2, s2 17; CHECK-NEXT: add r0, r2 18; CHECK-NEXT: vmov r2, s6 19; CHECK-NEXT: add r2, r12 20; CHECK-NEXT: add r2, r3 21; CHECK-NEXT: strd r0, r2, [r1] 22; CHECK-NEXT: bx lr 23entry: 24 %l1 = load <6 x i32>, <6 x i32>* %src, align 4 25 %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3> 26 %s2 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 1, i32 4> 27 %s3 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 2, i32 5> 28 %a1 = add <2 x i32> %s1, %s2 29 %a = add <2 x i32> %a1, %s3 30 store <2 x i32> %a, <2 x i32> *%dst 31 ret void 32} 33 34define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) { 35; CHECK-LABEL: vld3_v4i32: 36; CHECK: @ %bb.0: @ %entry 37; CHECK-NEXT: .vsave {d8, d9, d10, d11} 38; CHECK-NEXT: vpush {d8, d9, d10, d11} 39; CHECK-NEXT: vldrw.u32 q1, [r0] 40; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 41; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 42; CHECK-NEXT: vmov.f32 s12, s5 43; CHECK-NEXT: vmov.f32 s13, s0 44; CHECK-NEXT: vmov r0, s10 45; CHECK-NEXT: vdup.32 q4, r0 46; CHECK-NEXT: vmov.f32 s14, s3 47; CHECK-NEXT: vmov.f32 s15, s19 48; CHECK-NEXT: vmov.f64 d8, d2 49; CHECK-NEXT: vmov.f32 s17, s7 50; CHECK-NEXT: vmov r0, s9 51; CHECK-NEXT: vmov.f32 s18, s2 52; CHECK-NEXT: vdup.32 q5, r0 53; CHECK-NEXT: vmov.f32 s0, s6 54; CHECK-NEXT: vmov.f32 s19, s23 55; CHECK-NEXT: vmov.f32 s10, s8 56; CHECK-NEXT: vadd.i32 q3, q4, q3 57; CHECK-NEXT: vmov.f32 s2, s8 58; CHECK-NEXT: vmov.f32 s3, s11 59; CHECK-NEXT: vadd.i32 q0, q3, q0 60; CHECK-NEXT: vstrw.32 q0, [r1] 61; CHECK-NEXT: vpop {d8, d9, d10, d11} 62; CHECK-NEXT: bx lr 63entry: 64 %l1 = load <12 x i32>, <12 x i32>* %src, align 4 65 %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 66 %s2 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 67 %s3 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 68 %a1 = add <4 x i32> %s1, %s2 69 %a = add <4 x i32> %a1, %s3 70 store <4 x i32> %a, <4 x i32> *%dst 71 ret void 72} 73 74define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) { 75; CHECK-LABEL: vld3_v8i32: 76; CHECK: @ %bb.0: @ %entry 77; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 78; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 79; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 80; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 81; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 82; CHECK-NEXT: vmov.f32 s12, s5 83; CHECK-NEXT: vmov.f32 s13, s0 84; CHECK-NEXT: vmov r2, s10 85; CHECK-NEXT: vdup.32 q4, r2 86; CHECK-NEXT: vmov.f32 s14, s3 87; CHECK-NEXT: vmov.f32 s15, s19 88; CHECK-NEXT: vmov.f64 d8, d2 89; CHECK-NEXT: vmov.f32 s17, s7 90; CHECK-NEXT: vmov r2, s9 91; CHECK-NEXT: vmov.f32 s18, s2 92; CHECK-NEXT: vdup.32 q5, r2 93; CHECK-NEXT: vmov.f32 s0, s6 94; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 95; CHECK-NEXT: vmov.f32 s19, s23 96; CHECK-NEXT: vmov.f32 s10, s8 97; CHECK-NEXT: vadd.i32 q3, q4, q3 98; CHECK-NEXT: vmov.f32 s2, s8 99; CHECK-NEXT: vmov.f32 s3, s11 100; CHECK-NEXT: vldrw.u32 q2, [r0] 101; CHECK-NEXT: vadd.i32 q0, q3, q0 102; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 103; CHECK-NEXT: vmov.f32 s16, s9 104; CHECK-NEXT: vstrw.32 q0, [r1, #16] 105; CHECK-NEXT: vmov.f32 s17, s4 106; CHECK-NEXT: vmov r0, s14 107; CHECK-NEXT: vdup.32 q5, r0 108; CHECK-NEXT: vmov.f32 s18, s7 109; CHECK-NEXT: vmov.f32 s19, s23 110; CHECK-NEXT: vmov.f64 d10, d4 111; CHECK-NEXT: vmov.f32 s21, s11 112; CHECK-NEXT: vmov r0, s13 113; CHECK-NEXT: vmov.f32 s22, s6 114; CHECK-NEXT: vdup.32 q6, r0 115; CHECK-NEXT: vmov.f32 s4, s10 116; CHECK-NEXT: vmov.f32 s23, s27 117; CHECK-NEXT: vmov.f32 s14, s12 118; CHECK-NEXT: vadd.i32 q4, q5, q4 119; CHECK-NEXT: vmov.f32 s6, s12 120; CHECK-NEXT: vmov.f32 s7, s15 121; CHECK-NEXT: vadd.i32 q1, q4, q1 122; CHECK-NEXT: vstrw.32 q1, [r1] 123; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 124; CHECK-NEXT: bx lr 125entry: 126 %l1 = load <24 x i32>, <24 x i32>* %src, align 4 127 %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 128 %s2 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 129 %s3 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 130 %a1 = add <8 x i32> %s1, %s2 131 %a = add <8 x i32> %a1, %s3 132 store <8 x i32> %a, <8 x i32> *%dst 133 ret void 134} 135 136define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) { 137; CHECK-LABEL: vld3_v16i32: 138; CHECK: @ %bb.0: @ %entry 139; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 140; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 141; CHECK-NEXT: .pad #16 142; CHECK-NEXT: sub sp, #16 143; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 144; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 145; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 146; CHECK-NEXT: vmov.f32 s12, s5 147; CHECK-NEXT: vmov.f32 s13, s0 148; CHECK-NEXT: vmov r2, s10 149; CHECK-NEXT: vdup.32 q4, r2 150; CHECK-NEXT: vmov.f32 s14, s3 151; CHECK-NEXT: vmov.f32 s15, s19 152; CHECK-NEXT: vmov.f64 d8, d2 153; CHECK-NEXT: vmov.f32 s17, s7 154; CHECK-NEXT: vmov r2, s9 155; CHECK-NEXT: vmov.f32 s18, s2 156; CHECK-NEXT: vdup.32 q5, r2 157; CHECK-NEXT: vmov.f32 s0, s6 158; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 159; CHECK-NEXT: vmov.f32 s19, s23 160; CHECK-NEXT: vmov.f32 s10, s8 161; CHECK-NEXT: vadd.i32 q3, q4, q3 162; CHECK-NEXT: vmov.f32 s2, s8 163; CHECK-NEXT: vmov.f32 s3, s11 164; CHECK-NEXT: vldrw.u32 q2, [r0] 165; CHECK-NEXT: vadd.i32 q0, q3, q0 166; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 167; CHECK-NEXT: vmov.f32 s16, s9 168; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 169; CHECK-NEXT: vmov.f32 s17, s4 170; CHECK-NEXT: vldrw.u32 q0, [r0, #128] 171; CHECK-NEXT: vmov r2, s14 172; CHECK-NEXT: vdup.32 q5, r2 173; CHECK-NEXT: vmov.f32 s18, s7 174; CHECK-NEXT: vmov.f32 s19, s23 175; CHECK-NEXT: vmov.f64 d10, d4 176; CHECK-NEXT: vmov.f32 s21, s11 177; CHECK-NEXT: vmov r2, s13 178; CHECK-NEXT: vmov.f32 s22, s6 179; CHECK-NEXT: vdup.32 q6, r2 180; CHECK-NEXT: vmov.f32 s4, s10 181; CHECK-NEXT: vldrw.u32 q2, [r0, #160] 182; CHECK-NEXT: vmov.f32 s23, s27 183; CHECK-NEXT: vmov.f32 s14, s12 184; CHECK-NEXT: vadd.i32 q4, q5, q4 185; CHECK-NEXT: vmov.f32 s6, s12 186; CHECK-NEXT: vmov.f32 s7, s15 187; CHECK-NEXT: vldrw.u32 q3, [r0, #144] 188; CHECK-NEXT: vadd.i32 q1, q4, q1 189; CHECK-NEXT: vldrw.u32 q4, [r0, #176] 190; CHECK-NEXT: vmov.f32 s20, s13 191; CHECK-NEXT: vmov.f32 s21, s8 192; CHECK-NEXT: vmov r2, s18 193; CHECK-NEXT: vdup.32 q6, r2 194; CHECK-NEXT: vmov.f32 s22, s11 195; CHECK-NEXT: vmov.f32 s23, s27 196; CHECK-NEXT: vmov.f64 d12, d6 197; CHECK-NEXT: vmov.f32 s25, s15 198; CHECK-NEXT: vmov r2, s17 199; CHECK-NEXT: vmov.f32 s26, s10 200; CHECK-NEXT: vdup.32 q7, r2 201; CHECK-NEXT: vmov.f32 s8, s14 202; CHECK-NEXT: vmov.f32 s27, s31 203; CHECK-NEXT: vmov.f32 s18, s16 204; CHECK-NEXT: vadd.i32 q5, q6, q5 205; CHECK-NEXT: vmov.f32 s10, s16 206; CHECK-NEXT: vmov.f32 s11, s19 207; CHECK-NEXT: vldrw.u32 q4, [r0, #96] 208; CHECK-NEXT: vadd.i32 q2, q5, q2 209; CHECK-NEXT: vldrw.u32 q5, [r0, #112] 210; CHECK-NEXT: vmov.f32 s24, s17 211; CHECK-NEXT: vstrw.32 q2, [r1, #48] 212; CHECK-NEXT: vmov.f32 s25, s20 213; CHECK-NEXT: vstrw.32 q1, [r1] 214; CHECK-NEXT: vmov.f64 d6, d8 215; CHECK-NEXT: vmov r0, s2 216; CHECK-NEXT: vmov.f32 s13, s19 217; CHECK-NEXT: vdup.32 q7, r0 218; CHECK-NEXT: vmov.f32 s26, s23 219; CHECK-NEXT: vmov r0, s1 220; CHECK-NEXT: vmov.f32 s27, s31 221; CHECK-NEXT: vdup.32 q7, r0 222; CHECK-NEXT: vmov.f32 s14, s22 223; CHECK-NEXT: vmov.f32 s20, s18 224; CHECK-NEXT: vmov.f32 s15, s31 225; CHECK-NEXT: vmov.f32 s2, s0 226; CHECK-NEXT: vadd.i32 q6, q3, q6 227; CHECK-NEXT: vmov.f32 s22, s0 228; CHECK-NEXT: vmov.f32 s23, s3 229; CHECK-NEXT: vadd.i32 q0, q6, q5 230; CHECK-NEXT: vstrw.32 q0, [r1, #32] 231; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 232; CHECK-NEXT: vstrw.32 q0, [r1, #16] 233; CHECK-NEXT: add sp, #16 234; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 235; CHECK-NEXT: bx lr 236entry: 237 %l1 = load <48 x i32>, <48 x i32>* %src, align 4 238 %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 239 %s2 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 240 %s3 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 241 %a1 = add <16 x i32> %s1, %s2 242 %a = add <16 x i32> %a1, %s3 243 store <16 x i32> %a, <16 x i32> *%dst 244 ret void 245} 246 247; i16 248 249define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) { 250; CHECK-LABEL: vld3_v2i16: 251; CHECK: @ %bb.0: @ %entry 252; CHECK-NEXT: .pad #8 253; CHECK-NEXT: sub sp, #8 254; CHECK-NEXT: vldrh.u32 q0, [r0] 255; CHECK-NEXT: ldr r2, [r0, #8] 256; CHECK-NEXT: mov r3, sp 257; CHECK-NEXT: str r2, [sp] 258; CHECK-NEXT: vmov.f64 d2, d0 259; CHECK-NEXT: vmov.f32 s6, s3 260; CHECK-NEXT: vmov.f32 s8, s1 261; CHECK-NEXT: vmov.f64 d6, d1 262; CHECK-NEXT: vmov r0, s6 263; CHECK-NEXT: vldrh.u32 q1, [r3] 264; CHECK-NEXT: vmov.f32 s10, s4 265; CHECK-NEXT: vmov.f32 s14, s5 266; CHECK-NEXT: vmov r2, s10 267; CHECK-NEXT: add r0, r2 268; CHECK-NEXT: vmov r2, s14 269; CHECK-NEXT: add r0, r2 270; CHECK-NEXT: strh r0, [r1, #2] 271; CHECK-NEXT: vmov r0, s8 272; CHECK-NEXT: vmov r2, s0 273; CHECK-NEXT: add r0, r2 274; CHECK-NEXT: vmov r2, s12 275; CHECK-NEXT: add r0, r2 276; CHECK-NEXT: strh r0, [r1] 277; CHECK-NEXT: add sp, #8 278; CHECK-NEXT: bx lr 279entry: 280 %l1 = load <6 x i16>, <6 x i16>* %src, align 4 281 %s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 0, i32 3> 282 %s2 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 1, i32 4> 283 %s3 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 2, i32 5> 284 %a1 = add <2 x i16> %s1, %s2 285 %a = add <2 x i16> %a1, %s3 286 store <2 x i16> %a, <2 x i16> *%dst 287 ret void 288} 289 290define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) { 291; CHECK-LABEL: vld3_v4i16: 292; CHECK: @ %bb.0: @ %entry 293; CHECK-NEXT: vldrw.u32 q0, [r0] 294; CHECK-NEXT: vldrh.u32 q3, [r0, #16] 295; CHECK-NEXT: vmov.u16 r2, q0[0] 296; CHECK-NEXT: vmov r0, s14 297; CHECK-NEXT: vmov.32 q1[0], r2 298; CHECK-NEXT: vmov.u16 r2, q0[3] 299; CHECK-NEXT: vmov.32 q1[1], r2 300; CHECK-NEXT: vmov.u16 r2, q0[6] 301; CHECK-NEXT: vmov.32 q1[2], r2 302; CHECK-NEXT: vmov.u16 r2, q0[1] 303; CHECK-NEXT: vmov.32 q2[0], r2 304; CHECK-NEXT: vmov.u16 r2, q0[4] 305; CHECK-NEXT: vmov.32 q2[1], r2 306; CHECK-NEXT: vmov.u16 r2, q0[7] 307; CHECK-NEXT: vmov.32 q2[2], r2 308; CHECK-NEXT: vmov.32 q2[3], r0 309; CHECK-NEXT: vmov r0, s13 310; CHECK-NEXT: vmov.32 q1[3], r0 311; CHECK-NEXT: vmov.u16 r0, q0[2] 312; CHECK-NEXT: vadd.i32 q1, q1, q2 313; CHECK-NEXT: vmov.32 q2[0], r0 314; CHECK-NEXT: vmov.u16 r0, q0[5] 315; CHECK-NEXT: vmov.32 q2[1], r0 316; CHECK-NEXT: vmov r0, s12 317; CHECK-NEXT: vmov.32 q2[2], r0 318; CHECK-NEXT: vmov r0, s15 319; CHECK-NEXT: vmov.32 q2[3], r0 320; CHECK-NEXT: vadd.i32 q0, q1, q2 321; CHECK-NEXT: vstrh.32 q0, [r1] 322; CHECK-NEXT: bx lr 323entry: 324 %l1 = load <12 x i16>, <12 x i16>* %src, align 4 325 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 326 %s2 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 327 %s3 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 328 %a1 = add <4 x i16> %s1, %s2 329 %a = add <4 x i16> %a1, %s3 330 store <4 x i16> %a, <4 x i16> *%dst 331 ret void 332} 333 334define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { 335; CHECK-LABEL: vld3_v8i16: 336; CHECK: @ %bb.0: @ %entry 337; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 338; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 339; CHECK-NEXT: vldrw.u32 q1, [r0] 340; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 341; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 342; CHECK-NEXT: vmov.u16 r2, q1[2] 343; CHECK-NEXT: vmov.u16 r0, q3[4] 344; CHECK-NEXT: vmov.16 q4[0], r2 345; CHECK-NEXT: vmov.u16 r2, q1[5] 346; CHECK-NEXT: vmov.16 q4[1], r2 347; CHECK-NEXT: vmov.u16 r2, q2[0] 348; CHECK-NEXT: vmov.16 q5[6], r0 349; CHECK-NEXT: vmov.u16 r0, q3[7] 350; CHECK-NEXT: vmov.16 q5[7], r0 351; CHECK-NEXT: vmov.16 q4[2], r2 352; CHECK-NEXT: vmov.u16 r2, q2[3] 353; CHECK-NEXT: vmov.f32 s22, s12 354; CHECK-NEXT: vmov.16 q4[3], r2 355; CHECK-NEXT: vmov q6, q5 356; CHECK-NEXT: vmov.f32 s18, s11 357; CHECK-NEXT: vmov r2, s16 358; CHECK-NEXT: vmovnb.i32 q6, q4 359; CHECK-NEXT: vmov.32 q0[0], r2 360; CHECK-NEXT: vmov r2, s17 361; CHECK-NEXT: vmov.32 q0[1], r2 362; CHECK-NEXT: vmov r0, s26 363; CHECK-NEXT: vmov.32 q0[2], r0 364; CHECK-NEXT: vmov r0, s23 365; CHECK-NEXT: vmov.32 q0[3], r0 366; CHECK-NEXT: vmov.u16 r0, q1[0] 367; CHECK-NEXT: vmov.16 q4[0], r0 368; CHECK-NEXT: vmov.u16 r0, q1[3] 369; CHECK-NEXT: vmov.16 q4[1], r0 370; CHECK-NEXT: vmov.u16 r0, q1[6] 371; CHECK-NEXT: vmov.16 q4[2], r0 372; CHECK-NEXT: vmov.u16 r0, q2[1] 373; CHECK-NEXT: vmov.16 q4[3], r0 374; CHECK-NEXT: vmov.u16 r0, q2[4] 375; CHECK-NEXT: vmov.16 q4[4], r0 376; CHECK-NEXT: vmov.u16 r0, q3[2] 377; CHECK-NEXT: vmov.16 q5[6], r0 378; CHECK-NEXT: vmov.u16 r0, q3[5] 379; CHECK-NEXT: vmov.16 q5[7], r0 380; CHECK-NEXT: vmov.u16 r0, q2[7] 381; CHECK-NEXT: vmov.16 q4[5], r0 382; CHECK-NEXT: vmov.u16 r0, q1[1] 383; CHECK-NEXT: vmov.f32 s19, s23 384; CHECK-NEXT: vmov.16 q5[0], r0 385; CHECK-NEXT: vmov.u16 r0, q1[4] 386; CHECK-NEXT: vmov.16 q5[1], r0 387; CHECK-NEXT: vmov.u16 r0, q1[7] 388; CHECK-NEXT: vmov.16 q5[2], r0 389; CHECK-NEXT: vmov.u16 r0, q2[2] 390; CHECK-NEXT: vmov.16 q5[3], r0 391; CHECK-NEXT: vmov.u16 r0, q2[5] 392; CHECK-NEXT: vmov.16 q5[4], r0 393; CHECK-NEXT: vmov.u16 r0, q3[0] 394; CHECK-NEXT: vmov.16 q1[5], r0 395; CHECK-NEXT: vmov.u16 r0, q3[3] 396; CHECK-NEXT: vmov.16 q1[6], r0 397; CHECK-NEXT: vmov.u16 r0, q3[6] 398; CHECK-NEXT: vmov.16 q1[7], r0 399; CHECK-NEXT: vmov r0, s20 400; CHECK-NEXT: vmov q2, q1 401; CHECK-NEXT: vmov.32 q3[0], r0 402; CHECK-NEXT: vmov r0, s21 403; CHECK-NEXT: vmovnb.i32 q2, q5 404; CHECK-NEXT: vmov.32 q3[1], r0 405; CHECK-NEXT: vmov r0, s10 406; CHECK-NEXT: vmov.32 q3[2], r0 407; CHECK-NEXT: vmov r0, s7 408; CHECK-NEXT: vmov.32 q3[3], r0 409; CHECK-NEXT: vadd.i16 q1, q4, q3 410; CHECK-NEXT: vadd.i16 q0, q1, q0 411; CHECK-NEXT: vstrw.32 q0, [r1] 412; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 413; CHECK-NEXT: bx lr 414entry: 415 %l1 = load <24 x i16>, <24 x i16>* %src, align 4 416 %s1 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 417 %s2 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 418 %s3 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 419 %a1 = add <8 x i16> %s1, %s2 420 %a = add <8 x i16> %a1, %s3 421 store <8 x i16> %a, <8 x i16> *%dst 422 ret void 423} 424 425define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) { 426; CHECK-LABEL: vld3_v16i16: 427; CHECK: @ %bb.0: @ %entry 428; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 429; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 430; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 431; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 432; CHECK-NEXT: vldrw.u32 q3, [r0, #80] 433; CHECK-NEXT: vmov.u16 r2, q1[2] 434; CHECK-NEXT: vmov.16 q4[0], r2 435; CHECK-NEXT: vmov.u16 r2, q1[5] 436; CHECK-NEXT: vmov.16 q4[1], r2 437; CHECK-NEXT: vmov.u16 r2, q2[0] 438; CHECK-NEXT: vmov.16 q4[2], r2 439; CHECK-NEXT: vmov.u16 r2, q2[3] 440; CHECK-NEXT: vmov.16 q4[3], r2 441; CHECK-NEXT: vmov.f32 s18, s11 442; CHECK-NEXT: vmov r2, s16 443; CHECK-NEXT: vmov.32 q0[0], r2 444; CHECK-NEXT: vmov r2, s17 445; CHECK-NEXT: vmov.32 q0[1], r2 446; CHECK-NEXT: vmov.u16 r2, q3[4] 447; CHECK-NEXT: vmov.16 q5[6], r2 448; CHECK-NEXT: vmov.u16 r2, q3[7] 449; CHECK-NEXT: vmov.16 q5[7], r2 450; CHECK-NEXT: vmov.f32 s22, s12 451; CHECK-NEXT: vmov q6, q5 452; CHECK-NEXT: vmovnb.i32 q6, q4 453; CHECK-NEXT: vmov r2, s26 454; CHECK-NEXT: vmov.32 q0[2], r2 455; CHECK-NEXT: vmov r2, s23 456; CHECK-NEXT: vmov.32 q0[3], r2 457; CHECK-NEXT: vmov.u16 r2, q1[0] 458; CHECK-NEXT: vmov.16 q4[0], r2 459; CHECK-NEXT: vmov.u16 r2, q1[3] 460; CHECK-NEXT: vmov.16 q4[1], r2 461; CHECK-NEXT: vmov.u16 r2, q1[6] 462; CHECK-NEXT: vmov.16 q4[2], r2 463; CHECK-NEXT: vmov.u16 r2, q2[1] 464; CHECK-NEXT: vmov.16 q4[3], r2 465; CHECK-NEXT: vmov.u16 r2, q2[4] 466; CHECK-NEXT: vmov.16 q4[4], r2 467; CHECK-NEXT: vmov.u16 r2, q3[2] 468; CHECK-NEXT: vmov.16 q5[6], r2 469; CHECK-NEXT: vmov.u16 r2, q3[5] 470; CHECK-NEXT: vmov.16 q5[7], r2 471; CHECK-NEXT: vmov.u16 r2, q2[7] 472; CHECK-NEXT: vmov.16 q4[5], r2 473; CHECK-NEXT: vmov.u16 r2, q1[1] 474; CHECK-NEXT: vmov.f32 s19, s23 475; CHECK-NEXT: vmov.16 q5[0], r2 476; CHECK-NEXT: vmov.u16 r2, q1[4] 477; CHECK-NEXT: vmov.16 q5[1], r2 478; CHECK-NEXT: vmov.u16 r2, q1[7] 479; CHECK-NEXT: vmov.16 q5[2], r2 480; CHECK-NEXT: vmov.u16 r2, q2[2] 481; CHECK-NEXT: vmov.16 q5[3], r2 482; CHECK-NEXT: vmov.u16 r2, q2[5] 483; CHECK-NEXT: vmov.16 q5[4], r2 484; CHECK-NEXT: vmov.u16 r2, q3[0] 485; CHECK-NEXT: vmov.16 q1[5], r2 486; CHECK-NEXT: vmov.u16 r2, q3[3] 487; CHECK-NEXT: vmov.16 q1[6], r2 488; CHECK-NEXT: vmov.u16 r2, q3[6] 489; CHECK-NEXT: vmov.16 q1[7], r2 490; CHECK-NEXT: vmov r2, s20 491; CHECK-NEXT: vmov q2, q1 492; CHECK-NEXT: vmov.32 q3[0], r2 493; CHECK-NEXT: vmov r2, s21 494; CHECK-NEXT: vmovnb.i32 q2, q5 495; CHECK-NEXT: vmov.32 q3[1], r2 496; CHECK-NEXT: vmov r2, s10 497; CHECK-NEXT: vmov.32 q3[2], r2 498; CHECK-NEXT: vmov r2, s7 499; CHECK-NEXT: vmov.32 q3[3], r2 500; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 501; CHECK-NEXT: vadd.i16 q1, q4, q3 502; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 503; CHECK-NEXT: vadd.i16 q0, q1, q0 504; CHECK-NEXT: vldrw.u32 q1, [r0] 505; CHECK-NEXT: vmov.u16 r0, q3[4] 506; CHECK-NEXT: vstrw.32 q0, [r1, #16] 507; CHECK-NEXT: vmov.u16 r2, q1[2] 508; CHECK-NEXT: vmov.16 q6[6], r0 509; CHECK-NEXT: vmov.16 q5[0], r2 510; CHECK-NEXT: vmov.u16 r2, q1[5] 511; CHECK-NEXT: vmov.16 q5[1], r2 512; CHECK-NEXT: vmov.u16 r2, q2[0] 513; CHECK-NEXT: vmov.u16 r0, q3[7] 514; CHECK-NEXT: vmov.16 q5[2], r2 515; CHECK-NEXT: vmov.16 q6[7], r0 516; CHECK-NEXT: vmov.u16 r2, q2[3] 517; CHECK-NEXT: vmov.16 q5[3], r2 518; CHECK-NEXT: vmov.f32 s26, s12 519; CHECK-NEXT: vmov.f32 s22, s11 520; CHECK-NEXT: vmov q7, q6 521; CHECK-NEXT: vmov r0, s20 522; CHECK-NEXT: vmovnb.i32 q7, q5 523; CHECK-NEXT: vmov.32 q4[0], r0 524; CHECK-NEXT: vmov r0, s21 525; CHECK-NEXT: vmov.32 q4[1], r0 526; CHECK-NEXT: vmov r0, s30 527; CHECK-NEXT: vmov.32 q4[2], r0 528; CHECK-NEXT: vmov r0, s27 529; CHECK-NEXT: vmov.32 q4[3], r0 530; CHECK-NEXT: vmov.u16 r0, q1[0] 531; CHECK-NEXT: vmov.16 q5[0], r0 532; CHECK-NEXT: vmov.u16 r0, q1[3] 533; CHECK-NEXT: vmov.16 q5[1], r0 534; CHECK-NEXT: vmov.u16 r0, q1[6] 535; CHECK-NEXT: vmov.16 q5[2], r0 536; CHECK-NEXT: vmov.u16 r0, q2[1] 537; CHECK-NEXT: vmov.16 q5[3], r0 538; CHECK-NEXT: vmov.u16 r0, q2[4] 539; CHECK-NEXT: vmov.16 q5[4], r0 540; CHECK-NEXT: vmov.u16 r0, q3[2] 541; CHECK-NEXT: vmov.16 q6[6], r0 542; CHECK-NEXT: vmov.u16 r0, q3[5] 543; CHECK-NEXT: vmov.16 q6[7], r0 544; CHECK-NEXT: vmov.u16 r0, q2[7] 545; CHECK-NEXT: vmov.16 q5[5], r0 546; CHECK-NEXT: vmov.u16 r0, q1[1] 547; CHECK-NEXT: vmov.f32 s23, s27 548; CHECK-NEXT: vmov.16 q6[0], r0 549; CHECK-NEXT: vmov.u16 r0, q1[4] 550; CHECK-NEXT: vmov.16 q6[1], r0 551; CHECK-NEXT: vmov.u16 r0, q1[7] 552; CHECK-NEXT: vmov.16 q6[2], r0 553; CHECK-NEXT: vmov.u16 r0, q2[2] 554; CHECK-NEXT: vmov.16 q6[3], r0 555; CHECK-NEXT: vmov.u16 r0, q2[5] 556; CHECK-NEXT: vmov.16 q6[4], r0 557; CHECK-NEXT: vmov r0, s24 558; CHECK-NEXT: vmov.32 q1[0], r0 559; CHECK-NEXT: vmov r0, s25 560; CHECK-NEXT: vmov.32 q1[1], r0 561; CHECK-NEXT: vmov.u16 r0, q3[0] 562; CHECK-NEXT: vmov.16 q2[5], r0 563; CHECK-NEXT: vmov.u16 r0, q3[3] 564; CHECK-NEXT: vmov.16 q2[6], r0 565; CHECK-NEXT: vmov.u16 r0, q3[6] 566; CHECK-NEXT: vmov.16 q2[7], r0 567; CHECK-NEXT: vmov q3, q2 568; CHECK-NEXT: vmovnb.i32 q3, q6 569; CHECK-NEXT: vmov r0, s14 570; CHECK-NEXT: vmov.32 q1[2], r0 571; CHECK-NEXT: vmov r0, s11 572; CHECK-NEXT: vmov.32 q1[3], r0 573; CHECK-NEXT: vadd.i16 q1, q5, q1 574; CHECK-NEXT: vadd.i16 q1, q1, q4 575; CHECK-NEXT: vstrw.32 q1, [r1] 576; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 577; CHECK-NEXT: bx lr 578entry: 579 %l1 = load <48 x i16>, <48 x i16>* %src, align 4 580 %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 581 %s2 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 582 %s3 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 583 %a1 = add <16 x i16> %s1, %s2 584 %a = add <16 x i16> %a1, %s3 585 store <16 x i16> %a, <16 x i16> *%dst 586 ret void 587} 588 589; i8 590 591define void @vld3_v2i8(<6 x i8> *%src, <2 x i8> *%dst) { 592; CHECK-LABEL: vld3_v2i8: 593; CHECK: @ %bb.0: @ %entry 594; CHECK-NEXT: .pad #8 595; CHECK-NEXT: sub sp, #8 596; CHECK-NEXT: ldrd r2, r0, [r0] 597; CHECK-NEXT: strd r2, r0, [sp] 598; CHECK-NEXT: mov r0, sp 599; CHECK-NEXT: vldrb.u16 q0, [r0] 600; CHECK-NEXT: vmov.u16 r0, q0[4] 601; CHECK-NEXT: vmov.u16 r2, q0[3] 602; CHECK-NEXT: add r0, r2 603; CHECK-NEXT: vmov.u16 r2, q0[5] 604; CHECK-NEXT: add r0, r2 605; CHECK-NEXT: strb r0, [r1, #1] 606; CHECK-NEXT: vmov.u16 r0, q0[1] 607; CHECK-NEXT: vmov.u16 r2, q0[0] 608; CHECK-NEXT: add r0, r2 609; CHECK-NEXT: vmov.u16 r2, q0[2] 610; CHECK-NEXT: add r0, r2 611; CHECK-NEXT: strb r0, [r1] 612; CHECK-NEXT: add sp, #8 613; CHECK-NEXT: bx lr 614entry: 615 %l1 = load <6 x i8>, <6 x i8>* %src, align 4 616 %s1 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 0, i32 3> 617 %s2 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 1, i32 4> 618 %s3 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 2, i32 5> 619 %a1 = add <2 x i8> %s1, %s2 620 %a = add <2 x i8> %a1, %s3 621 store <2 x i8> %a, <2 x i8> *%dst 622 ret void 623} 624 625define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) { 626; CHECK-LABEL: vld3_v4i8: 627; CHECK: @ %bb.0: @ %entry 628; CHECK-NEXT: .pad #8 629; CHECK-NEXT: sub sp, #8 630; CHECK-NEXT: vldrb.u16 q2, [r0] 631; CHECK-NEXT: ldr r3, [r0, #8] 632; CHECK-NEXT: mov r2, sp 633; CHECK-NEXT: str r3, [sp] 634; CHECK-NEXT: vmov.u16 r0, q2[2] 635; CHECK-NEXT: vmov.32 q0[0], r0 636; CHECK-NEXT: vmov.u16 r0, q2[5] 637; CHECK-NEXT: vmov.32 q0[1], r0 638; CHECK-NEXT: vmov.u16 r0, q2[0] 639; CHECK-NEXT: vmov.32 q1[0], r0 640; CHECK-NEXT: vmov.u16 r0, q2[3] 641; CHECK-NEXT: vmov.32 q1[1], r0 642; CHECK-NEXT: vmov.u16 r0, q2[6] 643; CHECK-NEXT: vmov.32 q1[2], r0 644; CHECK-NEXT: vmov.u16 r0, q2[1] 645; CHECK-NEXT: vmov.32 q3[0], r0 646; CHECK-NEXT: vmov.u16 r0, q2[4] 647; CHECK-NEXT: vmov.32 q3[1], r0 648; CHECK-NEXT: vmov.u16 r0, q2[7] 649; CHECK-NEXT: vldrb.u16 q2, [r2] 650; CHECK-NEXT: vmov.32 q3[2], r0 651; CHECK-NEXT: vmov.u16 r0, q2[2] 652; CHECK-NEXT: vmov.32 q3[3], r0 653; CHECK-NEXT: vmov.u16 r0, q2[1] 654; CHECK-NEXT: vmov.32 q1[3], r0 655; CHECK-NEXT: vmov.u16 r0, q2[0] 656; CHECK-NEXT: vmov.32 q0[2], r0 657; CHECK-NEXT: vmov.u16 r0, q2[3] 658; CHECK-NEXT: vadd.i32 q1, q1, q3 659; CHECK-NEXT: vmov.32 q0[3], r0 660; CHECK-NEXT: vadd.i32 q0, q1, q0 661; CHECK-NEXT: vstrb.32 q0, [r1] 662; CHECK-NEXT: add sp, #8 663; CHECK-NEXT: bx lr 664entry: 665 %l1 = load <12 x i8>, <12 x i8>* %src, align 4 666 %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 667 %s2 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 668 %s3 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 669 %a1 = add <4 x i8> %s1, %s2 670 %a = add <4 x i8> %a1, %s3 671 store <4 x i8> %a, <4 x i8> *%dst 672 ret void 673} 674 675define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) { 676; CHECK-LABEL: vld3_v8i8: 677; CHECK: @ %bb.0: @ %entry 678; CHECK-NEXT: vldrw.u32 q0, [r0] 679; CHECK-NEXT: vldrb.u16 q1, [r0, #16] 680; CHECK-NEXT: vmov.u8 r2, q0[0] 681; CHECK-NEXT: vmov.u16 r0, q1[2] 682; CHECK-NEXT: vmov.16 q2[0], r2 683; CHECK-NEXT: vmov.u8 r2, q0[3] 684; CHECK-NEXT: vmov.16 q2[1], r2 685; CHECK-NEXT: vmov.u8 r2, q0[6] 686; CHECK-NEXT: vmov.16 q2[2], r2 687; CHECK-NEXT: vmov.u8 r2, q0[9] 688; CHECK-NEXT: vmov.16 q2[3], r2 689; CHECK-NEXT: vmov.u8 r2, q0[12] 690; CHECK-NEXT: vmov.16 q2[4], r2 691; CHECK-NEXT: vmov.u8 r2, q0[15] 692; CHECK-NEXT: vmov.16 q2[5], r2 693; CHECK-NEXT: vmov.16 q2[6], r0 694; CHECK-NEXT: vmov.u8 r0, q0[1] 695; CHECK-NEXT: vmov.16 q3[0], r0 696; CHECK-NEXT: vmov.u8 r0, q0[4] 697; CHECK-NEXT: vmov.16 q3[1], r0 698; CHECK-NEXT: vmov.u8 r0, q0[7] 699; CHECK-NEXT: vmov.16 q3[2], r0 700; CHECK-NEXT: vmov.u8 r0, q0[10] 701; CHECK-NEXT: vmov.16 q3[3], r0 702; CHECK-NEXT: vmov.u8 r0, q0[13] 703; CHECK-NEXT: vmov.16 q3[4], r0 704; CHECK-NEXT: vmov.u16 r0, q1[0] 705; CHECK-NEXT: vmov.16 q3[5], r0 706; CHECK-NEXT: vmov.u16 r0, q1[3] 707; CHECK-NEXT: vmov.16 q3[6], r0 708; CHECK-NEXT: vmov.u16 r0, q1[6] 709; CHECK-NEXT: vmov.16 q3[7], r0 710; CHECK-NEXT: vmov.u16 r0, q1[5] 711; CHECK-NEXT: vmov.16 q2[7], r0 712; CHECK-NEXT: vmov.u8 r0, q0[2] 713; CHECK-NEXT: vadd.i16 q2, q2, q3 714; CHECK-NEXT: vmov.16 q3[0], r0 715; CHECK-NEXT: vmov.u8 r0, q0[5] 716; CHECK-NEXT: vmov.16 q3[1], r0 717; CHECK-NEXT: vmov.u8 r0, q0[8] 718; CHECK-NEXT: vmov.16 q3[2], r0 719; CHECK-NEXT: vmov.u8 r0, q0[11] 720; CHECK-NEXT: vmov.16 q3[3], r0 721; CHECK-NEXT: vmov.u8 r0, q0[14] 722; CHECK-NEXT: vmov.16 q3[4], r0 723; CHECK-NEXT: vmov.u16 r0, q1[1] 724; CHECK-NEXT: vmov.16 q3[5], r0 725; CHECK-NEXT: vmov.u16 r0, q1[4] 726; CHECK-NEXT: vmov.16 q3[6], r0 727; CHECK-NEXT: vmov.u16 r0, q1[7] 728; CHECK-NEXT: vmov.16 q3[7], r0 729; CHECK-NEXT: vadd.i16 q0, q2, q3 730; CHECK-NEXT: vstrb.16 q0, [r1] 731; CHECK-NEXT: bx lr 732entry: 733 %l1 = load <24 x i8>, <24 x i8>* %src, align 4 734 %s1 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 735 %s2 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 736 %s3 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 737 %a1 = add <8 x i8> %s1, %s2 738 %a = add <8 x i8> %a1, %s3 739 store <8 x i8> %a, <8 x i8> *%dst 740 ret void 741} 742 743define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { 744; CHECK-LABEL: vld3_v16i8: 745; CHECK: @ %bb.0: @ %entry 746; CHECK-NEXT: .vsave {d8, d9, d10, d11} 747; CHECK-NEXT: vpush {d8, d9, d10, d11} 748; CHECK-NEXT: vldrw.u32 q2, [r0] 749; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 750; CHECK-NEXT: vmov.u8 r2, q2[0] 751; CHECK-NEXT: vmov.8 q1[0], r2 752; CHECK-NEXT: vmov.u8 r2, q2[3] 753; CHECK-NEXT: vmov.8 q1[1], r2 754; CHECK-NEXT: vmov.u8 r2, q2[6] 755; CHECK-NEXT: vmov.8 q1[2], r2 756; CHECK-NEXT: vmov.u8 r2, q2[9] 757; CHECK-NEXT: vmov.8 q1[3], r2 758; CHECK-NEXT: vmov.u8 r2, q2[12] 759; CHECK-NEXT: vmov.8 q1[4], r2 760; CHECK-NEXT: vmov.u8 r2, q2[15] 761; CHECK-NEXT: vmov.8 q1[5], r2 762; CHECK-NEXT: vmov.u8 r2, q0[2] 763; CHECK-NEXT: vmov.8 q1[6], r2 764; CHECK-NEXT: vmov.u8 r2, q0[5] 765; CHECK-NEXT: vmov.8 q1[7], r2 766; CHECK-NEXT: vmov r2, s4 767; CHECK-NEXT: vmov.32 q3[0], r2 768; CHECK-NEXT: vmov r2, s5 769; CHECK-NEXT: vmov.32 q3[1], r2 770; CHECK-NEXT: vmov.u8 r2, q0[8] 771; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 772; CHECK-NEXT: vmov.8 q4[8], r2 773; CHECK-NEXT: vmov.u8 r2, q0[11] 774; CHECK-NEXT: vmov.8 q4[9], r2 775; CHECK-NEXT: vmov.u8 r2, q0[14] 776; CHECK-NEXT: vmov.8 q4[10], r2 777; CHECK-NEXT: vmov.u8 r0, q1[1] 778; CHECK-NEXT: vmov.8 q4[11], r0 779; CHECK-NEXT: vmov r0, s18 780; CHECK-NEXT: vmov.32 q3[2], r0 781; CHECK-NEXT: vmov.u8 r0, q2[1] 782; CHECK-NEXT: vmov.8 q5[0], r0 783; CHECK-NEXT: vmov.u8 r0, q2[4] 784; CHECK-NEXT: vmov.8 q5[1], r0 785; CHECK-NEXT: vmov.u8 r0, q2[7] 786; CHECK-NEXT: vmov.8 q5[2], r0 787; CHECK-NEXT: vmov.u8 r0, q2[10] 788; CHECK-NEXT: vmov.8 q5[3], r0 789; CHECK-NEXT: vmov.u8 r0, q2[13] 790; CHECK-NEXT: vmov.8 q5[4], r0 791; CHECK-NEXT: vmov.u8 r0, q0[0] 792; CHECK-NEXT: vmov.8 q5[5], r0 793; CHECK-NEXT: vmov.u8 r0, q0[3] 794; CHECK-NEXT: vmov.8 q5[6], r0 795; CHECK-NEXT: vmov.u8 r0, q0[6] 796; CHECK-NEXT: vmov.8 q5[7], r0 797; CHECK-NEXT: vmov r0, s20 798; CHECK-NEXT: vmov.32 q4[0], r0 799; CHECK-NEXT: vmov r0, s21 800; CHECK-NEXT: vmov.32 q4[1], r0 801; CHECK-NEXT: vmov.u8 r0, q0[9] 802; CHECK-NEXT: vmov.8 q5[8], r0 803; CHECK-NEXT: vmov.u8 r0, q0[12] 804; CHECK-NEXT: vmov.8 q5[9], r0 805; CHECK-NEXT: vmov.u8 r0, q0[15] 806; CHECK-NEXT: vmov.8 q5[10], r0 807; CHECK-NEXT: vmov.u8 r0, q1[2] 808; CHECK-NEXT: vmov.8 q5[11], r0 809; CHECK-NEXT: vmov r0, s22 810; CHECK-NEXT: vmov.32 q4[2], r0 811; CHECK-NEXT: vmov.u8 r0, q1[5] 812; CHECK-NEXT: vmov.8 q5[12], r0 813; CHECK-NEXT: vmov.u8 r0, q1[8] 814; CHECK-NEXT: vmov.8 q5[13], r0 815; CHECK-NEXT: vmov.u8 r0, q1[11] 816; CHECK-NEXT: vmov.8 q5[14], r0 817; CHECK-NEXT: vmov.u8 r0, q1[14] 818; CHECK-NEXT: vmov.8 q5[15], r0 819; CHECK-NEXT: vmov r0, s23 820; CHECK-NEXT: vmov.32 q4[3], r0 821; CHECK-NEXT: vmov.u8 r0, q1[4] 822; CHECK-NEXT: vmov.8 q5[12], r0 823; CHECK-NEXT: vmov.u8 r0, q1[7] 824; CHECK-NEXT: vmov.8 q5[13], r0 825; CHECK-NEXT: vmov.u8 r0, q1[10] 826; CHECK-NEXT: vmov.8 q5[14], r0 827; CHECK-NEXT: vmov.u8 r0, q1[13] 828; CHECK-NEXT: vmov.8 q5[15], r0 829; CHECK-NEXT: vmov r0, s23 830; CHECK-NEXT: vmov.32 q3[3], r0 831; CHECK-NEXT: vmov.u8 r0, q2[2] 832; CHECK-NEXT: vadd.i8 q3, q3, q4 833; CHECK-NEXT: vmov.8 q4[0], r0 834; CHECK-NEXT: vmov.u8 r0, q2[5] 835; CHECK-NEXT: vmov.8 q4[1], r0 836; CHECK-NEXT: vmov.u8 r0, q2[8] 837; CHECK-NEXT: vmov.8 q4[2], r0 838; CHECK-NEXT: vmov.u8 r0, q2[11] 839; CHECK-NEXT: vmov.8 q4[3], r0 840; CHECK-NEXT: vmov.u8 r0, q2[14] 841; CHECK-NEXT: vmov.8 q4[4], r0 842; CHECK-NEXT: vmov.u8 r0, q0[1] 843; CHECK-NEXT: vmov.8 q4[5], r0 844; CHECK-NEXT: vmov.u8 r0, q0[4] 845; CHECK-NEXT: vmov.8 q4[6], r0 846; CHECK-NEXT: vmov.u8 r0, q0[7] 847; CHECK-NEXT: vmov.8 q4[7], r0 848; CHECK-NEXT: vmov r0, s16 849; CHECK-NEXT: vmov.32 q2[0], r0 850; CHECK-NEXT: vmov r0, s17 851; CHECK-NEXT: vmov.32 q2[1], r0 852; CHECK-NEXT: vmov.u8 r0, q0[10] 853; CHECK-NEXT: vmov.8 q4[8], r0 854; CHECK-NEXT: vmov.u8 r0, q0[13] 855; CHECK-NEXT: vmov.8 q4[9], r0 856; CHECK-NEXT: vmov.u8 r0, q1[0] 857; CHECK-NEXT: vmov.8 q4[10], r0 858; CHECK-NEXT: vmov.u8 r0, q1[3] 859; CHECK-NEXT: vmov.8 q4[11], r0 860; CHECK-NEXT: vmov r0, s18 861; CHECK-NEXT: vmov.32 q2[2], r0 862; CHECK-NEXT: vmov.u8 r0, q1[6] 863; CHECK-NEXT: vmov.8 q0[12], r0 864; CHECK-NEXT: vmov.u8 r0, q1[9] 865; CHECK-NEXT: vmov.8 q0[13], r0 866; CHECK-NEXT: vmov.u8 r0, q1[12] 867; CHECK-NEXT: vmov.8 q0[14], r0 868; CHECK-NEXT: vmov.u8 r0, q1[15] 869; CHECK-NEXT: vmov.8 q0[15], r0 870; CHECK-NEXT: vmov r0, s3 871; CHECK-NEXT: vmov.32 q2[3], r0 872; CHECK-NEXT: vadd.i8 q0, q3, q2 873; CHECK-NEXT: vstrw.32 q0, [r1] 874; CHECK-NEXT: vpop {d8, d9, d10, d11} 875; CHECK-NEXT: bx lr 876entry: 877 %l1 = load <48 x i8>, <48 x i8>* %src, align 4 878 %s1 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 879 %s2 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 880 %s3 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 881 %a1 = add <16 x i8> %s1, %s2 882 %a = add <16 x i8> %a1, %s3 883 store <16 x i8> %a, <16 x i8> *%dst 884 ret void 885} 886 887; i64 888 889define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) { 890; CHECK-LABEL: vld3_v2i64: 891; CHECK: @ %bb.0: @ %entry 892; CHECK-NEXT: .save {r4, lr} 893; CHECK-NEXT: push {r4, lr} 894; CHECK-NEXT: .vsave {d8, d9} 895; CHECK-NEXT: vpush {d8, d9} 896; CHECK-NEXT: vldrw.u32 q1, [r0] 897; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 898; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 899; CHECK-NEXT: vmov.f64 d6, d3 900; CHECK-NEXT: vmov.f32 s13, s7 901; CHECK-NEXT: vmov.f32 s14, s16 902; CHECK-NEXT: vmov.f32 s6, s10 903; CHECK-NEXT: vmov.f32 s7, s11 904; CHECK-NEXT: vmov.f32 s15, s17 905; CHECK-NEXT: vmov r3, s14 906; CHECK-NEXT: vmov r0, s6 907; CHECK-NEXT: vmov.f64 d0, d4 908; CHECK-NEXT: vmov.f32 s1, s9 909; CHECK-NEXT: vmov.f32 s2, s18 910; CHECK-NEXT: vmov.f32 s3, s19 911; CHECK-NEXT: vmov r12, s15 912; CHECK-NEXT: vmov r2, s7 913; CHECK-NEXT: vmov r4, s4 914; CHECK-NEXT: adds.w lr, r0, r3 915; CHECK-NEXT: vmov r0, s2 916; CHECK-NEXT: vmov r3, s3 917; CHECK-NEXT: adc.w r2, r2, r12 918; CHECK-NEXT: adds.w lr, lr, r0 919; CHECK-NEXT: vmov r0, s12 920; CHECK-NEXT: adc.w r12, r2, r3 921; CHECK-NEXT: vmov r3, s13 922; CHECK-NEXT: vmov r2, s5 923; CHECK-NEXT: adds r0, r0, r4 924; CHECK-NEXT: vmov r4, s0 925; CHECK-NEXT: adcs r2, r3 926; CHECK-NEXT: vmov r3, s1 927; CHECK-NEXT: adds r0, r0, r4 928; CHECK-NEXT: vmov.32 q0[0], r0 929; CHECK-NEXT: adcs r2, r3 930; CHECK-NEXT: vmov.32 q0[1], r2 931; CHECK-NEXT: vmov.32 q0[2], lr 932; CHECK-NEXT: vmov.32 q0[3], r12 933; CHECK-NEXT: vstrw.32 q0, [r1] 934; CHECK-NEXT: vpop {d8, d9} 935; CHECK-NEXT: pop {r4, pc} 936entry: 937 %l1 = load <6 x i64>, <6 x i64>* %src, align 4 938 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3> 939 %s2 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 1, i32 4> 940 %s3 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 2, i32 5> 941 %a1 = add <2 x i64> %s1, %s2 942 %a = add <2 x i64> %a1, %s3 943 store <2 x i64> %a, <2 x i64> *%dst 944 ret void 945} 946 947define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) { 948; CHECK-LABEL: vld3_v4i64: 949; CHECK: @ %bb.0: @ %entry 950; CHECK-NEXT: .save {r4, lr} 951; CHECK-NEXT: push {r4, lr} 952; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 953; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 954; CHECK-NEXT: .pad #24 955; CHECK-NEXT: sub sp, #24 956; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 957; CHECK-NEXT: vldrw.u32 q5, [r0, #48] 958; CHECK-NEXT: vldrw.u32 q6, [r0, #64] 959; CHECK-NEXT: vldrw.u32 q1, [r0] 960; CHECK-NEXT: vmov.f64 d4, d0 961; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 962; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 963; CHECK-NEXT: vmov.f32 s9, s1 964; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 965; CHECK-NEXT: vmov.f64 d14, d11 966; CHECK-NEXT: vmov.f32 s29, s23 967; CHECK-NEXT: vmov.f32 s30, s0 968; CHECK-NEXT: vmov.f32 s22, s26 969; CHECK-NEXT: vmov.f32 s23, s27 970; CHECK-NEXT: vmov.f32 s31, s1 971; CHECK-NEXT: vmov r3, s30 972; CHECK-NEXT: vmov r0, s22 973; CHECK-NEXT: vmov.f64 d6, d3 974; CHECK-NEXT: vmov.f32 s13, s7 975; CHECK-NEXT: vmov.f32 s10, s18 976; CHECK-NEXT: vmov.f32 s14, s16 977; CHECK-NEXT: vmov.f32 s11, s19 978; CHECK-NEXT: vmov.f32 s15, s17 979; CHECK-NEXT: vmov.f64 d8, d12 980; CHECK-NEXT: vmov.f32 s17, s25 981; CHECK-NEXT: vmov.f32 s18, s2 982; CHECK-NEXT: vmov.f32 s19, s3 983; CHECK-NEXT: vmov r12, s31 984; CHECK-NEXT: vmov r2, s23 985; CHECK-NEXT: adds.w lr, r0, r3 986; CHECK-NEXT: vmov r0, s18 987; CHECK-NEXT: vmov r4, s20 988; CHECK-NEXT: vmov r3, s19 989; CHECK-NEXT: adc.w r2, r2, r12 990; CHECK-NEXT: adds.w lr, lr, r0 991; CHECK-NEXT: vmov r0, s28 992; CHECK-NEXT: adc.w r12, r2, r3 993; CHECK-NEXT: vmov r3, s29 994; CHECK-NEXT: vmov r2, s21 995; CHECK-NEXT: adds r0, r0, r4 996; CHECK-NEXT: vmov r4, s16 997; CHECK-NEXT: adcs r2, r3 998; CHECK-NEXT: vmov r3, s17 999; CHECK-NEXT: adds r0, r0, r4 1000; CHECK-NEXT: vmov.32 q0[0], r0 1001; CHECK-NEXT: vmov r0, s15 1002; CHECK-NEXT: adcs r2, r3 1003; CHECK-NEXT: vmov r3, s14 1004; CHECK-NEXT: vmov.32 q0[1], r2 1005; CHECK-NEXT: vmov.32 q0[2], lr 1006; CHECK-NEXT: vmov.32 q0[3], r12 1007; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1008; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 1009; CHECK-NEXT: vmov.f32 s6, s2 1010; CHECK-NEXT: vmov.f32 s7, s3 1011; CHECK-NEXT: vmov r4, s6 1012; CHECK-NEXT: vmov r2, s7 1013; CHECK-NEXT: adds r3, r3, r4 1014; CHECK-NEXT: vmov r4, s10 1015; CHECK-NEXT: adcs r0, r2 1016; CHECK-NEXT: vmov r2, s11 1017; CHECK-NEXT: adds.w lr, r3, r4 1018; CHECK-NEXT: vmov r3, s4 1019; CHECK-NEXT: vmov r4, s5 1020; CHECK-NEXT: adc.w r12, r0, r2 1021; CHECK-NEXT: vmov r0, s12 1022; CHECK-NEXT: vmov r2, s13 1023; CHECK-NEXT: adds r0, r0, r3 1024; CHECK-NEXT: vmov r3, s9 1025; CHECK-NEXT: adcs r2, r4 1026; CHECK-NEXT: vmov r4, s8 1027; CHECK-NEXT: adds r0, r0, r4 1028; CHECK-NEXT: adcs r2, r3 1029; CHECK-NEXT: vmov.32 q0[0], r0 1030; CHECK-NEXT: vmov.32 q0[1], r2 1031; CHECK-NEXT: vmov.32 q0[2], lr 1032; CHECK-NEXT: vmov.32 q0[3], r12 1033; CHECK-NEXT: vstrw.32 q0, [r1] 1034; CHECK-NEXT: add sp, #24 1035; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1036; CHECK-NEXT: pop {r4, pc} 1037entry: 1038 %l1 = load <12 x i64>, <12 x i64>* %src, align 4 1039 %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1040 %s2 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1041 %s3 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1042 %a1 = add <4 x i64> %s1, %s2 1043 %a = add <4 x i64> %a1, %s3 1044 store <4 x i64> %a, <4 x i64> *%dst 1045 ret void 1046} 1047 1048; f32 1049 1050define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) { 1051; CHECK-LABEL: vld3_v2f32: 1052; CHECK: @ %bb.0: @ %entry 1053; CHECK-NEXT: vldrw.u32 q2, [r0] 1054; CHECK-NEXT: vldr s1, [r0, #16] 1055; CHECK-NEXT: vldr s5, [r0, #20] 1056; CHECK-NEXT: vmov.f64 d6, d4 1057; CHECK-NEXT: vmov.f32 s13, s11 1058; CHECK-NEXT: vmov.f32 s0, s9 1059; CHECK-NEXT: vadd.f32 q0, q3, q0 1060; CHECK-NEXT: vmov.f32 s4, s10 1061; CHECK-NEXT: vadd.f32 q0, q0, q1 1062; CHECK-NEXT: vstmia r1, {s0, s1} 1063; CHECK-NEXT: bx lr 1064entry: 1065 %l1 = load <6 x float>, <6 x float>* %src, align 4 1066 %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3> 1067 %s2 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 1, i32 4> 1068 %s3 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 2, i32 5> 1069 %a1 = fadd <2 x float> %s1, %s2 1070 %a = fadd <2 x float> %a1, %s3 1071 store <2 x float> %a, <2 x float> *%dst 1072 ret void 1073} 1074 1075define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) { 1076; CHECK-LABEL: vld3_v4f32: 1077; CHECK: @ %bb.0: @ %entry 1078; CHECK-NEXT: .vsave {d8, d9} 1079; CHECK-NEXT: vpush {d8, d9} 1080; CHECK-NEXT: vldrw.u32 q1, [r0] 1081; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 1082; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 1083; CHECK-NEXT: vmov.f32 s12, s5 1084; CHECK-NEXT: vmov.f64 d8, d2 1085; CHECK-NEXT: vmov.f32 s13, s0 1086; CHECK-NEXT: vmov.f32 s17, s7 1087; CHECK-NEXT: vmov.f32 s14, s3 1088; CHECK-NEXT: vmov.f32 s18, s2 1089; CHECK-NEXT: vmov.f32 s0, s6 1090; CHECK-NEXT: vmov.f32 s15, s10 1091; CHECK-NEXT: vmov.f32 s19, s9 1092; CHECK-NEXT: vmov.f32 s10, s8 1093; CHECK-NEXT: vadd.f32 q3, q4, q3 1094; CHECK-NEXT: vmov.f32 s2, s8 1095; CHECK-NEXT: vmov.f32 s3, s11 1096; CHECK-NEXT: vadd.f32 q0, q3, q0 1097; CHECK-NEXT: vstrw.32 q0, [r1] 1098; CHECK-NEXT: vpop {d8, d9} 1099; CHECK-NEXT: bx lr 1100entry: 1101 %l1 = load <12 x float>, <12 x float>* %src, align 4 1102 %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1103 %s2 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1104 %s3 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1105 %a1 = fadd <4 x float> %s1, %s2 1106 %a = fadd <4 x float> %a1, %s3 1107 store <4 x float> %a, <4 x float> *%dst 1108 ret void 1109} 1110 1111define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) { 1112; CHECK-LABEL: vld3_v8f32: 1113; CHECK: @ %bb.0: @ %entry 1114; CHECK-NEXT: .vsave {d8, d9, d10, d11} 1115; CHECK-NEXT: vpush {d8, d9, d10, d11} 1116; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 1117; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1118; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 1119; CHECK-NEXT: vmov.f32 s12, s5 1120; CHECK-NEXT: vmov.f64 d8, d2 1121; CHECK-NEXT: vmov.f32 s13, s0 1122; CHECK-NEXT: vmov.f32 s17, s7 1123; CHECK-NEXT: vmov.f32 s14, s3 1124; CHECK-NEXT: vmov.f32 s18, s2 1125; CHECK-NEXT: vmov.f32 s0, s6 1126; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1127; CHECK-NEXT: vmov.f32 s15, s10 1128; CHECK-NEXT: vmov.f32 s19, s9 1129; CHECK-NEXT: vmov.f32 s10, s8 1130; CHECK-NEXT: vadd.f32 q3, q4, q3 1131; CHECK-NEXT: vmov.f32 s2, s8 1132; CHECK-NEXT: vmov.f32 s3, s11 1133; CHECK-NEXT: vldrw.u32 q2, [r0] 1134; CHECK-NEXT: vadd.f32 q0, q3, q0 1135; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1136; CHECK-NEXT: vmov.f32 s16, s9 1137; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1138; CHECK-NEXT: vmov.f64 d10, d4 1139; CHECK-NEXT: vmov.f32 s17, s4 1140; CHECK-NEXT: vmov.f32 s21, s11 1141; CHECK-NEXT: vmov.f32 s18, s7 1142; CHECK-NEXT: vmov.f32 s22, s6 1143; CHECK-NEXT: vmov.f32 s4, s10 1144; CHECK-NEXT: vmov.f32 s19, s14 1145; CHECK-NEXT: vmov.f32 s23, s13 1146; CHECK-NEXT: vmov.f32 s14, s12 1147; CHECK-NEXT: vadd.f32 q4, q5, q4 1148; CHECK-NEXT: vmov.f32 s6, s12 1149; CHECK-NEXT: vmov.f32 s7, s15 1150; CHECK-NEXT: vadd.f32 q1, q4, q1 1151; CHECK-NEXT: vstrw.32 q1, [r1] 1152; CHECK-NEXT: vpop {d8, d9, d10, d11} 1153; CHECK-NEXT: bx lr 1154entry: 1155 %l1 = load <24 x float>, <24 x float>* %src, align 4 1156 %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 1157 %s2 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 1158 %s3 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 1159 %a1 = fadd <8 x float> %s1, %s2 1160 %a = fadd <8 x float> %a1, %s3 1161 store <8 x float> %a, <8 x float> *%dst 1162 ret void 1163} 1164 1165define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) { 1166; CHECK-LABEL: vld3_v16f32: 1167; CHECK: @ %bb.0: @ %entry 1168; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1169; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1170; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 1171; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1172; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 1173; CHECK-NEXT: vmov.f32 s12, s5 1174; CHECK-NEXT: vmov.f64 d8, d2 1175; CHECK-NEXT: vmov.f32 s13, s0 1176; CHECK-NEXT: vmov.f32 s17, s7 1177; CHECK-NEXT: vmov.f32 s14, s3 1178; CHECK-NEXT: vmov.f32 s18, s2 1179; CHECK-NEXT: vmov.f32 s0, s6 1180; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1181; CHECK-NEXT: vmov.f32 s15, s10 1182; CHECK-NEXT: vmov.f32 s19, s9 1183; CHECK-NEXT: vmov.f32 s10, s8 1184; CHECK-NEXT: vadd.f32 q3, q4, q3 1185; CHECK-NEXT: vmov.f32 s2, s8 1186; CHECK-NEXT: vmov.f32 s3, s11 1187; CHECK-NEXT: vldrw.u32 q2, [r0] 1188; CHECK-NEXT: vadd.f32 q0, q3, q0 1189; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1190; CHECK-NEXT: vmov.f32 s16, s9 1191; CHECK-NEXT: vmov.f64 d10, d4 1192; CHECK-NEXT: vmov.f32 s17, s4 1193; CHECK-NEXT: vmov.f32 s21, s11 1194; CHECK-NEXT: vmov.f32 s18, s7 1195; CHECK-NEXT: vmov.f32 s22, s6 1196; CHECK-NEXT: vmov.f32 s4, s10 1197; CHECK-NEXT: vldrw.u32 q2, [r0, #160] 1198; CHECK-NEXT: vmov.f32 s19, s14 1199; CHECK-NEXT: vmov.f32 s23, s13 1200; CHECK-NEXT: vmov.f32 s14, s12 1201; CHECK-NEXT: vadd.f32 q4, q5, q4 1202; CHECK-NEXT: vmov.f32 s6, s12 1203; CHECK-NEXT: vmov.f32 s7, s15 1204; CHECK-NEXT: vldrw.u32 q3, [r0, #144] 1205; CHECK-NEXT: vadd.f32 q1, q4, q1 1206; CHECK-NEXT: vldrw.u32 q4, [r0, #176] 1207; CHECK-NEXT: vmov.f32 s20, s13 1208; CHECK-NEXT: vmov.f64 d12, d6 1209; CHECK-NEXT: vmov.f32 s21, s8 1210; CHECK-NEXT: vmov.f32 s25, s15 1211; CHECK-NEXT: vmov.f32 s22, s11 1212; CHECK-NEXT: vmov.f32 s26, s10 1213; CHECK-NEXT: vmov.f32 s8, s14 1214; CHECK-NEXT: vldrw.u32 q3, [r0, #112] 1215; CHECK-NEXT: vmov.f32 s23, s18 1216; CHECK-NEXT: vmov.f32 s27, s17 1217; CHECK-NEXT: vmov.f32 s18, s16 1218; CHECK-NEXT: vadd.f32 q5, q6, q5 1219; CHECK-NEXT: vmov.f32 s10, s16 1220; CHECK-NEXT: vmov.f32 s11, s19 1221; CHECK-NEXT: vldrw.u32 q4, [r0, #96] 1222; CHECK-NEXT: vadd.f32 q2, q5, q2 1223; CHECK-NEXT: vldrw.u32 q5, [r0, #128] 1224; CHECK-NEXT: vmov.f32 s24, s17 1225; CHECK-NEXT: vstrw.32 q2, [r1, #48] 1226; CHECK-NEXT: vmov.f64 d14, d8 1227; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1228; CHECK-NEXT: vstrw.32 q1, [r1] 1229; CHECK-NEXT: vmov.f32 s25, s12 1230; CHECK-NEXT: vmov.f32 s29, s19 1231; CHECK-NEXT: vmov.f32 s26, s15 1232; CHECK-NEXT: vmov.f32 s30, s14 1233; CHECK-NEXT: vmov.f32 s12, s18 1234; CHECK-NEXT: vmov.f32 s27, s22 1235; CHECK-NEXT: vmov.f32 s31, s21 1236; CHECK-NEXT: vmov.f32 s22, s20 1237; CHECK-NEXT: vadd.f32 q6, q7, q6 1238; CHECK-NEXT: vmov.f32 s14, s20 1239; CHECK-NEXT: vmov.f32 s15, s23 1240; CHECK-NEXT: vadd.f32 q3, q6, q3 1241; CHECK-NEXT: vstrw.32 q3, [r1, #32] 1242; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1243; CHECK-NEXT: bx lr 1244entry: 1245 %l1 = load <48 x float>, <48 x float>* %src, align 4 1246 %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 1247 %s2 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 1248 %s3 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 1249 %a1 = fadd <16 x float> %s1, %s2 1250 %a = fadd <16 x float> %a1, %s3 1251 store <16 x float> %a, <16 x float> *%dst 1252 ret void 1253} 1254 1255; f16 1256 1257define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) { 1258; CHECK-LABEL: vld3_v2f16: 1259; CHECK: @ %bb.0: @ %entry 1260; CHECK-NEXT: ldrd r2, r3, [r0] 1261; CHECK-NEXT: ldr r0, [r0, #8] 1262; CHECK-NEXT: vmov.32 q0[0], r2 1263; CHECK-NEXT: vmov.32 q0[1], r3 1264; CHECK-NEXT: vmov.32 q0[2], r0 1265; CHECK-NEXT: vmovx.f16 s4, s0 1266; CHECK-NEXT: vmov r2, s2 1267; CHECK-NEXT: vmov r0, s4 1268; CHECK-NEXT: vmovx.f16 s8, s1 1269; CHECK-NEXT: vmov.16 q1[0], r0 1270; CHECK-NEXT: vmov r0, s8 1271; CHECK-NEXT: vmov.16 q1[1], r2 1272; CHECK-NEXT: vmov r2, s0 1273; CHECK-NEXT: vmov.16 q2[0], r2 1274; CHECK-NEXT: vmov r2, s1 1275; CHECK-NEXT: vmov.16 q2[1], r0 1276; CHECK-NEXT: vadd.f16 q1, q2, q1 1277; CHECK-NEXT: vmovx.f16 s8, s2 1278; CHECK-NEXT: vmov r0, s8 1279; CHECK-NEXT: vmov.16 q0[0], r2 1280; CHECK-NEXT: vmov.16 q0[1], r0 1281; CHECK-NEXT: vadd.f16 q0, q1, q0 1282; CHECK-NEXT: vmov r0, s0 1283; CHECK-NEXT: str r0, [r1] 1284; CHECK-NEXT: bx lr 1285entry: 1286 %l1 = load <6 x half>, <6 x half>* %src, align 4 1287 %s1 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 0, i32 3> 1288 %s2 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 1, i32 4> 1289 %s3 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 2, i32 5> 1290 %a1 = fadd <2 x half> %s1, %s2 1291 %a = fadd <2 x half> %a1, %s3 1292 store <2 x half> %a, <2 x half> *%dst 1293 ret void 1294} 1295 1296define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { 1297; CHECK-LABEL: vld3_v4f16: 1298; CHECK: @ %bb.0: @ %entry 1299; CHECK-NEXT: .vsave {d8} 1300; CHECK-NEXT: vpush {d8} 1301; CHECK-NEXT: vldrw.u32 q0, [r0] 1302; CHECK-NEXT: vmovx.f16 s4, s0 1303; CHECK-NEXT: vmov r2, s2 1304; CHECK-NEXT: vmov r3, s4 1305; CHECK-NEXT: vmovx.f16 s4, s3 1306; CHECK-NEXT: vmov.16 q2[0], r3 1307; CHECK-NEXT: vmovx.f16 s12, s1 1308; CHECK-NEXT: vmov.16 q2[1], r2 1309; CHECK-NEXT: vmov r2, s4 1310; CHECK-NEXT: vmov.16 q2[2], r2 1311; CHECK-NEXT: ldrd r2, r0, [r0, #16] 1312; CHECK-NEXT: vmov.32 q1[0], r2 1313; CHECK-NEXT: vmov r2, s0 1314; CHECK-NEXT: vmov.32 q1[1], r0 1315; CHECK-NEXT: vmovx.f16 s0, s2 1316; CHECK-NEXT: vmov r0, s5 1317; CHECK-NEXT: vmovx.f16 s16, s4 1318; CHECK-NEXT: vmov.16 q2[3], r0 1319; CHECK-NEXT: vmov r0, s12 1320; CHECK-NEXT: vmov.16 q3[0], r2 1321; CHECK-NEXT: vmov r2, s0 1322; CHECK-NEXT: vmov.16 q3[1], r0 1323; CHECK-NEXT: vmov r0, s3 1324; CHECK-NEXT: vmov.16 q3[2], r0 1325; CHECK-NEXT: vmov r0, s16 1326; CHECK-NEXT: vmov.16 q3[3], r0 1327; CHECK-NEXT: vmov r0, s1 1328; CHECK-NEXT: vmov.16 q0[0], r0 1329; CHECK-NEXT: vmov r0, s4 1330; CHECK-NEXT: vmov.16 q0[1], r2 1331; CHECK-NEXT: vmovx.f16 s4, s5 1332; CHECK-NEXT: vmov.16 q0[2], r0 1333; CHECK-NEXT: vmov r0, s4 1334; CHECK-NEXT: vadd.f16 q2, q3, q2 1335; CHECK-NEXT: vmov.16 q0[3], r0 1336; CHECK-NEXT: vadd.f16 q0, q2, q0 1337; CHECK-NEXT: vmov r2, s1 1338; CHECK-NEXT: vmov r0, s0 1339; CHECK-NEXT: strd r0, r2, [r1] 1340; CHECK-NEXT: vpop {d8} 1341; CHECK-NEXT: bx lr 1342entry: 1343 %l1 = load <12 x half>, <12 x half>* %src, align 4 1344 %s1 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1345 %s2 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1346 %s3 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1347 %a1 = fadd <4 x half> %s1, %s2 1348 %a = fadd <4 x half> %a1, %s3 1349 store <4 x half> %a, <4 x half> *%dst 1350 ret void 1351} 1352 1353define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { 1354; CHECK-LABEL: vld3_v8f16: 1355; CHECK: @ %bb.0: @ %entry 1356; CHECK-NEXT: .save {r4, lr} 1357; CHECK-NEXT: push {r4, lr} 1358; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 1359; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 1360; CHECK-NEXT: vldrw.u32 q1, [r0] 1361; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1362; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 1363; CHECK-NEXT: vmovx.f16 s0, s6 1364; CHECK-NEXT: vmov r3, s5 1365; CHECK-NEXT: vmov r2, s0 1366; CHECK-NEXT: vmov.16 q0[0], r3 1367; CHECK-NEXT: vmov.16 q0[1], r2 1368; CHECK-NEXT: vmov r2, s8 1369; CHECK-NEXT: vmovx.f16 s12, s9 1370; CHECK-NEXT: vmov.16 q0[2], r2 1371; CHECK-NEXT: vmov r2, s12 1372; CHECK-NEXT: vmovx.f16 s12, s19 1373; CHECK-NEXT: vmov.16 q0[3], r2 1374; CHECK-NEXT: vmov r3, s18 1375; CHECK-NEXT: vmov.f32 s2, s11 1376; CHECK-NEXT: vmovx.f16 s20, s16 1377; CHECK-NEXT: vmov r0, s12 1378; CHECK-NEXT: vmov.16 q3[6], r3 1379; CHECK-NEXT: vmov.16 q3[7], r0 1380; CHECK-NEXT: vmov r0, s20 1381; CHECK-NEXT: vmov.f32 s14, s16 1382; CHECK-NEXT: vmovx.f16 s24, s8 1383; CHECK-NEXT: vmov r4, s0 1384; CHECK-NEXT: vmov r2, s2 1385; CHECK-NEXT: vmov.16 q5[4], r2 1386; CHECK-NEXT: vmov r2, s17 1387; CHECK-NEXT: vmov.16 q5[5], r0 1388; CHECK-NEXT: vmov r0, s19 1389; CHECK-NEXT: vmov lr, s22 1390; CHECK-NEXT: vmovx.f16 s20, s17 1391; CHECK-NEXT: vmov r3, s20 1392; CHECK-NEXT: vmov.16 q5[6], r3 1393; CHECK-NEXT: vmov.16 q5[7], r0 1394; CHECK-NEXT: vmov r0, s16 1395; CHECK-NEXT: vmov r12, s23 1396; CHECK-NEXT: vmovx.f16 s20, s10 1397; CHECK-NEXT: vmov r3, s20 1398; CHECK-NEXT: vmov.16 q5[4], r3 1399; CHECK-NEXT: vmov.16 q5[5], r0 1400; CHECK-NEXT: vmov r3, s22 1401; CHECK-NEXT: vmovx.f16 s20, s18 1402; CHECK-NEXT: vmov r0, s20 1403; CHECK-NEXT: vmov.16 q4[6], r2 1404; CHECK-NEXT: vmov.16 q4[7], r0 1405; CHECK-NEXT: vmovx.f16 s20, s5 1406; CHECK-NEXT: vmov r0, s4 1407; CHECK-NEXT: vmov r2, s20 1408; CHECK-NEXT: vmov.16 q5[0], r0 1409; CHECK-NEXT: vmov.16 q5[1], r2 1410; CHECK-NEXT: vmov r0, s7 1411; CHECK-NEXT: vmov.16 q5[2], r0 1412; CHECK-NEXT: vmov r0, s24 1413; CHECK-NEXT: vmov.16 q5[3], r0 1414; CHECK-NEXT: vmov r0, s10 1415; CHECK-NEXT: vmovx.f16 s24, s11 1416; CHECK-NEXT: vmov.16 q5[4], r0 1417; CHECK-NEXT: vmov r0, s24 1418; CHECK-NEXT: vmovx.f16 s24, s4 1419; CHECK-NEXT: vmov r2, s24 1420; CHECK-NEXT: vmov.16 q5[5], r0 1421; CHECK-NEXT: vmov r0, s6 1422; CHECK-NEXT: vmov.16 q6[0], r2 1423; CHECK-NEXT: vmovx.f16 s4, s7 1424; CHECK-NEXT: vmov.16 q6[1], r0 1425; CHECK-NEXT: vmov r0, s4 1426; CHECK-NEXT: vmov.32 q1[0], r4 1427; CHECK-NEXT: vmov.16 q6[2], r0 1428; CHECK-NEXT: vmov r0, s9 1429; CHECK-NEXT: vmov.16 q6[3], r0 1430; CHECK-NEXT: vmov r4, s1 1431; CHECK-NEXT: vmov r2, s24 1432; CHECK-NEXT: vmov.32 q1[1], r4 1433; CHECK-NEXT: vmov r0, s25 1434; CHECK-NEXT: vmov.32 q0[0], r2 1435; CHECK-NEXT: vmov.32 q0[1], r0 1436; CHECK-NEXT: vmov.32 q1[2], lr 1437; CHECK-NEXT: vmov.32 q0[2], r3 1438; CHECK-NEXT: vmov r4, s15 1439; CHECK-NEXT: vmov.f32 s23, s19 1440; CHECK-NEXT: vmov.32 q0[3], r12 1441; CHECK-NEXT: vmov.32 q1[3], r4 1442; CHECK-NEXT: vadd.f16 q0, q5, q0 1443; CHECK-NEXT: vadd.f16 q0, q0, q1 1444; CHECK-NEXT: vstrw.32 q0, [r1] 1445; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 1446; CHECK-NEXT: pop {r4, pc} 1447entry: 1448 %l1 = load <24 x half>, <24 x half>* %src, align 4 1449 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 1450 %s2 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 1451 %s3 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 1452 %a1 = fadd <8 x half> %s1, %s2 1453 %a = fadd <8 x half> %a1, %s3 1454 store <8 x half> %a, <8 x half> *%dst 1455 ret void 1456} 1457 1458define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { 1459; CHECK-LABEL: vld3_v16f16: 1460; CHECK: @ %bb.0: @ %entry 1461; CHECK-NEXT: .save {r4, r5, r7, lr} 1462; CHECK-NEXT: push {r4, r5, r7, lr} 1463; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 1464; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 1465; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 1466; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 1467; CHECK-NEXT: vldrw.u32 q3, [r0, #64] 1468; CHECK-NEXT: vmovx.f16 s0, s19 1469; CHECK-NEXT: vmovx.f16 s4, s16 1470; CHECK-NEXT: vmov r2, s18 1471; CHECK-NEXT: vmovx.f16 s20, s13 1472; CHECK-NEXT: vmov r3, s0 1473; CHECK-NEXT: vmov.16 q0[6], r2 1474; CHECK-NEXT: vmov r12, s4 1475; CHECK-NEXT: vmovx.f16 s4, s10 1476; CHECK-NEXT: vmov r2, s9 1477; CHECK-NEXT: vmov.16 q0[7], r3 1478; CHECK-NEXT: vmov r3, s4 1479; CHECK-NEXT: vmov.16 q1[0], r2 1480; CHECK-NEXT: vmov.16 q1[1], r3 1481; CHECK-NEXT: vmov r2, s12 1482; CHECK-NEXT: vmov.16 q1[2], r2 1483; CHECK-NEXT: vmov r2, s20 1484; CHECK-NEXT: vmov.16 q1[3], r2 1485; CHECK-NEXT: vmov r3, s16 1486; CHECK-NEXT: vmov.f32 s6, s15 1487; CHECK-NEXT: vmovx.f16 s24, s12 1488; CHECK-NEXT: vmov.f32 s2, s16 1489; CHECK-NEXT: vmovx.f16 s16, s18 1490; CHECK-NEXT: vmov r4, s16 1491; CHECK-NEXT: vmov r5, s4 1492; CHECK-NEXT: vmov r2, s6 1493; CHECK-NEXT: vmov.16 q5[4], r2 1494; CHECK-NEXT: vmov.16 q5[5], r12 1495; CHECK-NEXT: vmov lr, s22 1496; CHECK-NEXT: vmovx.f16 s20, s14 1497; CHECK-NEXT: vmov r2, s20 1498; CHECK-NEXT: vmov.16 q5[4], r2 1499; CHECK-NEXT: vmov r2, s19 1500; CHECK-NEXT: vmov.16 q5[5], r3 1501; CHECK-NEXT: vmov r12, s22 1502; CHECK-NEXT: vmovx.f16 s20, s17 1503; CHECK-NEXT: vmov r3, s20 1504; CHECK-NEXT: vmov.16 q5[6], r3 1505; CHECK-NEXT: vmov r3, s17 1506; CHECK-NEXT: vmov.16 q5[7], r2 1507; CHECK-NEXT: vmov.16 q4[6], r3 1508; CHECK-NEXT: vmov r2, s23 1509; CHECK-NEXT: vmov.16 q4[7], r4 1510; CHECK-NEXT: vmovx.f16 s20, s9 1511; CHECK-NEXT: vmov r4, s8 1512; CHECK-NEXT: vmov r3, s20 1513; CHECK-NEXT: vmov.16 q5[0], r4 1514; CHECK-NEXT: vmov.16 q5[1], r3 1515; CHECK-NEXT: vmov r3, s11 1516; CHECK-NEXT: vmov.16 q5[2], r3 1517; CHECK-NEXT: vmov r3, s24 1518; CHECK-NEXT: vmov.16 q5[3], r3 1519; CHECK-NEXT: vmov r3, s14 1520; CHECK-NEXT: vmovx.f16 s24, s15 1521; CHECK-NEXT: vmov.16 q5[4], r3 1522; CHECK-NEXT: vmov r3, s24 1523; CHECK-NEXT: vmovx.f16 s24, s8 1524; CHECK-NEXT: vmov.16 q5[5], r3 1525; CHECK-NEXT: vmov r3, s24 1526; CHECK-NEXT: vmov r4, s10 1527; CHECK-NEXT: vmov.16 q6[0], r3 1528; CHECK-NEXT: vmovx.f16 s8, s11 1529; CHECK-NEXT: vmov.16 q6[1], r4 1530; CHECK-NEXT: vmov r3, s8 1531; CHECK-NEXT: vmov.32 q2[0], r5 1532; CHECK-NEXT: vmov.16 q6[2], r3 1533; CHECK-NEXT: vmov r3, s13 1534; CHECK-NEXT: vmov.16 q6[3], r3 1535; CHECK-NEXT: vmov r5, s5 1536; CHECK-NEXT: vmov r3, s24 1537; CHECK-NEXT: vmov.32 q2[1], r5 1538; CHECK-NEXT: vmov r5, s3 1539; CHECK-NEXT: vmov.32 q0[0], r3 1540; CHECK-NEXT: vmov r4, s25 1541; CHECK-NEXT: vmov.32 q2[2], lr 1542; CHECK-NEXT: vmov.32 q0[1], r4 1543; CHECK-NEXT: vmov.f32 s23, s19 1544; CHECK-NEXT: vmov.32 q0[2], r12 1545; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 1546; CHECK-NEXT: vmov.32 q0[3], r2 1547; CHECK-NEXT: vmov.32 q2[3], r5 1548; CHECK-NEXT: vadd.f16 q0, q5, q0 1549; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1550; CHECK-NEXT: vadd.f16 q0, q0, q2 1551; CHECK-NEXT: vldrw.u32 q2, [r0] 1552; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1553; CHECK-NEXT: vmovx.f16 s0, s19 1554; CHECK-NEXT: vmov r0, s18 1555; CHECK-NEXT: vmovx.f16 s12, s16 1556; CHECK-NEXT: vmov r2, s0 1557; CHECK-NEXT: vmov.16 q0[6], r0 1558; CHECK-NEXT: vmov.16 q0[7], r2 1559; CHECK-NEXT: vmov r0, s12 1560; CHECK-NEXT: vmovx.f16 s12, s10 1561; CHECK-NEXT: vmov r2, s9 1562; CHECK-NEXT: vmov r3, s12 1563; CHECK-NEXT: vmov.16 q3[0], r2 1564; CHECK-NEXT: vmov.16 q3[1], r3 1565; CHECK-NEXT: vmov r2, s4 1566; CHECK-NEXT: vmovx.f16 s20, s5 1567; CHECK-NEXT: vmov.16 q3[2], r2 1568; CHECK-NEXT: vmov r2, s20 1569; CHECK-NEXT: vmovx.f16 s24, s4 1570; CHECK-NEXT: vmov.16 q3[3], r2 1571; CHECK-NEXT: vmov r3, s16 1572; CHECK-NEXT: vmov.f32 s14, s7 1573; CHECK-NEXT: vmov.f32 s2, s16 1574; CHECK-NEXT: vmovx.f16 s16, s18 1575; CHECK-NEXT: vmov r4, s16 1576; CHECK-NEXT: vmov r2, s14 1577; CHECK-NEXT: vmov.16 q5[4], r2 1578; CHECK-NEXT: vmov.16 q5[5], r0 1579; CHECK-NEXT: vmov r2, s22 1580; CHECK-NEXT: vmovx.f16 s20, s6 1581; CHECK-NEXT: vmov r0, s20 1582; CHECK-NEXT: vmov.16 q5[4], r0 1583; CHECK-NEXT: vmov r0, s12 1584; CHECK-NEXT: vmov.16 q5[5], r3 1585; CHECK-NEXT: vmov r3, s19 1586; CHECK-NEXT: vmov r12, s22 1587; CHECK-NEXT: vmovx.f16 s20, s17 1588; CHECK-NEXT: vmov r5, s20 1589; CHECK-NEXT: vmov.16 q5[6], r5 1590; CHECK-NEXT: vmov r5, s17 1591; CHECK-NEXT: vmov.16 q5[7], r3 1592; CHECK-NEXT: vmov.16 q4[6], r5 1593; CHECK-NEXT: vmov r3, s23 1594; CHECK-NEXT: vmov.16 q4[7], r4 1595; CHECK-NEXT: vmovx.f16 s20, s9 1596; CHECK-NEXT: vmov r4, s8 1597; CHECK-NEXT: vmov r5, s20 1598; CHECK-NEXT: vmov.16 q5[0], r4 1599; CHECK-NEXT: vmov.16 q5[1], r5 1600; CHECK-NEXT: vmov r5, s11 1601; CHECK-NEXT: vmov.16 q5[2], r5 1602; CHECK-NEXT: vmov r5, s24 1603; CHECK-NEXT: vmov.16 q5[3], r5 1604; CHECK-NEXT: vmov r5, s6 1605; CHECK-NEXT: vmovx.f16 s24, s7 1606; CHECK-NEXT: vmov.16 q5[4], r5 1607; CHECK-NEXT: vmov r5, s24 1608; CHECK-NEXT: vmovx.f16 s24, s8 1609; CHECK-NEXT: vmov.16 q5[5], r5 1610; CHECK-NEXT: vmov r5, s24 1611; CHECK-NEXT: vmov r4, s10 1612; CHECK-NEXT: vmov.16 q6[0], r5 1613; CHECK-NEXT: vmovx.f16 s8, s11 1614; CHECK-NEXT: vmov.16 q6[1], r4 1615; CHECK-NEXT: vmov r5, s8 1616; CHECK-NEXT: vmov.16 q6[2], r5 1617; CHECK-NEXT: vmov r5, s5 1618; CHECK-NEXT: vmov.16 q6[3], r5 1619; CHECK-NEXT: vmov.32 q1[0], r0 1620; CHECK-NEXT: vmov r0, s13 1621; CHECK-NEXT: vmov r5, s24 1622; CHECK-NEXT: vmov.32 q1[1], r0 1623; CHECK-NEXT: vmov r0, s3 1624; CHECK-NEXT: vmov.32 q0[0], r5 1625; CHECK-NEXT: vmov r4, s25 1626; CHECK-NEXT: vmov.32 q1[2], r2 1627; CHECK-NEXT: vmov.32 q0[1], r4 1628; CHECK-NEXT: vmov.f32 s23, s19 1629; CHECK-NEXT: vmov.32 q0[2], r12 1630; CHECK-NEXT: vmov.32 q1[3], r0 1631; CHECK-NEXT: vmov.32 q0[3], r3 1632; CHECK-NEXT: vadd.f16 q0, q5, q0 1633; CHECK-NEXT: vadd.f16 q0, q0, q1 1634; CHECK-NEXT: vstrw.32 q0, [r1] 1635; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 1636; CHECK-NEXT: pop {r4, r5, r7, pc} 1637entry: 1638 %l1 = load <48 x half>, <48 x half>* %src, align 4 1639 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 1640 %s2 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 1641 %s3 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 1642 %a1 = fadd <16 x half> %s1, %s2 1643 %a = fadd <16 x half> %a1, %s3 1644 store <16 x half> %a, <16 x half> *%dst 1645 ret void 1646} 1647 1648; f64 1649 1650define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) { 1651; CHECK-LABEL: vld3_v2f64: 1652; CHECK: @ %bb.0: @ %entry 1653; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 1654; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1655; CHECK-NEXT: vldrw.u32 q3, [r0] 1656; CHECK-NEXT: vadd.f64 d4, d3, d0 1657; CHECK-NEXT: vadd.f64 d5, d6, d7 1658; CHECK-NEXT: vadd.f64 d1, d4, d1 1659; CHECK-NEXT: vadd.f64 d0, d5, d2 1660; CHECK-NEXT: vstrw.32 q0, [r1] 1661; CHECK-NEXT: bx lr 1662entry: 1663 %l1 = load <6 x double>, <6 x double>* %src, align 4 1664 %s1 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 0, i32 3> 1665 %s2 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 1, i32 4> 1666 %s3 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 2, i32 5> 1667 %a1 = fadd <2 x double> %s1, %s2 1668 %a = fadd <2 x double> %a1, %s3 1669 store <2 x double> %a, <2 x double> *%dst 1670 ret void 1671} 1672 1673define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) { 1674; CHECK-LABEL: vld3_v4f64: 1675; CHECK: @ %bb.0: @ %entry 1676; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 1677; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 1678; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 1679; CHECK-NEXT: vldrw.u32 q1, [r0, #80] 1680; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1681; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 1682; CHECK-NEXT: vadd.f64 d5, d6, d7 1683; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1684; CHECK-NEXT: vldrw.u32 q6, [r0] 1685; CHECK-NEXT: vadd.f64 d4, d1, d2 1686; CHECK-NEXT: vadd.f64 d10, d9, d6 1687; CHECK-NEXT: vadd.f64 d11, d12, d13 1688; CHECK-NEXT: vadd.f64 d3, d4, d3 1689; CHECK-NEXT: vadd.f64 d2, d5, d0 1690; CHECK-NEXT: vadd.f64 d1, d10, d7 1691; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1692; CHECK-NEXT: vadd.f64 d0, d11, d8 1693; CHECK-NEXT: vstrw.32 q0, [r1] 1694; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 1695; CHECK-NEXT: bx lr 1696entry: 1697 %l1 = load <12 x double>, <12 x double>* %src, align 4 1698 %s1 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1699 %s2 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1700 %s3 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1701 %a1 = fadd <4 x double> %s1, %s2 1702 %a = fadd <4 x double> %a1, %s3 1703 store <4 x double> %a, <4 x double> *%dst 1704 ret void 1705} 1706