1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s 3 4; i32 5 6; Expand 7define arm_aapcs_vfpcc void @ptr_v2i32(<2 x i32> %v, <2 x i32*>* %offptr) { 8; CHECK-LABEL: ptr_v2i32: 9; CHECK: @ %bb.0: @ %entry 10; CHECK-NEXT: vmov r2, s0 11; CHECK-NEXT: ldrd r1, r0, [r0] 12; CHECK-NEXT: str r2, [r1] 13; CHECK-NEXT: vmov r1, s2 14; CHECK-NEXT: str r1, [r0] 15; CHECK-NEXT: bx lr 16entry: 17 %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4 18 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %v, <2 x i32*> %offs, i32 4, <2 x i1> <i1 true, i1 true>) 19 ret void 20} 21 22; VSTRW.32 Qd, [offs, 0] 23define arm_aapcs_vfpcc void @ptr_v4i32(<4 x i32> %v, <4 x i32*>* %offptr) { 24; CHECK-LABEL: ptr_v4i32: 25; CHECK: @ %bb.0: @ %entry 26; CHECK-NEXT: vldrw.u32 q1, [r0] 27; CHECK-NEXT: vstrw.32 q0, [q1] 28; CHECK-NEXT: bx lr 29entry: 30 %offs = load <4 x i32*>, <4 x i32*>* %offptr, align 4 31 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v, <4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 32 ret void 33} 34 35; Expand 36define arm_aapcs_vfpcc void @ptr_v8i32(<8 x i32> %v, <8 x i32*>* %offptr) { 37; CHECK-LABEL: ptr_v8i32: 38; CHECK: @ %bb.0: @ %entry 39; CHECK-NEXT: vldrw.u32 q3, [r0] 40; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 41; CHECK-NEXT: vmov r1, s0 42; CHECK-NEXT: vmov r0, s12 43; CHECK-NEXT: str r1, [r0] 44; CHECK-NEXT: vmov r0, s13 45; CHECK-NEXT: vmov r1, s1 46; CHECK-NEXT: str r1, [r0] 47; CHECK-NEXT: vmov r0, s14 48; CHECK-NEXT: vmov r1, s2 49; CHECK-NEXT: str r1, [r0] 50; CHECK-NEXT: vmov r0, s15 51; CHECK-NEXT: vmov r1, s3 52; CHECK-NEXT: str r1, [r0] 53; CHECK-NEXT: vmov r0, s8 54; CHECK-NEXT: vmov r1, s4 55; CHECK-NEXT: str r1, [r0] 56; CHECK-NEXT: vmov r0, s9 57; CHECK-NEXT: vmov r1, s5 58; CHECK-NEXT: str r1, [r0] 59; CHECK-NEXT: vmov r0, s10 60; CHECK-NEXT: vmov r1, s6 61; CHECK-NEXT: str r1, [r0] 62; CHECK-NEXT: vmov r0, s11 63; CHECK-NEXT: vmov r1, s7 64; CHECK-NEXT: str r1, [r0] 65; CHECK-NEXT: bx lr 66entry: 67 %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4 68 call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %v, <8 x i32*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 69 ret void 70} 71 72; Expand 73define arm_aapcs_vfpcc void @ptr_v16i32(<16 x i32> %v, <16 x i32*>* %offptr) { 74; CHECK-LABEL: ptr_v16i32: 75; CHECK: @ %bb.0: @ %entry 76; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 77; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 78; CHECK-NEXT: vldrw.u32 q7, [r0] 79; CHECK-NEXT: vldrw.u32 q4, [r0, #48] 80; CHECK-NEXT: vldrw.u32 q5, [r0, #32] 81; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 82; CHECK-NEXT: vmov r0, s28 83; CHECK-NEXT: vmov r1, s0 84; CHECK-NEXT: str r1, [r0] 85; CHECK-NEXT: vmov r0, s29 86; CHECK-NEXT: vmov r1, s1 87; CHECK-NEXT: str r1, [r0] 88; CHECK-NEXT: vmov r0, s30 89; CHECK-NEXT: vmov r1, s2 90; CHECK-NEXT: str r1, [r0] 91; CHECK-NEXT: vmov r0, s31 92; CHECK-NEXT: vmov r1, s3 93; CHECK-NEXT: str r1, [r0] 94; CHECK-NEXT: vmov r0, s24 95; CHECK-NEXT: vmov r1, s4 96; CHECK-NEXT: str r1, [r0] 97; CHECK-NEXT: vmov r0, s25 98; CHECK-NEXT: vmov r1, s5 99; CHECK-NEXT: str r1, [r0] 100; CHECK-NEXT: vmov r0, s26 101; CHECK-NEXT: vmov r1, s6 102; CHECK-NEXT: str r1, [r0] 103; CHECK-NEXT: vmov r0, s27 104; CHECK-NEXT: vmov r1, s7 105; CHECK-NEXT: str r1, [r0] 106; CHECK-NEXT: vmov r0, s20 107; CHECK-NEXT: vmov r1, s8 108; CHECK-NEXT: str r1, [r0] 109; CHECK-NEXT: vmov r0, s21 110; CHECK-NEXT: vmov r1, s9 111; CHECK-NEXT: str r1, [r0] 112; CHECK-NEXT: vmov r0, s22 113; CHECK-NEXT: vmov r1, s10 114; CHECK-NEXT: str r1, [r0] 115; CHECK-NEXT: vmov r0, s23 116; CHECK-NEXT: vmov r1, s11 117; CHECK-NEXT: str r1, [r0] 118; CHECK-NEXT: vmov r0, s16 119; CHECK-NEXT: vmov r1, s12 120; CHECK-NEXT: str r1, [r0] 121; CHECK-NEXT: vmov r0, s17 122; CHECK-NEXT: vmov r1, s13 123; CHECK-NEXT: str r1, [r0] 124; CHECK-NEXT: vmov r0, s18 125; CHECK-NEXT: vmov r1, s14 126; CHECK-NEXT: str r1, [r0] 127; CHECK-NEXT: vmov r0, s19 128; CHECK-NEXT: vmov r1, s15 129; CHECK-NEXT: str r1, [r0] 130; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 131; CHECK-NEXT: bx lr 132entry: 133 %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4 134 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %v, <16 x i32*> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 135 ret void 136} 137 138; f32 139 140; Expand 141define arm_aapcs_vfpcc void @ptr_v2f32(<2 x float> %v, <2 x float*>* %offptr) { 142; CHECK-LABEL: ptr_v2f32: 143; CHECK: @ %bb.0: @ %entry 144; CHECK-NEXT: ldrd r1, r0, [r0] 145; CHECK-NEXT: vstr s0, [r1] 146; CHECK-NEXT: vstr s1, [r0] 147; CHECK-NEXT: bx lr 148entry: 149 %offs = load <2 x float*>, <2 x float*>* %offptr, align 4 150 call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %v, <2 x float*> %offs, i32 4, <2 x i1> <i1 true, i1 true>) 151 ret void 152} 153 154; VSTRW.32 Qd, [offs, 0] 155define arm_aapcs_vfpcc void @ptr_v4f32(<4 x float> %v, <4 x float*>* %offptr) { 156; CHECK-LABEL: ptr_v4f32: 157; CHECK: @ %bb.0: @ %entry 158; CHECK-NEXT: vldrw.u32 q1, [r0] 159; CHECK-NEXT: vstrw.32 q0, [q1] 160; CHECK-NEXT: bx lr 161entry: 162 %offs = load <4 x float*>, <4 x float*>* %offptr, align 4 163 call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %v, <4 x float*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 164 ret void 165} 166 167; Expand 168define arm_aapcs_vfpcc void @ptr_v8f32(<8 x float> %v, <8 x float*>* %offptr) { 169; CHECK-LABEL: ptr_v8f32: 170; CHECK: @ %bb.0: @ %entry 171; CHECK-NEXT: .save {r4, r5, r7, lr} 172; CHECK-NEXT: push {r4, r5, r7, lr} 173; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 174; CHECK-NEXT: vmov r12, s11 175; CHECK-NEXT: vmov lr, s10 176; CHECK-NEXT: vmov r3, s9 177; CHECK-NEXT: vmov r1, s8 178; CHECK-NEXT: vldrw.u32 q2, [r0] 179; CHECK-NEXT: vmov r5, s8 180; CHECK-NEXT: vmov r0, s11 181; CHECK-NEXT: vmov r2, s10 182; CHECK-NEXT: vmov r4, s9 183; CHECK-NEXT: vstr s0, [r5] 184; CHECK-NEXT: vstr s1, [r4] 185; CHECK-NEXT: vstr s2, [r2] 186; CHECK-NEXT: vstr s3, [r0] 187; CHECK-NEXT: vstr s4, [r1] 188; CHECK-NEXT: vstr s5, [r3] 189; CHECK-NEXT: vstr s6, [lr] 190; CHECK-NEXT: vstr s7, [r12] 191; CHECK-NEXT: pop {r4, r5, r7, pc} 192entry: 193 %offs = load <8 x float*>, <8 x float*>* %offptr, align 4 194 call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %v, <8 x float*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 195 ret void 196} 197 198; i16 199 200; Expand. 201define arm_aapcs_vfpcc void @ptr_i16(<8 x i16> %v, <8 x i16*>* %offptr) { 202; CHECK-LABEL: ptr_i16: 203; CHECK: @ %bb.0: @ %entry 204; CHECK-NEXT: vldrw.u32 q2, [r0] 205; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 206; CHECK-NEXT: vmov.u16 r1, q0[0] 207; CHECK-NEXT: vmov r0, s8 208; CHECK-NEXT: strh r1, [r0] 209; CHECK-NEXT: vmov r0, s9 210; CHECK-NEXT: vmov.u16 r1, q0[1] 211; CHECK-NEXT: strh r1, [r0] 212; CHECK-NEXT: vmov r0, s10 213; CHECK-NEXT: vmov.u16 r1, q0[2] 214; CHECK-NEXT: strh r1, [r0] 215; CHECK-NEXT: vmov r0, s11 216; CHECK-NEXT: vmov.u16 r1, q0[3] 217; CHECK-NEXT: strh r1, [r0] 218; CHECK-NEXT: vmov r0, s4 219; CHECK-NEXT: vmov.u16 r1, q0[4] 220; CHECK-NEXT: strh r1, [r0] 221; CHECK-NEXT: vmov r0, s5 222; CHECK-NEXT: vmov.u16 r1, q0[5] 223; CHECK-NEXT: strh r1, [r0] 224; CHECK-NEXT: vmov r0, s6 225; CHECK-NEXT: vmov.u16 r1, q0[6] 226; CHECK-NEXT: strh r1, [r0] 227; CHECK-NEXT: vmov r0, s7 228; CHECK-NEXT: vmov.u16 r1, q0[7] 229; CHECK-NEXT: strh r1, [r0] 230; CHECK-NEXT: bx lr 231entry: 232 %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 233 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v, <8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 234 ret void 235} 236 237; Expand 238define arm_aapcs_vfpcc void @ptr_v2i16_trunc(<2 x i32> %v, <2 x i16*>* %offptr) { 239; CHECK-LABEL: ptr_v2i16_trunc: 240; CHECK: @ %bb.0: @ %entry 241; CHECK-NEXT: vmov r2, s0 242; CHECK-NEXT: ldrd r1, r0, [r0] 243; CHECK-NEXT: strh r2, [r1] 244; CHECK-NEXT: vmov r1, s2 245; CHECK-NEXT: strh r1, [r0] 246; CHECK-NEXT: bx lr 247entry: 248 %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4 249 %ext = trunc <2 x i32> %v to <2 x i16> 250 call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %ext, <2 x i16*> %offs, i32 2, <2 x i1> <i1 true, i1 true>) 251 ret void 252} 253 254; Expand 255define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, <4 x i16*>* %offptr) { 256; CHECK-LABEL: ptr_v4i16_trunc: 257; CHECK: @ %bb.0: @ %entry 258; CHECK-NEXT: vldrw.u32 q1, [r0] 259; CHECK-NEXT: vmov r1, s0 260; CHECK-NEXT: vmov r0, s4 261; CHECK-NEXT: strh r1, [r0] 262; CHECK-NEXT: vmov r0, s5 263; CHECK-NEXT: vmov r1, s1 264; CHECK-NEXT: strh r1, [r0] 265; CHECK-NEXT: vmov r0, s6 266; CHECK-NEXT: vmov r1, s2 267; CHECK-NEXT: strh r1, [r0] 268; CHECK-NEXT: vmov r0, s7 269; CHECK-NEXT: vmov r1, s3 270; CHECK-NEXT: strh r1, [r0] 271; CHECK-NEXT: bx lr 272entry: 273 %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 274 %ext = trunc <4 x i32> %v to <4 x i16> 275 call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %ext, <4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 276 ret void 277} 278 279; Expand 280define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) { 281; CHECK-LABEL: ptr_v8i16_trunc: 282; CHECK: @ %bb.0: @ %entry 283; CHECK-NEXT: vldrw.u32 q3, [r0] 284; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 285; CHECK-NEXT: vmov r1, s0 286; CHECK-NEXT: vmov r0, s12 287; CHECK-NEXT: strh r1, [r0] 288; CHECK-NEXT: vmov r0, s13 289; CHECK-NEXT: vmov r1, s1 290; CHECK-NEXT: strh r1, [r0] 291; CHECK-NEXT: vmov r0, s14 292; CHECK-NEXT: vmov r1, s2 293; CHECK-NEXT: strh r1, [r0] 294; CHECK-NEXT: vmov r0, s15 295; CHECK-NEXT: vmov r1, s3 296; CHECK-NEXT: strh r1, [r0] 297; CHECK-NEXT: vmov r0, s8 298; CHECK-NEXT: vmov r1, s4 299; CHECK-NEXT: strh r1, [r0] 300; CHECK-NEXT: vmov r0, s9 301; CHECK-NEXT: vmov r1, s5 302; CHECK-NEXT: strh r1, [r0] 303; CHECK-NEXT: vmov r0, s10 304; CHECK-NEXT: vmov r1, s6 305; CHECK-NEXT: strh r1, [r0] 306; CHECK-NEXT: vmov r0, s11 307; CHECK-NEXT: vmov r1, s7 308; CHECK-NEXT: strh r1, [r0] 309; CHECK-NEXT: bx lr 310entry: 311 %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 312 %ext = trunc <8 x i32> %v to <8 x i16> 313 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %ext, <8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 314 ret void 315} 316 317; f16 318 319; Expand. 320define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, <8 x half*>* %offptr) { 321; CHECK-LABEL: ptr_f16: 322; CHECK: @ %bb.0: @ %entry 323; CHECK-NEXT: vldrw.u32 q2, [r0] 324; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 325; CHECK-NEXT: vmovx.f16 s12, s0 326; CHECK-NEXT: vmov r0, s8 327; CHECK-NEXT: vstr.16 s0, [r0] 328; CHECK-NEXT: vmov r0, s9 329; CHECK-NEXT: vstr.16 s12, [r0] 330; CHECK-NEXT: vmov r0, s10 331; CHECK-NEXT: vstr.16 s1, [r0] 332; CHECK-NEXT: vmov r0, s11 333; CHECK-NEXT: vmovx.f16 s8, s1 334; CHECK-NEXT: vmovx.f16 s0, s3 335; CHECK-NEXT: vstr.16 s8, [r0] 336; CHECK-NEXT: vmov r0, s4 337; CHECK-NEXT: vstr.16 s2, [r0] 338; CHECK-NEXT: vmov r0, s5 339; CHECK-NEXT: vmovx.f16 s8, s2 340; CHECK-NEXT: vstr.16 s8, [r0] 341; CHECK-NEXT: vmov r0, s6 342; CHECK-NEXT: vstr.16 s3, [r0] 343; CHECK-NEXT: vmov r0, s7 344; CHECK-NEXT: vstr.16 s0, [r0] 345; CHECK-NEXT: bx lr 346entry: 347 %offs = load <8 x half*>, <8 x half*>* %offptr, align 4 348 call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %v, <8 x half*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 349 ret void 350} 351 352; i8 353 354; Expand. 355define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, <16 x i8*>* %offptr) { 356; CHECK-LABEL: ptr_i8: 357; CHECK: @ %bb.0: @ %entry 358; CHECK-NEXT: .vsave {d8, d9} 359; CHECK-NEXT: vpush {d8, d9} 360; CHECK-NEXT: vldrw.u32 q4, [r0] 361; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 362; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 363; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 364; CHECK-NEXT: vmov r0, s16 365; CHECK-NEXT: vmov.u8 r1, q0[0] 366; CHECK-NEXT: strb r1, [r0] 367; CHECK-NEXT: vmov r0, s17 368; CHECK-NEXT: vmov.u8 r1, q0[1] 369; CHECK-NEXT: strb r1, [r0] 370; CHECK-NEXT: vmov r0, s18 371; CHECK-NEXT: vmov.u8 r1, q0[2] 372; CHECK-NEXT: strb r1, [r0] 373; CHECK-NEXT: vmov r0, s19 374; CHECK-NEXT: vmov.u8 r1, q0[3] 375; CHECK-NEXT: strb r1, [r0] 376; CHECK-NEXT: vmov r0, s12 377; CHECK-NEXT: vmov.u8 r1, q0[4] 378; CHECK-NEXT: strb r1, [r0] 379; CHECK-NEXT: vmov r0, s13 380; CHECK-NEXT: vmov.u8 r1, q0[5] 381; CHECK-NEXT: strb r1, [r0] 382; CHECK-NEXT: vmov r0, s14 383; CHECK-NEXT: vmov.u8 r1, q0[6] 384; CHECK-NEXT: strb r1, [r0] 385; CHECK-NEXT: vmov r0, s15 386; CHECK-NEXT: vmov.u8 r1, q0[7] 387; CHECK-NEXT: strb r1, [r0] 388; CHECK-NEXT: vmov r0, s8 389; CHECK-NEXT: vmov.u8 r1, q0[8] 390; CHECK-NEXT: strb r1, [r0] 391; CHECK-NEXT: vmov r0, s9 392; CHECK-NEXT: vmov.u8 r1, q0[9] 393; CHECK-NEXT: strb r1, [r0] 394; CHECK-NEXT: vmov r0, s10 395; CHECK-NEXT: vmov.u8 r1, q0[10] 396; CHECK-NEXT: strb r1, [r0] 397; CHECK-NEXT: vmov r0, s11 398; CHECK-NEXT: vmov.u8 r1, q0[11] 399; CHECK-NEXT: strb r1, [r0] 400; CHECK-NEXT: vmov r0, s4 401; CHECK-NEXT: vmov.u8 r1, q0[12] 402; CHECK-NEXT: strb r1, [r0] 403; CHECK-NEXT: vmov r0, s5 404; CHECK-NEXT: vmov.u8 r1, q0[13] 405; CHECK-NEXT: strb r1, [r0] 406; CHECK-NEXT: vmov r0, s6 407; CHECK-NEXT: vmov.u8 r1, q0[14] 408; CHECK-NEXT: strb r1, [r0] 409; CHECK-NEXT: vmov r0, s7 410; CHECK-NEXT: vmov.u8 r1, q0[15] 411; CHECK-NEXT: strb r1, [r0] 412; CHECK-NEXT: vpop {d8, d9} 413; CHECK-NEXT: bx lr 414entry: 415 %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4 416 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v, <16 x i8*> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 417 ret void 418} 419 420; Expand 421define arm_aapcs_vfpcc void @ptr_v8i8_trunc16(<8 x i16> %v, <8 x i8*>* %offptr) { 422; CHECK-LABEL: ptr_v8i8_trunc16: 423; CHECK: @ %bb.0: @ %entry 424; CHECK-NEXT: vldrw.u32 q2, [r0] 425; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 426; CHECK-NEXT: vmov.u16 r1, q0[0] 427; CHECK-NEXT: vmov r0, s8 428; CHECK-NEXT: strb r1, [r0] 429; CHECK-NEXT: vmov r0, s9 430; CHECK-NEXT: vmov.u16 r1, q0[1] 431; CHECK-NEXT: strb r1, [r0] 432; CHECK-NEXT: vmov r0, s10 433; CHECK-NEXT: vmov.u16 r1, q0[2] 434; CHECK-NEXT: strb r1, [r0] 435; CHECK-NEXT: vmov r0, s11 436; CHECK-NEXT: vmov.u16 r1, q0[3] 437; CHECK-NEXT: strb r1, [r0] 438; CHECK-NEXT: vmov r0, s4 439; CHECK-NEXT: vmov.u16 r1, q0[4] 440; CHECK-NEXT: strb r1, [r0] 441; CHECK-NEXT: vmov r0, s5 442; CHECK-NEXT: vmov.u16 r1, q0[5] 443; CHECK-NEXT: strb r1, [r0] 444; CHECK-NEXT: vmov r0, s6 445; CHECK-NEXT: vmov.u16 r1, q0[6] 446; CHECK-NEXT: strb r1, [r0] 447; CHECK-NEXT: vmov r0, s7 448; CHECK-NEXT: vmov.u16 r1, q0[7] 449; CHECK-NEXT: strb r1, [r0] 450; CHECK-NEXT: bx lr 451entry: 452 %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 453 %ext = trunc <8 x i16> %v to <8 x i8> 454 call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %ext, <8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 455 ret void 456} 457 458; Expand 459define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, <4 x i8*>* %offptr) { 460; CHECK-LABEL: ptr_v4i8_trunc32: 461; CHECK: @ %bb.0: @ %entry 462; CHECK-NEXT: vldrw.u32 q1, [r0] 463; CHECK-NEXT: vmov r1, s0 464; CHECK-NEXT: vmov r0, s4 465; CHECK-NEXT: strb r1, [r0] 466; CHECK-NEXT: vmov r0, s5 467; CHECK-NEXT: vmov r1, s1 468; CHECK-NEXT: strb r1, [r0] 469; CHECK-NEXT: vmov r0, s6 470; CHECK-NEXT: vmov r1, s2 471; CHECK-NEXT: strb r1, [r0] 472; CHECK-NEXT: vmov r0, s7 473; CHECK-NEXT: vmov r1, s3 474; CHECK-NEXT: strb r1, [r0] 475; CHECK-NEXT: bx lr 476entry: 477 %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 478 %ext = trunc <4 x i32> %v to <4 x i8> 479 call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %ext, <4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 480 ret void 481} 482 483; Expand 484define arm_aapcs_vfpcc void @ptr_v8i8_trunc32(<8 x i32> %v, <8 x i8*>* %offptr) { 485; CHECK-LABEL: ptr_v8i8_trunc32: 486; CHECK: @ %bb.0: @ %entry 487; CHECK-NEXT: vldrw.u32 q3, [r0] 488; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 489; CHECK-NEXT: vmov r1, s0 490; CHECK-NEXT: vmov r0, s12 491; CHECK-NEXT: strb r1, [r0] 492; CHECK-NEXT: vmov r0, s13 493; CHECK-NEXT: vmov r1, s1 494; CHECK-NEXT: strb r1, [r0] 495; CHECK-NEXT: vmov r0, s14 496; CHECK-NEXT: vmov r1, s2 497; CHECK-NEXT: strb r1, [r0] 498; CHECK-NEXT: vmov r0, s15 499; CHECK-NEXT: vmov r1, s3 500; CHECK-NEXT: strb r1, [r0] 501; CHECK-NEXT: vmov r0, s8 502; CHECK-NEXT: vmov r1, s4 503; CHECK-NEXT: strb r1, [r0] 504; CHECK-NEXT: vmov r0, s9 505; CHECK-NEXT: vmov r1, s5 506; CHECK-NEXT: strb r1, [r0] 507; CHECK-NEXT: vmov r0, s10 508; CHECK-NEXT: vmov r1, s6 509; CHECK-NEXT: strb r1, [r0] 510; CHECK-NEXT: vmov r0, s11 511; CHECK-NEXT: vmov r1, s7 512; CHECK-NEXT: strb r1, [r0] 513; CHECK-NEXT: bx lr 514entry: 515 %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 516 %ext = trunc <8 x i32> %v to <8 x i8> 517 call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %ext, <8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 518 ret void 519} 520 521; loops 522 523define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) { 524; CHECK-LABEL: foo_ptr_p_int32_t: 525; CHECK: @ %bb.0: @ %entry 526; CHECK-NEXT: bic r3, r2, #15 527; CHECK-NEXT: cmp r3, #1 528; CHECK-NEXT: it lt 529; CHECK-NEXT: bxlt lr 530; CHECK-NEXT: .LBB16_1: @ %vector.body 531; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 532; CHECK-NEXT: vldrw.u32 q0, [r1], #16 533; CHECK-NEXT: subs r2, #4 534; CHECK-NEXT: vptt.i32 ne, q0, zr 535; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 536; CHECK-NEXT: vstrwt.32 q1, [q0] 537; CHECK-NEXT: bne .LBB16_1 538; CHECK-NEXT: @ %bb.2: @ %for.end 539; CHECK-NEXT: bx lr 540entry: 541 %and = and i32 %n, -16 542 %cmp11 = icmp sgt i32 %and, 0 543 br i1 %cmp11, label %vector.body, label %for.end 544 545vector.body: ; preds = %entry, %vector.body 546 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 547 %0 = getelementptr inbounds i32*, i32** %src, i32 %index 548 %1 = bitcast i32** %0 to <4 x i32*>* 549 %wide.load = load <4 x i32*>, <4 x i32*>* %1, align 4 550 %2 = icmp ne <4 x i32*> %wide.load, zeroinitializer 551 %3 = getelementptr inbounds i32, i32* %dest, i32 %index 552 %4 = bitcast i32* %3 to <4 x i32>* 553 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>* %4, i32 4, <4 x i1> %2, <4 x i32> undef) 554 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.load, <4 x i32*> %wide.load, i32 4, <4 x i1> %2) 555 %index.next = add i32 %index, 4 556 %5 = icmp eq i32 %index.next, %n 557 br i1 %5, label %for.end, label %vector.body 558 559for.end: ; preds = %vector.body, %entry 560 ret void 561} 562 563define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) { 564; CHECK-LABEL: foo_ptr_p_float: 565; CHECK: @ %bb.0: @ %entry 566; CHECK-NEXT: bic r3, r2, #15 567; CHECK-NEXT: cmp r3, #1 568; CHECK-NEXT: it lt 569; CHECK-NEXT: bxlt lr 570; CHECK-NEXT: .LBB17_1: @ %vector.body 571; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 572; CHECK-NEXT: vldrw.u32 q0, [r1], #16 573; CHECK-NEXT: subs r2, #4 574; CHECK-NEXT: vptt.i32 ne, q0, zr 575; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 576; CHECK-NEXT: vstrwt.32 q1, [q0] 577; CHECK-NEXT: bne .LBB17_1 578; CHECK-NEXT: @ %bb.2: @ %for.end 579; CHECK-NEXT: bx lr 580entry: 581 %and = and i32 %n, -16 582 %cmp11 = icmp sgt i32 %and, 0 583 br i1 %cmp11, label %vector.body, label %for.end 584 585vector.body: ; preds = %entry, %vector.body 586 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 587 %0 = getelementptr inbounds float*, float** %src, i32 %index 588 %1 = bitcast float** %0 to <4 x float*>* 589 %wide.load = load <4 x float*>, <4 x float*>* %1, align 4 590 %2 = icmp ne <4 x float*> %wide.load, zeroinitializer 591 %3 = getelementptr inbounds float, float* %dest, i32 %index 592 %4 = bitcast float* %3 to <4 x i32>* 593 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>* %4, i32 4, <4 x i1> %2, <4 x i32> undef) 594 %5 = bitcast <4 x float*> %wide.load to <4 x i32*> 595 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.load, <4 x i32*> %5, i32 4, <4 x i1> %2) 596 %index.next = add i32 %index, 4 597 %6 = icmp eq i32 %index.next, %n 598 br i1 %6, label %for.end, label %vector.body 599 600for.end: ; preds = %vector.body, %entry 601 ret void 602} 603 604; VLSTW.u32 Qd, [P, 4] 605define arm_aapcs_vfpcc void @qi4(<4 x i32> %v, <4 x i32*> %p) { 606; CHECK-LABEL: qi4: 607; CHECK: @ %bb.0: @ %entry 608; CHECK-NEXT: vmov.i32 q2, #0x10 609; CHECK-NEXT: vadd.i32 q1, q1, q2 610; CHECK-NEXT: vstrw.32 q0, [q1] 611; CHECK-NEXT: bx lr 612entry: 613 %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4 614 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v, <4 x i32*> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 615 ret void 616} 617 618declare void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16>, <2 x i16*>, i32, <2 x i1>) 619declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>) 620declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float>, <2 x float*>, i32, <2 x i1>) 621declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) 622declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) 623declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) 624declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) 625declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) 626declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) 627declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) 628declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) 629declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32, <8 x i1>) 630declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <8 x i1>) 631declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) 632declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>, <16 x i32*>, i32, <16 x i1>) 633declare <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) 634