1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 4 5define void @foo_int8_int32(<4 x i8>* %dest, <4 x i32>* readonly %src, i32 %n) { 6; CHECK-LABEL: foo_int8_int32: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vldrw.u32 q0, [r1] 9; CHECK-NEXT: vstrb.32 q0, [r0] 10; CHECK-NEXT: bx lr 11entry: 12 %wide.load = load <4 x i32>, <4 x i32>* %src, align 4 13 %0 = trunc <4 x i32> %wide.load to <4 x i8> 14 store <4 x i8> %0, <4 x i8>* %dest, align 1 15 ret void 16} 17 18define void @foo_int16_int32(<4 x i16>* %dest, <4 x i32>* readonly %src, i32 %n) { 19; CHECK-LABEL: foo_int16_int32: 20; CHECK: @ %bb.0: @ %entry 21; CHECK-NEXT: vldrw.u32 q0, [r1] 22; CHECK-NEXT: vstrh.32 q0, [r0] 23; CHECK-NEXT: bx lr 24entry: 25 %wide.load = load <4 x i32>, <4 x i32>* %src, align 4 26 %0 = trunc <4 x i32> %wide.load to <4 x i16> 27 store <4 x i16> %0, <4 x i16>* %dest, align 2 28 ret void 29} 30 31define void @foo_int8_int16(<8 x i8>* %dest, <8 x i16>* readonly %src, i32 %n) { 32; CHECK-LABEL: foo_int8_int16: 33; CHECK: @ %bb.0: @ %entry 34; CHECK-NEXT: vldrh.u16 q0, [r1] 35; CHECK-NEXT: vstrb.16 q0, [r0] 36; CHECK-NEXT: bx lr 37entry: 38 %wide.load = load <8 x i16>, <8 x i16>* %src, align 2 39 %0 = trunc <8 x i16> %wide.load to <8 x i8> 40 store <8 x i8> %0, <8 x i8>* %dest, align 1 41 ret void 42} 43 44 45define void @foo_int8_int32_double(<16 x i8>* %dest, <16 x i32>* readonly %src, i32 %n) { 46; CHECK-LABEL: foo_int8_int32_double: 47; CHECK: @ %bb.0: @ %entry 48; CHECK-NEXT: vldrw.u32 q0, [r1] 49; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 50; CHECK-NEXT: vldrw.u32 q2, [r1, #32] 51; CHECK-NEXT: vldrw.u32 q3, [r1, #48] 52; CHECK-NEXT: vstrb.32 q1, [r0, #4] 53; CHECK-NEXT: vstrb.32 q0, [r0] 54; CHECK-NEXT: vstrb.32 q3, [r0, #12] 55; CHECK-NEXT: vstrb.32 q2, [r0, #8] 56; CHECK-NEXT: bx lr 57entry: 58 %wide.load = load <16 x i32>, <16 x i32>* %src, align 4 59 %0 = trunc <16 x i32> %wide.load to <16 x i8> 60 store <16 x i8> %0, <16 x i8>* %dest, align 1 61 ret void 62} 63 64define void @foo_int16_int32_double(<8 x i16>* %dest, <8 x i32>* readonly %src, i32 %n) { 65; CHECK-LABEL: foo_int16_int32_double: 66; CHECK: @ %bb.0: @ %entry 67; CHECK-NEXT: vldrw.u32 q0, [r1] 68; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 69; CHECK-NEXT: vstrh.32 q1, [r0, #8] 70; CHECK-NEXT: vstrh.32 q0, [r0] 71; CHECK-NEXT: bx lr 72entry: 73 %wide.load = load <8 x i32>, <8 x i32>* %src, align 4 74 %0 = trunc <8 x i32> %wide.load to <8 x i16> 75 store <8 x i16> %0, <8 x i16>* %dest, align 2 76 ret void 77} 78 79define void @foo_int8_int16_double(<16 x i8>* %dest, <16 x i16>* readonly %src, i32 %n) { 80; CHECK-LABEL: foo_int8_int16_double: 81; CHECK: @ %bb.0: @ %entry 82; CHECK-NEXT: vldrh.u16 q0, [r1] 83; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 84; CHECK-NEXT: vstrb.16 q1, [r0, #8] 85; CHECK-NEXT: vstrb.16 q0, [r0] 86; CHECK-NEXT: bx lr 87entry: 88 %wide.load = load <16 x i16>, <16 x i16>* %src, align 2 89 %0 = trunc <16 x i16> %wide.load to <16 x i8> 90 store <16 x i8> %0, <16 x i8>* %dest, align 1 91 ret void 92} 93 94 95define void @foo_int32_int8(<4 x i32>* %dest, <4 x i8>* readonly %src, i32 %n) { 96; CHECK-LABEL: foo_int32_int8: 97; CHECK: @ %bb.0: @ %entry 98; CHECK-NEXT: vldrb.s32 q0, [r1] 99; CHECK-NEXT: vstrw.32 q0, [r0] 100; CHECK-NEXT: bx lr 101entry: 102 %wide.load = load <4 x i8>, <4 x i8>* %src, align 1 103 %0 = sext <4 x i8> %wide.load to <4 x i32> 104 store <4 x i32> %0, <4 x i32>* %dest, align 4 105 ret void 106} 107 108define void @foo_int16_int8(<8 x i16>* %dest, <8 x i8>* readonly %src, i32 %n) { 109; CHECK-LABEL: foo_int16_int8: 110; CHECK: @ %bb.0: @ %entry 111; CHECK-NEXT: vldrb.s16 q0, [r1] 112; CHECK-NEXT: vstrh.16 q0, [r0] 113; CHECK-NEXT: bx lr 114entry: 115 %wide.load = load <8 x i8>, <8 x i8>* %src, align 1 116 %0 = sext <8 x i8> %wide.load to <8 x i16> 117 store <8 x i16> %0, <8 x i16>* %dest, align 2 118 ret void 119} 120 121define void @foo_int32_int16(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { 122; CHECK-LABEL: foo_int32_int16: 123; CHECK: @ %bb.0: @ %entry 124; CHECK-NEXT: vldrh.s32 q0, [r1] 125; CHECK-NEXT: vstrw.32 q0, [r0] 126; CHECK-NEXT: bx lr 127entry: 128 %wide.load = load <4 x i16>, <4 x i16>* %src, align 2 129 %0 = sext <4 x i16> %wide.load to <4 x i32> 130 store <4 x i32> %0, <4 x i32>* %dest, align 4 131 ret void 132} 133 134define void @foo_int32_int8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { 135; CHECK-LABEL: foo_int32_int8_double: 136; CHECK: @ %bb.0: @ %entry 137; CHECK-NEXT: vldrb.s32 q0, [r1] 138; CHECK-NEXT: vldrb.s32 q1, [r1, #4] 139; CHECK-NEXT: vldrb.s32 q2, [r1, #8] 140; CHECK-NEXT: vldrb.s32 q3, [r1, #12] 141; CHECK-NEXT: vstrw.32 q1, [r0, #16] 142; CHECK-NEXT: vstrw.32 q0, [r0] 143; CHECK-NEXT: vstrw.32 q3, [r0, #48] 144; CHECK-NEXT: vstrw.32 q2, [r0, #32] 145; CHECK-NEXT: bx lr 146entry: 147 %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 148 %0 = sext <16 x i8> %wide.load to <16 x i32> 149 store <16 x i32> %0, <16 x i32>* %dest, align 4 150 ret void 151} 152 153define void @foo_int16_int8_double(<16 x i16>* %dest, <16 x i8>* readonly %src, i32 %n) { 154; CHECK-LABEL: foo_int16_int8_double: 155; CHECK: @ %bb.0: @ %entry 156; CHECK-NEXT: vldrb.s16 q0, [r1] 157; CHECK-NEXT: vldrb.s16 q1, [r1, #8] 158; CHECK-NEXT: vstrh.16 q1, [r0, #16] 159; CHECK-NEXT: vstrh.16 q0, [r0] 160; CHECK-NEXT: bx lr 161entry: 162 %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 163 %0 = sext <16 x i8> %wide.load to <16 x i16> 164 store <16 x i16> %0, <16 x i16>* %dest, align 2 165 ret void 166} 167 168define void @foo_int32_int16_double(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) { 169; CHECK-LABEL: foo_int32_int16_double: 170; CHECK: @ %bb.0: @ %entry 171; CHECK-NEXT: vldrh.s32 q0, [r1] 172; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 173; CHECK-NEXT: vstrw.32 q1, [r0, #16] 174; CHECK-NEXT: vstrw.32 q0, [r0] 175; CHECK-NEXT: bx lr 176entry: 177 %wide.load = load <8 x i16>, <8 x i16>* %src, align 2 178 %0 = sext <8 x i16> %wide.load to <8 x i32> 179 store <8 x i32> %0, <8 x i32>* %dest, align 4 180 ret void 181} 182 183 184define void @foo_uint32_uint8(<4 x i32>* %dest, <4 x i8>* readonly %src, i32 %n) { 185; CHECK-LABEL: foo_uint32_uint8: 186; CHECK: @ %bb.0: @ %entry 187; CHECK-NEXT: vldrb.u32 q0, [r1] 188; CHECK-NEXT: vstrw.32 q0, [r0] 189; CHECK-NEXT: bx lr 190entry: 191 %wide.load = load <4 x i8>, <4 x i8>* %src, align 1 192 %0 = zext <4 x i8> %wide.load to <4 x i32> 193 store <4 x i32> %0, <4 x i32>* %dest, align 4 194 ret void 195} 196 197define void @foo_uint16_uint8(<8 x i16>* %dest, <8 x i8>* readonly %src, i32 %n) { 198; CHECK-LABEL: foo_uint16_uint8: 199; CHECK: @ %bb.0: @ %entry 200; CHECK-NEXT: vldrb.u16 q0, [r1] 201; CHECK-NEXT: vstrh.16 q0, [r0] 202; CHECK-NEXT: bx lr 203entry: 204 %wide.load = load <8 x i8>, <8 x i8>* %src, align 1 205 %0 = zext <8 x i8> %wide.load to <8 x i16> 206 store <8 x i16> %0, <8 x i16>* %dest, align 2 207 ret void 208} 209 210define void @foo_uint32_uint16(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { 211; CHECK-LABEL: foo_uint32_uint16: 212; CHECK: @ %bb.0: @ %entry 213; CHECK-NEXT: vldrh.u32 q0, [r1] 214; CHECK-NEXT: vstrw.32 q0, [r0] 215; CHECK-NEXT: bx lr 216entry: 217 %wide.load = load <4 x i16>, <4 x i16>* %src, align 2 218 %0 = zext <4 x i16> %wide.load to <4 x i32> 219 store <4 x i32> %0, <4 x i32>* %dest, align 4 220 ret void 221} 222 223 224define void @foo_uint32_uint8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { 225; CHECK-LABEL: foo_uint32_uint8_double: 226; CHECK: @ %bb.0: @ %entry 227; CHECK-NEXT: vldrb.u32 q0, [r1] 228; CHECK-NEXT: vldrb.u32 q1, [r1, #4] 229; CHECK-NEXT: vldrb.u32 q2, [r1, #8] 230; CHECK-NEXT: vldrb.u32 q3, [r1, #12] 231; CHECK-NEXT: vstrw.32 q1, [r0, #16] 232; CHECK-NEXT: vstrw.32 q0, [r0] 233; CHECK-NEXT: vstrw.32 q3, [r0, #48] 234; CHECK-NEXT: vstrw.32 q2, [r0, #32] 235; CHECK-NEXT: bx lr 236entry: 237 %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 238 %0 = zext <16 x i8> %wide.load to <16 x i32> 239 store <16 x i32> %0, <16 x i32>* %dest, align 4 240 ret void 241} 242 243define void @foo_uint16_uint8_double(<16 x i16>* %dest, <16 x i8>* readonly %src, i32 %n) { 244; CHECK-LABEL: foo_uint16_uint8_double: 245; CHECK: @ %bb.0: @ %entry 246; CHECK-NEXT: vldrb.u16 q0, [r1] 247; CHECK-NEXT: vldrb.u16 q1, [r1, #8] 248; CHECK-NEXT: vstrh.16 q1, [r0, #16] 249; CHECK-NEXT: vstrh.16 q0, [r0] 250; CHECK-NEXT: bx lr 251entry: 252 %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 253 %0 = zext <16 x i8> %wide.load to <16 x i16> 254 store <16 x i16> %0, <16 x i16>* %dest, align 2 255 ret void 256} 257 258define void @foo_uint32_uint16_double(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) { 259; CHECK-LABEL: foo_uint32_uint16_double: 260; CHECK: @ %bb.0: @ %entry 261; CHECK-NEXT: vldrh.u32 q0, [r1] 262; CHECK-NEXT: vldrh.u32 q1, [r1, #8] 263; CHECK-NEXT: vstrw.32 q1, [r0, #16] 264; CHECK-NEXT: vstrw.32 q0, [r0] 265; CHECK-NEXT: bx lr 266entry: 267 %wide.load = load <8 x i16>, <8 x i16>* %src, align 2 268 %0 = zext <8 x i16> %wide.load to <8 x i32> 269 store <8 x i32> %0, <8 x i32>* %dest, align 4 270 ret void 271} 272 273 274define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { 275; CHECK-LABEL: foo_int32_int8_both: 276; CHECK: @ %bb.0: @ %entry 277; CHECK-NEXT: vldrb.s16 q1, [r1, #8] 278; CHECK-NEXT: vmov.u16 r2, q1[4] 279; CHECK-NEXT: vmov.32 q0[0], r2 280; CHECK-NEXT: vmov.u16 r2, q1[5] 281; CHECK-NEXT: vmov.32 q0[1], r2 282; CHECK-NEXT: vmov.u16 r2, q1[6] 283; CHECK-NEXT: vmov.32 q0[2], r2 284; CHECK-NEXT: vmov.u16 r2, q1[7] 285; CHECK-NEXT: vmov.32 q0[3], r2 286; CHECK-NEXT: vmovlb.u16 q2, q0 287; CHECK-NEXT: vldrb.s16 q0, [r1] 288; CHECK-NEXT: vmov.u16 r1, q1[0] 289; CHECK-NEXT: vstrw.32 q2, [r0, #48] 290; CHECK-NEXT: vmov.32 q2[0], r1 291; CHECK-NEXT: vmov.u16 r1, q1[1] 292; CHECK-NEXT: vmov.32 q2[1], r1 293; CHECK-NEXT: vmov.u16 r1, q1[2] 294; CHECK-NEXT: vmov.32 q2[2], r1 295; CHECK-NEXT: vmov.u16 r1, q1[3] 296; CHECK-NEXT: vmov.32 q2[3], r1 297; CHECK-NEXT: vmov.u16 r1, q0[4] 298; CHECK-NEXT: vmovlb.u16 q1, q2 299; CHECK-NEXT: vstrw.32 q1, [r0, #32] 300; CHECK-NEXT: vmov.32 q1[0], r1 301; CHECK-NEXT: vmov.u16 r1, q0[5] 302; CHECK-NEXT: vmov.32 q1[1], r1 303; CHECK-NEXT: vmov.u16 r1, q0[6] 304; CHECK-NEXT: vmov.32 q1[2], r1 305; CHECK-NEXT: vmov.u16 r1, q0[7] 306; CHECK-NEXT: vmov.32 q1[3], r1 307; CHECK-NEXT: vmov.u16 r1, q0[0] 308; CHECK-NEXT: vmovlb.u16 q1, q1 309; CHECK-NEXT: vstrw.32 q1, [r0, #16] 310; CHECK-NEXT: vmov.32 q1[0], r1 311; CHECK-NEXT: vmov.u16 r1, q0[1] 312; CHECK-NEXT: vmov.32 q1[1], r1 313; CHECK-NEXT: vmov.u16 r1, q0[2] 314; CHECK-NEXT: vmov.32 q1[2], r1 315; CHECK-NEXT: vmov.u16 r1, q0[3] 316; CHECK-NEXT: vmov.32 q1[3], r1 317; CHECK-NEXT: vmovlb.u16 q0, q1 318; CHECK-NEXT: vstrw.32 q0, [r0] 319; CHECK-NEXT: bx lr 320entry: 321 %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 322 %0 = sext <16 x i8> %wide.load to <16 x i16> 323 %1 = zext <16 x i16> %0 to <16 x i32> 324 store <16 x i32> %1, <16 x i32>* %dest, align 4 325 ret void 326} 327 328define <8 x i16>* @foo_uint32_uint16_double_offset(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) { 329; CHECK-LABEL: foo_uint32_uint16_double_offset: 330; CHECK: @ %bb.0: @ %entry 331; CHECK-NEXT: vldrh.s32 q0, [r1, #16]! 332; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 333; CHECK-NEXT: vstrw.32 q0, [r0] 334; CHECK-NEXT: vstrw.32 q1, [r0, #16] 335; CHECK-NEXT: mov r0, r1 336; CHECK-NEXT: bx lr 337entry: 338 %z = getelementptr inbounds <8 x i16>, <8 x i16>* %src, i32 1 339 %wide.load = load <8 x i16>, <8 x i16>* %z, align 2 340 %0 = sext <8 x i16> %wide.load to <8 x i32> 341 store <8 x i32> %0, <8 x i32>* %dest, align 4 342 ret <8 x i16>* %z 343} 344 345define <16 x i16>* @foo_uint32_uint16_quad_offset(<16 x i32>* %dest, <16 x i16>* readonly %src, i32 %n) { 346; CHECK-LABEL: foo_uint32_uint16_quad_offset: 347; CHECK: @ %bb.0: @ %entry 348; CHECK-NEXT: vldrh.s32 q0, [r1, #32]! 349; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 350; CHECK-NEXT: vldrh.s32 q2, [r1, #16] 351; CHECK-NEXT: vldrh.s32 q3, [r1, #24] 352; CHECK-NEXT: vstrw.32 q0, [r0] 353; CHECK-NEXT: vstrw.32 q2, [r0, #32] 354; CHECK-NEXT: vstrw.32 q1, [r0, #16] 355; CHECK-NEXT: vstrw.32 q3, [r0, #48] 356; CHECK-NEXT: mov r0, r1 357; CHECK-NEXT: bx lr 358entry: 359 %z = getelementptr inbounds <16 x i16>, <16 x i16>* %src, i32 1 360 %wide.load = load <16 x i16>, <16 x i16>* %z, align 2 361 %0 = sext <16 x i16> %wide.load to <16 x i32> 362 store <16 x i32> %0, <16 x i32>* %dest, align 4 363 ret <16 x i16>* %z 364} 365 366 367define void @foo_int16_int32_align1(<4 x i16>* %dest, <4 x i32>* readonly %src, i32 %n) { 368; CHECK-LABEL: foo_int16_int32_align1: 369; CHECK: @ %bb.0: @ %entry 370; CHECK-NEXT: .pad #8 371; CHECK-NEXT: sub sp, #8 372; CHECK-NEXT: vldrw.u32 q0, [r1] 373; CHECK-NEXT: mov r1, sp 374; CHECK-NEXT: vstrh.32 q0, [r1] 375; CHECK-NEXT: ldrd r1, r2, [sp] 376; CHECK-NEXT: str r1, [r0] 377; CHECK-NEXT: str r2, [r0, #4] 378; CHECK-NEXT: add sp, #8 379; CHECK-NEXT: bx lr 380entry: 381 %wide.load = load <4 x i32>, <4 x i32>* %src, align 4 382 %0 = trunc <4 x i32> %wide.load to <4 x i16> 383 store <4 x i16> %0, <4 x i16>* %dest, align 1 384 ret void 385} 386 387define void @foo_int32_int16_align1(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { 388; CHECK-LABEL: foo_int32_int16_align1: 389; CHECK: @ %bb.0: @ %entry 390; CHECK-NEXT: .pad #8 391; CHECK-NEXT: sub sp, #8 392; CHECK-NEXT: ldr r2, [r1] 393; CHECK-NEXT: ldr r1, [r1, #4] 394; CHECK-NEXT: strd r2, r1, [sp] 395; CHECK-NEXT: mov r1, sp 396; CHECK-NEXT: vldrh.s32 q0, [r1] 397; CHECK-NEXT: vstrw.32 q0, [r0] 398; CHECK-NEXT: add sp, #8 399; CHECK-NEXT: bx lr 400entry: 401 %wide.load = load <4 x i16>, <4 x i16>* %src, align 1 402 %0 = sext <4 x i16> %wide.load to <4 x i32> 403 store <4 x i32> %0, <4 x i32>* %dest, align 4 404 ret void 405} 406 407define void @foo_uint32_uint16_align1(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) { 408; CHECK-LABEL: foo_uint32_uint16_align1: 409; CHECK: @ %bb.0: @ %entry 410; CHECK-NEXT: .pad #8 411; CHECK-NEXT: sub sp, #8 412; CHECK-NEXT: ldr r2, [r1] 413; CHECK-NEXT: ldr r1, [r1, #4] 414; CHECK-NEXT: strd r2, r1, [sp] 415; CHECK-NEXT: mov r1, sp 416; CHECK-NEXT: vldrh.u32 q0, [r1] 417; CHECK-NEXT: vstrw.32 q0, [r0] 418; CHECK-NEXT: add sp, #8 419; CHECK-NEXT: bx lr 420entry: 421 %wide.load = load <4 x i16>, <4 x i16>* %src, align 1 422 %0 = zext <4 x i16> %wide.load to <4 x i32> 423 store <4 x i32> %0, <4 x i32>* %dest, align 4 424 ret void 425} 426