1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE 3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE 4 5define void @load_load_add_store(<4 x i32> *%src1, <4 x i32> *%src2) { 6; CHECK-LABEL: load_load_add_store: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vldrw.u32 q0, [r1] 9; CHECK-NEXT: vldrw.u32 q1, [r0] 10; CHECK-NEXT: vadd.i32 q0, q1, q0 11; CHECK-NEXT: vstrw.32 q0, [r0] 12; CHECK-NEXT: bx lr 13entry: 14 %l1 = load <4 x i32>, <4 x i32>* %src1, align 4 15 %l2 = load <4 x i32>, <4 x i32>* %src2, align 4 16 %a = add <4 x i32> %l1, %l2 17 store <4 x i32> %a, <4 x i32>* %src1, align 4 18 ret void 19} 20 21define void @load_load_add_store_align1(<4 x i32> *%src1, <4 x i32> *%src2) { 22; CHECK-LE-LABEL: load_load_add_store_align1: 23; CHECK-LE: @ %bb.0: @ %entry 24; CHECK-LE-NEXT: vldrb.u8 q0, [r1] 25; CHECK-LE-NEXT: vldrb.u8 q1, [r0] 26; CHECK-LE-NEXT: vadd.i32 q0, q1, q0 27; CHECK-LE-NEXT: vstrb.8 q0, [r0] 28; CHECK-LE-NEXT: bx lr 29; 30; CHECK-BE-LABEL: load_load_add_store_align1: 31; CHECK-BE: @ %bb.0: @ %entry 32; CHECK-BE-NEXT: vldrb.u8 q0, [r1] 33; CHECK-BE-NEXT: vldrb.u8 q1, [r0] 34; CHECK-BE-NEXT: vrev32.8 q0, q0 35; CHECK-BE-NEXT: vrev32.8 q1, q1 36; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 37; CHECK-BE-NEXT: vrev32.8 q0, q0 38; CHECK-BE-NEXT: vstrb.8 q0, [r0] 39; CHECK-BE-NEXT: bx lr 40entry: 41 %l1 = load <4 x i32>, <4 x i32>* %src1, align 1 42 %l2 = load <4 x i32>, <4 x i32>* %src2, align 1 43 %a = add <4 x i32> %l1, %l2 44 store <4 x i32> %a, <4 x i32>* %src1, align 1 45 ret void 46} 47 48define arm_aapcs_vfpcc void @load_arg_add_store(<4 x i32> *%src1, <4 x i32> %src2) { 49; CHECK-LE-LABEL: load_arg_add_store: 50; CHECK-LE: @ %bb.0: @ %entry 51; CHECK-LE-NEXT: vldrw.u32 q1, [r0] 52; CHECK-LE-NEXT: vadd.i32 q0, q1, q0 53; CHECK-LE-NEXT: vstrw.32 q0, [r0] 54; CHECK-LE-NEXT: bx lr 55; 56; CHECK-BE-LABEL: load_arg_add_store: 57; CHECK-BE: @ %bb.0: @ %entry 58; CHECK-BE-NEXT: vrev64.32 q1, q0 59; CHECK-BE-NEXT: vldrw.u32 q0, [r0] 60; CHECK-BE-NEXT: vadd.i32 q0, q0, q1 61; CHECK-BE-NEXT: vstrw.32 q0, [r0] 62; CHECK-BE-NEXT: bx lr 63entry: 64 %l1 = load <4 x i32>, <4 x i32>* %src1, align 4 65 %a = add <4 x i32> %l1, %src2 66 store <4 x i32> %a, <4 x i32>* %src1, align 4 67 ret void 68} 69 70define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) { 71; CHECK-LE-LABEL: add_soft: 72; CHECK-LE: @ %bb.0: @ %entry 73; CHECK-LE-NEXT: vmov d1, r2, r3 74; CHECK-LE-NEXT: vmov d0, r0, r1 75; CHECK-LE-NEXT: mov r0, sp 76; CHECK-LE-NEXT: vldrw.u32 q1, [r0] 77; CHECK-LE-NEXT: vadd.i32 q0, q0, q1 78; CHECK-LE-NEXT: vmov r0, r1, d0 79; CHECK-LE-NEXT: vmov r2, r3, d1 80; CHECK-LE-NEXT: bx lr 81; 82; CHECK-BE-LABEL: add_soft: 83; CHECK-BE: @ %bb.0: @ %entry 84; CHECK-BE-NEXT: vmov d1, r3, r2 85; CHECK-BE-NEXT: vmov d0, r1, r0 86; CHECK-BE-NEXT: mov r0, sp 87; CHECK-BE-NEXT: vrev64.32 q1, q0 88; CHECK-BE-NEXT: vldrw.u32 q0, [r0] 89; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 90; CHECK-BE-NEXT: vrev64.32 q1, q0 91; CHECK-BE-NEXT: vmov r1, r0, d2 92; CHECK-BE-NEXT: vmov r3, r2, d3 93; CHECK-BE-NEXT: bx lr 94entry: 95 %0 = add <4 x i32> %src1, %src2 96 ret <4 x i32> %0 97} 98 99define arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %src1, <4 x i32> %src2) { 100; CHECK-LE-LABEL: add_hard: 101; CHECK-LE: @ %bb.0: @ %entry 102; CHECK-LE-NEXT: vadd.i32 q0, q0, q1 103; CHECK-LE-NEXT: bx lr 104; 105; CHECK-BE-LABEL: add_hard: 106; CHECK-BE: @ %bb.0: @ %entry 107; CHECK-BE-NEXT: vrev64.32 q2, q1 108; CHECK-BE-NEXT: vrev64.32 q1, q0 109; CHECK-BE-NEXT: vadd.i32 q1, q1, q2 110; CHECK-BE-NEXT: vrev64.32 q0, q1 111; CHECK-BE-NEXT: bx lr 112entry: 113 %0 = add <4 x i32> %src1, %src2 114 ret <4 x i32> %0 115} 116 117define <4 x i32> @call_soft(<4 x i32> %src1, <4 x i32> %src2) { 118; CHECK-LE-LABEL: call_soft: 119; CHECK-LE: @ %bb.0: @ %entry 120; CHECK-LE-NEXT: .save {r7, lr} 121; CHECK-LE-NEXT: push {r7, lr} 122; CHECK-LE-NEXT: .pad #16 123; CHECK-LE-NEXT: sub sp, #16 124; CHECK-LE-NEXT: add.w r12, sp, #24 125; CHECK-LE-NEXT: vldrw.u32 q0, [r12] 126; CHECK-LE-NEXT: vstrw.32 q0, [sp] 127; CHECK-LE-NEXT: vmov d1, r2, r3 128; CHECK-LE-NEXT: vmov d0, r0, r1 129; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 130; CHECK-LE-NEXT: vmov r0, r1, d0 131; CHECK-LE-NEXT: vmov r2, r3, d1 132; CHECK-LE-NEXT: bl add_soft 133; CHECK-LE-NEXT: vmov d1, r2, r3 134; CHECK-LE-NEXT: vmov d0, r0, r1 135; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 136; CHECK-LE-NEXT: vmov r0, r1, d0 137; CHECK-LE-NEXT: vmov r2, r3, d1 138; CHECK-LE-NEXT: add sp, #16 139; CHECK-LE-NEXT: pop {r7, pc} 140; 141; CHECK-BE-LABEL: call_soft: 142; CHECK-BE: @ %bb.0: @ %entry 143; CHECK-BE-NEXT: .save {r7, lr} 144; CHECK-BE-NEXT: push {r7, lr} 145; CHECK-BE-NEXT: .pad #16 146; CHECK-BE-NEXT: sub sp, #16 147; CHECK-BE-NEXT: add.w r12, sp, #24 148; CHECK-BE-NEXT: vldrw.u32 q0, [r12] 149; CHECK-BE-NEXT: vstrw.32 q0, [sp] 150; CHECK-BE-NEXT: vmov d1, r3, r2 151; CHECK-BE-NEXT: vmov d0, r1, r0 152; CHECK-BE-NEXT: vrev64.32 q1, q0 153; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 154; CHECK-BE-NEXT: vrev64.32 q1, q0 155; CHECK-BE-NEXT: vmov r1, r0, d2 156; CHECK-BE-NEXT: vmov r3, r2, d3 157; CHECK-BE-NEXT: bl add_soft 158; CHECK-BE-NEXT: vmov d1, r3, r2 159; CHECK-BE-NEXT: vmov d0, r1, r0 160; CHECK-BE-NEXT: vrev64.32 q1, q0 161; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 162; CHECK-BE-NEXT: vrev64.32 q1, q0 163; CHECK-BE-NEXT: vmov r1, r0, d2 164; CHECK-BE-NEXT: vmov r3, r2, d3 165; CHECK-BE-NEXT: add sp, #16 166; CHECK-BE-NEXT: pop {r7, pc} 167entry: 168 %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1> 169 %1 = call <4 x i32> @add_soft(<4 x i32> %0, <4 x i32> %src2) 170 %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1> 171 ret <4 x i32> %2 172} 173 174define arm_aapcs_vfpcc <4 x i32> @call_hard(<4 x i32> %src1, <4 x i32> %src2) { 175; CHECK-LE-LABEL: call_hard: 176; CHECK-LE: @ %bb.0: @ %entry 177; CHECK-LE-NEXT: .save {r7, lr} 178; CHECK-LE-NEXT: push {r7, lr} 179; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 180; CHECK-LE-NEXT: bl add_hard 181; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 182; CHECK-LE-NEXT: pop {r7, pc} 183; 184; CHECK-BE-LABEL: call_hard: 185; CHECK-BE: @ %bb.0: @ %entry 186; CHECK-BE-NEXT: .save {r7, lr} 187; CHECK-BE-NEXT: push {r7, lr} 188; CHECK-BE-NEXT: vrev64.32 q2, q0 189; CHECK-BE-NEXT: vshr.u32 q2, q2, #1 190; CHECK-BE-NEXT: vrev64.32 q0, q2 191; CHECK-BE-NEXT: bl add_hard 192; CHECK-BE-NEXT: vrev64.32 q1, q0 193; CHECK-BE-NEXT: vshr.u32 q1, q1, #1 194; CHECK-BE-NEXT: vrev64.32 q0, q1 195; CHECK-BE-NEXT: pop {r7, pc} 196entry: 197 %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1> 198 %1 = call arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %0, <4 x i32> %src2) 199 %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1> 200 ret <4 x i32> %2 201} 202 203define arm_aapcs_vfpcc <16 x i8> @and_v4i32(<4 x i32> %src) { 204; CHECK-LE-LABEL: and_v4i32: 205; CHECK-LE: @ %bb.0: @ %entry 206; CHECK-LE-NEXT: vmov.i32 q1, #0x1 207; CHECK-LE-NEXT: vand q0, q0, q1 208; CHECK-LE-NEXT: bx lr 209; 210; CHECK-BE-LABEL: and_v4i32: 211; CHECK-BE: @ %bb.0: @ %entry 212; CHECK-BE-NEXT: vrev64.32 q1, q0 213; CHECK-BE-NEXT: vmov.i32 q0, #0x1 214; CHECK-BE-NEXT: vand q1, q1, q0 215; CHECK-BE-NEXT: vrev64.32 q0, q1 216; CHECK-BE-NEXT: bx lr 217entry: 218 %s1 = and <4 x i32> %src, <i32 1, i32 1, i32 1, i32 1> 219 %r = bitcast <4 x i32> %s1 to <16 x i8> 220 ret <16 x i8> %r 221} 222 223; Should be the same as and_v4i32 for LE 224define arm_aapcs_vfpcc <16 x i8> @and_v16i8_le(<4 x i32> %src) { 225; CHECK-LE-LABEL: and_v16i8_le: 226; CHECK-LE: @ %bb.0: @ %entry 227; CHECK-LE-NEXT: vmov.i32 q1, #0x1 228; CHECK-LE-NEXT: vand q0, q0, q1 229; CHECK-LE-NEXT: bx lr 230; 231; CHECK-BE-LABEL: and_v16i8_le: 232; CHECK-BE: @ %bb.0: @ %entry 233; CHECK-BE-NEXT: vrev64.8 q1, q0 234; CHECK-BE-NEXT: vmov.i32 q0, #0x1 235; CHECK-BE-NEXT: vrev32.8 q0, q0 236; CHECK-BE-NEXT: vand q1, q1, q0 237; CHECK-BE-NEXT: vrev64.8 q0, q1 238; CHECK-BE-NEXT: bx lr 239entry: 240 %0 = bitcast <4 x i32> %src to <16 x i8> 241 %r = and <16 x i8> %0, <i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0> 242 ret <16 x i8> %r 243} 244 245; Should be the same (or at least equivalent) as and_v4i32 for BE 246define arm_aapcs_vfpcc <16 x i8> @and_v16i8_be(<4 x i32> %src) { 247; CHECK-LE-LABEL: and_v16i8_be: 248; CHECK-LE: @ %bb.0: @ %entry 249; CHECK-LE-NEXT: vmov.i32 q1, #0x1000000 250; CHECK-LE-NEXT: vand q0, q0, q1 251; CHECK-LE-NEXT: bx lr 252; 253; CHECK-BE-LABEL: and_v16i8_be: 254; CHECK-BE: @ %bb.0: @ %entry 255; CHECK-BE-NEXT: vrev64.8 q1, q0 256; CHECK-BE-NEXT: vmov.i32 q0, #0x1000000 257; CHECK-BE-NEXT: vrev32.8 q0, q0 258; CHECK-BE-NEXT: vand q1, q1, q0 259; CHECK-BE-NEXT: vrev64.8 q0, q1 260; CHECK-BE-NEXT: bx lr 261entry: 262 %0 = bitcast <4 x i32> %src to <16 x i8> 263 %r = and <16 x i8> %0, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1> 264 ret <16 x i8> %r 265} 266 267; FIXME: This looks wrong 268define arm_aapcs_vfpcc <4 x i32> @test(i32* %data) { 269; CHECK-LE-LABEL: test: 270; CHECK-LE: @ %bb.0: @ %entry 271; CHECK-LE-NEXT: vldrw.u32 q1, [r0, #32] 272; CHECK-LE-NEXT: vmov.i32 q0, #0x1 273; CHECK-LE-NEXT: vadd.i32 q1, q1, q0 274; CHECK-LE-NEXT: @APP 275; CHECK-LE-NEXT: vmullb.s32 q0, q1, q1 276; CHECK-LE-NEXT: @NO_APP 277; CHECK-LE-NEXT: bx lr 278; 279; CHECK-BE-LABEL: test: 280; CHECK-BE: @ %bb.0: @ %entry 281; CHECK-BE-NEXT: vldrw.u32 q1, [r0, #32] 282; CHECK-BE-NEXT: vmov.i32 q0, #0x1 283; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 284; CHECK-BE-NEXT: vrev32.8 q0, q0 285; CHECK-BE-NEXT: @APP 286; CHECK-BE-NEXT: vmullb.s32 q1, q0, q0 287; CHECK-BE-NEXT: @NO_APP 288; CHECK-BE-NEXT: vrev64.8 q0, q1 289; CHECK-BE-NEXT: bx lr 290entry: 291 %add.ptr = getelementptr inbounds i32, i32* %data, i32 8 292 %0 = bitcast i32* %add.ptr to <4 x i32>* 293 %1 = load <4 x i32>, <4 x i32>* %0, align 4 294 %2 = add <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1> 295 %3 = tail call <4 x i32> asm sideeffect " VMULLB.s32 $0, $1, $1", "=&w,w"(<4 x i32> %2) #2 296 ret <4 x i32> %3 297} 298 299; Test case demonstrating that 'bitcast' reinterprets the memory format of a 300; vector, as if stored and then loaded. So if it has to go between two 301; operations treating a register as having different lane sizes, then in 302; big-endian mode, it has to emit a vrev32.16, which is equivalent to the 303; effect that vstrw.32 followed by vldrh.16 would have. 304define arm_aapcs_vfpcc void @test_bitcast(<4 x i32>* readonly %in, <8 x i16>* %out) { 305; CHECK-LE-LABEL: test_bitcast: 306; CHECK-LE: @ %bb.0: @ %entry 307; CHECK-LE-NEXT: vldrw.u32 q0, [r0] 308; CHECK-LE-NEXT: vmul.i32 q0, q0, q0 309; CHECK-LE-NEXT: vmul.i16 q0, q0, q0 310; CHECK-LE-NEXT: vstrw.32 q0, [r1] 311; CHECK-LE-NEXT: bx lr 312; 313; CHECK-BE-LABEL: test_bitcast: 314; CHECK-BE: @ %bb.0: @ %entry 315; CHECK-BE-NEXT: vldrw.u32 q0, [r0] 316; CHECK-BE-NEXT: vmul.i32 q0, q0, q0 317; CHECK-BE-NEXT: vrev32.16 q0, q0 318; CHECK-BE-NEXT: vmul.i16 q0, q0, q0 319; CHECK-BE-NEXT: vstrh.16 q0, [r1] 320; CHECK-BE-NEXT: bx lr 321entry: 322 %vin = load <4 x i32>, <4 x i32>* %in, align 8 323 %vdbl = mul <4 x i32> %vin, %vin 324 %cast = bitcast <4 x i32> %vdbl to <8 x i16> 325 %cdbl = mul <8 x i16> %cast, %cast 326 store <8 x i16> %cdbl, <8 x i16>* %out, align 8 327 ret void 328} 329 330; Similar test case but using the arm.mve.vreinterpretq intrinsic instead, 331; which is defined to reinterpret the in-register format, so it generates no 332; instruction in either endianness. 333define arm_aapcs_vfpcc void @test_vreinterpretq(<4 x i32>* readonly %in, <8 x i16>* %out) { 334; CHECK-LE-LABEL: test_vreinterpretq: 335; CHECK-LE: @ %bb.0: @ %entry 336; CHECK-LE-NEXT: vldrw.u32 q0, [r0] 337; CHECK-LE-NEXT: vmul.i32 q0, q0, q0 338; CHECK-LE-NEXT: vmul.i16 q0, q0, q0 339; CHECK-LE-NEXT: vstrw.32 q0, [r1] 340; CHECK-LE-NEXT: bx lr 341; 342; CHECK-BE-LABEL: test_vreinterpretq: 343; CHECK-BE: @ %bb.0: @ %entry 344; CHECK-BE-NEXT: vldrw.u32 q0, [r0] 345; CHECK-BE-NEXT: vmul.i32 q0, q0, q0 346; CHECK-BE-NEXT: vmul.i16 q0, q0, q0 347; CHECK-BE-NEXT: vstrh.16 q0, [r1] 348; CHECK-BE-NEXT: bx lr 349entry: 350 %vin = load <4 x i32>, <4 x i32>* %in, align 8 351 %vdbl = mul <4 x i32> %vin, %vin 352 %cast = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32> %vdbl) 353 %cdbl = mul <8 x i16> %cast, %cast 354 store <8 x i16> %cdbl, <8 x i16>* %out, align 8 355 ret void 356} 357 358declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32>) 359