1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-LE 3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-BE 4 5define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) { 6; CHECK-LE-LABEL: load_4xi32_a4: 7; CHECK-LE: @ %bb.0: @ %entry 8; CHECK-LE-NEXT: vldrw.u32 q0, [r0] 9; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 10; CHECK-LE-NEXT: bx lr 11; 12; CHECK-BE-LABEL: load_4xi32_a4: 13; CHECK-BE: @ %bb.0: @ %entry 14; CHECK-BE-NEXT: vldrw.u32 q0, [r0] 15; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 16; CHECK-BE-NEXT: vrev64.32 q0, q1 17; CHECK-BE-NEXT: bx lr 18entry: 19 %0 = load <4 x i32>, <4 x i32>* %vp, align 4 20 %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1> 21 ret <4 x i32> %1 22} 23 24define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a2(<4 x i32>* %vp) { 25; CHECK-LE-LABEL: load_4xi32_a2: 26; CHECK-LE: @ %bb.0: @ %entry 27; CHECK-LE-NEXT: vldrh.u16 q0, [r0] 28; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 29; CHECK-LE-NEXT: bx lr 30; 31; CHECK-BE-LABEL: load_4xi32_a2: 32; CHECK-BE: @ %bb.0: @ %entry 33; CHECK-BE-NEXT: vldrb.u8 q0, [r0] 34; CHECK-BE-NEXT: vrev32.8 q0, q0 35; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 36; CHECK-BE-NEXT: vrev64.32 q0, q1 37; CHECK-BE-NEXT: bx lr 38entry: 39 %0 = load <4 x i32>, <4 x i32>* %vp, align 2 40 %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1> 41 ret <4 x i32> %1 42} 43 44define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a1(<4 x i32>* %vp) { 45; CHECK-LE-LABEL: load_4xi32_a1: 46; CHECK-LE: @ %bb.0: @ %entry 47; CHECK-LE-NEXT: vldrb.u8 q0, [r0] 48; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 49; CHECK-LE-NEXT: bx lr 50; 51; CHECK-BE-LABEL: load_4xi32_a1: 52; CHECK-BE: @ %bb.0: @ %entry 53; CHECK-BE-NEXT: vldrb.u8 q0, [r0] 54; CHECK-BE-NEXT: vrev32.8 q0, q0 55; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 56; CHECK-BE-NEXT: vrev64.32 q0, q1 57; CHECK-BE-NEXT: bx lr 58entry: 59 %0 = load <4 x i32>, <4 x i32>* %vp, align 1 60 %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1> 61 ret <4 x i32> %1 62} 63 64define arm_aapcs_vfpcc void @store_4xi32_a4(<4 x i32>* %vp, <4 x i32> %val) { 65; CHECK-LE-LABEL: store_4xi32_a4: 66; CHECK-LE: @ %bb.0: @ %entry 67; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 68; CHECK-LE-NEXT: vstrw.32 q0, [r0] 69; CHECK-LE-NEXT: bx lr 70; 71; CHECK-BE-LABEL: store_4xi32_a4: 72; CHECK-BE: @ %bb.0: @ %entry 73; CHECK-BE-NEXT: vrev64.32 q1, q0 74; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 75; CHECK-BE-NEXT: vstrw.32 q0, [r0] 76; CHECK-BE-NEXT: bx lr 77entry: 78 %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1> 79 store <4 x i32> %0, <4 x i32>* %vp, align 4 80 ret void 81} 82 83define arm_aapcs_vfpcc void @store_4xi32_a2(<4 x i32>* %vp, <4 x i32> %val) { 84; CHECK-LE-LABEL: store_4xi32_a2: 85; CHECK-LE: @ %bb.0: @ %entry 86; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 87; CHECK-LE-NEXT: vstrh.16 q0, [r0] 88; CHECK-LE-NEXT: bx lr 89; 90; CHECK-BE-LABEL: store_4xi32_a2: 91; CHECK-BE: @ %bb.0: @ %entry 92; CHECK-BE-NEXT: vrev64.32 q1, q0 93; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 94; CHECK-BE-NEXT: vrev32.8 q0, q0 95; CHECK-BE-NEXT: vstrb.8 q0, [r0] 96; CHECK-BE-NEXT: bx lr 97entry: 98 %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1> 99 store <4 x i32> %0, <4 x i32>* %vp, align 2 100 ret void 101} 102 103define arm_aapcs_vfpcc void @store_4xi32_a1(<4 x i32>* %vp, <4 x i32> %val) { 104; CHECK-LE-LABEL: store_4xi32_a1: 105; CHECK-LE: @ %bb.0: @ %entry 106; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 107; CHECK-LE-NEXT: vstrb.8 q0, [r0] 108; CHECK-LE-NEXT: bx lr 109; 110; CHECK-BE-LABEL: store_4xi32_a1: 111; CHECK-BE: @ %bb.0: @ %entry 112; CHECK-BE-NEXT: vrev64.32 q1, q0 113; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 114; CHECK-BE-NEXT: vrev32.8 q0, q0 115; CHECK-BE-NEXT: vstrb.8 q0, [r0] 116; CHECK-BE-NEXT: bx lr 117entry: 118 %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1> 119 store <4 x i32> %0, <4 x i32>* %vp, align 1 120 ret void 121} 122 123define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_pos(i32* %ip) { 124; CHECK-LE-LABEL: load_4xi32_a4_offset_pos: 125; CHECK-LE: @ %bb.0: @ %entry 126; CHECK-LE-NEXT: vldrw.u32 q0, [r0, #508] 127; CHECK-LE-NEXT: bx lr 128; 129; CHECK-BE-LABEL: load_4xi32_a4_offset_pos: 130; CHECK-BE: @ %bb.0: @ %entry 131; CHECK-BE-NEXT: add.w r0, r0, #508 132; CHECK-BE-NEXT: vldrb.u8 q1, [r0] 133; CHECK-BE-NEXT: vrev64.8 q0, q1 134; CHECK-BE-NEXT: bx lr 135entry: 136 %ipoffset = getelementptr inbounds i32, i32* %ip, i32 127 137 %vp = bitcast i32* %ipoffset to <4 x i32>* 138 %0 = load <4 x i32>, <4 x i32>* %vp, align 4 139 ret <4 x i32> %0 140} 141 142define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_neg(i32* %ip) { 143; CHECK-LE-LABEL: load_4xi32_a4_offset_neg: 144; CHECK-LE: @ %bb.0: @ %entry 145; CHECK-LE-NEXT: vldrw.u32 q0, [r0, #-508] 146; CHECK-LE-NEXT: bx lr 147; 148; CHECK-BE-LABEL: load_4xi32_a4_offset_neg: 149; CHECK-BE: @ %bb.0: @ %entry 150; CHECK-BE-NEXT: sub.w r0, r0, #508 151; CHECK-BE-NEXT: vldrb.u8 q1, [r0] 152; CHECK-BE-NEXT: vrev64.8 q0, q1 153; CHECK-BE-NEXT: bx lr 154entry: 155 %ipoffset = getelementptr inbounds i32, i32* %ip, i32 -127 156 %vp = bitcast i32* %ipoffset to <4 x i32>* 157 %0 = load <4 x i32>, <4 x i32>* %vp, align 4 158 ret <4 x i32> %0 159} 160 161define arm_aapcs_vfpcc <4 x i32> @loadstore_4xi32_stack_off16() { 162; CHECK-LE-LABEL: loadstore_4xi32_stack_off16: 163; CHECK-LE: @ %bb.0: @ %entry 164; CHECK-LE-NEXT: .pad #40 165; CHECK-LE-NEXT: sub sp, #40 166; CHECK-LE-NEXT: vmov.i32 q0, #0x1 167; CHECK-LE-NEXT: mov r0, sp 168; CHECK-LE-NEXT: vstrw.32 q0, [r0] 169; CHECK-LE-NEXT: movs r0, #3 170; CHECK-LE-NEXT: vstrw.32 q0, [sp, #16] 171; CHECK-LE-NEXT: str r0, [sp, #16] 172; CHECK-LE-NEXT: vldrw.u32 q0, [sp, #16] 173; CHECK-LE-NEXT: add sp, #40 174; CHECK-LE-NEXT: bx lr 175; 176; CHECK-BE-LABEL: loadstore_4xi32_stack_off16: 177; CHECK-BE: @ %bb.0: @ %entry 178; CHECK-BE-NEXT: .pad #40 179; CHECK-BE-NEXT: sub sp, #40 180; CHECK-BE-NEXT: vmov.i32 q0, #0x1 181; CHECK-BE-NEXT: mov r0, sp 182; CHECK-BE-NEXT: vstrw.32 q0, [r0] 183; CHECK-BE-NEXT: movs r0, #3 184; CHECK-BE-NEXT: vstrw.32 q0, [sp, #16] 185; CHECK-BE-NEXT: str r0, [sp, #16] 186; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] 187; CHECK-BE-NEXT: vrev64.8 q0, q1 188; CHECK-BE-NEXT: add sp, #40 189; CHECK-BE-NEXT: bx lr 190entry: 191 %c = alloca [1 x [5 x [2 x i32]]], align 4 192 %0 = bitcast [1 x [5 x [2 x i32]]]* %c to i8* 193 %arrayidx5 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 0, i32 0 194 %1 = bitcast [1 x [5 x [2 x i32]]]* %c to <4 x i32>* 195 store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* %1, align 4 196 %arrayidx5.2 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 2, i32 0 197 %2 = bitcast i32* %arrayidx5.2 to <4 x i32>* 198 store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* %2, align 4 199 store i32 3, i32* %arrayidx5.2, align 4 200 %3 = load <4 x i32>, <4 x i32>* %2, align 4 201 ret <4 x i32> %3 202} 203 204define arm_aapcs_vfpcc <8 x i16> @loadstore_8xi16_stack_off16() { 205; CHECK-LE-LABEL: loadstore_8xi16_stack_off16: 206; CHECK-LE: @ %bb.0: @ %entry 207; CHECK-LE-NEXT: .pad #40 208; CHECK-LE-NEXT: sub sp, #40 209; CHECK-LE-NEXT: vmov.i16 q0, #0x1 210; CHECK-LE-NEXT: mov r0, sp 211; CHECK-LE-NEXT: vstrh.16 q0, [r0] 212; CHECK-LE-NEXT: movs r0, #3 213; CHECK-LE-NEXT: vstrh.16 q0, [sp, #16] 214; CHECK-LE-NEXT: strh.w r0, [sp, #16] 215; CHECK-LE-NEXT: vldrh.u16 q0, [sp, #16] 216; CHECK-LE-NEXT: add sp, #40 217; CHECK-LE-NEXT: bx lr 218; 219; CHECK-BE-LABEL: loadstore_8xi16_stack_off16: 220; CHECK-BE: @ %bb.0: @ %entry 221; CHECK-BE-NEXT: .pad #40 222; CHECK-BE-NEXT: sub sp, #40 223; CHECK-BE-NEXT: vmov.i16 q0, #0x1 224; CHECK-BE-NEXT: mov r0, sp 225; CHECK-BE-NEXT: vstrh.16 q0, [r0] 226; CHECK-BE-NEXT: movs r0, #3 227; CHECK-BE-NEXT: vstrh.16 q0, [sp, #16] 228; CHECK-BE-NEXT: strh.w r0, [sp, #16] 229; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] 230; CHECK-BE-NEXT: vrev64.8 q0, q1 231; CHECK-BE-NEXT: add sp, #40 232; CHECK-BE-NEXT: bx lr 233entry: 234 %c = alloca [1 x [10 x [2 x i16]]], align 2 235 %0 = bitcast [1 x [10 x [2 x i16]]]* %c to i8* 236 %arrayidx5 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 0, i32 0 237 %1 = bitcast [1 x [10 x [2 x i16]]]* %c to <8 x i16>* 238 store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>* %1, align 2 239 %arrayidx5.2 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 4, i32 0 240 %2 = bitcast i16* %arrayidx5.2 to <8 x i16>* 241 store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>* %2, align 2 242 store i16 3, i16* %arrayidx5.2, align 2 243 %3 = load <8 x i16>, <8 x i16>* %2, align 2 244 ret <8 x i16> %3 245} 246 247define arm_aapcs_vfpcc <16 x i8> @loadstore_16xi8_stack_off16() { 248; CHECK-LE-LABEL: loadstore_16xi8_stack_off16: 249; CHECK-LE: @ %bb.0: @ %entry 250; CHECK-LE-NEXT: .pad #40 251; CHECK-LE-NEXT: sub sp, #40 252; CHECK-LE-NEXT: vmov.i8 q0, #0x1 253; CHECK-LE-NEXT: mov r0, sp 254; CHECK-LE-NEXT: vstrb.8 q0, [r0] 255; CHECK-LE-NEXT: movs r0, #3 256; CHECK-LE-NEXT: vstrb.8 q0, [sp, #16] 257; CHECK-LE-NEXT: strb.w r0, [sp, #16] 258; CHECK-LE-NEXT: vldrb.u8 q0, [sp, #16] 259; CHECK-LE-NEXT: add sp, #40 260; CHECK-LE-NEXT: bx lr 261; 262; CHECK-BE-LABEL: loadstore_16xi8_stack_off16: 263; CHECK-BE: @ %bb.0: @ %entry 264; CHECK-BE-NEXT: .pad #40 265; CHECK-BE-NEXT: sub sp, #40 266; CHECK-BE-NEXT: vmov.i8 q0, #0x1 267; CHECK-BE-NEXT: mov r0, sp 268; CHECK-BE-NEXT: vstrb.8 q0, [r0] 269; CHECK-BE-NEXT: movs r0, #3 270; CHECK-BE-NEXT: vstrb.8 q0, [sp, #16] 271; CHECK-BE-NEXT: strb.w r0, [sp, #16] 272; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] 273; CHECK-BE-NEXT: vrev64.8 q0, q1 274; CHECK-BE-NEXT: add sp, #40 275; CHECK-BE-NEXT: bx lr 276entry: 277 %c = alloca [1 x [20 x [2 x i8]]], align 1 278 %0 = bitcast [1 x [20 x [2 x i8]]]* %c to i8* 279 %arrayidx5 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 0, i32 0 280 %1 = bitcast [1 x [20 x [2 x i8]]]* %c to <16 x i8>* 281 store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>* %1, align 1 282 %arrayidx5.2 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 8, i32 0 283 %2 = bitcast i8* %arrayidx5.2 to <16 x i8>* 284 store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>* %2, align 1 285 store i8 3, i8* %arrayidx5.2, align 1 286 %3 = load <16 x i8>, <16 x i8>* %2, align 1 287 ret <16 x i8> %3 288} 289