1; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON 2; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true -mattr=-neon < %s | FileCheck %s -check-prefix=NONEON 3 4; NEON-LABEL: load_factor2: 5; NEON: ld2 { v0.8b, v1.8b }, [x0] 6; NONEON-LABEL: load_factor2: 7; NONEON-NOT: ld2 8define <8 x i8> @load_factor2(<16 x i8>* %ptr) { 9 %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4 10 %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 11 %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 12 %add = add nsw <8 x i8> %strided.v0, %strided.v1 13 ret <8 x i8> %add 14} 15 16; NEON-LABEL: load_factor3: 17; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0] 18; NONEON-LABEL: load_factor3: 19; NONEON-NOT: ld3 20define <4 x i32> @load_factor3(i32* %ptr) { 21 %base = bitcast i32* %ptr to <12 x i32>* 22 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 23 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 24 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 25 %add = add nsw <4 x i32> %strided.v2, %strided.v1 26 ret <4 x i32> %add 27} 28 29; NEON-LABEL: load_factor4: 30; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] 31; NONEON-LABEL: load_factor4: 32; NONEON-NOT: ld4 33define <4 x i32> @load_factor4(i32* %ptr) { 34 %base = bitcast i32* %ptr to <16 x i32>* 35 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 36 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 37 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 38 %add = add nsw <4 x i32> %strided.v0, %strided.v2 39 ret <4 x i32> %add 40} 41 42; NEON-LABEL: store_factor2: 43; NEON: st2 { v0.8b, v1.8b }, [x0] 44; NONEON-LABEL: store_factor2: 45; NONEON-NOT: st2 46define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) { 47 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 48 store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4 49 ret void 50} 51 52; NEON-LABEL: store_factor3: 53; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0] 54; NONEON-LABEL: store_factor3: 55; NONEON-NOT: st3 56define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { 57 %base = bitcast i32* %ptr to <12 x i32>* 58 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 59 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 60 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 61 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4 62 ret void 63} 64 65; NEON-LABEL: store_factor4: 66; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] 67; NONEON-LABEL: store_factor4: 68; NONEON-NOT: st4 69define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { 70 %base = bitcast i32* %ptr to <16 x i32>* 71 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 72 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 73 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 74 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 75 ret void 76} 77 78; The following cases test that interleaved access of pointer vectors can be 79; matched to ldN/stN instruction. 80 81; NEON-LABEL: load_ptrvec_factor2: 82; NEON: ld2 { v0.2d, v1.2d }, [x0] 83; NONEON-LABEL: load_ptrvec_factor2: 84; NONEON-NOT: ld2 85define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) { 86 %base = bitcast i32** %ptr to <4 x i32*>* 87 %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4 88 %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2> 89 ret <2 x i32*> %strided.v0 90} 91 92; NEON-LABEL: load_ptrvec_factor3: 93; NEON: ld3 { v0.2d, v1.2d, v2.2d }, [x0] 94; NONEON-LABEL: load_ptrvec_factor3: 95; NONEON-NOT: ld3 96define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { 97 %base = bitcast i32** %ptr to <6 x i32*>* 98 %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4 99 %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5> 100 store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1 101 %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4> 102 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2 103 ret void 104} 105 106; NEON-LABEL: load_ptrvec_factor4: 107; NEON: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] 108; NONEON-LABEL: load_ptrvec_factor4: 109; NONEON-NOT: ld4 110define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { 111 %base = bitcast i32** %ptr to <8 x i32*>* 112 %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4 113 %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5> 114 %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7> 115 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1 116 store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2 117 ret void 118} 119 120; NEON-LABEL: store_ptrvec_factor2: 121; NEON: st2 { v0.2d, v1.2d }, [x0] 122; NONEON-LABEL: store_ptrvec_factor2: 123; NONEON-NOT: st2 124define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) { 125 %base = bitcast i32** %ptr to <4 x i32*>* 126 %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 127 store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4 128 ret void 129} 130 131; NEON-LABEL: store_ptrvec_factor3: 132; NEON: st3 { v0.2d, v1.2d, v2.2d }, [x0] 133; NONEON-LABEL: store_ptrvec_factor3: 134; NONEON-NOT: st3 135define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) { 136 %base = bitcast i32** %ptr to <6 x i32*>* 137 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 138 %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 139 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 140 store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4 141 ret void 142} 143 144; NEON-LABEL: store_ptrvec_factor4: 145; NEON: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] 146; NONEON-LABEL: store_ptrvec_factor4: 147; NONEON-NOT: st4 148define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) { 149 %base = bitcast i32* %ptr to <8 x i32*>* 150 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 151 %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 152 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 153 store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4 154 ret void 155} 156 157; Following cases check that shuffle maskes with undef indices can be matched 158; into ldN/stN instruction. 159 160; NEON-LABEL: load_undef_mask_factor2: 161; NEON: ld2 { v0.4s, v1.4s }, [x0] 162; NONEON-LABEL: load_undef_mask_factor2: 163; NONEON-NOT: ld2 164define <4 x i32> @load_undef_mask_factor2(i32* %ptr) { 165 %base = bitcast i32* %ptr to <8 x i32>* 166 %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4 167 %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6> 168 %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7> 169 %add = add nsw <4 x i32> %strided.v0, %strided.v1 170 ret <4 x i32> %add 171} 172 173; NEON-LABEL: load_undef_mask_factor3: 174; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0] 175; NONEON-LABEL: load_undef_mask_factor3: 176; NONEON-NOT: ld3 177define <4 x i32> @load_undef_mask_factor3(i32* %ptr) { 178 %base = bitcast i32* %ptr to <12 x i32>* 179 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 180 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 181 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 182 %add = add nsw <4 x i32> %strided.v2, %strided.v1 183 ret <4 x i32> %add 184} 185 186; NEON-LABEL: load_undef_mask_factor4: 187; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] 188; NONEON-LABEL: load_undef_mask_factor4: 189; NONEON-NOT: ld4 190define <4 x i32> @load_undef_mask_factor4(i32* %ptr) { 191 %base = bitcast i32* %ptr to <16 x i32>* 192 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 193 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef> 194 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef> 195 %add = add nsw <4 x i32> %strided.v0, %strided.v2 196 ret <4 x i32> %add 197} 198 199; NEON-LABEL: store_undef_mask_factor2: 200; NEON: st2 { v0.4s, v1.4s }, [x0] 201; NONEON-LABEL: store_undef_mask_factor2: 202; NONEON-NOT: st2 203define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) { 204 %base = bitcast i32* %ptr to <8 x i32>* 205 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7> 206 store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4 207 ret void 208} 209 210; NEON-LABEL: store_undef_mask_factor3: 211; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0] 212; NONEON-LABEL: store_undef_mask_factor3: 213; NONEON-NOT: st3 214define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { 215 %base = bitcast i32* %ptr to <12 x i32>* 216 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 217 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 218 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 219 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4 220 ret void 221} 222 223; NEON-LABEL: store_undef_mask_factor4: 224; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] 225; NONEON-LABEL: store_undef_mask_factor4: 226; NONEON-NOT: st4 227define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { 228 %base = bitcast i32* %ptr to <16 x i32>* 229 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 230 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 231 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 232 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 233 ret void 234} 235 236; Check that we do something sane with illegal types. 237 238; NEON-LABEL: load_illegal_factor2: 239; NEON: BB#0: 240; NEON-NEXT: ldr q[[V:[0-9]+]], [x0] 241; NEON-NEXT: uzp1 v0.4s, v[[V]].4s, v{{.*}}.4s 242; NEON-NEXT: ret 243; NONEON-LABEL: load_illegal_factor2: 244; NONEON: BB#0: 245; NONEON-NEXT: ldr s0, [x0] 246; NONEON-NEXT: ldr s1, [x0, #8] 247; NONEON-NEXT: ret 248define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind { 249 %tmp1 = load <3 x float>, <3 x float>* %p, align 16 250 %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef> 251 ret <3 x float> %tmp2 252} 253 254; NEON-LABEL: store_illegal_factor2: 255; NEON: BB#0: 256; NEON-NEXT: uzp1 v0.4s, v0.4s, v{{.*}}.4s 257; NEON-NEXT: st1 { v0.d }[0], [x0] 258; NEON-NEXT: ret 259; NONEON-LABEL: store_illegal_factor2: 260; NONEON: BB#0: 261; NONEON-NEXT: fmov w[[ELT2:[0-9]+]], s2 262; NONEON-NEXT: fmov w[[RES:[0-9]+]], s0 263; NONEON-NEXT: bfi x[[RES]], x[[ELT2]], #32, #32 264; NONEON-NEXT: str x[[RES]], [x0] 265; NONEON-NEXT: ret 266define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind { 267 %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef> 268 store <3 x float> %tmp1, <3 x float>* %p, align 16 269 ret void 270} 271 272; NEON-LABEL: load_factor2_with_extract_user: 273; NEON: ld2 { v0.4s, v1.4s }, [x0] 274; NEON: mov w0, v0.s[1] 275; NONEON-LABEL: load_factor2_with_extract_user: 276; NONEON-NOT: ld2 277define i32 @load_factor2_with_extract_user(<8 x i32>* %a) { 278 %1 = load <8 x i32>, <8 x i32>* %a, align 8 279 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 280 %3 = extractelement <8 x i32> %1, i32 2 281 ret i32 %3 282} 283