1; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s 2 3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 4 5; Check vectorization on an interleaved load group of factor 2 and an interleaved 6; store group of factor 2. 7 8; int AB[1024]; 9; int CD[1024]; 10; void test_array_load2_store2(int C, int D) { 11; for (int i = 0; i < 1024; i+=2) { 12; int A = AB[i]; 13; int B = AB[i+1]; 14; CD[i] = A + C; 15; CD[i+1] = B * D; 16; } 17; } 18 19; CHECK-LABEL: @test_array_load2_store2( 20; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 21; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 22; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 23; CHECK: add nsw <4 x i32> 24; CHECK: mul nsw <4 x i32> 25; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 26; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 27 28@AB = common global [1024 x i32] zeroinitializer, align 4 29@CD = common global [1024 x i32] zeroinitializer, align 4 30 31define void @test_array_load2_store2(i32 %C, i32 %D) { 32entry: 33 br label %for.body 34 35for.body: ; preds = %for.body, %entry 36 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 37 %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv 38 %tmp = load i32, i32* %arrayidx0, align 4 39 %tmp1 = or i64 %indvars.iv, 1 40 %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 41 %tmp2 = load i32, i32* %arrayidx1, align 4 42 %add = add nsw i32 %tmp, %C 43 %mul = mul nsw i32 %tmp2, %D 44 %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv 45 store i32 %add, i32* %arrayidx2, align 4 46 %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 47 store i32 %mul, i32* %arrayidx3, align 4 48 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 49 %cmp = icmp slt i64 %indvars.iv.next, 1024 50 br i1 %cmp, label %for.body, label %for.end 51 52for.end: ; preds = %for.body 53 ret void 54} 55 56; int A[3072]; 57; struct ST S[1024]; 58; void test_struct_st3() { 59; int *ptr = A; 60; for (int i = 0; i < 1024; i++) { 61; int X1 = *ptr++; 62; int X2 = *ptr++; 63; int X3 = *ptr++; 64; T[i].x = X1 + 1; 65; T[i].y = X2 + 2; 66; T[i].z = X3 + 3; 67; } 68; } 69 70; CHECK-LABEL: @test_struct_array_load3_store3( 71; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 72; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 73; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 74; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 75; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1> 76; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2> 77; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3> 78; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 79; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 80; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 81; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 82 83%struct.ST3 = type { i32, i32, i32 } 84@A = common global [3072 x i32] zeroinitializer, align 4 85@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 86 87define void @test_struct_array_load3_store3() { 88entry: 89 br label %for.body 90 91for.body: ; preds = %for.body, %entry 92 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 93 %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] 94 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 95 %tmp = load i32, i32* %ptr.016, align 4 96 %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 97 %tmp1 = load i32, i32* %incdec.ptr, align 4 98 %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 99 %tmp2 = load i32, i32* %incdec.ptr1, align 4 100 %add = add nsw i32 %tmp, 1 101 %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 102 store i32 %add, i32* %x, align 4 103 %add3 = add nsw i32 %tmp1, 2 104 %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 105 store i32 %add3, i32* %y, align 4 106 %add6 = add nsw i32 %tmp2, 3 107 %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 108 store i32 %add6, i32* %z, align 4 109 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 110 %exitcond = icmp eq i64 %indvars.iv.next, 1024 111 br i1 %exitcond, label %for.end, label %for.body 112 113for.end: ; preds = %for.body 114 ret void 115} 116 117; Check vectorization on an interleaved load group of factor 4. 118 119; struct ST4{ 120; int x; 121; int y; 122; int z; 123; int w; 124; }; 125; int test_struct_load4(struct ST4 *S) { 126; int r = 0; 127; for (int i = 0; i < 1024; i++) { 128; r += S[i].x; 129; r -= S[i].y; 130; r += S[i].z; 131; r -= S[i].w; 132; } 133; return r; 134; } 135 136; CHECK-LABEL: @test_struct_load4( 137; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4 138; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 139; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 140; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 141; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 142; CHECK: add nsw <4 x i32> 143; CHECK: sub <4 x i32> 144; CHECK: add nsw <4 x i32> 145; CHECK: sub <4 x i32> 146 147%struct.ST4 = type { i32, i32, i32, i32 } 148 149define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { 150entry: 151 br label %for.body 152 153for.body: ; preds = %for.body, %entry 154 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 155 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] 156 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 157 %tmp = load i32, i32* %x, align 4 158 %add = add nsw i32 %tmp, %r.022 159 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 160 %tmp1 = load i32, i32* %y, align 4 161 %sub = sub i32 %add, %tmp1 162 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 163 %tmp2 = load i32, i32* %z, align 4 164 %add5 = add nsw i32 %sub, %tmp2 165 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 166 %tmp3 = load i32, i32* %w, align 4 167 %sub8 = sub i32 %add5, %tmp3 168 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 169 %exitcond = icmp eq i64 %indvars.iv.next, 1024 170 br i1 %exitcond, label %for.end, label %for.body 171 172for.end: ; preds = %for.body 173 ret i32 %sub8 174} 175 176; Check vectorization on an interleaved store group of factor 4. 177 178; void test_struct_store4(int *A, struct ST4 *B) { 179; int *ptr = A; 180; for (int i = 0; i < 1024; i++) { 181; int X = *ptr++; 182; B[i].x = X + 1; 183; B[i].y = X * 2; 184; B[i].z = X + 3; 185; B[i].w = X + 4; 186; } 187; } 188 189; CHECK-LABEL: @test_struct_store4( 190; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 191; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1> 192; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1> 193; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3> 194; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4> 195; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 196; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 197; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 198; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 199 200define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) { 201entry: 202 br label %for.body 203 204for.cond.cleanup: ; preds = %for.body 205 ret void 206 207for.body: ; preds = %for.body, %entry 208 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 209 %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] 210 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 211 %tmp = load i32, i32* %ptr.024, align 4 212 %add = add nsw i32 %tmp, 1 213 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 214 store i32 %add, i32* %x, align 4 215 %mul = shl nsw i32 %tmp, 1 216 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 217 store i32 %mul, i32* %y, align 4 218 %add3 = add nsw i32 %tmp, 3 219 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 220 store i32 %add3, i32* %z, align 4 221 %add6 = add nsw i32 %tmp, 4 222 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 223 store i32 %add6, i32* %w, align 4 224 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 225 %exitcond = icmp eq i64 %indvars.iv.next, 1024 226 br i1 %exitcond, label %for.cond.cleanup, label %for.body 227} 228 229; Check vectorization on a reverse interleaved load group of factor 2 and 230; a reverse interleaved store group of factor 2. 231 232; struct ST2 { 233; int x; 234; int y; 235; }; 236; 237; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { 238; for (int i = 1023; i >= 0; i--) { 239; int a = A[i].x + i; // interleaved load of index 0 240; int b = A[i].y - i; // interleaved load of index 1 241; B[i].x = a; // interleaved store of index 0 242; B[i].y = b; // interleaved store of index 1 243; } 244; } 245 246; CHECK-LABEL: @test_reversed_load2_store2( 247; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 248; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 249; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 250; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 251; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 252; CHECK: add nsw <4 x i32> 253; CHECK: sub nsw <4 x i32> 254; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 255; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 256; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 257; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 258 259%struct.ST2 = type { i32, i32 } 260 261define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) { 262entry: 263 br label %for.body 264 265for.cond.cleanup: ; preds = %for.body 266 ret void 267 268for.body: ; preds = %for.body, %entry 269 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] 270 %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 271 %tmp = load i32, i32* %x, align 4 272 %tmp1 = trunc i64 %indvars.iv to i32 273 %add = add nsw i32 %tmp, %tmp1 274 %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 275 %tmp2 = load i32, i32* %y, align 4 276 %sub = sub nsw i32 %tmp2, %tmp1 277 %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 278 store i32 %add, i32* %x5, align 4 279 %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 280 store i32 %sub, i32* %y8, align 4 281 %indvars.iv.next = add nsw i64 %indvars.iv, -1 282 %cmp = icmp sgt i64 %indvars.iv, 0 283 br i1 %cmp, label %for.body, label %for.cond.cleanup 284} 285 286; Check vectorization on an interleaved load group of factor 2 with 1 gap 287; (missing the load of odd elements). Because the vectorized loop would 288; speculatively access memory out-of-bounds, we must execute at least one 289; iteration of the scalar loop. 290 291; void even_load_static_tc(int *A, int *B) { 292; for (unsigned i = 0; i < 1024; i+=2) 293; B[i/2] = A[i] * 2; 294; } 295 296; CHECK-LABEL: @even_load_static_tc( 297; CHECK: vector.body: 298; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 299; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 300; CHECK: icmp eq i64 %index.next, 508 301; CHECK: middle.block: 302; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph 303 304define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 305entry: 306 br label %for.body 307 308for.cond.cleanup: ; preds = %for.body 309 ret void 310 311for.body: ; preds = %for.body, %entry 312 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 313 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 314 %tmp = load i32, i32* %arrayidx, align 4 315 %mul = shl nsw i32 %tmp, 1 316 %tmp1 = lshr exact i64 %indvars.iv, 1 317 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 318 store i32 %mul, i32* %arrayidx2, align 4 319 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 320 %cmp = icmp ult i64 %indvars.iv.next, 1024 321 br i1 %cmp, label %for.body, label %for.cond.cleanup 322} 323 324; Check vectorization on an interleaved load group of factor 2 with 1 gap 325; (missing the load of odd elements). Because the vectorized loop would 326; speculatively access memory out-of-bounds, we must execute at least one 327; iteration of the scalar loop. 328 329; void even_load_dynamic_tc(int *A, int *B, unsigned N) { 330; for (unsigned i = 0; i < N; i+=2) 331; B[i/2] = A[i] * 2; 332; } 333 334; CHECK-LABEL: @even_load_dynamic_tc( 335; CHECK: min.iters.checked: 336; CHECK: %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3 337; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 338; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 339; CHECK: %n.vec = sub i64 %[[N]], %[[R]] 340; CHECK: vector.body: 341; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 342; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 343; CHECK: icmp eq i64 %index.next, %n.vec 344; CHECK: middle.block: 345; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph 346 347define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) { 348entry: 349 br label %for.body 350 351for.cond.cleanup: ; preds = %for.body 352 ret void 353 354for.body: ; preds = %for.body, %entry 355 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 356 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 357 %tmp = load i32, i32* %arrayidx, align 4 358 %mul = shl nsw i32 %tmp, 1 359 %tmp1 = lshr exact i64 %indvars.iv, 1 360 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 361 store i32 %mul, i32* %arrayidx2, align 4 362 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 363 %cmp = icmp ult i64 %indvars.iv.next, %N 364 br i1 %cmp, label %for.body, label %for.cond.cleanup 365} 366 367; Check vectorization on a reverse interleaved load group of factor 2 with 1 368; gap and a reverse interleaved store group of factor 2. The interleaved load 369; group should be removed since it has a gap and is reverse. 370 371; struct pair { 372; int x; 373; int y; 374; }; 375; 376; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { 377; for (int i = 1023; i >= 0; i--) { 378; int a = X + i; 379; int b = A[i].y - i; 380; B[i].x = a; 381; B[i].y = b; 382; } 383; } 384 385; CHECK-LABEL: @load_gap_reverse( 386; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8 387; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 388 389%pair = type { i64, i64 } 390define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) { 391entry: 392 br label %for.body 393 394for.body: 395 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] 396 %0 = add nsw i64 %X, %i 397 %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 398 %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 399 %3 = load i64, i64* %2, align 8 400 %4 = sub nsw i64 %3, %i 401 store i64 %0, i64* %1, align 8 402 store i64 %4, i64* %2, align 8 403 %i.next = add nsw i64 %i, -1 404 %cond = icmp sgt i64 %i, 0 405 br i1 %cond, label %for.body, label %for.exit 406 407for.exit: 408 ret void 409} 410 411; Check vectorization on interleaved access groups identified from mixed 412; loads/stores. 413; void mixed_load2_store2(int *A, int *B) { 414; for (unsigned i = 0; i < 1024; i+=2) { 415; B[i] = A[i] * A[i+1]; 416; B[i+1] = A[i] + A[i+1]; 417; } 418; } 419 420; CHECK-LABEL: @mixed_load2_store2( 421; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 422; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 423; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 424; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 425; CHECK: store <8 x i32> %interleaved.vec 426 427define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 428entry: 429 br label %for.body 430 431for.cond.cleanup: ; preds = %for.body 432 ret void 433 434for.body: ; preds = %for.body, %entry 435 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 436 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 437 %tmp = load i32, i32* %arrayidx, align 4 438 %tmp1 = or i64 %indvars.iv, 1 439 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1 440 %tmp2 = load i32, i32* %arrayidx2, align 4 441 %mul = mul nsw i32 %tmp2, %tmp 442 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 443 store i32 %mul, i32* %arrayidx4, align 4 444 %tmp3 = load i32, i32* %arrayidx, align 4 445 %tmp4 = load i32, i32* %arrayidx2, align 4 446 %add10 = add nsw i32 %tmp4, %tmp3 447 %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1 448 store i32 %add10, i32* %arrayidx13, align 4 449 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 450 %cmp = icmp ult i64 %indvars.iv.next, 1024 451 br i1 %cmp, label %for.body, label %for.cond.cleanup 452} 453 454; Check vectorization on interleaved access groups identified from mixed 455; loads/stores. 456; void mixed_load3_store3(int *A) { 457; for (unsigned i = 0; i < 1024; i++) { 458; *A++ += i; 459; *A++ += i; 460; *A++ += i; 461; } 462; } 463 464; CHECK-LABEL: @mixed_load3_store3( 465; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 466; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 467; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 468; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 469; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 470; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4 471 472define void @mixed_load3_store3(i32* nocapture %A) { 473entry: 474 br label %for.body 475 476for.cond.cleanup: ; preds = %for.body 477 ret void 478 479for.body: ; preds = %for.body, %entry 480 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 481 %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ] 482 %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1 483 %tmp = load i32, i32* %A.addr.012, align 4 484 %add = add i32 %tmp, %i.013 485 store i32 %add, i32* %A.addr.012, align 4 486 %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2 487 %tmp1 = load i32, i32* %incdec.ptr, align 4 488 %add2 = add i32 %tmp1, %i.013 489 store i32 %add2, i32* %incdec.ptr, align 4 490 %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3 491 %tmp2 = load i32, i32* %incdec.ptr1, align 4 492 %add4 = add i32 %tmp2, %i.013 493 store i32 %add4, i32* %incdec.ptr1, align 4 494 %inc = add nuw nsw i32 %i.013, 1 495 %exitcond = icmp eq i32 %inc, 1024 496 br i1 %exitcond, label %for.cond.cleanup, label %for.body 497} 498 499; Check vectorization on interleaved access groups with members having different 500; kinds of type. 501 502; struct IntFloat { 503; int a; 504; float b; 505; }; 506; 507; int SA; 508; float SB; 509; 510; void int_float_struct(struct IntFloat *A) { 511; int SumA; 512; float SumB; 513; for (unsigned i = 0; i < 1024; i++) { 514; SumA += A[i].a; 515; SumB += A[i].b; 516; } 517; SA = SumA; 518; SB = SumB; 519; } 520 521; CHECK-LABEL: @int_float_struct( 522; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 523; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 524; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 525; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float> 526; CHECK: add nsw <4 x i32> 527; CHECK: fadd fast <4 x float> 528 529%struct.IntFloat = type { i32, float } 530 531@SA = common global i32 0, align 4 532@SB = common global float 0.000000e+00, align 4 533 534define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { 535entry: 536 br label %for.body 537 538for.cond.cleanup: ; preds = %for.body 539 store i32 %add, i32* @SA, align 4 540 store float %add3, float* @SB, align 4 541 ret void 542 543for.body: ; preds = %for.body, %entry 544 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 545 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] 546 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] 547 %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 548 %tmp = load i32, i32* %a, align 4 549 %add = add nsw i32 %tmp, %SumA.013 550 %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 551 %tmp1 = load float, float* %b, align 4 552 %add3 = fadd fast float %SumB.014, %tmp1 553 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 554 %exitcond = icmp eq i64 %indvars.iv.next, 1024 555 br i1 %exitcond, label %for.cond.cleanup, label %for.body 556} 557 558; Check vectorization of interleaved access groups in the presence of 559; dependences (PR27626). The following tests check that we don't reorder 560; dependent loads and stores when generating code for interleaved access 561; groups. Stores should be scalarized because the required code motion would 562; break dependences, and the remaining interleaved load groups should have 563; gaps. 564 565; PR27626_0: Ensure a strided store is not moved after a dependent (zero 566; distance) strided load. 567 568; void PR27626_0(struct pair *p, int z, int n) { 569; for (int i = 0; i < n; i++) { 570; p[i].x = z; 571; p[i].y = p[i].x; 572; } 573; } 574 575; CHECK-LABEL: @PR27626_0( 576; CHECK: min.iters.checked: 577; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 578; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 579; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 580; CHECK: %n.vec = sub i64 %[[N]], %[[R]] 581; CHECK: vector.body: 582; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 583; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 584; CHECK: store i32 %[[X1]], {{.*}} 585; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 586; CHECK: store i32 %[[X2]], {{.*}} 587; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 588; CHECK: store i32 %[[X3]], {{.*}} 589; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 590; CHECK: store i32 %[[X4]], {{.*}} 591 592%pair.i32 = type { i32, i32 } 593define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { 594entry: 595 br label %for.body 596 597for.body: 598 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 599 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 600 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 601 store i32 %z, i32* %p_i.x, align 4 602 %0 = load i32, i32* %p_i.x, align 4 603 store i32 %0, i32 *%p_i.y, align 4 604 %i.next = add nuw nsw i64 %i, 1 605 %cond = icmp slt i64 %i.next, %n 606 br i1 %cond, label %for.body, label %for.end 607 608for.end: 609 ret void 610} 611 612; PR27626_1: Ensure a strided load is not moved before a dependent (zero 613; distance) strided store. 614 615; void PR27626_1(struct pair *p, int n) { 616; int s = 0; 617; for (int i = 0; i < n; i++) { 618; p[i].y = p[i].x; 619; s += p[i].y 620; } 621; } 622 623; CHECK-LABEL: @PR27626_1( 624; CHECK: min.iters.checked: 625; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 626; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 627; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 628; CHECK: %n.vec = sub i64 %[[N]], %[[R]] 629; CHECK: vector.body: 630; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] 631; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 632; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 633; CHECK: store i32 %[[X1:.+]], {{.*}} 634; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 635; CHECK: store i32 %[[X2:.+]], {{.*}} 636; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 637; CHECK: store i32 %[[X3:.+]], {{.*}} 638; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 639; CHECK: store i32 %[[X4:.+]], {{.*}} 640; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 641; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 642; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] 643 644define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { 645entry: 646 br label %for.body 647 648for.body: 649 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 650 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 651 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 652 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 653 %0 = load i32, i32* %p_i.x, align 4 654 store i32 %0, i32* %p_i.y, align 4 655 %1 = load i32, i32* %p_i.y, align 4 656 %2 = add nsw i32 %1, %s 657 %i.next = add nuw nsw i64 %i, 1 658 %cond = icmp slt i64 %i.next, %n 659 br i1 %cond, label %for.body, label %for.end 660 661for.end: 662 %3 = phi i32 [ %2, %for.body ] 663 ret i32 %3 664} 665 666; PR27626_2: Ensure a strided store is not moved after a dependent (negative 667; distance) strided load. 668 669; void PR27626_2(struct pair *p, int z, int n) { 670; for (int i = 0; i < n; i++) { 671; p[i].x = z; 672; p[i].y = p[i - 1].x; 673; } 674; } 675 676; CHECK-LABEL: @PR27626_2( 677; CHECK: min.iters.checked: 678; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 679; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 680; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 681; CHECK: %n.vec = sub i64 %[[N]], %[[R]] 682; CHECK: vector.body: 683; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 684; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 685; CHECK: store i32 %[[X1]], {{.*}} 686; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 687; CHECK: store i32 %[[X2]], {{.*}} 688; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 689; CHECK: store i32 %[[X3]], {{.*}} 690; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 691; CHECK: store i32 %[[X4]], {{.*}} 692 693define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) { 694entry: 695 br label %for.body 696 697for.body: 698 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 699 %i_minus_1 = add nuw nsw i64 %i, -1 700 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 701 %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 702 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 703 store i32 %z, i32* %p_i.x, align 4 704 %0 = load i32, i32* %p_i_minus_1.x, align 4 705 store i32 %0, i32 *%p_i.y, align 4 706 %i.next = add nuw nsw i64 %i, 1 707 %cond = icmp slt i64 %i.next, %n 708 br i1 %cond, label %for.body, label %for.end 709 710for.end: 711 ret void 712} 713 714; PR27626_3: Ensure a strided load is not moved before a dependent (negative 715; distance) strided store. 716 717; void PR27626_3(struct pair *p, int z, int n) { 718; for (int i = 0; i < n; i++) { 719; p[i + 1].y = p[i].x; 720; s += p[i].y; 721; } 722; } 723 724; CHECK-LABEL: @PR27626_3( 725; CHECK: min.iters.checked: 726; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 727; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 728; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 729; CHECK: %n.vec = sub i64 %[[N]], %[[R]] 730; CHECK: vector.body: 731; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] 732; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 733; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 734; CHECK: store i32 %[[X1:.+]], {{.*}} 735; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 736; CHECK: store i32 %[[X2:.+]], {{.*}} 737; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 738; CHECK: store i32 %[[X3:.+]], {{.*}} 739; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 740; CHECK: store i32 %[[X4:.+]], {{.*}} 741; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 742; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 743; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] 744 745define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { 746entry: 747 br label %for.body 748 749for.body: 750 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 751 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 752 %i_plus_1 = add nuw nsw i64 %i, 1 753 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 754 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 755 %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 756 %0 = load i32, i32* %p_i.x, align 4 757 store i32 %0, i32* %p_i_plus_1.y, align 4 758 %1 = load i32, i32* %p_i.y, align 4 759 %2 = add nsw i32 %1, %s 760 %i.next = add nuw nsw i64 %i, 1 761 %cond = icmp slt i64 %i.next, %n 762 br i1 %cond, label %for.body, label %for.end 763 764for.end: 765 %3 = phi i32 [ %2, %for.body ] 766 ret i32 %3 767} 768 769; PR27626_4: Ensure we form an interleaved group for strided stores in the 770; presence of a write-after-write dependence. We create a group for 771; (2) and (3) while excluding (1). 772 773; void PR27626_4(int *a, int x, int y, int z, int n) { 774; for (int i = 0; i < n; i += 2) { 775; a[i] = x; // (1) 776; a[i] = y; // (2) 777; a[i + 1] = z; // (3) 778; } 779; } 780 781; CHECK-LABEL: @PR27626_4( 782; CHECK: vector.ph: 783; CHECK: %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0 784; CHECK: %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer 785; CHECK: %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0 786; CHECK: %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer 787; CHECK: vector.body: 788; CHECK: store i32 %x, {{.*}} 789; CHECK: store i32 %x, {{.*}} 790; CHECK: store i32 %x, {{.*}} 791; CHECK: store i32 %x, {{.*}} 792; CHECK: %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 793; CHECK: store <8 x i32> %[[VEC]], {{.*}} 794 795define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 796entry: 797 br label %for.body 798 799for.body: 800 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 801 %i_plus_1 = add i64 %i, 1 802 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 803 %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 804 store i32 %x, i32* %a_i, align 4 805 store i32 %y, i32* %a_i, align 4 806 store i32 %z, i32* %a_i_plus_1, align 4 807 %i.next = add nuw nsw i64 %i, 2 808 %cond = icmp slt i64 %i.next, %n 809 br i1 %cond, label %for.body, label %for.end 810 811for.end: 812 ret void 813} 814 815; PR27626_5: Ensure we do not form an interleaved group for strided stores in 816; the presence of a write-after-write dependence. 817 818; void PR27626_5(int *a, int x, int y, int z, int n) { 819; for (int i = 3; i < n; i += 2) { 820; a[i - 1] = x; 821; a[i - 3] = y; 822; a[i] = z; 823; } 824; } 825 826; CHECK-LABEL: @PR27626_5( 827; CHECK: vector.body: 828; CHECK: store i32 %x, {{.*}} 829; CHECK: store i32 %x, {{.*}} 830; CHECK: store i32 %x, {{.*}} 831; CHECK: store i32 %x, {{.*}} 832; CHECK: store i32 %y, {{.*}} 833; CHECK: store i32 %y, {{.*}} 834; CHECK: store i32 %y, {{.*}} 835; CHECK: store i32 %y, {{.*}} 836; CHECK: store i32 %z, {{.*}} 837; CHECK: store i32 %z, {{.*}} 838; CHECK: store i32 %z, {{.*}} 839; CHECK: store i32 %z, {{.*}} 840 841define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 842entry: 843 br label %for.body 844 845for.body: 846 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] 847 %i_minus_1 = sub i64 %i, 1 848 %i_minus_3 = sub i64 %i_minus_1, 2 849 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 850 %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 851 %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 852 store i32 %x, i32* %a_i_minus_1, align 4 853 store i32 %y, i32* %a_i_minus_3, align 4 854 store i32 %z, i32* %a_i, align 4 855 %i.next = add nuw nsw i64 %i, 2 856 %cond = icmp slt i64 %i.next, %n 857 br i1 %cond, label %for.body, label %for.end 858 859for.end: 860 ret void 861} 862 863attributes #0 = { "unsafe-fp-math"="true" } 864