1; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse -earlycse-debug-hash | FileCheck %s 2; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -basic-aa -early-cse-memssa | FileCheck %s 3; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s 4; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes=early-cse-memssa | FileCheck %s 5 6define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 7entry: 8; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE. 9; CHECK-LABEL: @test_cse 10; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 11 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 12 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 13 br label %for.cond 14 15for.cond: ; preds = %for.body, %entry 16 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 17 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 18 %cmp = icmp slt i32 %i.0, %n 19 br i1 %cmp, label %for.body, label %for.end 20 21for.body: ; preds = %for.cond 22 %0 = bitcast i32* %a to i8* 23 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 24 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 25 %3 = bitcast <16 x i8> %1 to <4 x i32> 26 %4 = bitcast <16 x i8> %2 to <4 x i32> 27 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 28 %5 = bitcast i32* %a to i8* 29 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 30 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 31 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 32 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 33 %inc = add nsw i32 %i.0, 1 34 br label %for.cond 35 36for.end: ; preds = %for.cond 37 ret <4 x i32> %res.0 38} 39 40define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 41entry: 42; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE. 43; CHECK-LABEL: @test_cse2 44; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 45; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, i8* %0) 46 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 47 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 48 br label %for.cond 49 50for.cond: ; preds = %for.body, %entry 51 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 52 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 53 %cmp = icmp slt i32 %i.0, %n 54 br i1 %cmp, label %for.body, label %for.end 55 56for.body: ; preds = %for.cond 57 %0 = bitcast i32* %a to i8* 58 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 59 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 60 %3 = bitcast <16 x i8> %1 to <4 x i32> 61 %4 = bitcast <16 x i8> %2 to <4 x i32> 62 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 63 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 64 %5 = bitcast i32* %a to i8* 65 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 66 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 67 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 68 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 69 %inc = add nsw i32 %i.0, 1 70 br label %for.cond 71 72for.end: ; preds = %for.cond 73 ret <4 x i32> %res.0 74} 75 76define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 { 77entry: 78; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE. 79; CHECK-LABEL: @test_cse3 80; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 81; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 82 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 83 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 84 br label %for.cond 85 86for.cond: ; preds = %for.body, %entry 87 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 88 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 89 %cmp = icmp slt i32 %i.0, %n 90 br i1 %cmp, label %for.body, label %for.end 91 92for.body: ; preds = %for.cond 93 %0 = bitcast i32* %a to i8* 94 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0) 95 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 96 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 97 %1 = bitcast i32* %a to i8* 98 %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1) 99 %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0 100 %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1 101 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract) 102 %inc = add nsw i32 %i.0, 1 103 br label %for.cond 104 105for.end: ; preds = %for.cond 106 ret <4 x i32> %res.0 107} 108 109 110define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) { 111entry: 112; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized 113; away by Early CSE. 114; CHECK-LABEL: @test_nocse 115; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 116 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 117 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 118 br label %for.cond 119 120for.cond: ; preds = %for.body, %entry 121 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 122 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 123 %cmp = icmp slt i32 %i.0, %n 124 br i1 %cmp, label %for.body, label %for.end 125 126for.body: ; preds = %for.cond 127 %0 = bitcast i32* %a to i8* 128 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 129 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 130 %3 = bitcast <16 x i8> %1 to <4 x i32> 131 %4 = bitcast <16 x i8> %2 to <4 x i32> 132 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 133 store i32 0, i32* %b, align 4 134 %5 = bitcast i32* %a to i8* 135 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 136 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 137 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 138 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 139 %inc = add nsw i32 %i.0, 1 140 br label %for.cond 141 142for.end: ; preds = %for.cond 143 ret <4 x i32> %res.0 144} 145 146define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 147entry: 148; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due 149; to mismatch between st2 and ld3. 150; CHECK-LABEL: @test_nocse2 151; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8 152 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 153 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 154 br label %for.cond 155 156for.cond: ; preds = %for.body, %entry 157 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 158 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 159 %cmp = icmp slt i32 %i.0, %n 160 br i1 %cmp, label %for.body, label %for.end 161 162for.body: ; preds = %for.cond 163 %0 = bitcast i32* %a to i8* 164 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 165 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 166 %3 = bitcast <16 x i8> %1 to <4 x i32> 167 %4 = bitcast <16 x i8> %2 to <4 x i32> 168 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 169 %5 = bitcast i32* %a to i8* 170 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) 171 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 172 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2 173 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract) 174 %inc = add nsw i32 %i.0, 1 175 br label %for.cond 176 177for.end: ; preds = %for.cond 178 ret <4 x i32> %res.0 179} 180 181define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 182entry: 183; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to 184; mismatch between st2 and st3. 185; CHECK-LABEL: @test_nocse3 186; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8 187; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8 188 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 189 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 190 br label %for.cond 191 192for.cond: ; preds = %for.body, %entry 193 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 194 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 195 %cmp = icmp slt i32 %i.0, %n 196 br i1 %cmp, label %for.body, label %for.end 197 198for.body: ; preds = %for.cond 199 %0 = bitcast i32* %a to i8* 200 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 201 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 202 %3 = bitcast <16 x i8> %1 to <4 x i32> 203 %4 = bitcast <16 x i8> %2 to <4 x i32> 204 call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0) 205 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 206 %5 = bitcast i32* %a to i8* 207 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) 208 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 209 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1 210 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract) 211 %inc = add nsw i32 %i.0, 1 212 br label %for.cond 213 214for.end: ; preds = %for.cond 215 ret <4 x i32> %res.0 216} 217 218; Function Attrs: nounwind 219declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture) 220 221; Function Attrs: nounwind 222declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture) 223 224; Function Attrs: nounwind readonly 225declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*) 226 227; Function Attrs: nounwind readonly 228declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*) 229 230define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) { 231entry: 232 %add = add <4 x i32> %__p0, %__p1 233 ret <4 x i32> %add 234} 235