1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s 3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 4target triple = "x86_64-unknown-linux-gnu" 5 6@b = common global [4 x i32] zeroinitializer, align 16 7@c = common global [4 x i32] zeroinitializer, align 16 8@d = common global [4 x i32] zeroinitializer, align 16 9@e = common global [4 x i32] zeroinitializer, align 16 10@a = common global [4 x i32] zeroinitializer, align 16 11@fb = common global [4 x float] zeroinitializer, align 16 12@fc = common global [4 x float] zeroinitializer, align 16 13@fa = common global [4 x float] zeroinitializer, align 16 14@fd = common global [4 x float] zeroinitializer, align 16 15 16; Function Attrs: nounwind uwtable 17define void @addsub() #0 { 18; CHECK-LABEL: @addsub( 19; CHECK-NEXT: entry: 20; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 21; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 4 22; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]] 23; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 4 24; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 4 25; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] 26; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]] 27; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]] 28; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7> 29; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 30; CHECK-NEXT: ret void 31; 32entry: 33 %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 0), align 4 34 %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 0), align 4 35 %add = add nsw i32 %0, %1 36 %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 0), align 4 37 %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 0), align 4 38 %add1 = add nsw i32 %2, %3 39 %add2 = add nsw i32 %add, %add1 40 store i32 %add2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 0), align 4 41 %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 1), align 4 42 %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 1), align 4 43 %add3 = add nsw i32 %4, %5 44 %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 1), align 4 45 %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 1), align 4 46 %add4 = add nsw i32 %6, %7 47 %sub = sub nsw i32 %add3, %add4 48 store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 1), align 4 49 %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 2), align 4 50 %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 2), align 4 51 %add5 = add nsw i32 %8, %9 52 %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 2), align 4 53 %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 2), align 4 54 %add6 = add nsw i32 %10, %11 55 %add7 = add nsw i32 %add5, %add6 56 store i32 %add7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 2), align 4 57 %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 3), align 4 58 %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 3), align 4 59 %add8 = add nsw i32 %12, %13 60 %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 3), align 4 61 %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 3), align 4 62 %add9 = add nsw i32 %14, %15 63 %sub10 = sub nsw i32 %add8, %add9 64 store i32 %sub10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 3), align 4 65 ret void 66} 67 68; Function Attrs: nounwind uwtable 69define void @subadd() #0 { 70; CHECK-LABEL: @subadd( 71; CHECK-NEXT: entry: 72; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 73; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 4 74; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]] 75; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 4 76; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 4 77; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] 78; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]] 79; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]] 80; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7> 81; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 82; CHECK-NEXT: ret void 83; 84entry: 85 %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 0), align 4 86 %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 0), align 4 87 %add = add nsw i32 %0, %1 88 %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 0), align 4 89 %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 0), align 4 90 %add1 = add nsw i32 %2, %3 91 %sub = sub nsw i32 %add, %add1 92 store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 0), align 4 93 %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 1), align 4 94 %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 1), align 4 95 %add2 = add nsw i32 %4, %5 96 %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 1), align 4 97 %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 1), align 4 98 %add3 = add nsw i32 %6, %7 99 %add4 = add nsw i32 %add2, %add3 100 store i32 %add4, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 1), align 4 101 %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 2), align 4 102 %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 2), align 4 103 %add5 = add nsw i32 %8, %9 104 %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 2), align 4 105 %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 2), align 4 106 %add6 = add nsw i32 %10, %11 107 %sub7 = sub nsw i32 %add5, %add6 108 store i32 %sub7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 2), align 4 109 %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 3), align 4 110 %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 3), align 4 111 %add8 = add nsw i32 %12, %13 112 %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 3), align 4 113 %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 3), align 4 114 %add9 = add nsw i32 %14, %15 115 %add10 = add nsw i32 %add8, %add9 116 store i32 %add10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 3), align 4 117 ret void 118} 119 120; Function Attrs: nounwind uwtable 121define void @faddfsub() #0 { 122; CHECK-LABEL: @faddfsub( 123; CHECK-NEXT: entry: 124; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4 125; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 126; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]] 127; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]] 128; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7> 129; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4 130; CHECK-NEXT: ret void 131; 132entry: 133 %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 134 %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 135 %add = fadd float %0, %1 136 store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 137 %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 138 %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 139 %sub = fsub float %2, %3 140 store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 141 %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 142 %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 143 %add1 = fadd float %4, %5 144 store float %add1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 145 %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 146 %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 147 %sub2 = fsub float %6, %7 148 store float %sub2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 149 ret void 150} 151 152; Function Attrs: nounwind uwtable 153define void @fsubfadd() #0 { 154; CHECK-LABEL: @fsubfadd( 155; CHECK-NEXT: entry: 156; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4 157; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 158; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]] 159; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]] 160; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7> 161; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4 162; CHECK-NEXT: ret void 163; 164entry: 165 %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 166 %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 167 %sub = fsub float %0, %1 168 store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 169 %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 170 %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 171 %add = fadd float %2, %3 172 store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 173 %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 174 %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 175 %sub1 = fsub float %4, %5 176 store float %sub1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 177 %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 178 %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 179 %add2 = fadd float %6, %7 180 store float %add2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 181 ret void 182} 183 184; Function Attrs: nounwind uwtable 185define void @faddfsub_select() #0 { 186; CHECK-LABEL: @faddfsub_select( 187; CHECK-NEXT: entry: 188; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4 189; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 190; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]] 191; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]] 192; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7> 193; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4 194; CHECK-NEXT: ret void 195; 196entry: 197 %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 198 %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 199 %add = fadd float %0, %1 200 store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 201 %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 202 %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 203 %add1 = fadd float %2, %3 204 store float %add1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 205 %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 206 %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 207 %add2 = fadd float %4, %5 208 store float %add2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 209 %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 210 %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 211 %sub = fsub float %6, %7 212 store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 213 ret void 214} 215 216; Check vectorization of following code for float data type- 217; fc[0] = fb[0]+fa[0]; //swapped fb and fa 218; fc[1] = fa[1]-fb[1]; 219; fc[2] = fa[2]+fb[2]; 220; fc[3] = fa[3]-fb[3]; 221 222define void @reorder_alt() #0 { 223; CHECK-LABEL: @reorder_alt( 224; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4 225; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4 226; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] 227; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]] 228; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7> 229; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 230; CHECK-NEXT: ret void 231; 232 %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 233 %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 234 %3 = fadd float %1, %2 235 store float %3, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 236 %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 237 %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 238 %6 = fsub float %4, %5 239 store float %6, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 240 %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 241 %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 242 %9 = fadd float %7, %8 243 store float %9, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 244 %10 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 245 %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 246 %12 = fsub float %10, %11 247 store float %12, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 248 ret void 249} 250 251; Check vectorization of following code for float data type- 252; fc[0] = fa[0]+(fb[0]-fd[0]); 253; fc[1] = fa[1]-(fb[1]+fd[1]); 254; fc[2] = fa[2]+(fb[2]-fd[2]); 255; fc[3] = fa[3]-(fd[3]+fb[3]); //swapped fd and fb 256 257define void @reorder_alt_subTree() #0 { 258; CHECK-LABEL: @reorder_alt_subTree( 259; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4 260; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fd to <4 x float>*), align 4 261; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4 262; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP3]], [[TMP2]] 263; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP2]] 264; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7> 265; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP1]], [[TMP6]] 266; CHECK-NEXT: [[TMP8:%.*]] = fsub <4 x float> [[TMP1]], [[TMP6]] 267; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 5, i32 2, i32 7> 268; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 269; CHECK-NEXT: ret void 270; 271 %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 272 %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 273 %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 0), align 4 274 %4 = fsub float %2, %3 275 %5 = fadd float %1, %4 276 store float %5, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 277 %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 278 %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 279 %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 1), align 4 280 %9 = fadd float %7, %8 281 %10 = fsub float %6, %9 282 store float %10, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 283 %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 284 %12 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 285 %13 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 2), align 4 286 %14 = fsub float %12, %13 287 %15 = fadd float %11, %14 288 store float %15, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 289 %16 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 290 %17 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 3), align 4 291 %18 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 292 %19 = fadd float %17, %18 293 %20 = fsub float %16, %19 294 store float %20, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 295 ret void 296} 297 298; Check vectorization of following code for double data type- 299; c[0] = (a[0]+b[0])-d[0]; 300; c[1] = d[1]+(a[1]+b[1]); //swapped d[1] and (a[1]+b[1]) 301 302define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) { 303; CHECK-LABEL: @reorder_alt_rightsubTree( 304; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 1 305; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[D]] to <2 x double>* 306; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 307; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 308; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>* 309; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 310; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 311; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[B]] to <2 x double>* 312; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8 313; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP6]], [[TMP9]] 314; CHECK-NEXT: [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP3]] 315; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP3]] 316; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3> 317; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 318; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[C]] to <2 x double>* 319; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP15]], align 8 320; CHECK-NEXT: ret void 321; 322 %1 = load double, double* %a 323 %2 = load double, double* %b 324 %3 = fadd double %1, %2 325 %4 = load double, double* %d 326 %5 = fsub double %3, %4 327 store double %5, double* %c 328 %6 = getelementptr inbounds double, double* %d, i64 1 329 %7 = load double, double* %6 330 %8 = getelementptr inbounds double, double* %a, i64 1 331 %9 = load double, double* %8 332 %10 = getelementptr inbounds double, double* %b, i64 1 333 %11 = load double, double* %10 334 %12 = fadd double %9, %11 335 %13 = fadd double %7, %12 336 %14 = getelementptr inbounds double, double* %c, i64 1 337 store double %13, double* %14 338 ret void 339} 340 341; Dont vectorization of following code for float data type as sub is not commutative- 342; fc[0] = fb[0]+fa[0]; 343; fc[1] = fa[1]-fb[1]; 344; fc[2] = fa[2]+fb[2]; 345; fc[3] = fb[3]-fa[3]; 346; In the above code we can swap the 1st and 2nd operation as fadd is commutative 347; but not 2nd or 4th as fsub is not commutative. 348 349define void @no_vec_shuff_reorder() #0 { 350; CHECK-LABEL: @no_vec_shuff_reorder( 351; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 352; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 353; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] 354; CHECK-NEXT: store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 355; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 356; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 357; CHECK-NEXT: [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]] 358; CHECK-NEXT: store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 359; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 360; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 361; CHECK-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] 362; CHECK-NEXT: store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 363; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 364; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 365; CHECK-NEXT: [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]] 366; CHECK-NEXT: store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 367; CHECK-NEXT: ret void 368; 369 %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 370 %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 371 %3 = fadd float %1, %2 372 store float %3, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 373 %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 374 %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 375 %6 = fsub float %4, %5 376 store float %6, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 377 %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 378 %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 379 %9 = fadd float %7, %8 380 store float %9, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 381 %10 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 382 %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 383 %12 = fsub float %10, %11 384 store float %12, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 385 ret void 386} 387 388 389attributes #0 = { nounwind } 390 391