1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s 3; 4; This file tests the look-ahead operand reordering heuristic. 5; 6; 7; This checks that operand reordering will reorder the operands of the adds 8; by taking into consideration the instructions beyond the immediate 9; predecessors. 10; 11; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1] 12; \ / \ / \ / \ / 13; - - - - 14; \ / \ / 15; + + 16; | | 17; S[0] S[1] 18; 19define void @lookahead_basic(double* %array) { 20; CHECK-LABEL: @lookahead_basic( 21; CHECK-NEXT: entry: 22; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 23; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 24; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2 25; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3 26; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4 27; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 28; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 29; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 30; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* 31; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 32; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* 33; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 34; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* 35; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 36; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* 37; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 38; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] 39; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] 40; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]] 41; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>* 42; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 43; CHECK-NEXT: ret void 44; 45entry: 46 %idx0 = getelementptr inbounds double, double* %array, i64 0 47 %idx1 = getelementptr inbounds double, double* %array, i64 1 48 %idx2 = getelementptr inbounds double, double* %array, i64 2 49 %idx3 = getelementptr inbounds double, double* %array, i64 3 50 %idx4 = getelementptr inbounds double, double* %array, i64 4 51 %idx5 = getelementptr inbounds double, double* %array, i64 5 52 %idx6 = getelementptr inbounds double, double* %array, i64 6 53 %idx7 = getelementptr inbounds double, double* %array, i64 7 54 55 %A_0 = load double, double *%idx0, align 8 56 %A_1 = load double, double *%idx1, align 8 57 %B_0 = load double, double *%idx2, align 8 58 %B_1 = load double, double *%idx3, align 8 59 %C_0 = load double, double *%idx4, align 8 60 %C_1 = load double, double *%idx5, align 8 61 %D_0 = load double, double *%idx6, align 8 62 %D_1 = load double, double *%idx7, align 8 63 64 %subAB_0 = fsub fast double %A_0, %B_0 65 %subCD_0 = fsub fast double %C_0, %D_0 66 67 %subAB_1 = fsub fast double %A_1, %B_1 68 %subCD_1 = fsub fast double %C_1, %D_1 69 70 %addABCD_0 = fadd fast double %subAB_0, %subCD_0 71 %addCDAB_1 = fadd fast double %subCD_1, %subAB_1 72 73 store double %addABCD_0, double *%idx0, align 8 74 store double %addCDAB_1, double *%idx1, align 8 75 ret void 76} 77 78 79; Check whether the look-ahead operand reordering heuristic will avoid 80; bundling the alt opcodes. The vectorized code should have no shuffles. 81; 82; A[0] B[0] A[0] B[0] A[1] A[1] A[1] B[1] 83; \ / \ / \ / \ / 84; + - - + 85; \ / \ / 86; + + 87; | | 88; S[0] S[1] 89; 90define void @lookahead_alt1(double* %array) { 91; CHECK-LABEL: @lookahead_alt1( 92; CHECK-NEXT: entry: 93; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 94; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 95; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2 96; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3 97; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4 98; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 99; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 100; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 101; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* 102; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 103; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* 104; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 105; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] 106; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] 107; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]] 108; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[IDX0]] to <2 x double>* 109; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 110; CHECK-NEXT: ret void 111; 112entry: 113 %idx0 = getelementptr inbounds double, double* %array, i64 0 114 %idx1 = getelementptr inbounds double, double* %array, i64 1 115 %idx2 = getelementptr inbounds double, double* %array, i64 2 116 %idx3 = getelementptr inbounds double, double* %array, i64 3 117 %idx4 = getelementptr inbounds double, double* %array, i64 4 118 %idx5 = getelementptr inbounds double, double* %array, i64 5 119 %idx6 = getelementptr inbounds double, double* %array, i64 6 120 %idx7 = getelementptr inbounds double, double* %array, i64 7 121 122 %A_0 = load double, double *%idx0, align 8 123 %A_1 = load double, double *%idx1, align 8 124 %B_0 = load double, double *%idx2, align 8 125 %B_1 = load double, double *%idx3, align 8 126 127 %addAB_0_L = fadd fast double %A_0, %B_0 128 %subAB_0_R = fsub fast double %A_0, %B_0 129 130 %subAB_1_L = fsub fast double %A_1, %B_1 131 %addAB_1_R = fadd fast double %A_1, %B_1 132 133 %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R 134 %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R 135 136 store double %addABCD_0, double *%idx0, align 8 137 store double %addCDAB_1, double *%idx1, align 8 138 ret void 139} 140 141 142; This code should get vectorized all the way to the loads with shuffles for 143; the alt opcodes. 144; 145; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1] 146; \ / \ / \ / \ / 147; + - + - 148; \ / \ / 149; + + 150; | | 151; S[0] S[1] 152; 153define void @lookahead_alt2(double* %array) { 154; CHECK-LABEL: @lookahead_alt2( 155; CHECK-NEXT: entry: 156; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 157; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 158; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2 159; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3 160; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4 161; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 162; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 163; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 164; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* 165; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 166; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* 167; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 168; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* 169; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 170; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* 171; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 172; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] 173; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] 174; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3> 175; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] 176; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] 177; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3> 178; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]] 179; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>* 180; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8 181; CHECK-NEXT: ret void 182; 183entry: 184 %idx0 = getelementptr inbounds double, double* %array, i64 0 185 %idx1 = getelementptr inbounds double, double* %array, i64 1 186 %idx2 = getelementptr inbounds double, double* %array, i64 2 187 %idx3 = getelementptr inbounds double, double* %array, i64 3 188 %idx4 = getelementptr inbounds double, double* %array, i64 4 189 %idx5 = getelementptr inbounds double, double* %array, i64 5 190 %idx6 = getelementptr inbounds double, double* %array, i64 6 191 %idx7 = getelementptr inbounds double, double* %array, i64 7 192 193 %A_0 = load double, double *%idx0, align 8 194 %A_1 = load double, double *%idx1, align 8 195 %B_0 = load double, double *%idx2, align 8 196 %B_1 = load double, double *%idx3, align 8 197 %C_0 = load double, double *%idx4, align 8 198 %C_1 = load double, double *%idx5, align 8 199 %D_0 = load double, double *%idx6, align 8 200 %D_1 = load double, double *%idx7, align 8 201 202 %addAB_0 = fadd fast double %A_0, %B_0 203 %subCD_0 = fsub fast double %C_0, %D_0 204 205 %addCD_1 = fadd fast double %C_1, %D_1 206 %subAB_1 = fsub fast double %A_1, %B_1 207 208 %addABCD_0 = fadd fast double %addAB_0, %subCD_0 209 %addCDAB_1 = fadd fast double %addCD_1, %subAB_1 210 211 store double %addABCD_0, double *%idx0, align 8 212 store double %addCDAB_1, double *%idx1, align 8 213 ret void 214} 215 216 217; 218; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1] 219; \ / \ / / \ / \ / 220; - - U - - 221; \ / \ / 222; + + 223; | | 224; S[0] S[1] 225; 226; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses. 227; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use. 228 229define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) { 230; CHECK-LABEL: @lookahead_external_uses( 231; CHECK-NEXT: entry: 232; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0 233; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0 234; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 235; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 236; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 237; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> undef, double* [[A]], i32 0 238; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[A]], i32 1 239; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> <i64 0, i64 2> 240; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 241; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 242; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 243; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 244; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 245; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> undef) 246; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double*> [[TMP2]], i32 0 247; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* 248; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 249; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 250; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A1]], i32 1 251; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 252; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B2]], i32 1 253; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]] 254; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP6]] 255; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP12]], [[TMP11]] 256; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 257; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 258; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* 259; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8 260; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 261; CHECK-NEXT: ret void 262; 263entry: 264 %IdxA0 = getelementptr inbounds double, double* %A, i64 0 265 %IdxB0 = getelementptr inbounds double, double* %B, i64 0 266 %IdxC0 = getelementptr inbounds double, double* %C, i64 0 267 %IdxD0 = getelementptr inbounds double, double* %D, i64 0 268 269 %IdxA1 = getelementptr inbounds double, double* %A, i64 1 270 %IdxB2 = getelementptr inbounds double, double* %B, i64 2 271 %IdxA2 = getelementptr inbounds double, double* %A, i64 2 272 %IdxB1 = getelementptr inbounds double, double* %B, i64 1 273 274 %A0 = load double, double *%IdxA0, align 8 275 %B0 = load double, double *%IdxB0, align 8 276 %C0 = load double, double *%IdxC0, align 8 277 %D0 = load double, double *%IdxD0, align 8 278 279 %A1 = load double, double *%IdxA1, align 8 280 %B2 = load double, double *%IdxB2, align 8 281 %A2 = load double, double *%IdxA2, align 8 282 %B1 = load double, double *%IdxB1, align 8 283 284 %subA0B0 = fsub fast double %A0, %B0 285 %subC0D0 = fsub fast double %C0, %D0 286 287 %subA1B2 = fsub fast double %A1, %B2 288 %subA2B1 = fsub fast double %A2, %B1 289 290 %add0 = fadd fast double %subA0B0, %subC0D0 291 %add1 = fadd fast double %subA1B2, %subA2B1 292 293 %IdxS0 = getelementptr inbounds double, double* %S, i64 0 294 %IdxS1 = getelementptr inbounds double, double* %S, i64 1 295 296 store double %add0, double *%IdxS0, align 8 297 store double %add1, double *%IdxS1, align 8 298 299 ; External use 300 store double %A1, double *%Ext1, align 8 301 ret void 302} 303 304; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1] 305; \ / \ / / \ / \ / \ 306; - - U1,U2,U3 - - U4,U5 307; \ / \ / 308; + + 309; | | 310; S[0] S[1] 311; 312; 313; If we limit the users budget for the look-ahead heuristic to 2, then the 314; look-ahead heuristic has no way of choosing B[1] (with 2 external users) 315; over A[1] (with 3 external users). 316; The result is that the operands are of the Add not reordered and the loads 317; from A get vectorized instead of the loads from B. 318; 319define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2, double *%Ext3, double *%Ext4, double *%Ext5) { 320; CHECK-LABEL: @lookahead_limit_users_budget( 321; CHECK-NEXT: entry: 322; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 323; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0 324; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0 325; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 326; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 327; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 328; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 329; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 330; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 331; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 332; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 333; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 334; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 335; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 336; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 337; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 338; CHECK-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] 339; CHECK-NEXT: [[SUBC0D0:%.*]] = fsub fast double [[C0]], [[D0]] 340; CHECK-NEXT: [[SUBA1B2:%.*]] = fsub fast double [[A1]], [[B2]] 341; CHECK-NEXT: [[SUBA2B1:%.*]] = fsub fast double [[A2]], [[B1]] 342; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[SUBC0D0]] 343; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[SUBA1B2]], [[SUBA2B1]] 344; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 345; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 346; CHECK-NEXT: store double [[ADD0]], double* [[IDXS0]], align 8 347; CHECK-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 348; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 349; CHECK-NEXT: store double [[A1]], double* [[EXT2:%.*]], align 8 350; CHECK-NEXT: store double [[A1]], double* [[EXT3:%.*]], align 8 351; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8 352; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8 353; CHECK-NEXT: ret void 354; 355entry: 356 %IdxA0 = getelementptr inbounds double, double* %A, i64 0 357 %IdxB0 = getelementptr inbounds double, double* %B, i64 0 358 %IdxC0 = getelementptr inbounds double, double* %C, i64 0 359 %IdxD0 = getelementptr inbounds double, double* %D, i64 0 360 361 %IdxA1 = getelementptr inbounds double, double* %A, i64 1 362 %IdxB2 = getelementptr inbounds double, double* %B, i64 2 363 %IdxA2 = getelementptr inbounds double, double* %A, i64 2 364 %IdxB1 = getelementptr inbounds double, double* %B, i64 1 365 366 %A0 = load double, double *%IdxA0, align 8 367 %B0 = load double, double *%IdxB0, align 8 368 %C0 = load double, double *%IdxC0, align 8 369 %D0 = load double, double *%IdxD0, align 8 370 371 %A1 = load double, double *%IdxA1, align 8 372 %B2 = load double, double *%IdxB2, align 8 373 %A2 = load double, double *%IdxA2, align 8 374 %B1 = load double, double *%IdxB1, align 8 375 376 %subA0B0 = fsub fast double %A0, %B0 377 %subC0D0 = fsub fast double %C0, %D0 378 379 %subA1B2 = fsub fast double %A1, %B2 380 %subA2B1 = fsub fast double %A2, %B1 381 382 %add0 = fadd fast double %subA0B0, %subC0D0 383 %add1 = fadd fast double %subA1B2, %subA2B1 384 385 %IdxS0 = getelementptr inbounds double, double* %S, i64 0 386 %IdxS1 = getelementptr inbounds double, double* %S, i64 1 387 388 store double %add0, double *%IdxS0, align 8 389 store double %add1, double *%IdxS1, align 8 390 391 ; External uses of A1 392 store double %A1, double *%Ext1, align 8 393 store double %A1, double *%Ext2, align 8 394 store double %A1, double *%Ext3, align 8 395 396 ; External uses of B1 397 store double %B1, double *%Ext4, align 8 398 store double %B1, double *%Ext5, align 8 399 400 ret void 401} 402 403; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls). 404 405%Class = type { i8 } 406declare double @_ZN1i2ayEv(%Class*) 407declare double @_ZN1i2axEv() 408 409define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) { 410; CHECK-LABEL: @lookahead_crash( 411; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 412; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 413; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* 414; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 415; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]]) 416; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv() 417; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 418; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1 419; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] 420; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 421; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 422; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* 423; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 424; CHECK-NEXT: ret void 425; 426 %IdxA0 = getelementptr inbounds double, double* %A, i64 0 427 %IdxA1 = getelementptr inbounds double, double* %A, i64 1 428 429 %A0 = load double, double *%IdxA0, align 8 430 %A1 = load double, double *%IdxA1, align 8 431 432 %C0 = call double @_ZN1i2ayEv(%Class *%Arg0) 433 %C1 = call double @_ZN1i2axEv() 434 435 %add0 = fadd fast double %A0, %C0 436 %add1 = fadd fast double %A1, %C1 437 438 %IdxS0 = getelementptr inbounds double, double* %S, i64 0 439 %IdxS1 = getelementptr inbounds double, double* %S, i64 1 440 store double %add0, double *%IdxS0, align 8 441 store double %add1, double *%IdxS1, align 8 442 ret void 443} 444 445; This checks that we choose to group consecutive extracts from the same vectors. 446define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2) { 447; CHECK-LABEL: @ChecksExtractScores( 448; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 449; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 450; CHECK-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4 451; CHECK-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4 452; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 453; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 454; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[LOADA0]], i32 0 455; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1 456; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] 457; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[LOADA1]], i32 0 458; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1 459; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]] 460; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] 461; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 462; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 463; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* 464; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 465; CHECK-NEXT: ret void 466; 467 %idx0 = getelementptr inbounds double, double* %array, i64 0 468 %idx1 = getelementptr inbounds double, double* %array, i64 1 469 %loadA0 = load double, double* %idx0, align 4 470 %loadA1 = load double, double* %idx1, align 4 471 472 %loadVec = load <2 x double>, <2 x double>* %vecPtr1, align 4 473 %extrA0 = extractelement <2 x double> %loadVec, i32 0 474 %extrA1 = extractelement <2 x double> %loadVec, i32 1 475 %loadVec2 = load <2 x double>, <2 x double>* %vecPtr2, align 4 476 %extrB0 = extractelement <2 x double> %loadVec2, i32 0 477 %extrB1 = extractelement <2 x double> %loadVec2, i32 1 478 479 %mul0 = fmul double %extrA0, %loadA0 480 %mul1 = fmul double %extrA1, %loadA0 481 %mul3 = fmul double %extrB0, %loadA1 482 %mul4 = fmul double %extrB1, %loadA1 483 %add0 = fadd double %mul0, %mul3 484 %add1 = fadd double %mul1, %mul4 485 486 %sidx0 = getelementptr inbounds double, double* %storeArray, i64 0 487 %sidx1 = getelementptr inbounds double, double* %storeArray, i64 1 488 store double %add0, double *%sidx0, align 8 489 store double %add1, double *%sidx1, align 8 490 ret void 491} 492 493 494define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { 495; CHECK-LABEL: @ExtractIdxNotConstantInt1( 496; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef 497; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] 498; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] 499; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] 500; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] 501; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 502; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] 503; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 504; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] 505; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] 506; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 507; CHECK-NEXT: ret i1 [[CMP_I185]] 508; 509 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef 510 %sub14.i167 = fsub float undef, %vecext.i291.i166 511 %fm = fmul float %a, %sub14.i167 512 %sub25.i168 = fsub float %fm, %b 513 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2 514 %add36.i173 = fadd float %sub25.i168, 10.0 515 %mul72.i179 = fmul float %c, %vecext.i276.i169 516 %add78.i180 = fsub float %mul72.i179, 30.0 517 %add79.i181 = fadd float 2.0, %add78.i180 518 %mul123.i184 = fmul float %add36.i173, %add79.i181 519 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 520 ret i1 %cmp.i185 521} 522 523 524define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { 525; CHECK-LABEL: @ExtractIdxNotConstantInt2( 526; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 527; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] 528; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] 529; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] 530; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] 531; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 532; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] 533; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 534; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] 535; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] 536; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 537; CHECK-NEXT: ret i1 [[CMP_I185]] 538; 539 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1 540 %sub14.i167 = fsub float undef, %vecext.i291.i166 541 %fm = fmul float %a, %sub14.i167 542 %sub25.i168 = fsub float %fm, %b 543 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2 544 %add36.i173 = fadd float %sub25.i168, 10.0 545 %mul72.i179 = fmul float %c, %vecext.i276.i169 546 %add78.i180 = fsub float %mul72.i179, 30.0 547 %add79.i181 = fadd float 2.0, %add78.i180 548 %mul123.i184 = fmul float %add36.i173, %add79.i181 549 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 550 ret i1 %cmp.i185 551} 552 553 554define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { 555; CHECK-LABEL: @foo( 556; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 557; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] 558; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] 559; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] 560; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 561; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 562; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] 563; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 564; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] 565; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] 566; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 567; CHECK-NEXT: ret i1 [[CMP_I185]] 568; 569 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0 570 %sub14.i167 = fsub float undef, %vecext.i291.i166 571 %fm = fmul float %a, %sub14.i167 572 %sub25.i168 = fsub float %fm, %b 573 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 1 574 %add36.i173 = fadd float %sub25.i168, 10.0 575 %mul72.i179 = fmul float %c, %vecext.i276.i169 576 %add78.i180 = fsub float %mul72.i179, 30.0 577 %add79.i181 = fadd float 2.0, %add78.i180 578 %mul123.i184 = fmul float %add36.i173, %add79.i181 579 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 580 ret i1 %cmp.i185 581} 582 583; Same as @ChecksExtractScores, but the extratelement vector operands do not match. 584define void @ChecksExtractScores_different_vectors(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2, <2 x double>* %vecPtr3, <2 x double>* %vecPtr4) { 585; CHECK-LABEL: @ChecksExtractScores_different_vectors( 586; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 587; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 588; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>* 589; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 590; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 591; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 592; CHECK-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0 593; CHECK-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1 594; CHECK-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4 595; CHECK-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 596; CHECK-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 597; CHECK-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 598; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[EXTRB0]], i32 0 599; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRA1]], i32 1 600; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 601; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 602; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 603; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 604; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP4]], [[TMP8]] 605; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> undef, double [[EXTRA0]], i32 0 606; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[EXTRB1]], i32 1 607; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP2]] 608; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], [[TMP9]] 609; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 610; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 611; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* 612; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8 613; CHECK-NEXT: ret void 614; 615 %idx0 = getelementptr inbounds double, double* %array, i64 0 616 %idx1 = getelementptr inbounds double, double* %array, i64 1 617 %loadA0 = load double, double* %idx0, align 4 618 %loadA1 = load double, double* %idx1, align 4 619 620 %loadVec = load <2 x double>, <2 x double>* %vecPtr1, align 4 621 %loadVec2 = load <2 x double>, <2 x double>* %vecPtr2, align 4 622 %extrA0 = extractelement <2 x double> %loadVec, i32 0 623 %extrA1 = extractelement <2 x double> %loadVec2, i32 1 624 %loadVec3= load <2 x double>, <2 x double>* %vecPtr3, align 4 625 %loadVec4 = load <2 x double>, <2 x double>* %vecPtr4, align 4 626 %extrB0 = extractelement <2 x double> %loadVec3, i32 0 627 %extrB1 = extractelement <2 x double> %loadVec4, i32 1 628 629 %mul0 = fmul double %extrA0, %loadA0 630 %mul1 = fmul double %extrA1, %loadA0 631 %mul3 = fmul double %extrB0, %loadA1 632 %mul4 = fmul double %extrB1, %loadA1 633 %add0 = fadd double %mul0, %mul3 634 %add1 = fadd double %mul1, %mul4 635 636 %sidx0 = getelementptr inbounds double, double* %storeArray, i64 0 637 %sidx1 = getelementptr inbounds double, double* %storeArray, i64 1 638 store double %add0, double *%sidx0, align 8 639 store double %add1, double *%sidx1, align 8 640 ret void 641} 642