1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t | FileCheck %s 3; RUN: cat %t | FileCheck -check-prefix=REMARK %s 4; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s 5; RUN: cat %t | FileCheck -check-prefix=REMARK %s 6 7target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" 8target triple = "aarch64--linux-gnu" 9 10; REMARK-LABEL: Function: gather_multiple_use 11; REMARK: Args: 12; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' 13; REMARK-NEXT: - Cost: '-7' 14; 15; REMARK-NOT: Function: gather_load 16 17define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) { 18; CHECK-LABEL: @gather_multiple_use( 19; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 20; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1 21; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2 22; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3 23; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15> 24; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 65537, i32 65537, i32 65537, i32 65537> 25; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535> 26; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP4]] 27; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]] 28; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) 29; CHECK-NEXT: ret i32 [[TMP10]] 30; 31 %tmp00 = lshr i32 %a, 15 32 %tmp01 = and i32 %tmp00, 65537 33 %tmp02 = mul nuw i32 %tmp01, 65535 34 %tmp03 = add i32 %tmp02, %a 35 %tmp04 = xor i32 %tmp03, %tmp02 36 %tmp05 = lshr i32 %c, 15 37 %tmp06 = and i32 %tmp05, 65537 38 %tmp07 = mul nuw i32 %tmp06, 65535 39 %tmp08 = add i32 %tmp07, %c 40 %tmp09 = xor i32 %tmp08, %tmp07 41 %tmp10 = lshr i32 %b, 15 42 %tmp11 = and i32 %tmp10, 65537 43 %tmp12 = mul nuw i32 %tmp11, 65535 44 %tmp13 = add i32 %tmp12, %b 45 %tmp14 = xor i32 %tmp13, %tmp12 46 %tmp15 = lshr i32 %d, 15 47 %tmp16 = and i32 %tmp15, 65537 48 %tmp17 = mul nuw i32 %tmp16, 65535 49 %tmp18 = add i32 %tmp17, %d 50 %tmp19 = xor i32 %tmp18, %tmp17 51 %tmp20 = add i32 %tmp09, %tmp04 52 %tmp21 = add i32 %tmp20, %tmp14 53 %tmp22 = add i32 %tmp21, %tmp19 54 ret i32 %tmp22 55} 56 57@data = global [6 x [258 x i8]] zeroinitializer, align 1 58define void @gather_load(i16* noalias %ptr) { 59; CHECK-LABEL: @gather_load( 60; CHECK-NEXT: [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 1 61; CHECK-NEXT: [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 2 62; CHECK-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 3 63; CHECK-NEXT: [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 4 64; CHECK-NEXT: [[L0:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 1, i64 0), align 1 65; CHECK-NEXT: [[CONV150:%.*]] = zext i8 [[L0]] to i16 66; CHECK-NEXT: [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10 67; CHECK-NEXT: [[L1:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 2, i64 1), align 1 68; CHECK-NEXT: [[CONV156:%.*]] = zext i8 [[L1]] to i16 69; CHECK-NEXT: [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20 70; CHECK-NEXT: [[L2:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 3, i64 2), align 1 71; CHECK-NEXT: [[CONV162:%.*]] = zext i8 [[L2]] to i16 72; CHECK-NEXT: [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30 73; CHECK-NEXT: [[L3:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 4, i64 3), align 1 74; CHECK-NEXT: [[CONV168:%.*]] = zext i8 [[L3]] to i16 75; CHECK-NEXT: [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40 76; CHECK-NEXT: store i16 [[ADD152]], i16* [[ARRAYIDX182]], align 2 77; CHECK-NEXT: store i16 [[ADD158]], i16* [[ARRAYIDX183]], align 2 78; CHECK-NEXT: store i16 [[ADD164]], i16* [[ARRAYIDX184]], align 2 79; CHECK-NEXT: store i16 [[ADD170]], i16* [[ARRAYIDX185]], align 2 80; CHECK-NEXT: ret void 81; 82 %arrayidx182 = getelementptr inbounds i16, i16* %ptr, i64 1 83 %arrayidx183 = getelementptr inbounds i16, i16* %ptr, i64 2 84 %arrayidx184 = getelementptr inbounds i16, i16* %ptr, i64 3 85 %arrayidx185 = getelementptr inbounds i16, i16* %ptr, i64 4 86 %arrayidx149 = getelementptr inbounds [6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 1, i64 0 87 %l0 = load i8, i8* %arrayidx149, align 1 88 %conv150 = zext i8 %l0 to i16 89 %add152 = add i16 10, %conv150 90 %arrayidx155 = getelementptr inbounds [6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 2, i64 1 91 %l1 = load i8, i8* %arrayidx155, align 1 92 %conv156 = zext i8 %l1 to i16 93 %add158 = add i16 20, %conv156 94 %arrayidx161 = getelementptr inbounds [6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 3, i64 2 95 %l2 = load i8, i8* %arrayidx161, align 1 96 %conv162 = zext i8 %l2 to i16 97 %add164 = add i16 30, %conv162 98 %arrayidx167 = getelementptr inbounds [6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 4, i64 3 99 %l3 = load i8, i8* %arrayidx167, align 1 100 %conv168 = zext i8 %l3 to i16 101 %add170 = add i16 40, %conv168 102 store i16 %add152, i16* %arrayidx182, align 2 103 store i16 %add158, i16* %arrayidx183, align 2 104 store i16 %add164, i16* %arrayidx184, align 2 105 store i16 %add170, i16* %arrayidx185, align 2 106 ret void 107} 108