1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: opt < %s -loop-reduce -mcpu=btver2 -S | FileCheck %s --check-prefix=JAG 3; RUN: opt < %s -loop-reduce -mcpu=bdver2 -S | FileCheck %s --check-prefix=BUL 4; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW 5 6; RUN: llc < %s | FileCheck %s --check-prefix=BASE 7; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE 8; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE 9 10target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 11target triple = "x86_64-unknown-unknown" 12 13; PR35681 - https://bugs.llvm.org/show_bug.cgi?id=35681 14; FIXME: If a CPU can macro-fuse a compare and branch, then we discount that 15; cost in LSR and avoid generating large offsets in each memory access. 16; This reduces code size and may improve decode throughput. 17 18define void @maxArray(double* noalias nocapture %x, double* noalias nocapture readonly %y) { 19; JAG-LABEL: @maxArray( 20; JAG-NEXT: entry: 21; JAG-NEXT: [[Y1:%.*]] = bitcast double* [[Y:%.*]] to i8* 22; JAG-NEXT: [[X3:%.*]] = bitcast double* [[X:%.*]] to i8* 23; JAG-NEXT: br label [[VECTOR_BODY:%.*]] 24; JAG: vector.body: 25; JAG-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ] 26; JAG-NEXT: [[UGLYGEP7:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]] 27; JAG-NEXT: [[UGLYGEP78:%.*]] = bitcast i8* [[UGLYGEP7]] to <2 x double>* 28; JAG-NEXT: [[SCEVGEP9:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP78]], i64 32768 29; JAG-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y1]], i64 [[LSR_IV]] 30; JAG-NEXT: [[UGLYGEP2:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>* 31; JAG-NEXT: [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP2]], i64 32768 32; JAG-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP9]], align 8 33; JAG-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8 34; JAG-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]] 35; JAG-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]] 36; JAG-NEXT: [[UGLYGEP4:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]] 37; JAG-NEXT: [[UGLYGEP45:%.*]] = bitcast i8* [[UGLYGEP4]] to <2 x double>* 38; JAG-NEXT: [[SCEVGEP6:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP45]], i64 32768 39; JAG-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP6]], align 8 40; JAG-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16 41; JAG-NEXT: [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 42; JAG-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]] 43; JAG: exit: 44; JAG-NEXT: ret void 45; 46; BUL-LABEL: @maxArray( 47; BUL-NEXT: entry: 48; BUL-NEXT: br label [[VECTOR_BODY:%.*]] 49; BUL: vector.body: 50; BUL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 51; BUL-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]] 52; BUL-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>* 53; BUL-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]] 54; BUL-NEXT: [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>* 55; BUL-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8 56; BUL-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8 57; BUL-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]] 58; BUL-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]] 59; BUL-NEXT: [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]] 60; BUL-NEXT: [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>* 61; BUL-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8 62; BUL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 63; BUL-NEXT: [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 64; BUL-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]] 65; BUL: exit: 66; BUL-NEXT: ret void 67; 68; HSW-LABEL: @maxArray( 69; HSW-NEXT: entry: 70; HSW-NEXT: br label [[VECTOR_BODY:%.*]] 71; HSW: vector.body: 72; HSW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 73; HSW-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]] 74; HSW-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>* 75; HSW-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]] 76; HSW-NEXT: [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>* 77; HSW-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8 78; HSW-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8 79; HSW-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]] 80; HSW-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]] 81; HSW-NEXT: [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]] 82; HSW-NEXT: [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>* 83; HSW-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8 84; HSW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 85; HSW-NEXT: [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 86; HSW-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]] 87; HSW: exit: 88; HSW-NEXT: ret void 89; 90; BASE-LABEL: maxArray: 91; BASE: # %bb.0: # %entry 92; BASE-NEXT: movq $-524288, %rax # imm = 0xFFF80000 93; BASE-NEXT: .p2align 4, 0x90 94; BASE-NEXT: .LBB0_1: # %vector.body 95; BASE-NEXT: # =>This Inner Loop Header: Depth=1 96; BASE-NEXT: movupd 524288(%rdi,%rax), %xmm0 97; BASE-NEXT: movupd 524288(%rsi,%rax), %xmm1 98; BASE-NEXT: maxpd %xmm0, %xmm1 99; BASE-NEXT: movupd %xmm1, 524288(%rdi,%rax) 100; BASE-NEXT: addq $16, %rax 101; BASE-NEXT: jne .LBB0_1 102; BASE-NEXT: # %bb.2: # %exit 103; BASE-NEXT: retq 104; FUSE-LABEL: maxArray: 105; FUSE: # %bb.0: # %entry 106; FUSE-NEXT: xorl %eax, %eax 107; FUSE-NEXT: .p2align 4, 0x90 108; FUSE-NEXT: .LBB0_1: # %vector.body 109; FUSE-NEXT: # =>This Inner Loop Header: Depth=1 110; FUSE-NEXT: movupd (%rdi,%rax,8), %xmm0 111; FUSE-NEXT: movupd (%rsi,%rax,8), %xmm1 112; FUSE-NEXT: maxpd %xmm0, %xmm1 113; FUSE-NEXT: movupd %xmm1, (%rdi,%rax,8) 114; FUSE-NEXT: addq $2, %rax 115; FUSE-NEXT: cmpq $65536, %rax # imm = 0x10000 116; FUSE-NEXT: jne .LBB0_1 117; FUSE-NEXT: # %bb.2: # %exit 118; FUSE-NEXT: retq 119entry: 120 br label %vector.body 121 122vector.body: 123 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 124 %gepx = getelementptr inbounds double, double* %x, i64 %index 125 %gepy = getelementptr inbounds double, double* %y, i64 %index 126 %xptr = bitcast double* %gepx to <2 x double>* 127 %yptr = bitcast double* %gepy to <2 x double>* 128 %xval = load <2 x double>, <2 x double>* %xptr, align 8 129 %yval = load <2 x double>, <2 x double>* %yptr, align 8 130 %cmp = fcmp ogt <2 x double> %yval, %xval 131 %max = select <2 x i1> %cmp, <2 x double> %yval, <2 x double> %xval 132 %xptr_again = bitcast double* %gepx to <2 x double>* 133 store <2 x double> %max, <2 x double>* %xptr_again, align 8 134 %index.next = add i64 %index, 2 135 %done = icmp eq i64 %index.next, 65536 136 br i1 %done, label %exit, label %vector.body 137 138exit: 139 ret void 140} 141 142