1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: opt < %s -loop-reduce -mcpu=btver2  -S | FileCheck %s --check-prefix=JAG
3; RUN: opt < %s -loop-reduce -mcpu=bdver2  -S | FileCheck %s --check-prefix=BUL
4; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW
5
6; RUN: llc < %s                     | FileCheck %s --check-prefix=BASE
7; RUN: llc < %s -mattr=macrofusion  | FileCheck %s --check-prefix=FUSE
8; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE
9
10target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
11target triple = "x86_64-unknown-unknown"
12
13; PR35681 - https://bugs.llvm.org/show_bug.cgi?id=35681
14; FIXME: If a CPU can macro-fuse a compare and branch, then we discount that
15; cost in LSR and avoid generating large offsets in each memory access.
16; This reduces code size and may improve decode throughput.
17
18define void @maxArray(double* noalias nocapture %x, double* noalias nocapture readonly %y) {
19; JAG-LABEL: @maxArray(
20; JAG-NEXT:  entry:
21; JAG-NEXT:    [[Y1:%.*]] = bitcast double* [[Y:%.*]] to i8*
22; JAG-NEXT:    [[X3:%.*]] = bitcast double* [[X:%.*]] to i8*
23; JAG-NEXT:    br label [[VECTOR_BODY:%.*]]
24; JAG:       vector.body:
25; JAG-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ]
26; JAG-NEXT:    [[UGLYGEP7:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
27; JAG-NEXT:    [[UGLYGEP78:%.*]] = bitcast i8* [[UGLYGEP7]] to <2 x double>*
28; JAG-NEXT:    [[SCEVGEP9:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP78]], i64 32768
29; JAG-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y1]], i64 [[LSR_IV]]
30; JAG-NEXT:    [[UGLYGEP2:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>*
31; JAG-NEXT:    [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP2]], i64 32768
32; JAG-NEXT:    [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP9]], align 8
33; JAG-NEXT:    [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8
34; JAG-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
35; JAG-NEXT:    [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
36; JAG-NEXT:    [[UGLYGEP4:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
37; JAG-NEXT:    [[UGLYGEP45:%.*]] = bitcast i8* [[UGLYGEP4]] to <2 x double>*
38; JAG-NEXT:    [[SCEVGEP6:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP45]], i64 32768
39; JAG-NEXT:    store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP6]], align 8
40; JAG-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16
41; JAG-NEXT:    [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
42; JAG-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
43; JAG:       exit:
44; JAG-NEXT:    ret void
45;
46; BUL-LABEL: @maxArray(
47; BUL-NEXT:  entry:
48; BUL-NEXT:    br label [[VECTOR_BODY:%.*]]
49; BUL:       vector.body:
50; BUL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
51; BUL-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]]
52; BUL-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>*
53; BUL-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]]
54; BUL-NEXT:    [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>*
55; BUL-NEXT:    [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8
56; BUL-NEXT:    [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8
57; BUL-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
58; BUL-NEXT:    [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
59; BUL-NEXT:    [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]]
60; BUL-NEXT:    [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>*
61; BUL-NEXT:    store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8
62; BUL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
63; BUL-NEXT:    [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
64; BUL-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
65; BUL:       exit:
66; BUL-NEXT:    ret void
67;
68; HSW-LABEL: @maxArray(
69; HSW-NEXT:  entry:
70; HSW-NEXT:    br label [[VECTOR_BODY:%.*]]
71; HSW:       vector.body:
72; HSW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
73; HSW-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]]
74; HSW-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>*
75; HSW-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]]
76; HSW-NEXT:    [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>*
77; HSW-NEXT:    [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8
78; HSW-NEXT:    [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8
79; HSW-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
80; HSW-NEXT:    [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
81; HSW-NEXT:    [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]]
82; HSW-NEXT:    [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>*
83; HSW-NEXT:    store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8
84; HSW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
85; HSW-NEXT:    [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
86; HSW-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
87; HSW:       exit:
88; HSW-NEXT:    ret void
89;
90; BASE-LABEL: maxArray:
91; BASE:       # %bb.0: # %entry
92; BASE-NEXT:    movq $-524288, %rax # imm = 0xFFF80000
93; BASE-NEXT:    .p2align 4, 0x90
94; BASE-NEXT:  .LBB0_1: # %vector.body
95; BASE-NEXT:    # =>This Inner Loop Header: Depth=1
96; BASE-NEXT:    movupd 524288(%rdi,%rax), %xmm0
97; BASE-NEXT:    movupd 524288(%rsi,%rax), %xmm1
98; BASE-NEXT:    maxpd %xmm0, %xmm1
99; BASE-NEXT:    movupd %xmm1, 524288(%rdi,%rax)
100; BASE-NEXT:    addq $16, %rax
101; BASE-NEXT:    jne .LBB0_1
102; BASE-NEXT:  # %bb.2: # %exit
103; BASE-NEXT:    retq
104; FUSE-LABEL: maxArray:
105; FUSE:       # %bb.0: # %entry
106; FUSE-NEXT:    xorl %eax, %eax
107; FUSE-NEXT:    .p2align 4, 0x90
108; FUSE-NEXT:  .LBB0_1: # %vector.body
109; FUSE-NEXT:    # =>This Inner Loop Header: Depth=1
110; FUSE-NEXT:    movupd (%rdi,%rax,8), %xmm0
111; FUSE-NEXT:    movupd (%rsi,%rax,8), %xmm1
112; FUSE-NEXT:    maxpd %xmm0, %xmm1
113; FUSE-NEXT:    movupd %xmm1, (%rdi,%rax,8)
114; FUSE-NEXT:    addq $2, %rax
115; FUSE-NEXT:    cmpq $65536, %rax # imm = 0x10000
116; FUSE-NEXT:    jne .LBB0_1
117; FUSE-NEXT:  # %bb.2: # %exit
118; FUSE-NEXT:    retq
119entry:
120  br label %vector.body
121
122vector.body:
123  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
124  %gepx = getelementptr inbounds double, double* %x, i64 %index
125  %gepy = getelementptr inbounds double, double* %y, i64 %index
126  %xptr = bitcast double* %gepx to <2 x double>*
127  %yptr = bitcast double* %gepy to <2 x double>*
128  %xval = load <2 x double>, <2 x double>* %xptr, align 8
129  %yval = load <2 x double>, <2 x double>* %yptr, align 8
130  %cmp = fcmp ogt <2 x double> %yval, %xval
131  %max = select <2 x i1> %cmp, <2 x double> %yval, <2 x double> %xval
132  %xptr_again = bitcast double* %gepx to <2 x double>*
133  store <2 x double> %max, <2 x double>* %xptr_again, align 8
134  %index.next = add i64 %index, 2
135  %done = icmp eq i64 %index.next, 65536
136  br i1 %done, label %exit, label %vector.body
137
138exit:
139  ret void
140}
141
142