1; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
2
3;AVX1-NOT: llvm.masked
4
5target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
6target triple = "x86_64-pc_linux"
7
8; The source code:
9;
10;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
11;
12;  for (int i=0; i < SIZE; ++i) {
13;    if (trigger[i] > 0) {
14;      out[i] = in[index[i]] + (float) 0.5;
15;    }
16;  }
17;}
18
19;AVX512-LABEL: @foo1
20;AVX512: llvm.masked.load.v16i32
21;AVX512: llvm.masked.gather.v16f32
22;AVX512: llvm.masked.store.v16f32
23;AVX512: ret void
24
25; Function Attrs: nounwind uwtable
26define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
27entry:
28  %in.addr = alloca float*, align 8
29  %out.addr = alloca float*, align 8
30  %trigger.addr = alloca i32*, align 8
31  %index.addr = alloca i32*, align 8
32  %i = alloca i32, align 4
33  store float* %in, float** %in.addr, align 8
34  store float* %out, float** %out.addr, align 8
35  store i32* %trigger, i32** %trigger.addr, align 8
36  store i32* %index, i32** %index.addr, align 8
37  store i32 0, i32* %i, align 4
38  br label %for.cond
39
40for.cond:                                         ; preds = %for.inc, %entry
41  %0 = load i32, i32* %i, align 4
42  %cmp = icmp slt i32 %0, 4096
43  br i1 %cmp, label %for.body, label %for.end
44
45for.body:                                         ; preds = %for.cond
46  %1 = load i32, i32* %i, align 4
47  %idxprom = sext i32 %1 to i64
48  %2 = load i32*, i32** %trigger.addr, align 8
49  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
50  %3 = load i32, i32* %arrayidx, align 4
51  %cmp1 = icmp sgt i32 %3, 0
52  br i1 %cmp1, label %if.then, label %if.end
53
54if.then:                                          ; preds = %for.body
55  %4 = load i32, i32* %i, align 4
56  %idxprom2 = sext i32 %4 to i64
57  %5 = load i32*, i32** %index.addr, align 8
58  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
59  %6 = load i32, i32* %arrayidx3, align 4
60  %idxprom4 = sext i32 %6 to i64
61  %7 = load float*, float** %in.addr, align 8
62  %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
63  %8 = load float, float* %arrayidx5, align 4
64  %add = fadd float %8, 5.000000e-01
65  %9 = load i32, i32* %i, align 4
66  %idxprom6 = sext i32 %9 to i64
67  %10 = load float*, float** %out.addr, align 8
68  %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
69  store float %add, float* %arrayidx7, align 4
70  br label %if.end
71
72if.end:                                           ; preds = %if.then, %for.body
73  br label %for.inc
74
75for.inc:                                          ; preds = %if.end
76  %11 = load i32, i32* %i, align 4
77  %inc = add nsw i32 %11, 1
78  store i32 %inc, i32* %i, align 4
79  br label %for.cond
80
81for.end:                                          ; preds = %for.cond
82  ret void
83}
84
85; The source code
86;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
87;
88;  for (int i=0; i<SIZE; ++i) {
89;    if (trigger[i] > 0) {
90;      out[i] = in[i].b + (float) 0.5;
91;    }
92;  }
93;}
94
95%struct.In = type { float, float }
96
97;AVX512-LABEL: @foo2
98;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
99;AVX512: llvm.masked.gather.v16f32
100;AVX512: llvm.masked.store.v16f32
101;AVX512: ret void
102define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
103entry:
104  %in.addr = alloca %struct.In*, align 8
105  %out.addr = alloca float*, align 8
106  %trigger.addr = alloca i32*, align 8
107  %index.addr = alloca i32*, align 8
108  %i = alloca i32, align 4
109  store %struct.In* %in, %struct.In** %in.addr, align 8
110  store float* %out, float** %out.addr, align 8
111  store i32* %trigger, i32** %trigger.addr, align 8
112  store i32* %index, i32** %index.addr, align 8
113  store i32 0, i32* %i, align 4
114  br label %for.cond
115
116for.cond:                                         ; preds = %for.inc, %entry
117  %0 = load i32, i32* %i, align 4
118  %cmp = icmp slt i32 %0, 4096
119  br i1 %cmp, label %for.body, label %for.end
120
121for.body:                                         ; preds = %for.cond
122  %1 = load i32, i32* %i, align 4
123  %idxprom = sext i32 %1 to i64
124  %2 = load i32*, i32** %trigger.addr, align 8
125  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
126  %3 = load i32, i32* %arrayidx, align 4
127  %cmp1 = icmp sgt i32 %3, 0
128  br i1 %cmp1, label %if.then, label %if.end
129
130if.then:                                          ; preds = %for.body
131  %4 = load i32, i32* %i, align 4
132  %idxprom2 = sext i32 %4 to i64
133  %5 = load %struct.In*, %struct.In** %in.addr, align 8
134  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
135  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
136  %6 = load float, float* %b, align 4
137  %add = fadd float %6, 5.000000e-01
138  %7 = load i32, i32* %i, align 4
139  %idxprom4 = sext i32 %7 to i64
140  %8 = load float*, float** %out.addr, align 8
141  %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
142  store float %add, float* %arrayidx5, align 4
143  br label %if.end
144
145if.end:                                           ; preds = %if.then, %for.body
146  br label %for.inc
147
148for.inc:                                          ; preds = %if.end
149  %9 = load i32, i32* %i, align 4
150  %inc = add nsw i32 %9, 1
151  store i32 %inc, i32* %i, align 4
152  br label %for.cond
153
154for.end:                                          ; preds = %for.cond
155  ret void
156}
157
158; The source code
159;struct Out {
160;  float a;
161;  float b;
162;};
163;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
164;
165;  for (int i=0; i<SIZE; ++i) {
166;    if (trigger[i] > 0) {
167;      out[i].b = in[i].b + (float) 0.5;
168;    }
169;  }
170;}
171
172;AVX512-LABEL: @foo3
173;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
174;AVX512: llvm.masked.gather.v16f32
175;AVX512: fadd <16 x float>
176;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
177;AVX512: llvm.masked.scatter.v16f32
178;AVX512: ret void
179
180%struct.Out = type { float, float }
181
182define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
183entry:
184  %in.addr = alloca %struct.In*, align 8
185  %out.addr = alloca %struct.Out*, align 8
186  %trigger.addr = alloca i32*, align 8
187  %i = alloca i32, align 4
188  store %struct.In* %in, %struct.In** %in.addr, align 8
189  store %struct.Out* %out, %struct.Out** %out.addr, align 8
190  store i32* %trigger, i32** %trigger.addr, align 8
191  store i32 0, i32* %i, align 4
192  br label %for.cond
193
194for.cond:                                         ; preds = %for.inc, %entry
195  %0 = load i32, i32* %i, align 4
196  %cmp = icmp slt i32 %0, 4096
197  br i1 %cmp, label %for.body, label %for.end
198
199for.body:                                         ; preds = %for.cond
200  %1 = load i32, i32* %i, align 4
201  %idxprom = sext i32 %1 to i64
202  %2 = load i32*, i32** %trigger.addr, align 8
203  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
204  %3 = load i32, i32* %arrayidx, align 4
205  %cmp1 = icmp sgt i32 %3, 0
206  br i1 %cmp1, label %if.then, label %if.end
207
208if.then:                                          ; preds = %for.body
209  %4 = load i32, i32* %i, align 4
210  %idxprom2 = sext i32 %4 to i64
211  %5 = load %struct.In*, %struct.In** %in.addr, align 8
212  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
213  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
214  %6 = load float, float* %b, align 4
215  %add = fadd float %6, 5.000000e-01
216  %7 = load i32, i32* %i, align 4
217  %idxprom4 = sext i32 %7 to i64
218  %8 = load %struct.Out*, %struct.Out** %out.addr, align 8
219  %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
220  %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
221  store float %add, float* %b6, align 4
222  br label %if.end
223
224if.end:                                           ; preds = %if.then, %for.body
225  br label %for.inc
226
227for.inc:                                          ; preds = %if.end
228  %9 = load i32, i32* %i, align 4
229  %inc = add nsw i32 %9, 1
230  store i32 %inc, i32* %i, align 4
231  br label %for.cond
232
233for.end:                                          ; preds = %for.cond
234  ret void
235}
236declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
237