1; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
2; RUN: -disable-output < %s | \
3; RUN: FileCheck -check-prefix=CODE %s
4
5; RUN: opt %loadPolly -polly-codegen-ppcg \
6; RUN: -S < %s | \
7; RUN: FileCheck -check-prefix=IR %s
8
9; RUN: opt %loadPolly -polly-codegen-ppcg \
10; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
11; RUN: FileCheck -check-prefix=KERNEL %s
12
13; REQUIRES: pollyacc,nvptx
14
15target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
16
17; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, float %MemRef_b)
18
19; CODE: Code
20; CODE-NEXT: ====
21; CODE-NEXT: # host
22; CODE-NEXT: {
23; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice));
24; CODE-NEXT:   {
25; CODE-NEXT:     dim3 k0_dimBlock(32);
26; CODE-NEXT:     dim3 k0_dimGrid(32);
27; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b);
28; CODE-NEXT:     cudaCheckKernel();
29; CODE-NEXT:   }
30
31; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(float), cudaMemcpyDeviceToHost));
32; CODE-NEXT: }
33
34; CODE: # kernel0
35; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
36
37;    void foo(float A[], float b) {
38;      for (long i = 0; i < 1024; i++)
39;        A[i] += b;
40;    }
41;
42target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
43
44define void @float(float* %A, float %b) {
45bb:
46  br label %bb1
47
48bb1:                                              ; preds = %bb5, %bb
49  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
50  %exitcond = icmp ne i64 %i.0, 1024
51  br i1 %exitcond, label %bb2, label %bb7
52
53bb2:                                              ; preds = %bb1
54  %tmp = getelementptr inbounds float, float* %A, i64 %i.0
55  %tmp3 = load float, float* %tmp, align 4
56  %tmp4 = fadd float %tmp3, %b
57  store float %tmp4, float* %tmp, align 4
58  br label %bb5
59
60bb5:                                              ; preds = %bb2
61  %tmp6 = add nuw nsw i64 %i.0, 1
62  br label %bb1
63
64bb7:                                              ; preds = %bb1
65  ret void
66}
67
68; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, double %MemRef_b)
69; KERNEL-NEXT: entry:
70; KERNEL-NEXT:   %b.s2a = alloca double
71; KERNEL-NEXT:   store double %MemRef_b, double* %b.s2a
72
73; CODE: Code
74; CODE-NEXT: ====
75; CODE-NEXT: # host
76; CODE-NEXT: {
77; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice));
78; CODE-NEXT:   {
79; CODE-NEXT:     dim3 k0_dimBlock(32);
80; CODE-NEXT:     dim3 k0_dimGrid(32);
81; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b);
82; CODE-NEXT:     cudaCheckKernel();
83; CODE-NEXT:   }
84
85; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(double), cudaMemcpyDeviceToHost));
86; CODE-NEXT: }
87
88; CODE: # kernel0
89; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
90
91;    void foo(double A[], double b) {
92;      for (long i = 0; i < 1024; i++)
93;        A[i] += b;
94;    }
95;
96target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
97
98define void @double(double* %A, double %b) {
99bb:
100  br label %bb1
101
102bb1:                                              ; preds = %bb5, %bb
103  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
104  %exitcond = icmp ne i64 %i.0, 1024
105  br i1 %exitcond, label %bb2, label %bb7
106
107bb2:                                              ; preds = %bb1
108  %tmp = getelementptr inbounds double, double* %A, i64 %i.0
109  %tmp3 = load double, double* %tmp, align 4
110  %tmp4 = fadd double %tmp3, %b
111  store double %tmp4, double* %tmp, align 4
112  br label %bb5
113
114bb5:                                              ; preds = %bb2
115  %tmp6 = add nuw nsw i64 %i.0, 1
116  br label %bb1
117
118bb7:                                              ; preds = %bb1
119  ret void
120}
121
122; CODE: Code
123; CODE-NEXT: ====
124; CODE-NEXT: # host
125; CODE-NEXT: {
126; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice));
127; CODE-NEXT:   {
128; CODE-NEXT:     dim3 k0_dimBlock(32);
129; CODE-NEXT:     dim3 k0_dimGrid(32);
130; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
131; CODE-NEXT:     cudaCheckKernel();
132; CODE-NEXT:   }
133
134; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i1), cudaMemcpyDeviceToHost));
135; CODE-NEXT: }
136
137; CODE: # kernel0
138; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
139
140;    void foo(i1 A[], i1 b) {
141;      for (long i = 0; i < 1024; i++)
142;        A[i] += b;
143;    }
144;
145target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
146
147define void @i1(i1* %A, i1 %b) {
148bb:
149  br label %bb1
150
151bb1:                                              ; preds = %bb5, %bb
152  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
153  %exitcond = icmp ne i64 %i.0, 1024
154  br i1 %exitcond, label %bb2, label %bb7
155
156bb2:                                              ; preds = %bb1
157  %tmp = getelementptr inbounds i1, i1* %A, i64 %i.0
158  %tmp3 = load i1, i1* %tmp, align 4
159  %tmp4 = add i1 %tmp3, %b
160  store i1 %tmp4, i1* %tmp, align 4
161  br label %bb5
162
163bb5:                                              ; preds = %bb2
164  %tmp6 = add nuw nsw i64 %i.0, 1
165  br label %bb1
166
167bb7:                                              ; preds = %bb1
168  ret void
169}
170
171; CODE: Code
172; CODE-NEXT: ====
173; CODE-NEXT: # host
174; CODE-NEXT: {
175; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3), cudaMemcpyHostToDevice));
176; CODE-NEXT:   {
177; CODE-NEXT:     dim3 k0_dimBlock(32);
178; CODE-NEXT:     dim3 k0_dimGrid(32);
179; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
180; CODE-NEXT:     cudaCheckKernel();
181; CODE-NEXT:   }
182
183; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3), cudaMemcpyDeviceToHost));
184; CODE-NEXT: }
185
186; CODE: # kernel0
187; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
188
189;    void foo(i3 A[], i3 b) {
190;      for (long i = 0; i < 1024; i++)
191;        A[i] += b;
192;    }
193;
194target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
195
196define void @i3(i3* %A, i3 %b) {
197bb:
198  br label %bb1
199
200bb1:                                              ; preds = %bb5, %bb
201  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
202  %exitcond = icmp ne i64 %i.0, 1024
203  br i1 %exitcond, label %bb2, label %bb7
204
205bb2:                                              ; preds = %bb1
206  %tmp = getelementptr inbounds i3, i3* %A, i64 %i.0
207  %tmp3 = load i3, i3* %tmp, align 4
208  %tmp4 = add i3 %tmp3, %b
209  store i3 %tmp4, i3* %tmp, align 4
210  br label %bb5
211
212bb5:                                              ; preds = %bb2
213  %tmp6 = add nuw nsw i64 %i.0, 1
214  br label %bb1
215
216bb7:                                              ; preds = %bb1
217  ret void
218}
219
220; CODE: Code
221; CODE-NEXT: ====
222; CODE-NEXT: # host
223; CODE-NEXT: {
224; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i8), cudaMemcpyHostToDevice));
225; CODE-NEXT:   {
226; CODE-NEXT:     dim3 k0_dimBlock(32);
227; CODE-NEXT:     dim3 k0_dimGrid(32);
228; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
229; CODE-NEXT:     cudaCheckKernel();
230; CODE-NEXT:   }
231
232; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i8), cudaMemcpyDeviceToHost));
233; CODE-NEXT: }
234
235; CODE: # kernel0
236; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
237
238;    void foo(i8 A[], i32 b) {
239;      for (long i = 0; i < 1024; i++)
240;        A[i] += b;
241;    }
242;
243target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
244
245define void @i8(i8* %A, i8 %b) {
246bb:
247  br label %bb1
248
249bb1:                                              ; preds = %bb5, %bb
250  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
251  %exitcond = icmp ne i64 %i.0, 1024
252  br i1 %exitcond, label %bb2, label %bb7
253
254bb2:                                              ; preds = %bb1
255  %tmp = getelementptr inbounds i8, i8* %A, i64 %i.0
256  %tmp3 = load i8, i8* %tmp, align 4
257  %tmp4 = add i8 %tmp3, %b
258  store i8 %tmp4, i8* %tmp, align 4
259  br label %bb5
260
261bb5:                                              ; preds = %bb2
262  %tmp6 = add nuw nsw i64 %i.0, 1
263  br label %bb1
264
265bb7:                                              ; preds = %bb1
266  ret void
267}
268
269; IR-LABEL: @i8
270
271; IR: [[REGA:%.+]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
272; IR-NEXT: [[REGB:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
273; IR-NEXT: store i8* [[REGA:%.+]], i8** %polly_launch_0_param_0
274; IR-NEXT: [[REGC:%.+]] = bitcast i8** %polly_launch_0_param_0 to i8*
275; IR-NEXT: store i8* [[REGC]], i8** [[REGB]]
276; IR-NEXT: store i8 %b, i8* %polly_launch_0_param_1
277; IR-NEXT: [[REGD:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
278; IR-NEXT: store i8* %polly_launch_0_param_1, i8** [[REGD]]
279
280; CODE: Code
281; CODE-NEXT: ====
282; CODE-NEXT: # host
283; CODE-NEXT: {
284; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i32), cudaMemcpyHostToDevice));
285; CODE-NEXT:   {
286; CODE-NEXT:     dim3 k0_dimBlock(32);
287; CODE-NEXT:     dim3 k0_dimGrid(32);
288; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
289; CODE-NEXT:     cudaCheckKernel();
290; CODE-NEXT:   }
291
292; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i32), cudaMemcpyDeviceToHost));
293; CODE-NEXT: }
294
295; CODE: # kernel0
296; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
297
298;    void foo(i32 A[], i32 b) {
299;      for (long i = 0; i < 1024; i++)
300;        A[i] += b;
301;    }
302;
303target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
304
305define void @i32(i32* %A, i32 %b) {
306bb:
307  br label %bb1
308
309bb1:                                              ; preds = %bb5, %bb
310  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
311  %exitcond = icmp ne i64 %i.0, 1024
312  br i1 %exitcond, label %bb2, label %bb7
313
314bb2:                                              ; preds = %bb1
315  %tmp = getelementptr inbounds i32, i32* %A, i64 %i.0
316  %tmp3 = load i32, i32* %tmp, align 4
317  %tmp4 = add i32 %tmp3, %b
318  store i32 %tmp4, i32* %tmp, align 4
319  br label %bb5
320
321bb5:                                              ; preds = %bb2
322  %tmp6 = add nuw nsw i64 %i.0, 1
323  br label %bb1
324
325bb7:                                              ; preds = %bb1
326  ret void
327}
328
329; CODE: Code
330; CODE-NEXT: ====
331; CODE-NEXT: # host
332; CODE-NEXT: {
333; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i60), cudaMemcpyHostToDevice));
334; CODE-NEXT:   {
335; CODE-NEXT:     dim3 k0_dimBlock(32);
336; CODE-NEXT:     dim3 k0_dimGrid(32);
337; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
338; CODE-NEXT:     cudaCheckKernel();
339; CODE-NEXT:   }
340
341; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i60), cudaMemcpyDeviceToHost));
342; CODE-NEXT: }
343
344; CODE: # kernel0
345; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
346
347;    void foo(i60 A[], i60 b) {
348;      for (long i = 0; i < 1024; i++)
349;        A[i] += b;
350;    }
351;
352target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
353
354define void @i60(i60* %A, i60 %b) {
355bb:
356  br label %bb1
357
358bb1:                                              ; preds = %bb5, %bb
359  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
360  %exitcond = icmp ne i64 %i.0, 1024
361  br i1 %exitcond, label %bb2, label %bb7
362
363bb2:                                              ; preds = %bb1
364  %tmp = getelementptr inbounds i60, i60* %A, i64 %i.0
365  %tmp3 = load i60, i60* %tmp, align 4
366  %tmp4 = add i60 %tmp3, %b
367  store i60 %tmp4, i60* %tmp, align 4
368  br label %bb5
369
370bb5:                                              ; preds = %bb2
371  %tmp6 = add nuw nsw i64 %i.0, 1
372  br label %bb1
373
374bb7:                                              ; preds = %bb1
375  ret void
376}
377
378; CODE: Code
379; CODE-NEXT: ====
380; CODE-NEXT: # host
381; CODE-NEXT: {
382; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
383; CODE-NEXT:   {
384; CODE-NEXT:     dim3 k0_dimBlock(32);
385; CODE-NEXT:     dim3 k0_dimGrid(32);
386; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
387; CODE-NEXT:     cudaCheckKernel();
388; CODE-NEXT:   }
389
390; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
391; CODE-NEXT: }
392
393; CODE: # kernel0
394; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
395
396;    void foo(i64 A[], i64 b) {
397;      for (long i = 0; i < 1024; i++)
398;        A[i] += b;
399;    }
400;
401target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
402
403define void @i64(i64* %A, i64 %b) {
404bb:
405  br label %bb1
406
407bb1:                                              ; preds = %bb5, %bb
408  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
409  %exitcond = icmp ne i64 %i.0, 1024
410  br i1 %exitcond, label %bb2, label %bb7
411
412bb2:                                              ; preds = %bb1
413  %tmp = getelementptr inbounds i64, i64* %A, i64 %i.0
414  %tmp3 = load i64, i64* %tmp, align 4
415  %tmp4 = add i64 %tmp3, %b
416  store i64 %tmp4, i64* %tmp, align 4
417  br label %bb5
418
419bb5:                                              ; preds = %bb2
420  %tmp6 = add nuw nsw i64 %i.0, 1
421  br label %bb1
422
423bb7:                                              ; preds = %bb1
424  ret void
425}
426