1; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
2; RUN: -disable-output < %s | \
3; RUN: FileCheck -check-prefix=CODE %s
4
5; REQUIRES: pollyacc
6
7target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
8target triple = "x86_64-unknown-linux-gnu"
9
10; This test case took at some point forever to schedule, as the isl scheduler
11; seems to have problems if domain constraints appear in the dependences
12; provided to the scheduler.
13
14;   /* D := alpha*A*B*C + beta*D */
15;   for (i = 0; i < _PB_NI; i++)
16;     for (j = 0; j < _PB_NJ; j++)
17;       {
18;   tmp[i][j] = 0;
19;   for (k = 0; k < _PB_NK; ++k)
20;     tmp[i][j] += alpha * A[i][k] * B[k][j];
21;       }
22;   for (i = 0; i < _PB_NI; i++)
23;     for (j = 0; j < _PB_NL; j++)
24;       {
25;   D[i][j] *= beta;
26;   for (k = 0; k < _PB_NJ; ++k)
27;     D[i][j] += tmp[i][k] * C[k][j];
28;       }
29
30; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
31; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
32; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
33; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
34; CODE-NEXT:   {
35; CODE-NEXT:     dim3 k0_dimBlock(16, 32);
36; CODE-NEXT:     dim3 k0_dimGrid(128, 128);
37; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B);
38; CODE-NEXT:     cudaCheckKernel();
39; CODE-NEXT:   }
40
41; CODE:   {
42; CODE-NEXT:     dim3 k1_dimBlock(16, 32);
43; CODE-NEXT:     dim3 k1_dimGrid(128, 128);
44; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C);
45; CODE-NEXT:     cudaCheckKernel();
46; CODE-NEXT:   }
47
48; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_tmp, dev_MemRef_tmp, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost));
49; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_D, dev_MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost));
50
51; CODE: # kernel0
52; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1)
53; CODE-NEXT:   for (int c4 = 0; c4 <= 1; c4 += 1) {
54; CODE-NEXT:     if (c2 == 0)
55; CODE-NEXT:       Stmt_for_body6(32 * b0 + t0, 32 * b1 + t1 + 16 * c4);
56; CODE-NEXT:     for (int c5 = 0; c5 <= 31; c5 += 1)
57; CODE-NEXT:       Stmt_for_body11(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5);
58; CODE-NEXT:   }
59
60; CODE: # kernel1
61; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1)
62; CODE-NEXT:   for (int c4 = 0; c4 <= 1; c4 += 1) {
63; CODE-NEXT:     if (c2 == 0)
64; CODE-NEXT:       Stmt_for_body36(32 * b0 + t0, 32 * b1 + t1 + 16 * c4);
65; CODE-NEXT:     for (int c5 = 0; c5 <= 31; c5 += 1)
66; CODE-NEXT:       Stmt_for_body44(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5);
67; CODE-NEXT:   }
68
69
70
71; Function Attrs: argmemonly nounwind
72declare void @llvm.lifetime.start(i64, i8* nocapture) #0
73
74; Function Attrs: nounwind uwtable
75define internal void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, float %alpha, float %beta, [4096 x float]* %tmp, [4096 x float]* %A, [4096 x float]* %B, [4096 x float]* %C, [4096 x float]* %D) #1 {
76entry:
77  br label %entry.split
78
79entry.split:                                      ; preds = %entry
80  br label %for.cond4.preheader
81
82for.cond4.preheader:                              ; preds = %entry.split, %for.inc28
83  %indvars.iv19 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next20, %for.inc28 ]
84  br label %for.body6
85
86for.cond31.preheader:                             ; preds = %for.inc28
87  br label %for.cond34.preheader
88
89for.body6:                                        ; preds = %for.cond4.preheader, %for.inc25
90  %indvars.iv16 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next17, %for.inc25 ]
91  %arrayidx8 = getelementptr inbounds [4096 x float], [4096 x float]* %tmp, i64 %indvars.iv19, i64 %indvars.iv16
92  store float 0.000000e+00, float* %arrayidx8, align 4, !tbaa !1
93  br label %for.body11
94
95for.body11:                                       ; preds = %for.body6, %for.body11
96  %indvars.iv13 = phi i64 [ 0, %for.body6 ], [ %indvars.iv.next14, %for.body11 ]
97  %arrayidx15 = getelementptr inbounds [4096 x float], [4096 x float]* %A, i64 %indvars.iv19, i64 %indvars.iv13
98  %tmp22 = load float, float* %arrayidx15, align 4, !tbaa !1
99  %mul = fmul float %tmp22, %alpha
100  %arrayidx19 = getelementptr inbounds [4096 x float], [4096 x float]* %B, i64 %indvars.iv13, i64 %indvars.iv16
101  %tmp23 = load float, float* %arrayidx19, align 4, !tbaa !1
102  %mul20 = fmul float %mul, %tmp23
103  %arrayidx24 = getelementptr inbounds [4096 x float], [4096 x float]* %tmp, i64 %indvars.iv19, i64 %indvars.iv16
104  %tmp24 = load float, float* %arrayidx24, align 4, !tbaa !1
105  %add = fadd float %tmp24, %mul20
106  store float %add, float* %arrayidx24, align 4, !tbaa !1
107  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
108  %exitcond15 = icmp ne i64 %indvars.iv.next14, 4096
109  br i1 %exitcond15, label %for.body11, label %for.inc25
110
111for.inc25:                                        ; preds = %for.body11
112  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
113  %exitcond18 = icmp ne i64 %indvars.iv.next17, 4096
114  br i1 %exitcond18, label %for.body6, label %for.inc28
115
116for.inc28:                                        ; preds = %for.inc25
117  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
118  %exitcond21 = icmp ne i64 %indvars.iv.next20, 4096
119  br i1 %exitcond21, label %for.cond4.preheader, label %for.cond31.preheader
120
121for.cond34.preheader:                             ; preds = %for.cond31.preheader, %for.inc65
122  %indvars.iv10 = phi i64 [ 0, %for.cond31.preheader ], [ %indvars.iv.next11, %for.inc65 ]
123  br label %for.body36
124
125for.body36:                                       ; preds = %for.cond34.preheader, %for.inc62
126  %indvars.iv7 = phi i64 [ 0, %for.cond34.preheader ], [ %indvars.iv.next8, %for.inc62 ]
127  %arrayidx40 = getelementptr inbounds [4096 x float], [4096 x float]* %D, i64 %indvars.iv10, i64 %indvars.iv7
128  %tmp25 = load float, float* %arrayidx40, align 4, !tbaa !1
129  %mul41 = fmul float %tmp25, %beta
130  store float %mul41, float* %arrayidx40, align 4, !tbaa !1
131  br label %for.body44
132
133for.body44:                                       ; preds = %for.body36, %for.body44
134  %indvars.iv = phi i64 [ 0, %for.body36 ], [ %indvars.iv.next, %for.body44 ]
135  %arrayidx48 = getelementptr inbounds [4096 x float], [4096 x float]* %tmp, i64 %indvars.iv10, i64 %indvars.iv
136  %tmp26 = load float, float* %arrayidx48, align 4, !tbaa !1
137  %arrayidx52 = getelementptr inbounds [4096 x float], [4096 x float]* %C, i64 %indvars.iv, i64 %indvars.iv7
138  %tmp27 = load float, float* %arrayidx52, align 4, !tbaa !1
139  %mul53 = fmul float %tmp26, %tmp27
140  %arrayidx57 = getelementptr inbounds [4096 x float], [4096 x float]* %D, i64 %indvars.iv10, i64 %indvars.iv7
141  %tmp28 = load float, float* %arrayidx57, align 4, !tbaa !1
142  %add58 = fadd float %tmp28, %mul53
143  store float %add58, float* %arrayidx57, align 4, !tbaa !1
144  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
145  %exitcond = icmp ne i64 %indvars.iv.next, 4096
146  br i1 %exitcond, label %for.body44, label %for.inc62
147
148for.inc62:                                        ; preds = %for.body44
149  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
150  %exitcond9 = icmp ne i64 %indvars.iv.next8, 4096
151  br i1 %exitcond9, label %for.body36, label %for.inc65
152
153for.inc65:                                        ; preds = %for.inc62
154  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
155  %exitcond12 = icmp ne i64 %indvars.iv.next11, 4096
156  br i1 %exitcond12, label %for.cond34.preheader, label %for.end67
157
158for.end67:                                        ; preds = %for.inc65
159  ret void
160}
161
162; Function Attrs: argmemonly nounwind
163declare void @llvm.lifetime.end(i64, i8* nocapture) #0
164
165attributes #0 = { argmemonly nounwind }
166attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
167
168!llvm.ident = !{!0}
169
170!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
171!1 = !{!2, !2, i64 0}
172!2 = !{!"float", !3, i64 0}
173!3 = !{!"omnipotent char", !4, i64 0}
174!4 = !{!"Simple C/C++ TBAA"}
175