1; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ 2; RUN: -disable-output < %s | \ 3; RUN: FileCheck -check-prefix=CODE %s 4 5; REQUIRES: pollyacc 6 7target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 8target triple = "x86_64-unknown-linux-gnu" 9 10; This test case took at some point forever to schedule, as the isl scheduler 11; seems to have problems if domain constraints appear in the dependences 12; provided to the scheduler. 13 14; /* D := alpha*A*B*C + beta*D */ 15; for (i = 0; i < _PB_NI; i++) 16; for (j = 0; j < _PB_NJ; j++) 17; { 18; tmp[i][j] = 0; 19; for (k = 0; k < _PB_NK; ++k) 20; tmp[i][j] += alpha * A[i][k] * B[k][j]; 21; } 22; for (i = 0; i < _PB_NI; i++) 23; for (j = 0; j < _PB_NL; j++) 24; { 25; D[i][j] *= beta; 26; for (k = 0; k < _PB_NJ; ++k) 27; D[i][j] += tmp[i][k] * C[k][j]; 28; } 29 30; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); 31; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); 32; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); 33; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); 34; CODE-NEXT: { 35; CODE-NEXT: dim3 k0_dimBlock(16, 32); 36; CODE-NEXT: dim3 k0_dimGrid(128, 128); 37; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B); 38; CODE-NEXT: cudaCheckKernel(); 39; CODE-NEXT: } 40 41; CODE: { 42; CODE-NEXT: dim3 k1_dimBlock(16, 32); 43; CODE-NEXT: dim3 k1_dimGrid(128, 128); 44; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C); 45; CODE-NEXT: cudaCheckKernel(); 46; CODE-NEXT: } 47 48; CODE: cudaCheckReturn(cudaMemcpy(MemRef_tmp, dev_MemRef_tmp, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); 49; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_D, dev_MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); 50 51; CODE: # kernel0 52; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) 53; CODE-NEXT: for (int c4 = 0; c4 <= 1; c4 += 1) { 54; CODE-NEXT: if (c2 == 0) 55; CODE-NEXT: Stmt_for_body6(32 * b0 + t0, 32 * b1 + t1 + 16 * c4); 56; CODE-NEXT: for (int c5 = 0; c5 <= 31; c5 += 1) 57; CODE-NEXT: Stmt_for_body11(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5); 58; CODE-NEXT: } 59 60; CODE: # kernel1 61; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) 62; CODE-NEXT: for (int c4 = 0; c4 <= 1; c4 += 1) { 63; CODE-NEXT: if (c2 == 0) 64; CODE-NEXT: Stmt_for_body36(32 * b0 + t0, 32 * b1 + t1 + 16 * c4); 65; CODE-NEXT: for (int c5 = 0; c5 <= 31; c5 += 1) 66; CODE-NEXT: Stmt_for_body44(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5); 67; CODE-NEXT: } 68 69 70 71; Function Attrs: argmemonly nounwind 72declare void @llvm.lifetime.start(i64, i8* nocapture) #0 73 74; Function Attrs: nounwind uwtable 75define internal void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, float %alpha, float %beta, [4096 x float]* %tmp, [4096 x float]* %A, [4096 x float]* %B, [4096 x float]* %C, [4096 x float]* %D) #1 { 76entry: 77 br label %entry.split 78 79entry.split: ; preds = %entry 80 br label %for.cond4.preheader 81 82for.cond4.preheader: ; preds = %entry.split, %for.inc28 83 %indvars.iv19 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next20, %for.inc28 ] 84 br label %for.body6 85 86for.cond31.preheader: ; preds = %for.inc28 87 br label %for.cond34.preheader 88 89for.body6: ; preds = %for.cond4.preheader, %for.inc25 90 %indvars.iv16 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next17, %for.inc25 ] 91 %arrayidx8 = getelementptr inbounds [4096 x float], [4096 x float]* %tmp, i64 %indvars.iv19, i64 %indvars.iv16 92 store float 0.000000e+00, float* %arrayidx8, align 4, !tbaa !1 93 br label %for.body11 94 95for.body11: ; preds = %for.body6, %for.body11 96 %indvars.iv13 = phi i64 [ 0, %for.body6 ], [ %indvars.iv.next14, %for.body11 ] 97 %arrayidx15 = getelementptr inbounds [4096 x float], [4096 x float]* %A, i64 %indvars.iv19, i64 %indvars.iv13 98 %tmp22 = load float, float* %arrayidx15, align 4, !tbaa !1 99 %mul = fmul float %tmp22, %alpha 100 %arrayidx19 = getelementptr inbounds [4096 x float], [4096 x float]* %B, i64 %indvars.iv13, i64 %indvars.iv16 101 %tmp23 = load float, float* %arrayidx19, align 4, !tbaa !1 102 %mul20 = fmul float %mul, %tmp23 103 %arrayidx24 = getelementptr inbounds [4096 x float], [4096 x float]* %tmp, i64 %indvars.iv19, i64 %indvars.iv16 104 %tmp24 = load float, float* %arrayidx24, align 4, !tbaa !1 105 %add = fadd float %tmp24, %mul20 106 store float %add, float* %arrayidx24, align 4, !tbaa !1 107 %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 108 %exitcond15 = icmp ne i64 %indvars.iv.next14, 4096 109 br i1 %exitcond15, label %for.body11, label %for.inc25 110 111for.inc25: ; preds = %for.body11 112 %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 113 %exitcond18 = icmp ne i64 %indvars.iv.next17, 4096 114 br i1 %exitcond18, label %for.body6, label %for.inc28 115 116for.inc28: ; preds = %for.inc25 117 %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 118 %exitcond21 = icmp ne i64 %indvars.iv.next20, 4096 119 br i1 %exitcond21, label %for.cond4.preheader, label %for.cond31.preheader 120 121for.cond34.preheader: ; preds = %for.cond31.preheader, %for.inc65 122 %indvars.iv10 = phi i64 [ 0, %for.cond31.preheader ], [ %indvars.iv.next11, %for.inc65 ] 123 br label %for.body36 124 125for.body36: ; preds = %for.cond34.preheader, %for.inc62 126 %indvars.iv7 = phi i64 [ 0, %for.cond34.preheader ], [ %indvars.iv.next8, %for.inc62 ] 127 %arrayidx40 = getelementptr inbounds [4096 x float], [4096 x float]* %D, i64 %indvars.iv10, i64 %indvars.iv7 128 %tmp25 = load float, float* %arrayidx40, align 4, !tbaa !1 129 %mul41 = fmul float %tmp25, %beta 130 store float %mul41, float* %arrayidx40, align 4, !tbaa !1 131 br label %for.body44 132 133for.body44: ; preds = %for.body36, %for.body44 134 %indvars.iv = phi i64 [ 0, %for.body36 ], [ %indvars.iv.next, %for.body44 ] 135 %arrayidx48 = getelementptr inbounds [4096 x float], [4096 x float]* %tmp, i64 %indvars.iv10, i64 %indvars.iv 136 %tmp26 = load float, float* %arrayidx48, align 4, !tbaa !1 137 %arrayidx52 = getelementptr inbounds [4096 x float], [4096 x float]* %C, i64 %indvars.iv, i64 %indvars.iv7 138 %tmp27 = load float, float* %arrayidx52, align 4, !tbaa !1 139 %mul53 = fmul float %tmp26, %tmp27 140 %arrayidx57 = getelementptr inbounds [4096 x float], [4096 x float]* %D, i64 %indvars.iv10, i64 %indvars.iv7 141 %tmp28 = load float, float* %arrayidx57, align 4, !tbaa !1 142 %add58 = fadd float %tmp28, %mul53 143 store float %add58, float* %arrayidx57, align 4, !tbaa !1 144 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 145 %exitcond = icmp ne i64 %indvars.iv.next, 4096 146 br i1 %exitcond, label %for.body44, label %for.inc62 147 148for.inc62: ; preds = %for.body44 149 %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 150 %exitcond9 = icmp ne i64 %indvars.iv.next8, 4096 151 br i1 %exitcond9, label %for.body36, label %for.inc65 152 153for.inc65: ; preds = %for.inc62 154 %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 155 %exitcond12 = icmp ne i64 %indvars.iv.next11, 4096 156 br i1 %exitcond12, label %for.cond34.preheader, label %for.end67 157 158for.end67: ; preds = %for.inc65 159 ret void 160} 161 162; Function Attrs: argmemonly nounwind 163declare void @llvm.lifetime.end(i64, i8* nocapture) #0 164 165attributes #0 = { argmemonly nounwind } 166attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 167 168!llvm.ident = !{!0} 169 170!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"} 171!1 = !{!2, !2, i64 0} 172!2 = !{!"float", !3, i64 0} 173!3 = !{!"omnipotent char", !4, i64 0} 174!4 = !{!"Simple C/C++ TBAA"} 175