1; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ 2; RUN: -disable-output < %s | \ 3; RUN: FileCheck -check-prefix=CODE %s 4 5; RUN: opt %loadPolly -polly-codegen-ppcg \ 6; RUN: -S < %s | \ 7; RUN: FileCheck -check-prefix=IR %s 8 9; RUN: opt %loadPolly -polly-codegen-ppcg \ 10; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ 11; RUN: FileCheck -check-prefix=KERNEL %s 12 13; REQUIRES: pollyacc,nvptx 14 15target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 16 17; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, float %MemRef_b) 18 19; CODE: Code 20; CODE-NEXT: ==== 21; CODE-NEXT: # host 22; CODE-NEXT: { 23; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice)); 24; CODE-NEXT: { 25; CODE-NEXT: dim3 k0_dimBlock(32); 26; CODE-NEXT: dim3 k0_dimGrid(32); 27; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b); 28; CODE-NEXT: cudaCheckKernel(); 29; CODE-NEXT: } 30 31; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(float), cudaMemcpyDeviceToHost)); 32; CODE-NEXT: } 33 34; CODE: # kernel0 35; CODE-NEXT: Stmt_bb2(32 * b0 + t0); 36 37; void foo(float A[], float b) { 38; for (long i = 0; i < 1024; i++) 39; A[i] += b; 40; } 41; 42target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 43 44define void @float(float* %A, float %b) { 45bb: 46 br label %bb1 47 48bb1: ; preds = %bb5, %bb 49 %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] 50 %exitcond = icmp ne i64 %i.0, 1024 51 br i1 %exitcond, label %bb2, label %bb7 52 53bb2: ; preds = %bb1 54 %tmp = getelementptr inbounds float, float* %A, i64 %i.0 55 %tmp3 = load float, float* %tmp, align 4 56 %tmp4 = fadd float %tmp3, %b 57 store float %tmp4, float* %tmp, align 4 58 br label %bb5 59 60bb5: ; preds = %bb2 61 %tmp6 = add nuw nsw i64 %i.0, 1 62 br label %bb1 63 64bb7: ; preds = %bb1 65 ret void 66} 67 68; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, double %MemRef_b) 69; KERNEL-NEXT: entry: 70; KERNEL-NEXT: %b.s2a = alloca double 71; KERNEL-NEXT: store double %MemRef_b, double* %b.s2a 72 73; CODE: Code 74; CODE-NEXT: ==== 75; CODE-NEXT: # host 76; CODE-NEXT: { 77; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice)); 78; CODE-NEXT: { 79; CODE-NEXT: dim3 k0_dimBlock(32); 80; CODE-NEXT: dim3 k0_dimGrid(32); 81; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b); 82; CODE-NEXT: cudaCheckKernel(); 83; CODE-NEXT: } 84 85; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(double), cudaMemcpyDeviceToHost)); 86; CODE-NEXT: } 87 88; CODE: # kernel0 89; CODE-NEXT: Stmt_bb2(32 * b0 + t0); 90 91; void foo(double A[], double b) { 92; for (long i = 0; i < 1024; i++) 93; A[i] += b; 94; } 95; 96target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 97 98define void @double(double* %A, double %b) { 99bb: 100 br label %bb1 101 102bb1: ; preds = %bb5, %bb 103 %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] 104 %exitcond = icmp ne i64 %i.0, 1024 105 br i1 %exitcond, label %bb2, label %bb7 106 107bb2: ; preds = %bb1 108 %tmp = getelementptr inbounds double, double* %A, i64 %i.0 109 %tmp3 = load double, double* %tmp, align 4 110 %tmp4 = fadd double %tmp3, %b 111 store double %tmp4, double* %tmp, align 4 112 br label %bb5 113 114bb5: ; preds = %bb2 115 %tmp6 = add nuw nsw i64 %i.0, 1 116 br label %bb1 117 118bb7: ; preds = %bb1 119 ret void 120} 121 122; CODE: Code 123; CODE-NEXT: ==== 124; CODE-NEXT: # host 125; CODE-NEXT: { 126; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice)); 127; CODE-NEXT: { 128; CODE-NEXT: dim3 k0_dimBlock(32); 129; CODE-NEXT: dim3 k0_dimGrid(32); 130; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); 131; CODE-NEXT: cudaCheckKernel(); 132; CODE-NEXT: } 133 134; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i1), cudaMemcpyDeviceToHost)); 135; CODE-NEXT: } 136 137; CODE: # kernel0 138; CODE-NEXT: Stmt_bb2(32 * b0 + t0); 139 140; void foo(i1 A[], i1 b) { 141; for (long i = 0; i < 1024; i++) 142; A[i] += b; 143; } 144; 145target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 146 147define void @i1(i1* %A, i1 %b) { 148bb: 149 br label %bb1 150 151bb1: ; preds = %bb5, %bb 152 %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] 153 %exitcond = icmp ne i64 %i.0, 1024 154 br i1 %exitcond, label %bb2, label %bb7 155 156bb2: ; preds = %bb1 157 %tmp = getelementptr inbounds i1, i1* %A, i64 %i.0 158 %tmp3 = load i1, i1* %tmp, align 4 159 %tmp4 = add i1 %tmp3, %b 160 store i1 %tmp4, i1* %tmp, align 4 161 br label %bb5 162 163bb5: ; preds = %bb2 164 %tmp6 = add nuw nsw i64 %i.0, 1 165 br label %bb1 166 167bb7: ; preds = %bb1 168 ret void 169} 170 171; CODE: Code 172; CODE-NEXT: ==== 173; CODE-NEXT: # host 174; CODE-NEXT: { 175; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3), cudaMemcpyHostToDevice)); 176; CODE-NEXT: { 177; CODE-NEXT: dim3 k0_dimBlock(32); 178; CODE-NEXT: dim3 k0_dimGrid(32); 179; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); 180; CODE-NEXT: cudaCheckKernel(); 181; CODE-NEXT: } 182 183; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3), cudaMemcpyDeviceToHost)); 184; CODE-NEXT: } 185 186; CODE: # kernel0 187; CODE-NEXT: Stmt_bb2(32 * b0 + t0); 188 189; void foo(i3 A[], i3 b) { 190; for (long i = 0; i < 1024; i++) 191; A[i] += b; 192; } 193; 194target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 195 196define void @i3(i3* %A, i3 %b) { 197bb: 198 br label %bb1 199 200bb1: ; preds = %bb5, %bb 201 %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] 202 %exitcond = icmp ne i64 %i.0, 1024 203 br i1 %exitcond, label %bb2, label %bb7 204 205bb2: ; preds = %bb1 206 %tmp = getelementptr inbounds i3, i3* %A, i64 %i.0 207 %tmp3 = load i3, i3* %tmp, align 4 208 %tmp4 = add i3 %tmp3, %b 209 store i3 %tmp4, i3* %tmp, align 4 210 br label %bb5 211 212bb5: ; preds = %bb2 213 %tmp6 = add nuw nsw i64 %i.0, 1 214 br label %bb1 215 216bb7: ; preds = %bb1 217 ret void 218} 219 220; CODE: Code 221; CODE-NEXT: ==== 222; CODE-NEXT: # host 223; CODE-NEXT: { 224; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i8), cudaMemcpyHostToDevice)); 225; CODE-NEXT: { 226; CODE-NEXT: dim3 k0_dimBlock(32); 227; CODE-NEXT: dim3 k0_dimGrid(32); 228; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); 229; CODE-NEXT: cudaCheckKernel(); 230; CODE-NEXT: } 231 232; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i8), cudaMemcpyDeviceToHost)); 233; CODE-NEXT: } 234 235; CODE: # kernel0 236; CODE-NEXT: Stmt_bb2(32 * b0 + t0); 237 238; void foo(i8 A[], i32 b) { 239; for (long i = 0; i < 1024; i++) 240; A[i] += b; 241; } 242; 243target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 244 245define void @i8(i8* %A, i8 %b) { 246bb: 247 br label %bb1 248 249bb1: ; preds = %bb5, %bb 250 %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] 251 %exitcond = icmp ne i64 %i.0, 1024 252 br i1 %exitcond, label %bb2, label %bb7 253 254bb2: ; preds = %bb1 255 %tmp = getelementptr inbounds i8, i8* %A, i64 %i.0 256 %tmp3 = load i8, i8* %tmp, align 4 257 %tmp4 = add i8 %tmp3, %b 258 store i8 %tmp4, i8* %tmp, align 4 259 br label %bb5 260 261bb5: ; preds = %bb2 262 %tmp6 = add nuw nsw i64 %i.0, 1 263 br label %bb1 264 265bb7: ; preds = %bb1 266 ret void 267} 268 269; IR-LABEL: @i8 270 271; IR: [[REGA:%.+]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) 272; IR-NEXT: [[REGB:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 273; IR-NEXT: store i8* [[REGA:%.+]], i8** %polly_launch_0_param_0 274; IR-NEXT: [[REGC:%.+]] = bitcast i8** %polly_launch_0_param_0 to i8* 275; IR-NEXT: store i8* [[REGC]], i8** [[REGB]] 276; IR-NEXT: store i8 %b, i8* %polly_launch_0_param_1 277; IR-NEXT: [[REGD:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 278; IR-NEXT: store i8* %polly_launch_0_param_1, i8** [[REGD]] 279 280; CODE: Code 281; CODE-NEXT: ==== 282; CODE-NEXT: # host 283; CODE-NEXT: { 284; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i32), cudaMemcpyHostToDevice)); 285; CODE-NEXT: { 286; CODE-NEXT: dim3 k0_dimBlock(32); 287; CODE-NEXT: dim3 k0_dimGrid(32); 288; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); 289; CODE-NEXT: cudaCheckKernel(); 290; CODE-NEXT: } 291 292; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i32), cudaMemcpyDeviceToHost)); 293; CODE-NEXT: } 294 295; CODE: # kernel0 296; CODE-NEXT: Stmt_bb2(32 * b0 + t0); 297 298; void foo(i32 A[], i32 b) { 299; for (long i = 0; i < 1024; i++) 300; A[i] += b; 301; } 302; 303target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 304 305define void @i32(i32* %A, i32 %b) { 306bb: 307 br label %bb1 308 309bb1: ; preds = %bb5, %bb 310 %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] 311 %exitcond = icmp ne i64 %i.0, 1024 312 br i1 %exitcond, label %bb2, label %bb7 313 314bb2: ; preds = %bb1 315 %tmp = getelementptr inbounds i32, i32* %A, i64 %i.0 316 %tmp3 = load i32, i32* %tmp, align 4 317 %tmp4 = add i32 %tmp3, %b 318 store i32 %tmp4, i32* %tmp, align 4 319 br label %bb5 320 321bb5: ; preds = %bb2 322 %tmp6 = add nuw nsw i64 %i.0, 1 323 br label %bb1 324 325bb7: ; preds = %bb1 326 ret void 327} 328 329; CODE: Code 330; CODE-NEXT: ==== 331; CODE-NEXT: # host 332; CODE-NEXT: { 333; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i60), cudaMemcpyHostToDevice)); 334; CODE-NEXT: { 335; CODE-NEXT: dim3 k0_dimBlock(32); 336; CODE-NEXT: dim3 k0_dimGrid(32); 337; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); 338; CODE-NEXT: cudaCheckKernel(); 339; CODE-NEXT: } 340 341; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i60), cudaMemcpyDeviceToHost)); 342; CODE-NEXT: } 343 344; CODE: # kernel0 345; CODE-NEXT: Stmt_bb2(32 * b0 + t0); 346 347; void foo(i60 A[], i60 b) { 348; for (long i = 0; i < 1024; i++) 349; A[i] += b; 350; } 351; 352target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 353 354define void @i60(i60* %A, i60 %b) { 355bb: 356 br label %bb1 357 358bb1: ; preds = %bb5, %bb 359 %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] 360 %exitcond = icmp ne i64 %i.0, 1024 361 br i1 %exitcond, label %bb2, label %bb7 362 363bb2: ; preds = %bb1 364 %tmp = getelementptr inbounds i60, i60* %A, i64 %i.0 365 %tmp3 = load i60, i60* %tmp, align 4 366 %tmp4 = add i60 %tmp3, %b 367 store i60 %tmp4, i60* %tmp, align 4 368 br label %bb5 369 370bb5: ; preds = %bb2 371 %tmp6 = add nuw nsw i64 %i.0, 1 372 br label %bb1 373 374bb7: ; preds = %bb1 375 ret void 376} 377 378; CODE: Code 379; CODE-NEXT: ==== 380; CODE-NEXT: # host 381; CODE-NEXT: { 382; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); 383; CODE-NEXT: { 384; CODE-NEXT: dim3 k0_dimBlock(32); 385; CODE-NEXT: dim3 k0_dimGrid(32); 386; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); 387; CODE-NEXT: cudaCheckKernel(); 388; CODE-NEXT: } 389 390; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost)); 391; CODE-NEXT: } 392 393; CODE: # kernel0 394; CODE-NEXT: Stmt_bb2(32 * b0 + t0); 395 396; void foo(i64 A[], i64 b) { 397; for (long i = 0; i < 1024; i++) 398; A[i] += b; 399; } 400; 401target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 402 403define void @i64(i64* %A, i64 %b) { 404bb: 405 br label %bb1 406 407bb1: ; preds = %bb5, %bb 408 %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] 409 %exitcond = icmp ne i64 %i.0, 1024 410 br i1 %exitcond, label %bb2, label %bb7 411 412bb2: ; preds = %bb1 413 %tmp = getelementptr inbounds i64, i64* %A, i64 %i.0 414 %tmp3 = load i64, i64* %tmp, align 4 415 %tmp4 = add i64 %tmp3, %b 416 store i64 %tmp4, i64* %tmp, align 4 417 br label %bb5 418 419bb5: ; preds = %bb2 420 %tmp6 = add nuw nsw i64 %i.0, 1 421 br label %bb1 422 423bb7: ; preds = %bb1 424 ret void 425} 426