1 //===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities to generate mappings for parallel loops to
10 // GPU devices.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "mlir/Dialect/GPU/ParallelLoopMapper.h"
15 
16 #include "mlir/Dialect/GPU/GPUDialect.h"
17 #include "mlir/Dialect/GPU/Passes.h"
18 #include "mlir/Dialect/SCF/SCF.h"
19 #include "mlir/IR/AffineMap.h"
20 #include "mlir/Pass/Pass.h"
21 
22 using namespace mlir;
23 using namespace mlir::gpu;
24 using namespace mlir::scf;
25 
26 #include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc"
27 #include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc"
28 namespace mlir {
29 namespace gpu {
30 
getMappingAttrName()31 StringRef getMappingAttrName() { return "mapping"; }
32 
getParallelLoopDimMappingAttr(Processor processor,AffineMap map,AffineMap bound)33 ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
34                                                      AffineMap map,
35                                                      AffineMap bound) {
36   MLIRContext *context = map.getContext();
37   OpBuilder builder(context);
38   return ParallelLoopDimMapping::get(
39       builder.getI64IntegerAttr(static_cast<int32_t>(processor)),
40       AffineMapAttr::get(map), AffineMapAttr::get(bound), context);
41 }
42 
setMappingAttr(scf::ParallelOp ploopOp,ArrayRef<ParallelLoopDimMapping> mapping)43 LogicalResult setMappingAttr(scf::ParallelOp ploopOp,
44                              ArrayRef<ParallelLoopDimMapping> mapping) {
45   // Verify that each processor is mapped to only once.
46   llvm::DenseSet<gpu::Processor> specifiedMappings;
47   for (auto dimAttr : mapping) {
48     gpu::Processor processor = getProcessor(dimAttr);
49     if (processor != gpu::Processor::Sequential &&
50         specifiedMappings.count(processor))
51       return ploopOp.emitError(
52           "invalid mapping multiple loops to same processor");
53   }
54   ArrayRef<Attribute> mappingAsAttrs(mapping.data(), mapping.size());
55   ploopOp.setAttr(getMappingAttrName(),
56                   ArrayAttr::get(mappingAsAttrs, ploopOp.getContext()));
57   return success();
58 }
59 } // namespace gpu
60 } // namespace mlir
61 
62 namespace {
63 
64 enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
65 
66 static constexpr int kNumHardwareIds = 3;
67 
68 } // namespace
69 
70 /// Bounded increment on MappingLevel. Increments to the next
71 /// level unless Sequential was already reached.
operator ++(MappingLevel & mappingLevel)72 MappingLevel &operator++(MappingLevel &mappingLevel) {
73   if (mappingLevel < Sequential) {
74     mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
75   }
76   return mappingLevel;
77 }
78 
79 /// Computed the hardware id to use for a given mapping level. Will
80 /// assign x,y and z hardware ids for the first 3 dimensions and use
81 /// sequential after.
82 /// TODO: Make this use x for the inner-most loop that is
83 /// distributed to map to x, the next innermost to y and the next innermost to
84 /// z.
getHardwareIdForMapping(MappingLevel level,int dimension)85 static gpu::Processor getHardwareIdForMapping(MappingLevel level,
86                                               int dimension) {
87 
88   if (dimension >= kNumHardwareIds || level == Sequential)
89     return Processor::Sequential;
90   switch (level) {
91   case MapGrid:
92     switch (dimension) {
93     case 0:
94       return Processor::BlockX;
95     case 1:
96       return Processor::BlockY;
97     case 2:
98       return Processor::BlockZ;
99     default:
100       return Processor::Sequential;
101     }
102     break;
103   case MapBlock:
104     switch (dimension) {
105     case 0:
106       return Processor::ThreadX;
107     case 1:
108       return Processor::ThreadY;
109     case 2:
110       return Processor::ThreadZ;
111     default:
112       return Processor::Sequential;
113     }
114   default:;
115   }
116   return Processor::Sequential;
117 }
118 
119 /// Add mapping information to the given parallel loop. Do not add
120 /// mapping information if the loop already has it. Also, don't
121 /// start a mapping at a nested loop.
mapParallelOp(ParallelOp parallelOp,MappingLevel mappingLevel=MapGrid)122 static void mapParallelOp(ParallelOp parallelOp,
123                           MappingLevel mappingLevel = MapGrid) {
124   // Do not try to add a mapping to already mapped loops or nested loops.
125   if (parallelOp.getAttr(getMappingAttrName()) ||
126       ((mappingLevel == MapGrid) && parallelOp->getParentOfType<ParallelOp>()))
127     return;
128 
129   MLIRContext *ctx = parallelOp.getContext();
130   Builder b(ctx);
131   SmallVector<ParallelLoopDimMapping, 4> attrs;
132   attrs.reserve(parallelOp.getNumLoops());
133   for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) {
134     attrs.push_back(getParallelLoopDimMappingAttr(
135         getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
136         b.getDimIdentityMap()));
137   }
138   setMappingAttr(parallelOp, attrs);
139   ++mappingLevel;
140   // Parallel loop operations are immediately nested, so do not use
141   // walk but just iterate over the operations.
142   for (Operation &op : *parallelOp.getBody()) {
143     if (ParallelOp nested = dyn_cast<ParallelOp>(op))
144       mapParallelOp(nested, mappingLevel);
145   }
146 }
147 
greedilyMapParallelSCFToGPU(Region & region)148 void mlir::greedilyMapParallelSCFToGPU(Region &region) {
149   region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
150 }
151