1 //===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
9 #define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
10 
11 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
12 #include "mlir/Dialect/GPU/GPUDialect.h"
13 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
14 #include "mlir/Dialect/StandardOps/IR/Ops.h"
15 #include "mlir/IR/Builders.h"
16 #include "llvm/Support/FormatVariadic.h"
17 
18 namespace mlir {
19 
20 template <unsigned AllocaAddrSpace>
21 struct GPUFuncOpLowering : ConvertToLLVMPattern {
GPUFuncOpLoweringGPUFuncOpLowering22   explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
23       : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
24                              typeConverter.getDialect()->getContext(),
25                              typeConverter) {}
26 
27   LogicalResult
matchAndRewriteGPUFuncOpLowering28   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
29                   ConversionPatternRewriter &rewriter) const override {
30     assert(operands.empty() && "func op is not expected to have operands");
31     auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
32     Location loc = gpuFuncOp.getLoc();
33 
34     SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
35     workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
36     for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
37       Value attribution = en.value();
38 
39       auto type = attribution.getType().dyn_cast<MemRefType>();
40       assert(type && type.hasStaticShape() && "unexpected type in attribution");
41 
42       uint64_t numElements = type.getNumElements();
43 
44       auto elementType = typeConverter->convertType(type.getElementType())
45                              .template cast<LLVM::LLVMType>();
46       auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
47       std::string name = std::string(
48           llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
49       auto globalOp = rewriter.create<LLVM::GlobalOp>(
50           gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
51           LLVM::Linkage::Internal, name, /*value=*/Attribute(),
52           gpu::GPUDialect::getWorkgroupAddressSpace());
53       workgroupBuffers.push_back(globalOp);
54     }
55 
56     // Rewrite the original GPU function to an LLVM function.
57     auto funcType = typeConverter->convertType(gpuFuncOp.getType())
58                         .template cast<LLVM::LLVMType>()
59                         .getPointerElementTy();
60 
61     // Remap proper input types.
62     TypeConverter::SignatureConversion signatureConversion(
63         gpuFuncOp.front().getNumArguments());
64     getTypeConverter()->convertFunctionSignature(
65         gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
66 
67     // Create the new function operation. Only copy those attributes that are
68     // not specific to function modeling.
69     SmallVector<NamedAttribute, 4> attributes;
70     for (const auto &attr : gpuFuncOp.getAttrs()) {
71       if (attr.first == SymbolTable::getSymbolAttrName() ||
72           attr.first == impl::getTypeAttrName() ||
73           attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
74         continue;
75       attributes.push_back(attr);
76     }
77     auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
78         gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
79         LLVM::Linkage::External, attributes);
80 
81     {
82       // Insert operations that correspond to converted workgroup and private
83       // memory attributions to the body of the function. This must operate on
84       // the original function, before the body region is inlined in the new
85       // function to maintain the relation between block arguments and the
86       // parent operation that assigns their semantics.
87       OpBuilder::InsertionGuard guard(rewriter);
88 
89       // Rewrite workgroup memory attributions to addresses of global buffers.
90       rewriter.setInsertionPointToStart(&gpuFuncOp.front());
91       unsigned numProperArguments = gpuFuncOp.getNumArguments();
92       auto i32Type = LLVM::LLVMType::getInt32Ty(rewriter.getContext());
93 
94       Value zero = nullptr;
95       if (!workgroupBuffers.empty())
96         zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
97                                                  rewriter.getI32IntegerAttr(0));
98       for (auto en : llvm::enumerate(workgroupBuffers)) {
99         LLVM::GlobalOp global = en.value();
100         Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
101         auto elementType = global.getType().getArrayElementType();
102         Value memory = rewriter.create<LLVM::GEPOp>(
103             loc, elementType.getPointerTo(global.addr_space()), address,
104             ArrayRef<Value>{zero, zero});
105 
106         // Build a memref descriptor pointing to the buffer to plug with the
107         // existing memref infrastructure. This may use more registers than
108         // otherwise necessary given that memref sizes are fixed, but we can try
109         // and canonicalize that away later.
110         Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
111         auto type = attribution.getType().cast<MemRefType>();
112         auto descr = MemRefDescriptor::fromStaticShape(
113             rewriter, loc, *getTypeConverter(), type, memory);
114         signatureConversion.remapInput(numProperArguments + en.index(), descr);
115       }
116 
117       // Rewrite private memory attributions to alloca'ed buffers.
118       unsigned numWorkgroupAttributions =
119           gpuFuncOp.getNumWorkgroupAttributions();
120       auto int64Ty = LLVM::LLVMType::getInt64Ty(rewriter.getContext());
121       for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
122         Value attribution = en.value();
123         auto type = attribution.getType().cast<MemRefType>();
124         assert(type && type.hasStaticShape() &&
125                "unexpected type in attribution");
126 
127         // Explicitly drop memory space when lowering private memory
128         // attributions since NVVM models it as `alloca`s in the default
129         // memory space and does not support `alloca`s with addrspace(5).
130         auto ptrType = typeConverter->convertType(type.getElementType())
131                            .template cast<LLVM::LLVMType>()
132                            .getPointerTo(AllocaAddrSpace);
133         Value numElements = rewriter.create<LLVM::ConstantOp>(
134             gpuFuncOp.getLoc(), int64Ty,
135             rewriter.getI64IntegerAttr(type.getNumElements()));
136         Value allocated = rewriter.create<LLVM::AllocaOp>(
137             gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
138         auto descr = MemRefDescriptor::fromStaticShape(
139             rewriter, loc, *getTypeConverter(), type, allocated);
140         signatureConversion.remapInput(
141             numProperArguments + numWorkgroupAttributions + en.index(), descr);
142       }
143     }
144 
145     // Move the region to the new function, update the entry block signature.
146     rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
147                                 llvmFuncOp.end());
148     if (failed(rewriter.convertRegionTypes(
149             &llvmFuncOp.getBody(), *typeConverter, &signatureConversion)))
150       return failure();
151 
152     rewriter.eraseOp(gpuFuncOp);
153     return success();
154   }
155 };
156 
157 struct GPUReturnOpLowering : public ConvertToLLVMPattern {
GPUReturnOpLoweringGPUReturnOpLowering158   GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
159       : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
160                              typeConverter.getDialect()->getContext(),
161                              typeConverter) {}
162 
163   LogicalResult
matchAndRewriteGPUReturnOpLowering164   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
165                   ConversionPatternRewriter &rewriter) const override {
166     rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
167     return success();
168   }
169 };
170 
171 } // namespace mlir
172 
173 #endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
174