//===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements a pass to generate NVVMIR operations for higher-level // GPU operations. // //===----------------------------------------------------------------------===// #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/GPU/Passes.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/Support/FormatVariadic.h" #include "../GPUCommon/GPUOpsLowering.h" #include "../GPUCommon/IndexIntrinsicsOpLowering.h" #include "../GPUCommon/OpToFuncCallLowering.h" #include "../PassDetail.h" using namespace mlir; namespace { struct GPUShuffleOpLowering : public ConvertToLLVMPattern { explicit GPUShuffleOpLowering(LLVMTypeConverter &lowering_) : ConvertToLLVMPattern(gpu::ShuffleOp::getOperationName(), lowering_.getDialect()->getContext(), lowering_) {} /// Lowers a shuffle to the corresponding NVVM op. /// /// Convert the `width` argument into an activeMask (a bitmask which specifies /// which threads participate in the shuffle) and a maskAndClamp (specifying /// the highest lane which participates in the shuffle). /// /// %one = llvm.constant(1 : i32) : !llvm.i32 /// %shl = llvm.shl %one, %width : !llvm.i32 /// %active_mask = llvm.sub %shl, %one : !llvm.i32 /// %mask_and_clamp = llvm.sub %width, %one : !llvm.i32 /// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset, /// %mask_and_clamp : !llvm<"{ float, i1 }"> /// %shfl_value = llvm.extractvalue %shfl[0 : index] : /// !llvm<"{ float, i1 }"> /// %shfl_pred = llvm.extractvalue %shfl[1 : index] : /// !llvm<"{ float, i1 }"> LogicalResult matchAndRewrite(Operation *op, ArrayRef operands, ConversionPatternRewriter &rewriter) const override { Location loc = op->getLoc(); gpu::ShuffleOpAdaptor adaptor(operands); auto valueTy = adaptor.value().getType().cast(); auto int32Type = LLVM::LLVMType::getInt32Ty(rewriter.getContext()); auto predTy = LLVM::LLVMType::getInt1Ty(rewriter.getContext()); auto resultTy = LLVM::LLVMType::getStructTy(rewriter.getContext(), {valueTy, predTy}); Value one = rewriter.create( loc, int32Type, rewriter.getI32IntegerAttr(1)); // Bit mask of active lanes: `(1 << activeWidth) - 1`. Value activeMask = rewriter.create( loc, int32Type, rewriter.create(loc, int32Type, one, adaptor.width()), one); // Clamp lane: `activeWidth - 1` Value maskAndClamp = rewriter.create(loc, int32Type, adaptor.width(), one); auto returnValueAndIsValidAttr = rewriter.getUnitAttr(); Value shfl = rewriter.create( loc, resultTy, activeMask, adaptor.value(), adaptor.offset(), maskAndClamp, returnValueAndIsValidAttr); Value shflValue = rewriter.create( loc, valueTy, shfl, rewriter.getIndexArrayAttr(0)); Value isActiveSrcLane = rewriter.create( loc, predTy, shfl, rewriter.getIndexArrayAttr(1)); rewriter.replaceOp(op, {shflValue, isActiveSrcLane}); return success(); } }; /// Import the GPU Ops to NVVM Patterns. #include "GPUToNVVM.cpp.inc" /// A pass that replaces all occurrences of GPU device operations with their /// corresponding NVVM equivalent. /// /// This pass only handles device code and is not meant to be run on GPU host /// code. struct LowerGpuOpsToNVVMOpsPass : public ConvertGpuOpsToNVVMOpsBase { LowerGpuOpsToNVVMOpsPass() = default; LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) { this->indexBitwidth = indexBitwidth; } void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); /// Customize the bitwidth used for the device side index computations. LowerToLLVMOptions options = {/*useBarePtrCallConv =*/false, /*emitCWrappers =*/true, /*indexBitwidth =*/indexBitwidth, /*useAlignedAlloc =*/false}; /// MemRef conversion for GPU to NVVM lowering. The GPU dialect uses memory /// space 5 for private memory attributions, but NVVM represents private /// memory allocations as local `alloca`s in the default address space. This /// converter drops the private memory space to support the use case above. LLVMTypeConverter converter(m.getContext(), options); converter.addConversion([&](MemRefType type) -> Optional { if (type.getMemorySpace() != gpu::GPUDialect::getPrivateAddressSpace()) return llvm::None; return converter.convertType(MemRefType::Builder(type).setMemorySpace(0)); }); OwningRewritePatternList patterns, llvmPatterns; // Apply in-dialect lowering first. In-dialect lowering will replace ops // which need to be lowered further, which is not supported by a single // conversion pass. populateGpuRewritePatterns(m.getContext(), patterns); applyPatternsAndFoldGreedily(m, std::move(patterns)); populateStdToLLVMConversionPatterns(converter, llvmPatterns); populateGpuToNVVMConversionPatterns(converter, llvmPatterns); LLVMConversionTarget target(getContext()); configureGpuToNVVMConversionLegality(target); if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) signalPassFailure(); } }; } // anonymous namespace void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) { target.addIllegalOp(); target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); target.addLegalDialect<::mlir::NVVM::NVVMDialect>(); target.addIllegalDialect(); target.addIllegalOp(); // TODO: Remove once we support replacing non-root ops. target.addLegalOp(); } void mlir::populateGpuToNVVMConversionPatterns( LLVMTypeConverter &converter, OwningRewritePatternList &patterns) { populateWithGenerated(converter.getDialect()->getContext(), patterns); patterns .insert, GPUIndexIntrinsicOpLowering, GPUIndexIntrinsicOpLowering, GPUIndexIntrinsicOpLowering, GPUShuffleOpLowering, GPUReturnOpLowering, // Explicitly drop memory space when lowering private memory // attributions since NVVM models it as `alloca`s in the default // memory space and does not support `alloca`s with addrspace(5). GPUFuncOpLowering<0>>(converter); patterns.insert>(converter, "__nv_fabsf", "__nv_fabs"); patterns.insert>(converter, "__nv_ceilf", "__nv_ceil"); patterns.insert>(converter, "__nv_cosf", "__nv_cos"); patterns.insert>(converter, "__nv_expf", "__nv_exp"); patterns.insert>(converter, "__nv_floorf", "__nv_floor"); patterns.insert>(converter, "__nv_logf", "__nv_log"); patterns.insert>(converter, "__nv_log10f", "__nv_log10"); patterns.insert>(converter, "__nv_log2f", "__nv_log2"); patterns.insert>(converter, "__nv_rsqrtf", "__nv_rsqrt"); patterns.insert>(converter, "__nv_sinf", "__nv_sin"); patterns.insert>(converter, "__nv_sqrtf", "__nv_sqrt"); patterns.insert>(converter, "__nv_tanhf", "__nv_tanh"); } std::unique_ptr> mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) { return std::make_unique(indexBitwidth); }