| //===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| #ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ |
| #define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ |
| |
| #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" |
| #include "mlir/Dialect/GPU/GPUDialect.h" |
| #include "mlir/Dialect/LLVMIR/LLVMDialect.h" |
| #include "mlir/Dialect/StandardOps/IR/Ops.h" |
| #include "mlir/IR/Builders.h" |
| #include "llvm/Support/FormatVariadic.h" |
| |
| namespace mlir { |
| |
| template <unsigned AllocaAddrSpace> |
| struct GPUFuncOpLowering : ConvertToLLVMPattern { |
| explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter) |
| : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(), |
| typeConverter.getDialect()->getContext(), |
| typeConverter) {} |
| |
| LogicalResult |
| matchAndRewrite(Operation *op, ArrayRef<Value> operands, |
| ConversionPatternRewriter &rewriter) const override { |
| assert(operands.empty() && "func op is not expected to have operands"); |
| auto gpuFuncOp = cast<gpu::GPUFuncOp>(op); |
| Location loc = gpuFuncOp.getLoc(); |
| |
| SmallVector<LLVM::GlobalOp, 3> workgroupBuffers; |
| workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); |
| for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { |
| Value attribution = en.value(); |
| |
| auto type = attribution.getType().dyn_cast<MemRefType>(); |
| assert(type && type.hasStaticShape() && "unexpected type in attribution"); |
| |
| uint64_t numElements = type.getNumElements(); |
| |
| auto elementType = typeConverter->convertType(type.getElementType()) |
| .template cast<LLVM::LLVMType>(); |
| auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); |
| std::string name = std::string( |
| llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); |
| auto globalOp = rewriter.create<LLVM::GlobalOp>( |
| gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, |
| LLVM::Linkage::Internal, name, /*value=*/Attribute(), |
| gpu::GPUDialect::getWorkgroupAddressSpace()); |
| workgroupBuffers.push_back(globalOp); |
| } |
| |
| // Rewrite the original GPU function to an LLVM function. |
| auto funcType = typeConverter->convertType(gpuFuncOp.getType()) |
| .template cast<LLVM::LLVMType>() |
| .getPointerElementTy(); |
| |
| // Remap proper input types. |
| TypeConverter::SignatureConversion signatureConversion( |
| gpuFuncOp.front().getNumArguments()); |
| getTypeConverter()->convertFunctionSignature( |
| gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); |
| |
| // Create the new function operation. Only copy those attributes that are |
| // not specific to function modeling. |
| SmallVector<NamedAttribute, 4> attributes; |
| for (const auto &attr : gpuFuncOp.getAttrs()) { |
| if (attr.first == SymbolTable::getSymbolAttrName() || |
| attr.first == impl::getTypeAttrName() || |
| attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) |
| continue; |
| attributes.push_back(attr); |
| } |
| auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>( |
| gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, |
| LLVM::Linkage::External, attributes); |
| |
| { |
| // Insert operations that correspond to converted workgroup and private |
| // memory attributions to the body of the function. This must operate on |
| // the original function, before the body region is inlined in the new |
| // function to maintain the relation between block arguments and the |
| // parent operation that assigns their semantics. |
| OpBuilder::InsertionGuard guard(rewriter); |
| |
| // Rewrite workgroup memory attributions to addresses of global buffers. |
| rewriter.setInsertionPointToStart(&gpuFuncOp.front()); |
| unsigned numProperArguments = gpuFuncOp.getNumArguments(); |
| auto i32Type = LLVM::LLVMType::getInt32Ty(rewriter.getContext()); |
| |
| Value zero = nullptr; |
| if (!workgroupBuffers.empty()) |
| zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type, |
| rewriter.getI32IntegerAttr(0)); |
| for (auto en : llvm::enumerate(workgroupBuffers)) { |
| LLVM::GlobalOp global = en.value(); |
| Value address = rewriter.create<LLVM::AddressOfOp>(loc, global); |
| auto elementType = global.getType().getArrayElementType(); |
| Value memory = rewriter.create<LLVM::GEPOp>( |
| loc, elementType.getPointerTo(global.addr_space()), address, |
| ArrayRef<Value>{zero, zero}); |
| |
| // Build a memref descriptor pointing to the buffer to plug with the |
| // existing memref infrastructure. This may use more registers than |
| // otherwise necessary given that memref sizes are fixed, but we can try |
| // and canonicalize that away later. |
| Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; |
| auto type = attribution.getType().cast<MemRefType>(); |
| auto descr = MemRefDescriptor::fromStaticShape( |
| rewriter, loc, *getTypeConverter(), type, memory); |
| signatureConversion.remapInput(numProperArguments + en.index(), descr); |
| } |
| |
| // Rewrite private memory attributions to alloca'ed buffers. |
| unsigned numWorkgroupAttributions = |
| gpuFuncOp.getNumWorkgroupAttributions(); |
| auto int64Ty = LLVM::LLVMType::getInt64Ty(rewriter.getContext()); |
| for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { |
| Value attribution = en.value(); |
| auto type = attribution.getType().cast<MemRefType>(); |
| assert(type && type.hasStaticShape() && |
| "unexpected type in attribution"); |
| |
| // Explicitly drop memory space when lowering private memory |
| // attributions since NVVM models it as `alloca`s in the default |
| // memory space and does not support `alloca`s with addrspace(5). |
| auto ptrType = typeConverter->convertType(type.getElementType()) |
| .template cast<LLVM::LLVMType>() |
| .getPointerTo(AllocaAddrSpace); |
| Value numElements = rewriter.create<LLVM::ConstantOp>( |
| gpuFuncOp.getLoc(), int64Ty, |
| rewriter.getI64IntegerAttr(type.getNumElements())); |
| Value allocated = rewriter.create<LLVM::AllocaOp>( |
| gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); |
| auto descr = MemRefDescriptor::fromStaticShape( |
| rewriter, loc, *getTypeConverter(), type, allocated); |
| signatureConversion.remapInput( |
| numProperArguments + numWorkgroupAttributions + en.index(), descr); |
| } |
| } |
| |
| // Move the region to the new function, update the entry block signature. |
| rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), |
| llvmFuncOp.end()); |
| if (failed(rewriter.convertRegionTypes( |
| &llvmFuncOp.getBody(), *typeConverter, &signatureConversion))) |
| return failure(); |
| |
| rewriter.eraseOp(gpuFuncOp); |
| return success(); |
| } |
| }; |
| |
| struct GPUReturnOpLowering : public ConvertToLLVMPattern { |
| GPUReturnOpLowering(LLVMTypeConverter &typeConverter) |
| : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(), |
| typeConverter.getDialect()->getContext(), |
| typeConverter) {} |
| |
| LogicalResult |
| matchAndRewrite(Operation *op, ArrayRef<Value> operands, |
| ConversionPatternRewriter &rewriter) const override { |
| rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands); |
| return success(); |
| } |
| }; |
| |
| } // namespace mlir |
| |
| #endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ |