mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h - toolchain/llvm-project - Git at Google

 //===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
 #define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_

 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "llvm/Support/FormatVariadic.h"

 namespace mlir {

 template <unsigned AllocaAddrSpace>
 struct GPUFuncOpLowering : ConvertToLLVMPattern {
   explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
       : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
                              typeConverter.getDialect()->getContext(),
                              typeConverter) {}

   LogicalResult
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
     assert(operands.empty() && "func op is not expected to have operands");
     auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
     Location loc = gpuFuncOp.getLoc();

     SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
     workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
     for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
       Value attribution = en.value();

       auto type = attribution.getType().dyn_cast<MemRefType>();
       assert(type && type.hasStaticShape() && "unexpected type in attribution");

       uint64_t numElements = type.getNumElements();

       auto elementType = typeConverter->convertType(type.getElementType())
                              .template cast<LLVM::LLVMType>();
       auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
       std::string name = std::string(
           llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
       auto globalOp = rewriter.create<LLVM::GlobalOp>(
           gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
           LLVM::Linkage::Internal, name, /*value=*/Attribute(),
           gpu::GPUDialect::getWorkgroupAddressSpace());
       workgroupBuffers.push_back(globalOp);
     }

     // Rewrite the original GPU function to an LLVM function.
     auto funcType = typeConverter->convertType(gpuFuncOp.getType())
                         .template cast<LLVM::LLVMType>()
                         .getPointerElementTy();

     // Remap proper input types.
     TypeConverter::SignatureConversion signatureConversion(
         gpuFuncOp.front().getNumArguments());
     getTypeConverter()->convertFunctionSignature(
         gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);

     // Create the new function operation. Only copy those attributes that are
     // not specific to function modeling.
     SmallVector<NamedAttribute, 4> attributes;
     for (const auto &attr : gpuFuncOp.getAttrs()) {
       if (attr.first == SymbolTable::getSymbolAttrName() ||
           attr.first == impl::getTypeAttrName() ||
           attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
         continue;
       attributes.push_back(attr);
     }
     auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
         gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
         LLVM::Linkage::External, attributes);

     {
       // Insert operations that correspond to converted workgroup and private
       // memory attributions to the body of the function. This must operate on
       // the original function, before the body region is inlined in the new
       // function to maintain the relation between block arguments and the
       // parent operation that assigns their semantics.
       OpBuilder::InsertionGuard guard(rewriter);

       // Rewrite workgroup memory attributions to addresses of global buffers.
       rewriter.setInsertionPointToStart(&gpuFuncOp.front());
       unsigned numProperArguments = gpuFuncOp.getNumArguments();
       auto i32Type = LLVM::LLVMType::getInt32Ty(rewriter.getContext());

       Value zero = nullptr;
       if (!workgroupBuffers.empty())
         zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
                                                  rewriter.getI32IntegerAttr(0));
       for (auto en : llvm::enumerate(workgroupBuffers)) {
         LLVM::GlobalOp global = en.value();
         Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
         auto elementType = global.getType().getArrayElementType();
         Value memory = rewriter.create<LLVM::GEPOp>(
             loc, elementType.getPointerTo(global.addr_space()), address,
             ArrayRef<Value>{zero, zero});

         // Build a memref descriptor pointing to the buffer to plug with the
         // existing memref infrastructure. This may use more registers than
         // otherwise necessary given that memref sizes are fixed, but we can try
         // and canonicalize that away later.
         Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
         auto type = attribution.getType().cast<MemRefType>();
         auto descr = MemRefDescriptor::fromStaticShape(
             rewriter, loc, *getTypeConverter(), type, memory);
         signatureConversion.remapInput(numProperArguments + en.index(), descr);
       }

       // Rewrite private memory attributions to alloca'ed buffers.
       unsigned numWorkgroupAttributions =
           gpuFuncOp.getNumWorkgroupAttributions();
       auto int64Ty = LLVM::LLVMType::getInt64Ty(rewriter.getContext());
       for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
         Value attribution = en.value();
         auto type = attribution.getType().cast<MemRefType>();
         assert(type && type.hasStaticShape() &&
                "unexpected type in attribution");

         // Explicitly drop memory space when lowering private memory
         // attributions since NVVM models it as `alloca`s in the default
         // memory space and does not support `alloca`s with addrspace(5).
         auto ptrType = typeConverter->convertType(type.getElementType())
                            .template cast<LLVM::LLVMType>()
                            .getPointerTo(AllocaAddrSpace);
         Value numElements = rewriter.create<LLVM::ConstantOp>(
             gpuFuncOp.getLoc(), int64Ty,
             rewriter.getI64IntegerAttr(type.getNumElements()));
         Value allocated = rewriter.create<LLVM::AllocaOp>(
             gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
         auto descr = MemRefDescriptor::fromStaticShape(
             rewriter, loc, *getTypeConverter(), type, allocated);
         signatureConversion.remapInput(
             numProperArguments + numWorkgroupAttributions + en.index(), descr);
       }
     }

     // Move the region to the new function, update the entry block signature.
     rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
                                 llvmFuncOp.end());
     if (failed(rewriter.convertRegionTypes(
             &llvmFuncOp.getBody(), *typeConverter, &signatureConversion)))
       return failure();

     rewriter.eraseOp(gpuFuncOp);
     return success();
   }
 };

 struct GPUReturnOpLowering : public ConvertToLLVMPattern {
   GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
       : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
                              typeConverter.getDialect()->getContext(),
                              typeConverter) {}

   LogicalResult
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
     return success();
   }
 };

 } // namespace mlir

 #endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
	//===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------- C++ ---===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
	#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_

	#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
	#include "mlir/Dialect/GPU/GPUDialect.h"
	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
	#include "mlir/Dialect/StandardOps/IR/Ops.h"
	#include "mlir/IR/Builders.h"
	#include "llvm/Support/FormatVariadic.h"

	namespace mlir {

	template <unsigned AllocaAddrSpace>
	struct GPUFuncOpLowering : ConvertToLLVMPattern {
	explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
	: ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
	typeConverter.getDialect()->getContext(),
	typeConverter) {}

	LogicalResult
	matchAndRewrite(Operation *op, ArrayRef<Value> operands,
	ConversionPatternRewriter &rewriter) const override {
	assert(operands.empty() && "func op is not expected to have operands");
	auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
	Location loc = gpuFuncOp.getLoc();

	SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
	workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
	for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
	Value attribution = en.value();

	auto type = attribution.getType().dyn_cast<MemRefType>();
	assert(type && type.hasStaticShape() && "unexpected type in attribution");

	uint64_t numElements = type.getNumElements();

	auto elementType = typeConverter->convertType(type.getElementType())
	.template cast<LLVM::LLVMType>();
	auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
	std::string name = std::string(
	llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
	auto globalOp = rewriter.create<LLVM::GlobalOp>(
	gpuFuncOp.getLoc(), arrayType, /isConstant=/false,
	LLVM::Linkage::Internal, name, /value=/Attribute(),
	gpu::GPUDialect::getWorkgroupAddressSpace());
	workgroupBuffers.push_back(globalOp);
	}

	// Rewrite the original GPU function to an LLVM function.
	auto funcType = typeConverter->convertType(gpuFuncOp.getType())
	.template cast<LLVM::LLVMType>()
	.getPointerElementTy();

	// Remap proper input types.
	TypeConverter::SignatureConversion signatureConversion(
	gpuFuncOp.front().getNumArguments());
	getTypeConverter()->convertFunctionSignature(
	gpuFuncOp.getType(), /isVariadic=/false, signatureConversion);

	// Create the new function operation. Only copy those attributes that are
	// not specific to function modeling.
	SmallVector<NamedAttribute, 4> attributes;
	for (const auto &attr : gpuFuncOp.getAttrs()) {
	if (attr.first == SymbolTable::getSymbolAttrName() \|\|
	attr.first == impl::getTypeAttrName() \|\|
	attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
	continue;
	attributes.push_back(attr);
	}
	auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
	gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
	LLVM::Linkage::External, attributes);

	{
	// Insert operations that correspond to converted workgroup and private
	// memory attributions to the body of the function. This must operate on
	// the original function, before the body region is inlined in the new
	// function to maintain the relation between block arguments and the
	// parent operation that assigns their semantics.
	OpBuilder::InsertionGuard guard(rewriter);

	// Rewrite workgroup memory attributions to addresses of global buffers.
	rewriter.setInsertionPointToStart(&gpuFuncOp.front());
	unsigned numProperArguments = gpuFuncOp.getNumArguments();
	auto i32Type = LLVM::LLVMType::getInt32Ty(rewriter.getContext());

	Value zero = nullptr;
	if (!workgroupBuffers.empty())
	zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
	rewriter.getI32IntegerAttr(0));
	for (auto en : llvm::enumerate(workgroupBuffers)) {
	LLVM::GlobalOp global = en.value();
	Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
	auto elementType = global.getType().getArrayElementType();
	Value memory = rewriter.create<LLVM::GEPOp>(
	loc, elementType.getPointerTo(global.addr_space()), address,
	ArrayRef<Value>{zero, zero});

	// Build a memref descriptor pointing to the buffer to plug with the
	// existing memref infrastructure. This may use more registers than
	// otherwise necessary given that memref sizes are fixed, but we can try
	// and canonicalize that away later.
	Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
	auto type = attribution.getType().cast<MemRefType>();
	auto descr = MemRefDescriptor::fromStaticShape(
	rewriter, loc, *getTypeConverter(), type, memory);
	signatureConversion.remapInput(numProperArguments + en.index(), descr);
	}

	// Rewrite private memory attributions to alloca'ed buffers.
	unsigned numWorkgroupAttributions =
	gpuFuncOp.getNumWorkgroupAttributions();
	auto int64Ty = LLVM::LLVMType::getInt64Ty(rewriter.getContext());
	for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
	Value attribution = en.value();
	auto type = attribution.getType().cast<MemRefType>();
	assert(type && type.hasStaticShape() &&
	"unexpected type in attribution");

	// Explicitly drop memory space when lowering private memory
	// attributions since NVVM models it as `alloca`s in the default
	// memory space and does not support `alloca`s with addrspace(5).
	auto ptrType = typeConverter->convertType(type.getElementType())
	.template cast<LLVM::LLVMType>()
	.getPointerTo(AllocaAddrSpace);
	Value numElements = rewriter.create<LLVM::ConstantOp>(
	gpuFuncOp.getLoc(), int64Ty,
	rewriter.getI64IntegerAttr(type.getNumElements()));
	Value allocated = rewriter.create<LLVM::AllocaOp>(
	gpuFuncOp.getLoc(), ptrType, numElements, /alignment=/0);
	auto descr = MemRefDescriptor::fromStaticShape(
	rewriter, loc, *getTypeConverter(), type, allocated);
	signatureConversion.remapInput(
	numProperArguments + numWorkgroupAttributions + en.index(), descr);
	}
	}

	// Move the region to the new function, update the entry block signature.
	rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
	llvmFuncOp.end());
	if (failed(rewriter.convertRegionTypes(
	&llvmFuncOp.getBody(), *typeConverter, &signatureConversion)))
	return failure();

	rewriter.eraseOp(gpuFuncOp);
	return success();
	}
	};

	struct GPUReturnOpLowering : public ConvertToLLVMPattern {
	GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
	: ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
	typeConverter.getDialect()->getContext(),
	typeConverter) {}

	LogicalResult
	matchAndRewrite(Operation *op, ArrayRef<Value> operands,
	ConversionPatternRewriter &rewriter) const override {
	rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
	return success();
	}
	};

	} // namespace mlir

	#endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_