llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp - toolchain/llvm-project - Git at Google

 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
 /// selection.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/Local.h"

 #define DEBUG_TYPE "amdgpu-late-codegenprepare"

 using namespace llvm;

 // Scalar load widening needs running after load-store-vectorizer as that pass
 // doesn't handle overlapping cases. In addition, this pass enhances the
 // widening to handle cases where scalar sub-dword loads are naturally aligned
 // only but not dword aligned.
 static cl::opt<bool>
     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
                cl::desc("Widen sub-dword constant address space loads in "
                         "AMDGPULateCodeGenPrepare"),
                cl::ReallyHidden, cl::init(true));

 namespace {

 class AMDGPULateCodeGenPrepare
     : public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   const GCNSubtarget &ST;

   AssumptionCache *AC = nullptr;
   UniformityInfo *UA = nullptr;

   SmallVector<WeakTrackingVH, 8> DeadInsts;

 public:
   AMDGPULateCodeGenPrepare(Module &M, const GCNSubtarget &ST,
                            AssumptionCache *AC, UniformityInfo *UA)
       : Mod(&M), DL(&M.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
   bool run(Function &F);
   bool visitInstruction(Instruction &) { return false; }

   // Check if the specified value is at least DWORD aligned.
   bool isDWORDAligned(const Value *V) const {
     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
     return Known.countMinTrailingZeros() >= 2;
   }

   bool canWidenScalarExtLoad(LoadInst &LI) const;
   bool visitLoadInst(LoadInst &LI);
 };

 using ValueToValueMap = DenseMap<const Value *, Value *>;

 class LiveRegOptimizer {
 private:
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   const GCNSubtarget *ST;
   /// The scalar type to convert to
   Type *ConvertToScalar;
   /// The set of visited Instructions
   SmallPtrSet<Instruction *, 4> Visited;
   /// Map of Value -> Converted Value
   ValueToValueMap ValMap;
   /// Map of containing conversions from Optimal Type -> Original Type per BB.
   DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;

 public:
   /// Calculate the and \p return  the type to convert to given a problematic \p
   /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
   Type *calculateConvertType(Type *OriginalType);
   /// Convert the virtual register defined by \p V to the compatible vector of
   /// legal type
   Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
   /// Convert the virtual register defined by \p V back to the original type \p
   /// ConvertType, stripping away the MSBs in cases where there was an imperfect
   /// fit (e.g. v2i32 -> v7i8)
   Value *convertFromOptType(Type *ConvertType, Instruction *V,
                             BasicBlock::iterator &InstPt,
                             BasicBlock *InsertBlock);
   /// Check for problematic PHI nodes or cross-bb values based on the value
   /// defined by \p I, and coerce to legal types if necessary. For problematic
   /// PHI node, we coerce all incoming values in a single invocation.
   bool optimizeLiveType(Instruction *I,
                         SmallVectorImpl<WeakTrackingVH> &DeadInsts);

   // Whether or not the type should be replaced to avoid inefficient
   // legalization code
   bool shouldReplace(Type *ITy) {
     FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
     if (!VTy)
       return false;

     auto TLI = ST->getTargetLowering();

     Type *EltTy = VTy->getElementType();
     // If the element size is not less than the convert to scalar size, then we
     // can't do any bit packing
     if (!EltTy->isIntegerTy() ||
         EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
       return false;

     // Only coerce illegal types
     TargetLoweringBase::LegalizeKind LK =
         TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
     return LK.first != TargetLoweringBase::TypeLegal;
   }

   LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
     DL = &Mod->getDataLayout();
     ConvertToScalar = Type::getInt32Ty(Mod->getContext());
   }
 };

 } // end anonymous namespace

 bool AMDGPULateCodeGenPrepare::run(Function &F) {
   // "Optimize" the virtual regs that cross basic block boundaries. When
   // building the SelectionDAG, vectors of illegal types that cross basic blocks
   // will be scalarized and widened, with each scalar living in its
   // own register. To work around this, this optimization converts the
   // vectors to equivalent vectors of legal type (which are converted back
   // before uses in subsequent blocks), to pack the bits into fewer physical
   // registers (used in CopyToReg/CopyFromReg pairs).
   LiveRegOptimizer LRO(Mod, &ST);

   bool Changed = false;

   bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads();

   for (auto &BB : reverse(F))
     for (Instruction &I : make_early_inc_range(reverse(BB))) {
       Changed |= !HasScalarSubwordLoads && visit(I);
       Changed |= LRO.optimizeLiveType(&I, DeadInsts);
     }

   RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
   return Changed;
 }

 Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
   assert(OriginalType->getScalarSizeInBits() <=
          ConvertToScalar->getScalarSizeInBits());

   FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);

   TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
   TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
   unsigned ConvertEltCount =
       (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;

   if (OriginalSize <= ConvertScalarSize)
     return IntegerType::get(Mod->getContext(), ConvertScalarSize);

   return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
                          ConvertEltCount, false);
 }

 Value *LiveRegOptimizer::convertToOptType(Instruction *V,
                                           BasicBlock::iterator &InsertPt) {
   FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
   Type *NewTy = calculateConvertType(V->getType());

   TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
   TypeSize NewSize = DL->getTypeSizeInBits(NewTy);

   IRBuilder<> Builder(V->getParent(), InsertPt);
   // If there is a bitsize match, we can fit the old vector into a new vector of
   // desired type.
   if (OriginalSize == NewSize)
     return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");

   // If there is a bitsize mismatch, we must use a wider vector.
   assert(NewSize > OriginalSize);
   uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();

   SmallVector<int, 8> ShuffleMask;
   uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
   for (unsigned I = 0; I < OriginalElementCount; I++)
     ShuffleMask.push_back(I);

   for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
     ShuffleMask.push_back(OriginalElementCount);

   Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
   return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
 }

 Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
                                             BasicBlock::iterator &InsertPt,
                                             BasicBlock *InsertBB) {
   FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);

   TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
   TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);

   IRBuilder<> Builder(InsertBB, InsertPt);
   // If there is a bitsize match, we simply convert back to the original type.
   if (OriginalSize == NewSize)
     return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");

   // If there is a bitsize mismatch, then we must have used a wider value to
   // hold the bits.
   assert(OriginalSize > NewSize);
   // For wide scalars, we can just truncate the value.
   if (!V->getType()->isVectorTy()) {
     Instruction *Trunc = cast<Instruction>(
         Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
     return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
   }

   // For wider vectors, we must strip the MSBs to convert back to the original
   // type.
   VectorType *ExpandedVT = VectorType::get(
       Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
       (OriginalSize / NewVTy->getScalarSizeInBits()), false);
   Instruction *Converted =
       cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));

   unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
   SmallVector<int, 8> ShuffleMask(NarrowElementCount);
   std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);

   return Builder.CreateShuffleVector(Converted, ShuffleMask);
 }

 bool LiveRegOptimizer::optimizeLiveType(
     Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   SmallVector<Instruction *, 4> Worklist;
   SmallPtrSet<PHINode *, 4> PhiNodes;
   SmallPtrSet<Instruction *, 4> Defs;
   SmallPtrSet<Instruction *, 4> Uses;

   Worklist.push_back(cast<Instruction>(I));
   while (!Worklist.empty()) {
     Instruction *II = Worklist.pop_back_val();

     if (!Visited.insert(II).second)
       continue;

     if (!shouldReplace(II->getType()))
       continue;

     if (PHINode *Phi = dyn_cast<PHINode>(II)) {
       PhiNodes.insert(Phi);
       // Collect all the incoming values of problematic PHI nodes.
       for (Value *V : Phi->incoming_values()) {
         // Repeat the collection process for newly found PHI nodes.
         if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
           if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
             Worklist.push_back(OpPhi);
           continue;
         }

         Instruction *IncInst = dyn_cast<Instruction>(V);
         // Other incoming value types (e.g. vector literals) are unhandled
         if (!IncInst && !isa<ConstantAggregateZero>(V))
           return false;

         // Collect all other incoming values for coercion.
         if (IncInst)
           Defs.insert(IncInst);
       }
     }

     // Collect all relevant uses.
     for (User *V : II->users()) {
       // Repeat the collection process for problematic PHI nodes.
       if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
         if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
           Worklist.push_back(OpPhi);
         continue;
       }

       Instruction *UseInst = cast<Instruction>(V);
       // Collect all uses of PHINodes and any use the crosses BB boundaries.
       if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
         Uses.insert(UseInst);
         if (!Defs.count(II) && !isa<PHINode>(II)) {
           Defs.insert(II);
         }
       }
     }
   }

   // Coerce and track the defs.
   for (Instruction *D : Defs) {
     if (!ValMap.contains(D)) {
       BasicBlock::iterator InsertPt = std::next(D->getIterator());
       Value *ConvertVal = convertToOptType(D, InsertPt);
       assert(ConvertVal);
       ValMap[D] = ConvertVal;
     }
   }

   // Construct new-typed PHI nodes.
   for (PHINode *Phi : PhiNodes) {
     ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
                                   Phi->getNumIncomingValues(),
                                   Phi->getName() + ".tc", Phi->getIterator());
   }

   // Connect all the PHI nodes with their new incoming values.
   for (PHINode *Phi : PhiNodes) {
     PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
     bool MissingIncVal = false;
     for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
       Value *IncVal = Phi->getIncomingValue(I);
       if (isa<ConstantAggregateZero>(IncVal)) {
         Type *NewType = calculateConvertType(Phi->getType());
         NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
                             Phi->getIncomingBlock(I));
       } else if (ValMap.contains(IncVal) && ValMap[IncVal])
         NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
       else
         MissingIncVal = true;
     }
     if (MissingIncVal) {
       Value *DeadVal = ValMap[Phi];
       // The coercion chain of the PHI is broken. Delete the Phi
       // from the ValMap and any connected / user Phis.
       SmallVector<Value *, 4> PHIWorklist;
       SmallPtrSet<Value *, 4> VisitedPhis;
       PHIWorklist.push_back(DeadVal);
       while (!PHIWorklist.empty()) {
         Value *NextDeadValue = PHIWorklist.pop_back_val();
         VisitedPhis.insert(NextDeadValue);
         auto OriginalPhi =
             std::find_if(PhiNodes.begin(), PhiNodes.end(),
                          [this, &NextDeadValue](PHINode *CandPhi) {
                            return ValMap[CandPhi] == NextDeadValue;
                          });
         // This PHI may have already been removed from maps when
         // unwinding a previous Phi
         if (OriginalPhi != PhiNodes.end())
           ValMap.erase(*OriginalPhi);

         DeadInsts.emplace_back(cast<Instruction>(NextDeadValue));

         for (User *U : NextDeadValue->users()) {
           if (!VisitedPhis.contains(cast<PHINode>(U)))
             PHIWorklist.push_back(U);
         }
       }
     } else {
       DeadInsts.emplace_back(cast<Instruction>(Phi));
     }
   }
   // Coerce back to the original type and replace the uses.
   for (Instruction *U : Uses) {
     // Replace all converted operands for a use.
     for (auto [OpIdx, Op] : enumerate(U->operands())) {
       if (ValMap.contains(Op) && ValMap[Op]) {
         Value *NewVal = nullptr;
         if (BBUseValMap.contains(U->getParent()) &&
             BBUseValMap[U->getParent()].contains(ValMap[Op]))
           NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
         else {
           BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
           // We may pick up ops that were previously converted for users in
           // other blocks. If there is an originally typed definition of the Op
           // already in this block, simply reuse it.
           if (isa<Instruction>(Op) && !isa<PHINode>(Op) &&
               U->getParent() == cast<Instruction>(Op)->getParent()) {
             NewVal = Op;
           } else {
             NewVal =
                 convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
                                    InsertPt, U->getParent());
             BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
           }
         }
         assert(NewVal);
         U->setOperand(OpIdx, NewVal);
       }
     }
   }

   return true;
 }

 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   unsigned AS = LI.getPointerAddressSpace();
   // Skip non-constant address space.
   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
     return false;
   // Skip non-simple loads.
   if (!LI.isSimple())
     return false;
   Type *Ty = LI.getType();
   // Skip aggregate types.
   if (Ty->isAggregateType())
     return false;
   unsigned TySize = DL->getTypeStoreSize(Ty);
   // Only handle sub-DWORD loads.
   if (TySize >= 4)
     return false;
   // That load must be at least naturally aligned.
   if (LI.getAlign() < DL->getABITypeAlign(Ty))
     return false;
   // It should be uniform, i.e. a scalar load.
   return UA->isUniform(&LI);
 }

 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
   if (!WidenLoads)
     return false;

   // Skip if that load is already aligned on DWORD at least as it's handled in
   // SDAG.
   if (LI.getAlign() >= 4)
     return false;

   if (!canWidenScalarExtLoad(LI))
     return false;

   int64_t Offset = 0;
   auto *Base =
       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
   // If that base is not DWORD aligned, it's not safe to perform the following
   // transforms.
   if (!isDWORDAligned(Base))
     return false;

   int64_t Adjust = Offset & 0x3;
   if (Adjust == 0) {
     // With a zero adjust, the original alignment could be promoted with a
     // better one.
     LI.setAlignment(Align(4));
     return true;
   }

   IRBuilder<> IRB(&LI);
   IRB.SetCurrentDebugLocation(LI.getDebugLoc());

   unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);

   auto *NewPtr = IRB.CreateConstGEP1_64(
       IRB.getInt8Ty(),
       IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
       Offset - Adjust);

   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
   NewLd->copyMetadata(LI);
   NewLd->setMetadata(LLVMContext::MD_range, nullptr);

   unsigned ShAmt = Adjust * 8;
   auto *NewVal = IRB.CreateBitCast(
       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
   LI.replaceAllUsesWith(NewVal);
   DeadInsts.emplace_back(&LI);

   return true;
 }

 PreservedAnalyses
 AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

   AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
   UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);

   AMDGPULateCodeGenPrepare Impl(*F.getParent(), ST, &AC, &UI);

   bool Changed = Impl.run(F);

   PreservedAnalyses PA = PreservedAnalyses::none();
   if (!Changed)
     return PA;
   PA.preserveSet<CFGAnalyses>();
   return PA;
 }

 class AMDGPULateCodeGenPrepareLegacy : public FunctionPass {
 public:
   static char ID;

   AMDGPULateCodeGenPrepareLegacy() : FunctionPass(ID) {}

   StringRef getPassName() const override {
     return "AMDGPU IR late optimizations";
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetPassConfig>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<UniformityInfoWrapperPass>();
     AU.setPreservesAll();
   }

   bool runOnFunction(Function &F) override;
 };

 bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;

   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

   AssumptionCache &AC =
       getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UniformityInfo &UI =
       getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();

   AMDGPULateCodeGenPrepare Impl(*F.getParent(), ST, &AC, &UI);

   return Impl.run(F);
 }

 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
                       "AMDGPU IR late optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
                     "AMDGPU IR late optimizations", false, false)

 char AMDGPULateCodeGenPrepareLegacy::ID = 0;

 FunctionPass *llvm::createAMDGPULateCodeGenPrepareLegacyPass() {
   return new AMDGPULateCodeGenPrepareLegacy();
 }
	//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// This pass does misc. AMDGPU optimizations on IR just before instruction
	/// selection.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPU.h"
	#include "AMDGPUTargetMachine.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/UniformityAnalysis.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InstVisitor.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Transforms/Utils/Local.h"

	#define DEBUG_TYPE "amdgpu-late-codegenprepare"

	using namespace llvm;

	// Scalar load widening needs running after load-store-vectorizer as that pass
	// doesn't handle overlapping cases. In addition, this pass enhances the
	// widening to handle cases where scalar sub-dword loads are naturally aligned
	// only but not dword aligned.
	static cl::opt<bool>
	WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
	cl::desc("Widen sub-dword constant address space loads in "
	"AMDGPULateCodeGenPrepare"),
	cl::ReallyHidden, cl::init(true));

	namespace {

	class AMDGPULateCodeGenPrepare
	: public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
	Module *Mod = nullptr;
	const DataLayout *DL = nullptr;
	const GCNSubtarget &ST;

	AssumptionCache *AC = nullptr;
	UniformityInfo *UA = nullptr;

	SmallVector<WeakTrackingVH, 8> DeadInsts;

	public:
	AMDGPULateCodeGenPrepare(Module &M, const GCNSubtarget &ST,
	AssumptionCache AC, UniformityInfo UA)
	: Mod(&M), DL(&M.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
	bool run(Function &F);
	bool visitInstruction(Instruction &) { return false; }

	// Check if the specified value is at least DWORD aligned.
	bool isDWORDAligned(const Value *V) const {
	KnownBits Known = computeKnownBits(V, *DL, 0, AC);
	return Known.countMinTrailingZeros() >= 2;
	}

	bool canWidenScalarExtLoad(LoadInst &LI) const;
	bool visitLoadInst(LoadInst &LI);
	};

	using ValueToValueMap = DenseMap<const Value , Value >;

	class LiveRegOptimizer {
	private:
	Module *Mod = nullptr;
	const DataLayout *DL = nullptr;
	const GCNSubtarget *ST;
	/// The scalar type to convert to
	Type *ConvertToScalar;
	/// The set of visited Instructions
	SmallPtrSet<Instruction *, 4> Visited;
	/// Map of Value -> Converted Value
	ValueToValueMap ValMap;
	/// Map of containing conversions from Optimal Type -> Original Type per BB.
	DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;

	public:
	/// Calculate the and \p return the type to convert to given a problematic \p
	/// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
	Type calculateConvertType(Type OriginalType);
	/// Convert the virtual register defined by \p V to the compatible vector of
	/// legal type
	Value convertToOptType(Instruction V, BasicBlock::iterator &InstPt);
	/// Convert the virtual register defined by \p V back to the original type \p
	/// ConvertType, stripping away the MSBs in cases where there was an imperfect
	/// fit (e.g. v2i32 -> v7i8)
	Value convertFromOptType(Type ConvertType, Instruction *V,
	BasicBlock::iterator &InstPt,
	BasicBlock *InsertBlock);
	/// Check for problematic PHI nodes or cross-bb values based on the value
	/// defined by \p I, and coerce to legal types if necessary. For problematic
	/// PHI node, we coerce all incoming values in a single invocation.
	bool optimizeLiveType(Instruction *I,
	SmallVectorImpl<WeakTrackingVH> &DeadInsts);

	// Whether or not the type should be replaced to avoid inefficient
	// legalization code
	bool shouldReplace(Type *ITy) {
	FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
	if (!VTy)
	return false;

	auto TLI = ST->getTargetLowering();

	Type *EltTy = VTy->getElementType();
	// If the element size is not less than the convert to scalar size, then we
	// can't do any bit packing
	if (!EltTy->isIntegerTy() \|\|
	EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
	return false;

	// Only coerce illegal types
	TargetLoweringBase::LegalizeKind LK =
	TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
	return LK.first != TargetLoweringBase::TypeLegal;
	}

	LiveRegOptimizer(Module Mod, const GCNSubtarget ST) : Mod(Mod), ST(ST) {
	DL = &Mod->getDataLayout();
	ConvertToScalar = Type::getInt32Ty(Mod->getContext());
	}
	};

	} // end anonymous namespace

	bool AMDGPULateCodeGenPrepare::run(Function &F) {
	// "Optimize" the virtual regs that cross basic block boundaries. When
	// building the SelectionDAG, vectors of illegal types that cross basic blocks
	// will be scalarized and widened, with each scalar living in its
	// own register. To work around this, this optimization converts the
	// vectors to equivalent vectors of legal type (which are converted back
	// before uses in subsequent blocks), to pack the bits into fewer physical
	// registers (used in CopyToReg/CopyFromReg pairs).
	LiveRegOptimizer LRO(Mod, &ST);

	bool Changed = false;

	bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads();

	for (auto &BB : reverse(F))
	for (Instruction &I : make_early_inc_range(reverse(BB))) {
	Changed \|= !HasScalarSubwordLoads && visit(I);
	Changed \|= LRO.optimizeLiveType(&I, DeadInsts);
	}

	RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
	return Changed;
	}

	Type LiveRegOptimizer::calculateConvertType(Type OriginalType) {
	assert(OriginalType->getScalarSizeInBits() <=
	ConvertToScalar->getScalarSizeInBits());

	FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);

	TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
	TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
	unsigned ConvertEltCount =
	(OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;

	if (OriginalSize <= ConvertScalarSize)
	return IntegerType::get(Mod->getContext(), ConvertScalarSize);

	return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
	ConvertEltCount, false);
	}

	Value LiveRegOptimizer::convertToOptType(Instruction V,
	BasicBlock::iterator &InsertPt) {
	FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
	Type *NewTy = calculateConvertType(V->getType());

	TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
	TypeSize NewSize = DL->getTypeSizeInBits(NewTy);

	IRBuilder<> Builder(V->getParent(), InsertPt);
	// If there is a bitsize match, we can fit the old vector into a new vector of
	// desired type.
	if (OriginalSize == NewSize)
	return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");

	// If there is a bitsize mismatch, we must use a wider vector.
	assert(NewSize > OriginalSize);
	uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();

	SmallVector<int, 8> ShuffleMask;
	uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
	for (unsigned I = 0; I < OriginalElementCount; I++)
	ShuffleMask.push_back(I);

	for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
	ShuffleMask.push_back(OriginalElementCount);

	Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
	return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
	}

	Value LiveRegOptimizer::convertFromOptType(Type ConvertType, Instruction *V,
	BasicBlock::iterator &InsertPt,
	BasicBlock *InsertBB) {
	FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);

	TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
	TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);

	IRBuilder<> Builder(InsertBB, InsertPt);
	// If there is a bitsize match, we simply convert back to the original type.
	if (OriginalSize == NewSize)
	return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");

	// If there is a bitsize mismatch, then we must have used a wider value to
	// hold the bits.
	assert(OriginalSize > NewSize);
	// For wide scalars, we can just truncate the value.
	if (!V->getType()->isVectorTy()) {
	Instruction *Trunc = cast<Instruction>(
	Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
	return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
	}

	// For wider vectors, we must strip the MSBs to convert back to the original
	// type.
	VectorType *ExpandedVT = VectorType::get(
	Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
	(OriginalSize / NewVTy->getScalarSizeInBits()), false);
	Instruction *Converted =
	cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));

	unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
	SmallVector<int, 8> ShuffleMask(NarrowElementCount);
	std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);

	return Builder.CreateShuffleVector(Converted, ShuffleMask);
	}

	bool LiveRegOptimizer::optimizeLiveType(
	Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
	SmallVector<Instruction *, 4> Worklist;
	SmallPtrSet<PHINode *, 4> PhiNodes;
	SmallPtrSet<Instruction *, 4> Defs;
	SmallPtrSet<Instruction *, 4> Uses;

	Worklist.push_back(cast<Instruction>(I));
	while (!Worklist.empty()) {
	Instruction *II = Worklist.pop_back_val();

	if (!Visited.insert(II).second)
	continue;

	if (!shouldReplace(II->getType()))
	continue;

	if (PHINode *Phi = dyn_cast<PHINode>(II)) {
	PhiNodes.insert(Phi);
	// Collect all the incoming values of problematic PHI nodes.
	for (Value *V : Phi->incoming_values()) {
	// Repeat the collection process for newly found PHI nodes.
	if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
	if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
	Worklist.push_back(OpPhi);
	continue;
	}

	Instruction *IncInst = dyn_cast<Instruction>(V);
	// Other incoming value types (e.g. vector literals) are unhandled
	if (!IncInst && !isa<ConstantAggregateZero>(V))
	return false;

	// Collect all other incoming values for coercion.
	if (IncInst)
	Defs.insert(IncInst);
	}
	}

	// Collect all relevant uses.
	for (User *V : II->users()) {
	// Repeat the collection process for problematic PHI nodes.
	if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
	if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
	Worklist.push_back(OpPhi);
	continue;
	}

	Instruction *UseInst = cast<Instruction>(V);
	// Collect all uses of PHINodes and any use the crosses BB boundaries.
	if (UseInst->getParent() != II->getParent() \|\| isa<PHINode>(II)) {
	Uses.insert(UseInst);
	if (!Defs.count(II) && !isa<PHINode>(II)) {
	Defs.insert(II);
	}
	}
	}
	}

	// Coerce and track the defs.
	for (Instruction *D : Defs) {
	if (!ValMap.contains(D)) {
	BasicBlock::iterator InsertPt = std::next(D->getIterator());
	Value *ConvertVal = convertToOptType(D, InsertPt);
	assert(ConvertVal);
	ValMap[D] = ConvertVal;
	}
	}

	// Construct new-typed PHI nodes.
	for (PHINode *Phi : PhiNodes) {
	ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
	Phi->getNumIncomingValues(),
	Phi->getName() + ".tc", Phi->getIterator());
	}

	// Connect all the PHI nodes with their new incoming values.
	for (PHINode *Phi : PhiNodes) {
	PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
	bool MissingIncVal = false;
	for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
	Value *IncVal = Phi->getIncomingValue(I);
	if (isa<ConstantAggregateZero>(IncVal)) {
	Type *NewType = calculateConvertType(Phi->getType());
	NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
	Phi->getIncomingBlock(I));
	} else if (ValMap.contains(IncVal) && ValMap[IncVal])
	NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
	else
	MissingIncVal = true;
	}
	if (MissingIncVal) {
	Value *DeadVal = ValMap[Phi];
	// The coercion chain of the PHI is broken. Delete the Phi
	// from the ValMap and any connected / user Phis.
	SmallVector<Value *, 4> PHIWorklist;
	SmallPtrSet<Value *, 4> VisitedPhis;
	PHIWorklist.push_back(DeadVal);
	while (!PHIWorklist.empty()) {
	Value *NextDeadValue = PHIWorklist.pop_back_val();
	VisitedPhis.insert(NextDeadValue);
	auto OriginalPhi =
	std::find_if(PhiNodes.begin(), PhiNodes.end(),
	[this, &NextDeadValue](PHINode *CandPhi) {
	return ValMap[CandPhi] == NextDeadValue;
	});
	// This PHI may have already been removed from maps when
	// unwinding a previous Phi
	if (OriginalPhi != PhiNodes.end())
	ValMap.erase(*OriginalPhi);

	DeadInsts.emplace_back(cast<Instruction>(NextDeadValue));

	for (User *U : NextDeadValue->users()) {
	if (!VisitedPhis.contains(cast<PHINode>(U)))
	PHIWorklist.push_back(U);
	}
	}
	} else {
	DeadInsts.emplace_back(cast<Instruction>(Phi));
	}
	}
	// Coerce back to the original type and replace the uses.
	for (Instruction *U : Uses) {
	// Replace all converted operands for a use.
	for (auto [OpIdx, Op] : enumerate(U->operands())) {
	if (ValMap.contains(Op) && ValMap[Op]) {
	Value *NewVal = nullptr;
	if (BBUseValMap.contains(U->getParent()) &&
	BBUseValMap[U->getParent()].contains(ValMap[Op]))
	NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
	else {
	BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
	// We may pick up ops that were previously converted for users in
	// other blocks. If there is an originally typed definition of the Op
	// already in this block, simply reuse it.
	if (isa<Instruction>(Op) && !isa<PHINode>(Op) &&
	U->getParent() == cast<Instruction>(Op)->getParent()) {
	NewVal = Op;
	} else {
	NewVal =
	convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
	InsertPt, U->getParent());
	BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
	}
	}
	assert(NewVal);
	U->setOperand(OpIdx, NewVal);
	}
	}
	}

	return true;
	}

	bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
	unsigned AS = LI.getPointerAddressSpace();
	// Skip non-constant address space.
	if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
	AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
	return false;
	// Skip non-simple loads.
	if (!LI.isSimple())
	return false;
	Type *Ty = LI.getType();
	// Skip aggregate types.
	if (Ty->isAggregateType())
	return false;
	unsigned TySize = DL->getTypeStoreSize(Ty);
	// Only handle sub-DWORD loads.
	if (TySize >= 4)
	return false;
	// That load must be at least naturally aligned.
	if (LI.getAlign() < DL->getABITypeAlign(Ty))
	return false;
	// It should be uniform, i.e. a scalar load.
	return UA->isUniform(&LI);
	}

	bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
	if (!WidenLoads)
	return false;

	// Skip if that load is already aligned on DWORD at least as it's handled in
	// SDAG.
	if (LI.getAlign() >= 4)
	return false;

	if (!canWidenScalarExtLoad(LI))
	return false;

	int64_t Offset = 0;
	auto *Base =
	GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
	// If that base is not DWORD aligned, it's not safe to perform the following
	// transforms.
	if (!isDWORDAligned(Base))
	return false;

	int64_t Adjust = Offset & 0x3;
	if (Adjust == 0) {
	// With a zero adjust, the original alignment could be promoted with a
	// better one.
	LI.setAlignment(Align(4));
	return true;
	}

	IRBuilder<> IRB(&LI);
	IRB.SetCurrentDebugLocation(LI.getDebugLoc());

	unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
	auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);

	auto *NewPtr = IRB.CreateConstGEP1_64(
	IRB.getInt8Ty(),
	IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
	Offset - Adjust);

	LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
	NewLd->copyMetadata(LI);
	NewLd->setMetadata(LLVMContext::MD_range, nullptr);

	unsigned ShAmt = Adjust * 8;
	auto *NewVal = IRB.CreateBitCast(
	IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
	LI.replaceAllUsesWith(NewVal);
	DeadInsts.emplace_back(&LI);

	return true;
	}

	PreservedAnalyses
	AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

	AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
	UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);

	AMDGPULateCodeGenPrepare Impl(*F.getParent(), ST, &AC, &UI);

	bool Changed = Impl.run(F);

	PreservedAnalyses PA = PreservedAnalyses::none();
	if (!Changed)
	return PA;
	PA.preserveSet<CFGAnalyses>();
	return PA;
	}

	class AMDGPULateCodeGenPrepareLegacy : public FunctionPass {
	public:
	static char ID;

	AMDGPULateCodeGenPrepareLegacy() : FunctionPass(ID) {}

	StringRef getPassName() const override {
	return "AMDGPU IR late optimizations";
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<TargetPassConfig>();
	AU.addRequired<AssumptionCacheTracker>();
	AU.addRequired<UniformityInfoWrapperPass>();
	AU.setPreservesAll();
	}

	bool runOnFunction(Function &F) override;
	};

	bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
	if (skipFunction(F))
	return false;

	const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
	const TargetMachine &TM = TPC.getTM<TargetMachine>();
	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

	AssumptionCache &AC =
	getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
	UniformityInfo &UI =
	getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();

	AMDGPULateCodeGenPrepare Impl(*F.getParent(), ST, &AC, &UI);

	return Impl.run(F);
	}

	INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
	"AMDGPU IR late optimizations", false, false)
	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
	INITIALIZE_PASS_END(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
	"AMDGPU IR late optimizations", false, false)

	char AMDGPULateCodeGenPrepareLegacy::ID = 0;

	FunctionPass *llvm::createAMDGPULateCodeGenPrepareLegacyPass() {
	return new AMDGPULateCodeGenPrepareLegacy();
	}