blob: 36dfebacaed68680faf347e651835557e233c055 [file] [log] [blame]
//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
/// selection.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/Local.h"
#define DEBUG_TYPE "amdgpu-late-codegenprepare"
using namespace llvm;
// Scalar load widening needs running after load-store-vectorizer as that pass
// doesn't handle overlapping cases. In addition, this pass enhances the
// widening to handle cases where scalar sub-dword loads are naturally aligned
// only but not dword aligned.
static cl::opt<bool>
WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
cl::desc("Widen sub-dword constant address space loads in "
"AMDGPULateCodeGenPrepare"),
cl::ReallyHidden, cl::init(true));
namespace {
class AMDGPULateCodeGenPrepare
: public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
const GCNSubtarget &ST;
AssumptionCache *AC = nullptr;
UniformityInfo *UA = nullptr;
SmallVector<WeakTrackingVH, 8> DeadInsts;
public:
AMDGPULateCodeGenPrepare(Module &M, const GCNSubtarget &ST,
AssumptionCache *AC, UniformityInfo *UA)
: Mod(&M), DL(&M.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
bool run(Function &F);
bool visitInstruction(Instruction &) { return false; }
// Check if the specified value is at least DWORD aligned.
bool isDWORDAligned(const Value *V) const {
KnownBits Known = computeKnownBits(V, *DL, 0, AC);
return Known.countMinTrailingZeros() >= 2;
}
bool canWidenScalarExtLoad(LoadInst &LI) const;
bool visitLoadInst(LoadInst &LI);
};
using ValueToValueMap = DenseMap<const Value *, Value *>;
class LiveRegOptimizer {
private:
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
const GCNSubtarget *ST;
/// The scalar type to convert to
Type *ConvertToScalar;
/// The set of visited Instructions
SmallPtrSet<Instruction *, 4> Visited;
/// Map of Value -> Converted Value
ValueToValueMap ValMap;
/// Map of containing conversions from Optimal Type -> Original Type per BB.
DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
public:
/// Calculate the and \p return the type to convert to given a problematic \p
/// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
Type *calculateConvertType(Type *OriginalType);
/// Convert the virtual register defined by \p V to the compatible vector of
/// legal type
Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
/// Convert the virtual register defined by \p V back to the original type \p
/// ConvertType, stripping away the MSBs in cases where there was an imperfect
/// fit (e.g. v2i32 -> v7i8)
Value *convertFromOptType(Type *ConvertType, Instruction *V,
BasicBlock::iterator &InstPt,
BasicBlock *InsertBlock);
/// Check for problematic PHI nodes or cross-bb values based on the value
/// defined by \p I, and coerce to legal types if necessary. For problematic
/// PHI node, we coerce all incoming values in a single invocation.
bool optimizeLiveType(Instruction *I,
SmallVectorImpl<WeakTrackingVH> &DeadInsts);
// Whether or not the type should be replaced to avoid inefficient
// legalization code
bool shouldReplace(Type *ITy) {
FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
if (!VTy)
return false;
auto TLI = ST->getTargetLowering();
Type *EltTy = VTy->getElementType();
// If the element size is not less than the convert to scalar size, then we
// can't do any bit packing
if (!EltTy->isIntegerTy() ||
EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
return false;
// Only coerce illegal types
TargetLoweringBase::LegalizeKind LK =
TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
return LK.first != TargetLoweringBase::TypeLegal;
}
LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
DL = &Mod->getDataLayout();
ConvertToScalar = Type::getInt32Ty(Mod->getContext());
}
};
} // end anonymous namespace
bool AMDGPULateCodeGenPrepare::run(Function &F) {
// "Optimize" the virtual regs that cross basic block boundaries. When
// building the SelectionDAG, vectors of illegal types that cross basic blocks
// will be scalarized and widened, with each scalar living in its
// own register. To work around this, this optimization converts the
// vectors to equivalent vectors of legal type (which are converted back
// before uses in subsequent blocks), to pack the bits into fewer physical
// registers (used in CopyToReg/CopyFromReg pairs).
LiveRegOptimizer LRO(Mod, &ST);
bool Changed = false;
bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads();
for (auto &BB : reverse(F))
for (Instruction &I : make_early_inc_range(reverse(BB))) {
Changed |= !HasScalarSubwordLoads && visit(I);
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
}
RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
return Changed;
}
Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
assert(OriginalType->getScalarSizeInBits() <=
ConvertToScalar->getScalarSizeInBits());
FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
unsigned ConvertEltCount =
(OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
if (OriginalSize <= ConvertScalarSize)
return IntegerType::get(Mod->getContext(), ConvertScalarSize);
return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
ConvertEltCount, false);
}
Value *LiveRegOptimizer::convertToOptType(Instruction *V,
BasicBlock::iterator &InsertPt) {
FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
Type *NewTy = calculateConvertType(V->getType());
TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
IRBuilder<> Builder(V->getParent(), InsertPt);
// If there is a bitsize match, we can fit the old vector into a new vector of
// desired type.
if (OriginalSize == NewSize)
return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
// If there is a bitsize mismatch, we must use a wider vector.
assert(NewSize > OriginalSize);
uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
SmallVector<int, 8> ShuffleMask;
uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
for (unsigned I = 0; I < OriginalElementCount; I++)
ShuffleMask.push_back(I);
for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
ShuffleMask.push_back(OriginalElementCount);
Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
}
Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
BasicBlock::iterator &InsertPt,
BasicBlock *InsertBB) {
FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
IRBuilder<> Builder(InsertBB, InsertPt);
// If there is a bitsize match, we simply convert back to the original type.
if (OriginalSize == NewSize)
return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
// If there is a bitsize mismatch, then we must have used a wider value to
// hold the bits.
assert(OriginalSize > NewSize);
// For wide scalars, we can just truncate the value.
if (!V->getType()->isVectorTy()) {
Instruction *Trunc = cast<Instruction>(
Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
}
// For wider vectors, we must strip the MSBs to convert back to the original
// type.
VectorType *ExpandedVT = VectorType::get(
Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
(OriginalSize / NewVTy->getScalarSizeInBits()), false);
Instruction *Converted =
cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
SmallVector<int, 8> ShuffleMask(NarrowElementCount);
std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
return Builder.CreateShuffleVector(Converted, ShuffleMask);
}
bool LiveRegOptimizer::optimizeLiveType(
Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
SmallVector<Instruction *, 4> Worklist;
SmallPtrSet<PHINode *, 4> PhiNodes;
SmallPtrSet<Instruction *, 4> Defs;
SmallPtrSet<Instruction *, 4> Uses;
Worklist.push_back(cast<Instruction>(I));
while (!Worklist.empty()) {
Instruction *II = Worklist.pop_back_val();
if (!Visited.insert(II).second)
continue;
if (!shouldReplace(II->getType()))
continue;
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
PhiNodes.insert(Phi);
// Collect all the incoming values of problematic PHI nodes.
for (Value *V : Phi->incoming_values()) {
// Repeat the collection process for newly found PHI nodes.
if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
Worklist.push_back(OpPhi);
continue;
}
Instruction *IncInst = dyn_cast<Instruction>(V);
// Other incoming value types (e.g. vector literals) are unhandled
if (!IncInst && !isa<ConstantAggregateZero>(V))
return false;
// Collect all other incoming values for coercion.
if (IncInst)
Defs.insert(IncInst);
}
}
// Collect all relevant uses.
for (User *V : II->users()) {
// Repeat the collection process for problematic PHI nodes.
if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
Worklist.push_back(OpPhi);
continue;
}
Instruction *UseInst = cast<Instruction>(V);
// Collect all uses of PHINodes and any use the crosses BB boundaries.
if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
Uses.insert(UseInst);
if (!Defs.count(II) && !isa<PHINode>(II)) {
Defs.insert(II);
}
}
}
}
// Coerce and track the defs.
for (Instruction *D : Defs) {
if (!ValMap.contains(D)) {
BasicBlock::iterator InsertPt = std::next(D->getIterator());
Value *ConvertVal = convertToOptType(D, InsertPt);
assert(ConvertVal);
ValMap[D] = ConvertVal;
}
}
// Construct new-typed PHI nodes.
for (PHINode *Phi : PhiNodes) {
ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
Phi->getNumIncomingValues(),
Phi->getName() + ".tc", Phi->getIterator());
}
// Connect all the PHI nodes with their new incoming values.
for (PHINode *Phi : PhiNodes) {
PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
bool MissingIncVal = false;
for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
Value *IncVal = Phi->getIncomingValue(I);
if (isa<ConstantAggregateZero>(IncVal)) {
Type *NewType = calculateConvertType(Phi->getType());
NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
Phi->getIncomingBlock(I));
} else if (ValMap.contains(IncVal) && ValMap[IncVal])
NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
else
MissingIncVal = true;
}
if (MissingIncVal) {
Value *DeadVal = ValMap[Phi];
// The coercion chain of the PHI is broken. Delete the Phi
// from the ValMap and any connected / user Phis.
SmallVector<Value *, 4> PHIWorklist;
SmallPtrSet<Value *, 4> VisitedPhis;
PHIWorklist.push_back(DeadVal);
while (!PHIWorklist.empty()) {
Value *NextDeadValue = PHIWorklist.pop_back_val();
VisitedPhis.insert(NextDeadValue);
auto OriginalPhi =
std::find_if(PhiNodes.begin(), PhiNodes.end(),
[this, &NextDeadValue](PHINode *CandPhi) {
return ValMap[CandPhi] == NextDeadValue;
});
// This PHI may have already been removed from maps when
// unwinding a previous Phi
if (OriginalPhi != PhiNodes.end())
ValMap.erase(*OriginalPhi);
DeadInsts.emplace_back(cast<Instruction>(NextDeadValue));
for (User *U : NextDeadValue->users()) {
if (!VisitedPhis.contains(cast<PHINode>(U)))
PHIWorklist.push_back(U);
}
}
} else {
DeadInsts.emplace_back(cast<Instruction>(Phi));
}
}
// Coerce back to the original type and replace the uses.
for (Instruction *U : Uses) {
// Replace all converted operands for a use.
for (auto [OpIdx, Op] : enumerate(U->operands())) {
if (ValMap.contains(Op) && ValMap[Op]) {
Value *NewVal = nullptr;
if (BBUseValMap.contains(U->getParent()) &&
BBUseValMap[U->getParent()].contains(ValMap[Op]))
NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
else {
BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
// We may pick up ops that were previously converted for users in
// other blocks. If there is an originally typed definition of the Op
// already in this block, simply reuse it.
if (isa<Instruction>(Op) && !isa<PHINode>(Op) &&
U->getParent() == cast<Instruction>(Op)->getParent()) {
NewVal = Op;
} else {
NewVal =
convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
InsertPt, U->getParent());
BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
}
}
assert(NewVal);
U->setOperand(OpIdx, NewVal);
}
}
}
return true;
}
bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
unsigned AS = LI.getPointerAddressSpace();
// Skip non-constant address space.
if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return false;
// Skip non-simple loads.
if (!LI.isSimple())
return false;
Type *Ty = LI.getType();
// Skip aggregate types.
if (Ty->isAggregateType())
return false;
unsigned TySize = DL->getTypeStoreSize(Ty);
// Only handle sub-DWORD loads.
if (TySize >= 4)
return false;
// That load must be at least naturally aligned.
if (LI.getAlign() < DL->getABITypeAlign(Ty))
return false;
// It should be uniform, i.e. a scalar load.
return UA->isUniform(&LI);
}
bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
if (!WidenLoads)
return false;
// Skip if that load is already aligned on DWORD at least as it's handled in
// SDAG.
if (LI.getAlign() >= 4)
return false;
if (!canWidenScalarExtLoad(LI))
return false;
int64_t Offset = 0;
auto *Base =
GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
// If that base is not DWORD aligned, it's not safe to perform the following
// transforms.
if (!isDWORDAligned(Base))
return false;
int64_t Adjust = Offset & 0x3;
if (Adjust == 0) {
// With a zero adjust, the original alignment could be promoted with a
// better one.
LI.setAlignment(Align(4));
return true;
}
IRBuilder<> IRB(&LI);
IRB.SetCurrentDebugLocation(LI.getDebugLoc());
unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
auto *NewPtr = IRB.CreateConstGEP1_64(
IRB.getInt8Ty(),
IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
Offset - Adjust);
LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
NewLd->copyMetadata(LI);
NewLd->setMetadata(LLVMContext::MD_range, nullptr);
unsigned ShAmt = Adjust * 8;
auto *NewVal = IRB.CreateBitCast(
IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
LI.replaceAllUsesWith(NewVal);
DeadInsts.emplace_back(&LI);
return true;
}
PreservedAnalyses
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
AMDGPULateCodeGenPrepare Impl(*F.getParent(), ST, &AC, &UI);
bool Changed = Impl.run(F);
PreservedAnalyses PA = PreservedAnalyses::none();
if (!Changed)
return PA;
PA.preserveSet<CFGAnalyses>();
return PA;
}
class AMDGPULateCodeGenPrepareLegacy : public FunctionPass {
public:
static char ID;
AMDGPULateCodeGenPrepareLegacy() : FunctionPass(ID) {}
StringRef getPassName() const override {
return "AMDGPU IR late optimizations";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<UniformityInfoWrapperPass>();
AU.setPreservesAll();
}
bool runOnFunction(Function &F) override;
};
bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
AssumptionCache &AC =
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UniformityInfo &UI =
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
AMDGPULateCodeGenPrepare Impl(*F.getParent(), ST, &AC, &UI);
return Impl.run(F);
}
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)
char AMDGPULateCodeGenPrepareLegacy::ID = 0;
FunctionPass *llvm::createAMDGPULateCodeGenPrepareLegacyPass() {
return new AMDGPULateCodeGenPrepareLegacy();
}