| //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPUMemoryUtils.h" |
| #include "AMDGPU.h" |
| #include "Utils/AMDGPUBaseInfo.h" |
| #include "llvm/ADT/SetOperations.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/Analysis/AliasAnalysis.h" |
| #include "llvm/Analysis/CallGraph.h" |
| #include "llvm/Analysis/MemorySSA.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/IntrinsicsAMDGPU.h" |
| #include "llvm/IR/Operator.h" |
| #include "llvm/IR/ReplaceConstant.h" |
| |
| #define DEBUG_TYPE "amdgpu-memory-utils" |
| |
| using namespace llvm; |
| |
| namespace llvm::AMDGPU { |
| |
| Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { |
| return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), |
| GV->getValueType()); |
| } |
| |
| bool isDynamicLDS(const GlobalVariable &GV) { |
| // external zero size addrspace(3) without initializer is dynlds. |
| const Module *M = GV.getParent(); |
| const DataLayout &DL = M->getDataLayout(); |
| if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) |
| return false; |
| return DL.getTypeAllocSize(GV.getValueType()) == 0; |
| } |
| |
| bool isLDSVariableToLower(const GlobalVariable &GV) { |
| if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { |
| return false; |
| } |
| if (isDynamicLDS(GV)) { |
| return true; |
| } |
| if (GV.isConstant()) { |
| // A constant undef variable can't be written to, and any load is |
| // undef, so it should be eliminated by the optimizer. It could be |
| // dropped by the back end if not. This pass skips over it. |
| return false; |
| } |
| if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) { |
| // Initializers are unimplemented for LDS address space. |
| // Leave such variables in place for consistent error reporting. |
| return false; |
| } |
| return true; |
| } |
| |
| bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { |
| // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS |
| // global may have uses from multiple different functions as a result. |
| // This pass specialises LDS variables with respect to the kernel that |
| // allocates them. |
| |
| // This is semantically equivalent to (the unimplemented as slow): |
| // for (auto &F : M.functions()) |
| // for (auto &BB : F) |
| // for (auto &I : BB) |
| // for (Use &Op : I.operands()) |
| // if (constantExprUsesLDS(Op)) |
| // replaceConstantExprInFunction(I, Op); |
| |
| SmallVector<Constant *> LDSGlobals; |
| for (auto &GV : M.globals()) |
| if (AMDGPU::isLDSVariableToLower(GV)) |
| LDSGlobals.push_back(&GV); |
| return convertUsersOfConstantsToInstructions(LDSGlobals); |
| } |
| |
| void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, |
| FunctionVariableMap &kernels, |
| FunctionVariableMap &Functions) { |
| // Get uses from the current function, excluding uses by called Functions |
| // Two output variables to avoid walking the globals list twice |
| for (auto &GV : M.globals()) { |
| if (!AMDGPU::isLDSVariableToLower(GV)) |
| continue; |
| for (User *V : GV.users()) { |
| if (auto *I = dyn_cast<Instruction>(V)) { |
| Function *F = I->getFunction(); |
| if (isKernelLDS(F)) |
| kernels[F].insert(&GV); |
| else |
| Functions[F].insert(&GV); |
| } |
| } |
| } |
| } |
| |
| bool isKernelLDS(const Function *F) { |
| // Some weirdness here. AMDGPU::isKernelCC does not call into |
| // AMDGPU::isKernel with the calling conv, it instead calls into |
| // isModuleEntryFunction which returns true for more calling conventions |
| // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel. |
| // There's also a test that checks that the LDS lowering does not hit on |
| // a graphics shader, denoted amdgpu_ps, so stay with the limited case. |
| // Putting LDS in the name of the function to draw attention to this. |
| return AMDGPU::isKernel(F->getCallingConv()); |
| } |
| |
| LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { |
| |
| FunctionVariableMap DirectMapKernel; |
| FunctionVariableMap DirectMapFunction; |
| getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction); |
| |
| // Collect variables that are used by functions whose address has escaped |
| DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; |
| for (Function &F : M.functions()) { |
| if (!isKernelLDS(&F)) |
| if (F.hasAddressTaken(nullptr, |
| /* IgnoreCallbackUses */ false, |
| /* IgnoreAssumeLikeCalls */ false, |
| /* IgnoreLLVMUsed */ true, |
| /* IgnoreArcAttachedCall */ false)) { |
| set_union(VariablesReachableThroughFunctionPointer, |
| DirectMapFunction[&F]); |
| } |
| } |
| |
| auto FunctionMakesUnknownCall = [&](const Function *F) -> bool { |
| assert(!F->isDeclaration()); |
| for (const CallGraphNode::CallRecord &R : *CG[F]) { |
| if (!R.second->getFunction()) |
| return true; |
| } |
| return false; |
| }; |
| |
| // Work out which variables are reachable through function calls |
| FunctionVariableMap TransitiveMapFunction = DirectMapFunction; |
| |
| // If the function makes any unknown call, assume the worst case that it can |
| // access all variables accessed by functions whose address escaped |
| for (Function &F : M.functions()) { |
| if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) { |
| if (!isKernelLDS(&F)) { |
| set_union(TransitiveMapFunction[&F], |
| VariablesReachableThroughFunctionPointer); |
| } |
| } |
| } |
| |
| // Direct implementation of collecting all variables reachable from each |
| // function |
| for (Function &Func : M.functions()) { |
| if (Func.isDeclaration() || isKernelLDS(&Func)) |
| continue; |
| |
| DenseSet<Function *> seen; // catches cycles |
| SmallVector<Function *, 4> wip = {&Func}; |
| |
| while (!wip.empty()) { |
| Function *F = wip.pop_back_val(); |
| |
| // Can accelerate this by referring to transitive map for functions that |
| // have already been computed, with more care than this |
| set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]); |
| |
| for (const CallGraphNode::CallRecord &R : *CG[F]) { |
| Function *Ith = R.second->getFunction(); |
| if (Ith) { |
| if (!seen.contains(Ith)) { |
| seen.insert(Ith); |
| wip.push_back(Ith); |
| } |
| } |
| } |
| } |
| } |
| |
| // DirectMapKernel lists which variables are used by the kernel |
| // find the variables which are used through a function call |
| FunctionVariableMap IndirectMapKernel; |
| |
| for (Function &Func : M.functions()) { |
| if (Func.isDeclaration() || !isKernelLDS(&Func)) |
| continue; |
| |
| for (const CallGraphNode::CallRecord &R : *CG[&Func]) { |
| Function *Ith = R.second->getFunction(); |
| if (Ith) { |
| set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]); |
| } else { |
| set_union(IndirectMapKernel[&Func], |
| VariablesReachableThroughFunctionPointer); |
| } |
| } |
| } |
| |
| // Verify that we fall into one of 2 cases: |
| // - All variables are either absolute |
| // or direct mapped dynamic LDS that is not lowered. |
| // this is a re-run of the pass |
| // so we don't have anything to do. |
| // - No variables are absolute. |
| std::optional<bool> HasAbsoluteGVs; |
| for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { |
| for (auto &[Fn, GVs] : Map) { |
| for (auto *GV : GVs) { |
| bool IsAbsolute = GV->isAbsoluteSymbolRef(); |
| bool IsDirectMapDynLDSGV = |
| AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn); |
| if (IsDirectMapDynLDSGV) |
| continue; |
| if (HasAbsoluteGVs.has_value()) { |
| if (*HasAbsoluteGVs != IsAbsolute) { |
| report_fatal_error( |
| "Module cannot mix absolute and non-absolute LDS GVs"); |
| } |
| } else |
| HasAbsoluteGVs = IsAbsolute; |
| } |
| } |
| } |
| |
| // If we only had absolute GVs, we have nothing to do, return an empty |
| // result. |
| if (HasAbsoluteGVs && *HasAbsoluteGVs) |
| return {FunctionVariableMap(), FunctionVariableMap()}; |
| |
| return {std::move(DirectMapKernel), std::move(IndirectMapKernel)}; |
| } |
| |
| void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, |
| ArrayRef<StringRef> FnAttrs) { |
| for (StringRef Attr : FnAttrs) |
| KernelRoot->removeFnAttr(Attr); |
| |
| SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()}; |
| SmallPtrSet<Function *, 8> Visited; |
| bool SeenUnknownCall = false; |
| |
| while (!WorkList.empty()) { |
| Function *F = WorkList.pop_back_val(); |
| |
| for (auto &CallRecord : *CG[F]) { |
| if (!CallRecord.second) |
| continue; |
| |
| Function *Callee = CallRecord.second->getFunction(); |
| if (!Callee) { |
| if (!SeenUnknownCall) { |
| SeenUnknownCall = true; |
| |
| // If we see any indirect calls, assume nothing about potential |
| // targets. |
| // TODO: This could be refined to possible LDS global users. |
| for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) { |
| Function *PotentialCallee = |
| ExternalCallRecord.second->getFunction(); |
| assert(PotentialCallee); |
| if (!isKernelLDS(PotentialCallee)) { |
| for (StringRef Attr : FnAttrs) |
| PotentialCallee->removeFnAttr(Attr); |
| } |
| } |
| } |
| } else { |
| for (StringRef Attr : FnAttrs) |
| Callee->removeFnAttr(Attr); |
| if (Visited.insert(Callee).second) |
| WorkList.push_back(Callee); |
| } |
| } |
| } |
| } |
| |
| bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { |
| Instruction *DefInst = Def->getMemoryInst(); |
| |
| if (isa<FenceInst>(DefInst)) |
| return false; |
| |
| if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { |
| switch (II->getIntrinsicID()) { |
| case Intrinsic::amdgcn_s_barrier: |
| case Intrinsic::amdgcn_s_barrier_signal: |
| case Intrinsic::amdgcn_s_barrier_signal_var: |
| case Intrinsic::amdgcn_s_barrier_signal_isfirst: |
| case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: |
| case Intrinsic::amdgcn_s_barrier_init: |
| case Intrinsic::amdgcn_s_barrier_join: |
| case Intrinsic::amdgcn_s_barrier_wait: |
| case Intrinsic::amdgcn_s_barrier_leave: |
| case Intrinsic::amdgcn_s_get_barrier_state: |
| case Intrinsic::amdgcn_s_wakeup_barrier: |
| case Intrinsic::amdgcn_wave_barrier: |
| case Intrinsic::amdgcn_sched_barrier: |
| case Intrinsic::amdgcn_sched_group_barrier: |
| return false; |
| default: |
| break; |
| } |
| } |
| |
| // Ignore atomics not aliasing with the original load, any atomic is a |
| // universal MemoryDef from MSSA's point of view too, just like a fence. |
| const auto checkNoAlias = [AA, Ptr](auto I) -> bool { |
| return I && AA->isNoAlias(I->getPointerOperand(), Ptr); |
| }; |
| |
| if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) || |
| checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst))) |
| return false; |
| |
| return true; |
| } |
| |
| bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, |
| AAResults *AA) { |
| MemorySSAWalker *Walker = MSSA->getWalker(); |
| SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; |
| SmallSet<MemoryAccess *, 8> Visited; |
| MemoryLocation Loc(MemoryLocation::get(Load)); |
| |
| LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); |
| |
| // Start with a nearest dominating clobbering access, it will be either |
| // live on entry (nothing to do, load is not clobbered), MemoryDef, or |
| // MemoryPhi if several MemoryDefs can define this memory state. In that |
| // case add all Defs to WorkList and continue going up and checking all |
| // the definitions of this memory location until the root. When all the |
| // defs are exhausted and came to the entry state we have no clobber. |
| // Along the scan ignore barriers and fences which are considered clobbers |
| // by the MemorySSA, but not really writing anything into the memory. |
| while (!WorkList.empty()) { |
| MemoryAccess *MA = WorkList.pop_back_val(); |
| if (!Visited.insert(MA).second) |
| continue; |
| |
| if (MSSA->isLiveOnEntryDef(MA)) |
| continue; |
| |
| if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { |
| LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); |
| |
| if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) { |
| LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); |
| return true; |
| } |
| |
| WorkList.push_back( |
| Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); |
| continue; |
| } |
| |
| const MemoryPhi *Phi = cast<MemoryPhi>(MA); |
| for (const auto &Use : Phi->incoming_values()) |
| WorkList.push_back(cast<MemoryAccess>(&Use)); |
| } |
| |
| LLVM_DEBUG(dbgs() << " -> no clobber\n"); |
| return false; |
| } |
| |
| } // end namespace llvm::AMDGPU |