| //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This provides a generalized class for OpenMP runtime code generation |
| // specialized by GPU targets NVPTX and AMDGCN. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "CGOpenMPRuntimeGPU.h" |
| #include "CodeGenFunction.h" |
| #include "clang/AST/Attr.h" |
| #include "clang/AST/DeclOpenMP.h" |
| #include "clang/AST/OpenMPClause.h" |
| #include "clang/AST/StmtOpenMP.h" |
| #include "clang/AST/StmtVisitor.h" |
| #include "clang/Basic/Cuda.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" |
| #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| |
| using namespace clang; |
| using namespace CodeGen; |
| using namespace llvm::omp; |
| |
| namespace { |
| /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. |
| class NVPTXActionTy final : public PrePostActionTy { |
| llvm::FunctionCallee EnterCallee = nullptr; |
| ArrayRef<llvm::Value *> EnterArgs; |
| llvm::FunctionCallee ExitCallee = nullptr; |
| ArrayRef<llvm::Value *> ExitArgs; |
| bool Conditional = false; |
| llvm::BasicBlock *ContBlock = nullptr; |
| |
| public: |
| NVPTXActionTy(llvm::FunctionCallee EnterCallee, |
| ArrayRef<llvm::Value *> EnterArgs, |
| llvm::FunctionCallee ExitCallee, |
| ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) |
| : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), |
| ExitArgs(ExitArgs), Conditional(Conditional) {} |
| void Enter(CodeGenFunction &CGF) override { |
| llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); |
| if (Conditional) { |
| llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); |
| auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); |
| ContBlock = CGF.createBasicBlock("omp_if.end"); |
| // Generate the branch (If-stmt) |
| CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); |
| CGF.EmitBlock(ThenBlock); |
| } |
| } |
| void Done(CodeGenFunction &CGF) { |
| // Emit the rest of blocks/branches |
| CGF.EmitBranch(ContBlock); |
| CGF.EmitBlock(ContBlock, true); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| CGF.EmitRuntimeCall(ExitCallee, ExitArgs); |
| } |
| }; |
| |
| /// A class to track the execution mode when codegening directives within |
| /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry |
| /// to the target region and used by containing directives such as 'parallel' |
| /// to emit optimized code. |
| class ExecutionRuntimeModesRAII { |
| private: |
| CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = |
| CGOpenMPRuntimeGPU::EM_Unknown; |
| CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; |
| |
| public: |
| ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, |
| CGOpenMPRuntimeGPU::ExecutionMode EntryMode) |
| : ExecMode(ExecMode) { |
| SavedExecMode = ExecMode; |
| ExecMode = EntryMode; |
| } |
| ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; } |
| }; |
| |
| static const ValueDecl *getPrivateItem(const Expr *RefExpr) { |
| RefExpr = RefExpr->IgnoreParens(); |
| if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { |
| const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); |
| while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) |
| Base = TempASE->getBase()->IgnoreParenImpCasts(); |
| RefExpr = Base; |
| } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) { |
| const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); |
| while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base)) |
| Base = TempOASE->getBase()->IgnoreParenImpCasts(); |
| while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) |
| Base = TempASE->getBase()->IgnoreParenImpCasts(); |
| RefExpr = Base; |
| } |
| RefExpr = RefExpr->IgnoreParenImpCasts(); |
| if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) |
| return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); |
| const auto *ME = cast<MemberExpr>(RefExpr); |
| return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); |
| } |
| |
| static RecordDecl *buildRecordForGlobalizedVars( |
| ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, |
| ArrayRef<const ValueDecl *> EscapedDeclsForTeams, |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> |
| &MappedDeclsFields, |
| int BufSize) { |
| using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; |
| if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) |
| return nullptr; |
| SmallVector<VarsDataTy, 4> GlobalizedVars; |
| for (const ValueDecl *D : EscapedDecls) |
| GlobalizedVars.emplace_back(C.getDeclAlign(D), D); |
| for (const ValueDecl *D : EscapedDeclsForTeams) |
| GlobalizedVars.emplace_back(C.getDeclAlign(D), D); |
| |
| // Build struct _globalized_locals_ty { |
| // /* globalized vars */[WarSize] align (decl_align) |
| // /* globalized vars */ for EscapedDeclsForTeams |
| // }; |
| RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); |
| GlobalizedRD->startDefinition(); |
| llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped( |
| EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); |
| for (const auto &Pair : GlobalizedVars) { |
| const ValueDecl *VD = Pair.second; |
| QualType Type = VD->getType(); |
| if (Type->isLValueReferenceType()) |
| Type = C.getPointerType(Type.getNonReferenceType()); |
| else |
| Type = Type.getNonReferenceType(); |
| SourceLocation Loc = VD->getLocation(); |
| FieldDecl *Field; |
| if (SingleEscaped.count(VD)) { |
| Field = FieldDecl::Create( |
| C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, |
| C.getTrivialTypeSourceInfo(Type, SourceLocation()), |
| /*BW=*/nullptr, /*Mutable=*/false, |
| /*InitStyle=*/ICIS_NoInit); |
| Field->setAccess(AS_public); |
| if (VD->hasAttrs()) { |
| for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), |
| E(VD->getAttrs().end()); |
| I != E; ++I) |
| Field->addAttr(*I); |
| } |
| } else { |
| if (BufSize > 1) { |
| llvm::APInt ArraySize(32, BufSize); |
| Type = C.getConstantArrayType(Type, ArraySize, nullptr, |
| ArraySizeModifier::Normal, 0); |
| } |
| Field = FieldDecl::Create( |
| C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, |
| C.getTrivialTypeSourceInfo(Type, SourceLocation()), |
| /*BW=*/nullptr, /*Mutable=*/false, |
| /*InitStyle=*/ICIS_NoInit); |
| Field->setAccess(AS_public); |
| llvm::APInt Align(32, Pair.first.getQuantity()); |
| Field->addAttr(AlignedAttr::CreateImplicit( |
| C, /*IsAlignmentExpr=*/true, |
| IntegerLiteral::Create(C, Align, |
| C.getIntTypeForBitwidth(32, /*Signed=*/0), |
| SourceLocation()), |
| {}, AlignedAttr::GNU_aligned)); |
| } |
| GlobalizedRD->addDecl(Field); |
| MappedDeclsFields.try_emplace(VD, Field); |
| } |
| GlobalizedRD->completeDefinition(); |
| return GlobalizedRD; |
| } |
| |
| /// Get the list of variables that can escape their declaration context. |
| class CheckVarsEscapingDeclContext final |
| : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { |
| CodeGenFunction &CGF; |
| llvm::SetVector<const ValueDecl *> EscapedDecls; |
| llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; |
| llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls; |
| llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; |
| RecordDecl *GlobalizedRD = nullptr; |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; |
| bool AllEscaped = false; |
| bool IsForCombinedParallelRegion = false; |
| |
| void markAsEscaped(const ValueDecl *VD) { |
| // Do not globalize declare target variables. |
| if (!isa<VarDecl>(VD) || |
| OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) |
| return; |
| VD = cast<ValueDecl>(VD->getCanonicalDecl()); |
| // Use user-specified allocation. |
| if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) |
| return; |
| // Variables captured by value must be globalized. |
| bool IsCaptured = false; |
| if (auto *CSI = CGF.CapturedStmtInfo) { |
| if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { |
| // Check if need to capture the variable that was already captured by |
| // value in the outer region. |
| IsCaptured = true; |
| if (!IsForCombinedParallelRegion) { |
| if (!FD->hasAttrs()) |
| return; |
| const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); |
| if (!Attr) |
| return; |
| if (((Attr->getCaptureKind() != OMPC_map) && |
| !isOpenMPPrivate(Attr->getCaptureKind())) || |
| ((Attr->getCaptureKind() == OMPC_map) && |
| !FD->getType()->isAnyPointerType())) |
| return; |
| } |
| if (!FD->getType()->isReferenceType()) { |
| assert(!VD->getType()->isVariablyModifiedType() && |
| "Parameter captured by value with variably modified type"); |
| EscapedParameters.insert(VD); |
| } else if (!IsForCombinedParallelRegion) { |
| return; |
| } |
| } |
| } |
| if ((!CGF.CapturedStmtInfo || |
| (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && |
| VD->getType()->isReferenceType()) |
| // Do not globalize variables with reference type. |
| return; |
| if (VD->getType()->isVariablyModifiedType()) { |
| // If not captured at the target region level then mark the escaped |
| // variable as delayed. |
| if (IsCaptured) |
| EscapedVariableLengthDecls.insert(VD); |
| else |
| DelayedVariableLengthDecls.insert(VD); |
| } else |
| EscapedDecls.insert(VD); |
| } |
| |
| void VisitValueDecl(const ValueDecl *VD) { |
| if (VD->getType()->isLValueReferenceType()) |
| markAsEscaped(VD); |
| if (const auto *VarD = dyn_cast<VarDecl>(VD)) { |
| if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { |
| const bool SavedAllEscaped = AllEscaped; |
| AllEscaped = VD->getType()->isLValueReferenceType(); |
| Visit(VarD->getInit()); |
| AllEscaped = SavedAllEscaped; |
| } |
| } |
| } |
| void VisitOpenMPCapturedStmt(const CapturedStmt *S, |
| ArrayRef<OMPClause *> Clauses, |
| bool IsCombinedParallelRegion) { |
| if (!S) |
| return; |
| for (const CapturedStmt::Capture &C : S->captures()) { |
| if (C.capturesVariable() && !C.capturesVariableByCopy()) { |
| const ValueDecl *VD = C.getCapturedVar(); |
| bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; |
| if (IsCombinedParallelRegion) { |
| // Check if the variable is privatized in the combined construct and |
| // those private copies must be shared in the inner parallel |
| // directive. |
| IsForCombinedParallelRegion = false; |
| for (const OMPClause *C : Clauses) { |
| if (!isOpenMPPrivate(C->getClauseKind()) || |
| C->getClauseKind() == OMPC_reduction || |
| C->getClauseKind() == OMPC_linear || |
| C->getClauseKind() == OMPC_private) |
| continue; |
| ArrayRef<const Expr *> Vars; |
| if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) |
| Vars = PC->getVarRefs(); |
| else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) |
| Vars = PC->getVarRefs(); |
| else |
| llvm_unreachable("Unexpected clause."); |
| for (const auto *E : Vars) { |
| const Decl *D = |
| cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); |
| if (D == VD->getCanonicalDecl()) { |
| IsForCombinedParallelRegion = true; |
| break; |
| } |
| } |
| if (IsForCombinedParallelRegion) |
| break; |
| } |
| } |
| markAsEscaped(VD); |
| if (isa<OMPCapturedExprDecl>(VD)) |
| VisitValueDecl(VD); |
| IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; |
| } |
| } |
| } |
| |
| void buildRecordForGlobalizedVars(bool IsInTTDRegion) { |
| assert(!GlobalizedRD && |
| "Record for globalized variables is built already."); |
| ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; |
| unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; |
| if (IsInTTDRegion) |
| EscapedDeclsForTeams = EscapedDecls.getArrayRef(); |
| else |
| EscapedDeclsForParallel = EscapedDecls.getArrayRef(); |
| GlobalizedRD = ::buildRecordForGlobalizedVars( |
| CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, |
| MappedDeclsFields, WarpSize); |
| } |
| |
| public: |
| CheckVarsEscapingDeclContext(CodeGenFunction &CGF, |
| ArrayRef<const ValueDecl *> TeamsReductions) |
| : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { |
| } |
| virtual ~CheckVarsEscapingDeclContext() = default; |
| void VisitDeclStmt(const DeclStmt *S) { |
| if (!S) |
| return; |
| for (const Decl *D : S->decls()) |
| if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) |
| VisitValueDecl(VD); |
| } |
| void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { |
| if (!D) |
| return; |
| if (!D->hasAssociatedStmt()) |
| return; |
| if (const auto *S = |
| dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { |
| // Do not analyze directives that do not actually require capturing, |
| // like `omp for` or `omp simd` directives. |
| llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; |
| getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); |
| if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { |
| VisitStmt(S->getCapturedStmt()); |
| return; |
| } |
| VisitOpenMPCapturedStmt( |
| S, D->clauses(), |
| CaptureRegions.back() == OMPD_parallel && |
| isOpenMPDistributeDirective(D->getDirectiveKind())); |
| } |
| } |
| void VisitCapturedStmt(const CapturedStmt *S) { |
| if (!S) |
| return; |
| for (const CapturedStmt::Capture &C : S->captures()) { |
| if (C.capturesVariable() && !C.capturesVariableByCopy()) { |
| const ValueDecl *VD = C.getCapturedVar(); |
| markAsEscaped(VD); |
| if (isa<OMPCapturedExprDecl>(VD)) |
| VisitValueDecl(VD); |
| } |
| } |
| } |
| void VisitLambdaExpr(const LambdaExpr *E) { |
| if (!E) |
| return; |
| for (const LambdaCapture &C : E->captures()) { |
| if (C.capturesVariable()) { |
| if (C.getCaptureKind() == LCK_ByRef) { |
| const ValueDecl *VD = C.getCapturedVar(); |
| markAsEscaped(VD); |
| if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) |
| VisitValueDecl(VD); |
| } |
| } |
| } |
| } |
| void VisitBlockExpr(const BlockExpr *E) { |
| if (!E) |
| return; |
| for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { |
| if (C.isByRef()) { |
| const VarDecl *VD = C.getVariable(); |
| markAsEscaped(VD); |
| if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) |
| VisitValueDecl(VD); |
| } |
| } |
| } |
| void VisitCallExpr(const CallExpr *E) { |
| if (!E) |
| return; |
| for (const Expr *Arg : E->arguments()) { |
| if (!Arg) |
| continue; |
| if (Arg->isLValue()) { |
| const bool SavedAllEscaped = AllEscaped; |
| AllEscaped = true; |
| Visit(Arg); |
| AllEscaped = SavedAllEscaped; |
| } else { |
| Visit(Arg); |
| } |
| } |
| Visit(E->getCallee()); |
| } |
| void VisitDeclRefExpr(const DeclRefExpr *E) { |
| if (!E) |
| return; |
| const ValueDecl *VD = E->getDecl(); |
| if (AllEscaped) |
| markAsEscaped(VD); |
| if (isa<OMPCapturedExprDecl>(VD)) |
| VisitValueDecl(VD); |
| else if (VD->isInitCapture()) |
| VisitValueDecl(VD); |
| } |
| void VisitUnaryOperator(const UnaryOperator *E) { |
| if (!E) |
| return; |
| if (E->getOpcode() == UO_AddrOf) { |
| const bool SavedAllEscaped = AllEscaped; |
| AllEscaped = true; |
| Visit(E->getSubExpr()); |
| AllEscaped = SavedAllEscaped; |
| } else { |
| Visit(E->getSubExpr()); |
| } |
| } |
| void VisitImplicitCastExpr(const ImplicitCastExpr *E) { |
| if (!E) |
| return; |
| if (E->getCastKind() == CK_ArrayToPointerDecay) { |
| const bool SavedAllEscaped = AllEscaped; |
| AllEscaped = true; |
| Visit(E->getSubExpr()); |
| AllEscaped = SavedAllEscaped; |
| } else { |
| Visit(E->getSubExpr()); |
| } |
| } |
| void VisitExpr(const Expr *E) { |
| if (!E) |
| return; |
| bool SavedAllEscaped = AllEscaped; |
| if (!E->isLValue()) |
| AllEscaped = false; |
| for (const Stmt *Child : E->children()) |
| if (Child) |
| Visit(Child); |
| AllEscaped = SavedAllEscaped; |
| } |
| void VisitStmt(const Stmt *S) { |
| if (!S) |
| return; |
| for (const Stmt *Child : S->children()) |
| if (Child) |
| Visit(Child); |
| } |
| |
| /// Returns the record that handles all the escaped local variables and used |
| /// instead of their original storage. |
| const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { |
| if (!GlobalizedRD) |
| buildRecordForGlobalizedVars(IsInTTDRegion); |
| return GlobalizedRD; |
| } |
| |
| /// Returns the field in the globalized record for the escaped variable. |
| const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { |
| assert(GlobalizedRD && |
| "Record for globalized variables must be generated already."); |
| return MappedDeclsFields.lookup(VD); |
| } |
| |
| /// Returns the list of the escaped local variables/parameters. |
| ArrayRef<const ValueDecl *> getEscapedDecls() const { |
| return EscapedDecls.getArrayRef(); |
| } |
| |
| /// Checks if the escaped local variable is actually a parameter passed by |
| /// value. |
| const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { |
| return EscapedParameters; |
| } |
| |
| /// Returns the list of the escaped variables with the variably modified |
| /// types. |
| ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { |
| return EscapedVariableLengthDecls.getArrayRef(); |
| } |
| |
| /// Returns the list of the delayed variables with the variably modified |
| /// types. |
| ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const { |
| return DelayedVariableLengthDecls.getArrayRef(); |
| } |
| }; |
| } // anonymous namespace |
| |
| CGOpenMPRuntimeGPU::ExecutionMode |
| CGOpenMPRuntimeGPU::getExecutionMode() const { |
| return CurrentExecutionMode; |
| } |
| |
| CGOpenMPRuntimeGPU::DataSharingMode |
| CGOpenMPRuntimeGPU::getDataSharingMode() const { |
| return CurrentDataSharingMode; |
| } |
| |
| /// Check for inner (nested) SPMD construct, if any |
| static bool hasNestedSPMDDirective(ASTContext &Ctx, |
| const OMPExecutableDirective &D) { |
| const auto *CS = D.getInnermostCapturedStmt(); |
| const auto *Body = |
| CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); |
| const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| |
| if (const auto *NestedDir = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); |
| switch (D.getDirectiveKind()) { |
| case OMPD_target: |
| if (isOpenMPParallelDirective(DKind)) |
| return true; |
| if (DKind == OMPD_teams) { |
| Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( |
| /*IgnoreCaptured=*/true); |
| if (!Body) |
| return false; |
| ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| if (const auto *NND = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| DKind = NND->getDirectiveKind(); |
| if (isOpenMPParallelDirective(DKind)) |
| return true; |
| } |
| } |
| return false; |
| case OMPD_target_teams: |
| return isOpenMPParallelDirective(DKind); |
| case OMPD_target_simd: |
| case OMPD_target_parallel: |
| case OMPD_target_parallel_for: |
| case OMPD_target_parallel_for_simd: |
| case OMPD_target_teams_distribute: |
| case OMPD_target_teams_distribute_simd: |
| case OMPD_target_teams_distribute_parallel_for: |
| case OMPD_target_teams_distribute_parallel_for_simd: |
| case OMPD_parallel: |
| case OMPD_for: |
| case OMPD_parallel_for: |
| case OMPD_parallel_master: |
| case OMPD_parallel_sections: |
| case OMPD_for_simd: |
| case OMPD_parallel_for_simd: |
| case OMPD_cancel: |
| case OMPD_cancellation_point: |
| case OMPD_ordered: |
| case OMPD_threadprivate: |
| case OMPD_allocate: |
| case OMPD_task: |
| case OMPD_simd: |
| case OMPD_sections: |
| case OMPD_section: |
| case OMPD_single: |
| case OMPD_master: |
| case OMPD_critical: |
| case OMPD_taskyield: |
| case OMPD_barrier: |
| case OMPD_taskwait: |
| case OMPD_taskgroup: |
| case OMPD_atomic: |
| case OMPD_flush: |
| case OMPD_depobj: |
| case OMPD_scan: |
| case OMPD_teams: |
| case OMPD_target_data: |
| case OMPD_target_exit_data: |
| case OMPD_target_enter_data: |
| case OMPD_distribute: |
| case OMPD_distribute_simd: |
| case OMPD_distribute_parallel_for: |
| case OMPD_distribute_parallel_for_simd: |
| case OMPD_teams_distribute: |
| case OMPD_teams_distribute_simd: |
| case OMPD_teams_distribute_parallel_for: |
| case OMPD_teams_distribute_parallel_for_simd: |
| case OMPD_target_update: |
| case OMPD_declare_simd: |
| case OMPD_declare_variant: |
| case OMPD_begin_declare_variant: |
| case OMPD_end_declare_variant: |
| case OMPD_declare_target: |
| case OMPD_end_declare_target: |
| case OMPD_declare_reduction: |
| case OMPD_declare_mapper: |
| case OMPD_taskloop: |
| case OMPD_taskloop_simd: |
| case OMPD_master_taskloop: |
| case OMPD_master_taskloop_simd: |
| case OMPD_parallel_master_taskloop: |
| case OMPD_parallel_master_taskloop_simd: |
| case OMPD_requires: |
| case OMPD_unknown: |
| default: |
| llvm_unreachable("Unexpected directive."); |
| } |
| } |
| |
| return false; |
| } |
| |
| static bool supportsSPMDExecutionMode(ASTContext &Ctx, |
| const OMPExecutableDirective &D) { |
| OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); |
| switch (DirectiveKind) { |
| case OMPD_target: |
| case OMPD_target_teams: |
| return hasNestedSPMDDirective(Ctx, D); |
| case OMPD_target_parallel_loop: |
| case OMPD_target_parallel: |
| case OMPD_target_parallel_for: |
| case OMPD_target_parallel_for_simd: |
| case OMPD_target_teams_distribute_parallel_for: |
| case OMPD_target_teams_distribute_parallel_for_simd: |
| case OMPD_target_simd: |
| case OMPD_target_teams_distribute_simd: |
| return true; |
| case OMPD_target_teams_distribute: |
| return false; |
| case OMPD_target_teams_loop: |
| // Whether this is true or not depends on how the directive will |
| // eventually be emitted. |
| if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D)) |
| return TTLD->canBeParallelFor(); |
| return false; |
| case OMPD_parallel: |
| case OMPD_for: |
| case OMPD_parallel_for: |
| case OMPD_parallel_master: |
| case OMPD_parallel_sections: |
| case OMPD_for_simd: |
| case OMPD_parallel_for_simd: |
| case OMPD_cancel: |
| case OMPD_cancellation_point: |
| case OMPD_ordered: |
| case OMPD_threadprivate: |
| case OMPD_allocate: |
| case OMPD_task: |
| case OMPD_simd: |
| case OMPD_sections: |
| case OMPD_section: |
| case OMPD_single: |
| case OMPD_master: |
| case OMPD_critical: |
| case OMPD_taskyield: |
| case OMPD_barrier: |
| case OMPD_taskwait: |
| case OMPD_taskgroup: |
| case OMPD_atomic: |
| case OMPD_flush: |
| case OMPD_depobj: |
| case OMPD_scan: |
| case OMPD_teams: |
| case OMPD_target_data: |
| case OMPD_target_exit_data: |
| case OMPD_target_enter_data: |
| case OMPD_distribute: |
| case OMPD_distribute_simd: |
| case OMPD_distribute_parallel_for: |
| case OMPD_distribute_parallel_for_simd: |
| case OMPD_teams_distribute: |
| case OMPD_teams_distribute_simd: |
| case OMPD_teams_distribute_parallel_for: |
| case OMPD_teams_distribute_parallel_for_simd: |
| case OMPD_target_update: |
| case OMPD_declare_simd: |
| case OMPD_declare_variant: |
| case OMPD_begin_declare_variant: |
| case OMPD_end_declare_variant: |
| case OMPD_declare_target: |
| case OMPD_end_declare_target: |
| case OMPD_declare_reduction: |
| case OMPD_declare_mapper: |
| case OMPD_taskloop: |
| case OMPD_taskloop_simd: |
| case OMPD_master_taskloop: |
| case OMPD_master_taskloop_simd: |
| case OMPD_parallel_master_taskloop: |
| case OMPD_parallel_master_taskloop_simd: |
| case OMPD_requires: |
| case OMPD_unknown: |
| default: |
| break; |
| } |
| llvm_unreachable( |
| "Unknown programming model for OpenMP directive on NVPTX target."); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, |
| StringRef ParentName, |
| llvm::Function *&OutlinedFn, |
| llvm::Constant *&OutlinedFnID, |
| bool IsOffloadEntry, |
| const RegionCodeGenTy &CodeGen) { |
| ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD); |
| EntryFunctionState EST; |
| WrapperFunctionsMap.clear(); |
| |
| [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); |
| assert(!IsBareKernel && "bare kernel should not be at generic mode"); |
| |
| // Emit target region as a standalone region. |
| class NVPTXPrePostActionTy : public PrePostActionTy { |
| CGOpenMPRuntimeGPU::EntryFunctionState &EST; |
| const OMPExecutableDirective &D; |
| |
| public: |
| NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, |
| const OMPExecutableDirective &D) |
| : EST(EST), D(D) {} |
| void Enter(CodeGenFunction &CGF) override { |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false); |
| // Skip target region initialization. |
| RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| RT.clearLocThreadIdInsertPt(CGF); |
| RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); |
| } |
| } Action(EST, D); |
| CodeGen.setAction(Action); |
| IsInTTDRegion = true; |
| emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, |
| IsOffloadEntry, CodeGen); |
| IsInTTDRegion = false; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D, |
| CodeGenFunction &CGF, |
| EntryFunctionState &EST, bool IsSPMD) { |
| llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs; |
| Attrs.ExecFlags = |
| IsSPMD ? llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD |
| : llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; |
| computeMinAndMaxThreadsAndTeams(D, CGF, Attrs); |
| |
| CGBuilderTy &Bld = CGF.Builder; |
| Bld.restoreIP(OMPBuilder.createTargetInit(Bld, Attrs)); |
| if (!IsSPMD) |
| emitGenericVarsProlog(CGF, EST.Loc); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, |
| EntryFunctionState &EST, |
| bool IsSPMD) { |
| if (!IsSPMD) |
| emitGenericVarsEpilog(CGF); |
| |
| // This is temporary until we remove the fixed sized buffer. |
| ASTContext &C = CGM.getContext(); |
| RecordDecl *StaticRD = C.buildImplicitRecord( |
| "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union); |
| StaticRD->startDefinition(); |
| for (const RecordDecl *TeamReductionRec : TeamsReductions) { |
| QualType RecTy = C.getRecordType(TeamReductionRec); |
| auto *Field = FieldDecl::Create( |
| C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, |
| C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), |
| /*BW=*/nullptr, /*Mutable=*/false, |
| /*InitStyle=*/ICIS_NoInit); |
| Field->setAccess(AS_public); |
| StaticRD->addDecl(Field); |
| } |
| StaticRD->completeDefinition(); |
| QualType StaticTy = C.getRecordType(StaticRD); |
| llvm::Type *LLVMReductionsBufferTy = |
| CGM.getTypes().ConvertTypeForMem(StaticTy); |
| const auto &DL = CGM.getModule().getDataLayout(); |
| uint64_t ReductionDataSize = |
| TeamsReductions.empty() |
| ? 0 |
| : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue(); |
| CGBuilderTy &Bld = CGF.Builder; |
| OMPBuilder.createTargetDeinit(Bld, ReductionDataSize, |
| C.getLangOpts().OpenMPCUDAReductionBufNum); |
| TeamsReductions.clear(); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, |
| StringRef ParentName, |
| llvm::Function *&OutlinedFn, |
| llvm::Constant *&OutlinedFnID, |
| bool IsOffloadEntry, |
| const RegionCodeGenTy &CodeGen) { |
| ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD); |
| EntryFunctionState EST; |
| |
| bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); |
| |
| // Emit target region as a standalone region. |
| class NVPTXPrePostActionTy : public PrePostActionTy { |
| CGOpenMPRuntimeGPU &RT; |
| CGOpenMPRuntimeGPU::EntryFunctionState &EST; |
| bool IsBareKernel; |
| DataSharingMode Mode; |
| const OMPExecutableDirective &D; |
| |
| public: |
| NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, |
| CGOpenMPRuntimeGPU::EntryFunctionState &EST, |
| bool IsBareKernel, const OMPExecutableDirective &D) |
| : RT(RT), EST(EST), IsBareKernel(IsBareKernel), |
| Mode(RT.CurrentDataSharingMode), D(D) {} |
| void Enter(CodeGenFunction &CGF) override { |
| if (IsBareKernel) { |
| RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA; |
| return; |
| } |
| RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true); |
| // Skip target region initialization. |
| RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| if (IsBareKernel) { |
| RT.CurrentDataSharingMode = Mode; |
| return; |
| } |
| RT.clearLocThreadIdInsertPt(CGF); |
| RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); |
| } |
| } Action(*this, EST, IsBareKernel, D); |
| CodeGen.setAction(Action); |
| IsInTTDRegion = true; |
| emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, |
| IsOffloadEntry, CodeGen); |
| IsInTTDRegion = false; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( |
| const OMPExecutableDirective &D, StringRef ParentName, |
| llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, |
| bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { |
| if (!IsOffloadEntry) // Nothing to do. |
| return; |
| |
| assert(!ParentName.empty() && "Invalid target region parent name!"); |
| |
| bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); |
| bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); |
| if (Mode || IsBareKernel) |
| emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, |
| CodeGen); |
| else |
| emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, |
| CodeGen); |
| } |
| |
| CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) |
| : CGOpenMPRuntime(CGM) { |
| llvm::OpenMPIRBuilderConfig Config( |
| CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(), |
| CGM.getLangOpts().OpenMPOffloadMandatory, |
| /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false, |
| hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false); |
| OMPBuilder.setConfig(Config); |
| |
| if (!CGM.getLangOpts().OpenMPIsTargetDevice) |
| llvm_unreachable("OpenMP can only handle device code."); |
| |
| if (CGM.getLangOpts().OpenMPCUDAMode) |
| CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA; |
| |
| llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder(); |
| if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty()) |
| return; |
| |
| OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug, |
| "__omp_rtl_debug_kind"); |
| OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription, |
| "__omp_rtl_assume_teams_oversubscription"); |
| OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription, |
| "__omp_rtl_assume_threads_oversubscription"); |
| OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, |
| "__omp_rtl_assume_no_thread_state"); |
| OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, |
| "__omp_rtl_assume_no_nested_parallelism"); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, |
| ProcBindKind ProcBind, |
| SourceLocation Loc) { |
| // Nothing to do. |
| } |
| |
| void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, |
| llvm::Value *NumThreads, |
| SourceLocation Loc) { |
| // Nothing to do. |
| } |
| |
| void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, |
| const Expr *NumTeams, |
| const Expr *ThreadLimit, |
| SourceLocation Loc) {} |
| |
| llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( |
| CodeGenFunction &CGF, const OMPExecutableDirective &D, |
| const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, |
| const RegionCodeGenTy &CodeGen) { |
| // Emit target region as a standalone region. |
| bool PrevIsInTTDRegion = IsInTTDRegion; |
| IsInTTDRegion = false; |
| auto *OutlinedFun = |
| cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( |
| CGF, D, ThreadIDVar, InnermostKind, CodeGen)); |
| IsInTTDRegion = PrevIsInTTDRegion; |
| if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) { |
| llvm::Function *WrapperFun = |
| createParallelDataSharingWrapper(OutlinedFun, D); |
| WrapperFunctionsMap[OutlinedFun] = WrapperFun; |
| } |
| |
| return OutlinedFun; |
| } |
| |
| /// Get list of lastprivate variables from the teams distribute ... or |
| /// teams {distribute ...} directives. |
| static void |
| getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, |
| llvm::SmallVectorImpl<const ValueDecl *> &Vars) { |
| assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && |
| "expected teams directive."); |
| const OMPExecutableDirective *Dir = &D; |
| if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { |
| if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( |
| Ctx, |
| D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( |
| /*IgnoreCaptured=*/true))) { |
| Dir = dyn_cast_or_null<OMPExecutableDirective>(S); |
| if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) |
| Dir = nullptr; |
| } |
| } |
| if (!Dir) |
| return; |
| for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { |
| for (const Expr *E : C->getVarRefs()) |
| Vars.push_back(getPrivateItem(E)); |
| } |
| } |
| |
| /// Get list of reduction variables from the teams ... directives. |
| static void |
| getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, |
| llvm::SmallVectorImpl<const ValueDecl *> &Vars) { |
| assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && |
| "expected teams directive."); |
| for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { |
| for (const Expr *E : C->privates()) |
| Vars.push_back(getPrivateItem(E)); |
| } |
| } |
| |
| llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( |
| CodeGenFunction &CGF, const OMPExecutableDirective &D, |
| const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, |
| const RegionCodeGenTy &CodeGen) { |
| SourceLocation Loc = D.getBeginLoc(); |
| |
| const RecordDecl *GlobalizedRD = nullptr; |
| llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; |
| unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size; |
| // Globalize team reductions variable unconditionally in all modes. |
| if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) |
| getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); |
| if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { |
| getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); |
| if (!LastPrivatesReductions.empty()) { |
| GlobalizedRD = ::buildRecordForGlobalizedVars( |
| CGM.getContext(), {}, LastPrivatesReductions, MappedDeclsFields, |
| WarpSize); |
| } |
| } else if (!LastPrivatesReductions.empty()) { |
| assert(!TeamAndReductions.first && |
| "Previous team declaration is not expected."); |
| TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); |
| std::swap(TeamAndReductions.second, LastPrivatesReductions); |
| } |
| |
| // Emit target region as a standalone region. |
| class NVPTXPrePostActionTy : public PrePostActionTy { |
| SourceLocation &Loc; |
| const RecordDecl *GlobalizedRD; |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> |
| &MappedDeclsFields; |
| |
| public: |
| NVPTXPrePostActionTy( |
| SourceLocation &Loc, const RecordDecl *GlobalizedRD, |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> |
| &MappedDeclsFields) |
| : Loc(Loc), GlobalizedRD(GlobalizedRD), |
| MappedDeclsFields(MappedDeclsFields) {} |
| void Enter(CodeGenFunction &CGF) override { |
| auto &Rt = |
| static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| if (GlobalizedRD) { |
| auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; |
| I->getSecond().MappedParams = |
| std::make_unique<CodeGenFunction::OMPMapVars>(); |
| DeclToAddrMapTy &Data = I->getSecond().LocalVarData; |
| for (const auto &Pair : MappedDeclsFields) { |
| assert(Pair.getFirst()->isCanonicalDecl() && |
| "Expected canonical declaration"); |
| Data.insert(std::make_pair(Pair.getFirst(), MappedVarData())); |
| } |
| } |
| Rt.emitGenericVarsProlog(CGF, Loc); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) |
| .emitGenericVarsEpilog(CGF); |
| } |
| } Action(Loc, GlobalizedRD, MappedDeclsFields); |
| CodeGen.setAction(Action); |
| llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( |
| CGF, D, ThreadIDVar, InnermostKind, CodeGen); |
| |
| return OutlinedFun; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, |
| SourceLocation Loc) { |
| if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) |
| return; |
| |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); |
| if (I == FunctionGlobalizedDecls.end()) |
| return; |
| |
| for (auto &Rec : I->getSecond().LocalVarData) { |
| const auto *VD = cast<VarDecl>(Rec.first); |
| bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); |
| QualType VarTy = VD->getType(); |
| |
| // Get the local allocation of a firstprivate variable before sharing |
| llvm::Value *ParValue; |
| if (EscapedParam) { |
| LValue ParLVal = |
| CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); |
| ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); |
| } |
| |
| // Allocate space for the variable to be globalized |
| llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; |
| llvm::CallBase *VoidPtr = |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_alloc_shared), |
| AllocArgs, VD->getName()); |
| // FIXME: We should use the variables actual alignment as an argument. |
| VoidPtr->addRetAttr(llvm::Attribute::get( |
| CGM.getLLVMContext(), llvm::Attribute::Alignment, |
| CGM.getContext().getTargetInfo().getNewAlign() / 8)); |
| |
| // Cast the void pointer and get the address of the globalized variable. |
| llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( |
| VoidPtr, Bld.getPtrTy(0), VD->getName() + "_on_stack"); |
| LValue VarAddr = |
| CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy); |
| Rec.second.PrivateAddr = VarAddr.getAddress(); |
| Rec.second.GlobalizedVal = VoidPtr; |
| |
| // Assign the local allocation to the newly globalized location. |
| if (EscapedParam) { |
| CGF.EmitStoreOfScalar(ParValue, VarAddr); |
| I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress()); |
| } |
| if (auto *DI = CGF.getDebugInfo()) |
| VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); |
| } |
| |
| for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) { |
| const auto *VD = cast<VarDecl>(ValueD); |
| std::pair<llvm::Value *, llvm::Value *> AddrSizePair = |
| getKmpcAllocShared(CGF, VD); |
| I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair); |
| LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(), |
| CGM.getContext().getDeclAlign(VD), |
| AlignmentSource::Decl); |
| I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress()); |
| } |
| I->getSecond().MappedParams->apply(CGF); |
| } |
| |
| bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF, |
| const VarDecl *VD) const { |
| const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); |
| if (I == FunctionGlobalizedDecls.end()) |
| return false; |
| |
| // Check variable declaration is delayed: |
| return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD); |
| } |
| |
| std::pair<llvm::Value *, llvm::Value *> |
| CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, |
| const VarDecl *VD) { |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| // Compute size and alignment. |
| llvm::Value *Size = CGF.getTypeSize(VD->getType()); |
| CharUnits Align = CGM.getContext().getDeclAlign(VD); |
| Size = Bld.CreateNUWAdd( |
| Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); |
| llvm::Value *AlignVal = |
| llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); |
| Size = Bld.CreateUDiv(Size, AlignVal); |
| Size = Bld.CreateNUWMul(Size, AlignVal); |
| |
| // Allocate space for this VLA object to be globalized. |
| llvm::Value *AllocArgs[] = {Size}; |
| llvm::CallBase *VoidPtr = |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_alloc_shared), |
| AllocArgs, VD->getName()); |
| VoidPtr->addRetAttr(llvm::Attribute::get( |
| CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity())); |
| |
| return std::make_pair(VoidPtr, Size); |
| } |
| |
| void CGOpenMPRuntimeGPU::getKmpcFreeShared( |
| CodeGenFunction &CGF, |
| const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) { |
| // Deallocate the memory for each globalized VLA object |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_free_shared), |
| {AddrSizePair.first, AddrSizePair.second}); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { |
| if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) |
| return; |
| |
| const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); |
| if (I != FunctionGlobalizedDecls.end()) { |
| // Deallocate the memory for each globalized VLA object that was |
| // globalized in the prolog (i.e. emitGenericVarsProlog). |
| for (const auto &AddrSizePair : |
| llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_free_shared), |
| {AddrSizePair.first, AddrSizePair.second}); |
| } |
| // Deallocate the memory for each globalized value |
| for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { |
| const auto *VD = cast<VarDecl>(Rec.first); |
| I->getSecond().MappedParams->restore(CGF); |
| |
| llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal, |
| CGF.getTypeSize(VD->getType())}; |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_free_shared), |
| FreeArgs); |
| } |
| } |
| } |
| |
| void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, |
| const OMPExecutableDirective &D, |
| SourceLocation Loc, |
| llvm::Function *OutlinedFn, |
| ArrayRef<llvm::Value *> CapturedVars) { |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); |
| |
| RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, |
| /*Name=*/".zero.addr"); |
| CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); |
| llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| // We don't emit any thread id function call in bare kernel, but because the |
| // outlined function has a pointer argument, we emit a nullptr here. |
| if (IsBareKernel) |
| OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy)); |
| else |
| OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF)); |
| OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, |
| SourceLocation Loc, |
| llvm::Function *OutlinedFn, |
| ArrayRef<llvm::Value *> CapturedVars, |
| const Expr *IfCond, |
| llvm::Value *NumThreads) { |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, |
| NumThreads](CodeGenFunction &CGF, |
| PrePostActionTy &Action) { |
| CGBuilderTy &Bld = CGF.Builder; |
| llvm::Value *NumThreadsVal = NumThreads; |
| llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; |
| llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); |
| if (WFn) |
| ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); |
| llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); |
| |
| // Create a private scope that will globalize the arguments |
| // passed from the outside of the target region. |
| // TODO: Is that needed? |
| CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); |
| |
| Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( |
| llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), |
| "captured_vars_addrs"); |
| // There's something to share. |
| if (!CapturedVars.empty()) { |
| // Prepare for parallel region. Indicate the outlined function. |
| ASTContext &Ctx = CGF.getContext(); |
| unsigned Idx = 0; |
| for (llvm::Value *V : CapturedVars) { |
| Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); |
| llvm::Value *PtrV; |
| if (V->getType()->isIntegerTy()) |
| PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); |
| else |
| PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); |
| CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, |
| Ctx.getPointerType(Ctx.VoidPtrTy)); |
| ++Idx; |
| } |
| } |
| |
| llvm::Value *IfCondVal = nullptr; |
| if (IfCond) |
| IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, |
| /* isSigned */ false); |
| else |
| IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); |
| |
| if (!NumThreadsVal) |
| NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1); |
| else |
| NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty), |
| |
| assert(IfCondVal && "Expected a value"); |
| llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); |
| llvm::Value *Args[] = { |
| RTLoc, |
| getThreadID(CGF, Loc), |
| IfCondVal, |
| NumThreadsVal, |
| llvm::ConstantInt::get(CGF.Int32Ty, -1), |
| FnPtr, |
| ID, |
| Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF), |
| CGF.VoidPtrPtrTy), |
| llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_parallel_51), |
| Args); |
| }; |
| |
| RegionCodeGenTy RCG(ParallelGen); |
| RCG(CGF); |
| } |
| |
| void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { |
| // Always emit simple barriers! |
| if (!CGF.HaveInsertPoint()) |
| return; |
| // Build call __kmpc_barrier_simple_spmd(nullptr, 0); |
| // This function does not use parameters, so we can emit just default values. |
| llvm::Value *Args[] = { |
| llvm::ConstantPointerNull::get( |
| cast<llvm::PointerType>(getIdentTyPointerTy())), |
| llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd), |
| Args); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, |
| SourceLocation Loc, |
| OpenMPDirectiveKind Kind, bool, |
| bool) { |
| // Always emit simple barriers! |
| if (!CGF.HaveInsertPoint()) |
| return; |
| // Build call __kmpc_cancel_barrier(loc, thread_id); |
| unsigned Flags = getDefaultFlagsForBarriers(Kind); |
| llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), |
| getThreadID(CGF, Loc)}; |
| |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_barrier), |
| Args); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitCriticalRegion( |
| CodeGenFunction &CGF, StringRef CriticalName, |
| const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, |
| const Expr *Hint) { |
| llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); |
| llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); |
| llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); |
| llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); |
| llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); |
| |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| |
| // Get the mask of active threads in the warp. |
| llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask)); |
| // Fetch team-local id of the thread. |
| llvm::Value *ThreadID = RT.getGPUThreadID(CGF); |
| |
| // Get the width of the team. |
| llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); |
| |
| // Initialize the counter variable for the loop. |
| QualType Int32Ty = |
| CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); |
| Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); |
| LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); |
| CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, |
| /*isInit=*/true); |
| |
| // Block checks if loop counter exceeds upper bound. |
| CGF.EmitBlock(LoopBB); |
| llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); |
| llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); |
| CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); |
| |
| // Block tests which single thread should execute region, and which threads |
| // should go straight to synchronisation point. |
| CGF.EmitBlock(TestBB); |
| CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); |
| llvm::Value *CmpThreadToCounter = |
| CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); |
| CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); |
| |
| // Block emits the body of the critical region. |
| CGF.EmitBlock(BodyBB); |
| |
| // Output the critical statement. |
| CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, |
| Hint); |
| |
| // After the body surrounded by the critical region, the single executing |
| // thread will jump to the synchronisation point. |
| // Block waits for all threads in current team to finish then increments the |
| // counter variable and returns to the loop. |
| CGF.EmitBlock(SyncBB); |
| // Reconverge active threads in the warp. |
| (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_syncwarp), |
| Mask); |
| |
| llvm::Value *IncCounterVal = |
| CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); |
| CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); |
| CGF.EmitBranch(LoopBB); |
| |
| // Block that is reached when all threads in the team complete the region. |
| CGF.EmitBlock(ExitBB, /*IsFinished=*/true); |
| } |
| |
| /// Cast value to the specified type. |
| static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, |
| QualType ValTy, QualType CastTy, |
| SourceLocation Loc) { |
| assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && |
| "Cast type must sized."); |
| assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && |
| "Val type must sized."); |
| llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); |
| if (ValTy == CastTy) |
| return Val; |
| if (CGF.getContext().getTypeSizeInChars(ValTy) == |
| CGF.getContext().getTypeSizeInChars(CastTy)) |
| return CGF.Builder.CreateBitCast(Val, LLVMCastTy); |
| if (CastTy->isIntegerType() && ValTy->isIntegerType()) |
| return CGF.Builder.CreateIntCast(Val, LLVMCastTy, |
| CastTy->hasSignedIntegerRepresentation()); |
| Address CastItem = CGF.CreateMemTemp(CastTy); |
| Address ValCastItem = CastItem.withElementType(Val->getType()); |
| CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, |
| LValueBaseInfo(AlignmentSource::Type), |
| TBAAAccessInfo()); |
| return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, |
| LValueBaseInfo(AlignmentSource::Type), |
| TBAAAccessInfo()); |
| } |
| |
| /// |
| /// Design of OpenMP reductions on the GPU |
| /// |
| /// Consider a typical OpenMP program with one or more reduction |
| /// clauses: |
| /// |
| /// float foo; |
| /// double bar; |
| /// #pragma omp target teams distribute parallel for \ |
| /// reduction(+:foo) reduction(*:bar) |
| /// for (int i = 0; i < N; i++) { |
| /// foo += A[i]; bar *= B[i]; |
| /// } |
| /// |
| /// where 'foo' and 'bar' are reduced across all OpenMP threads in |
| /// all teams. In our OpenMP implementation on the NVPTX device an |
| /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads |
| /// within a team are mapped to CUDA threads within a threadblock. |
| /// Our goal is to efficiently aggregate values across all OpenMP |
| /// threads such that: |
| /// |
| /// - the compiler and runtime are logically concise, and |
| /// - the reduction is performed efficiently in a hierarchical |
| /// manner as follows: within OpenMP threads in the same warp, |
| /// across warps in a threadblock, and finally across teams on |
| /// the NVPTX device. |
| /// |
| /// Introduction to Decoupling |
| /// |
| /// We would like to decouple the compiler and the runtime so that the |
| /// latter is ignorant of the reduction variables (number, data types) |
| /// and the reduction operators. This allows a simpler interface |
| /// and implementation while still attaining good performance. |
| /// |
| /// Pseudocode for the aforementioned OpenMP program generated by the |
| /// compiler is as follows: |
| /// |
| /// 1. Create private copies of reduction variables on each OpenMP |
| /// thread: 'foo_private', 'bar_private' |
| /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned |
| /// to it and writes the result in 'foo_private' and 'bar_private' |
| /// respectively. |
| /// 3. Call the OpenMP runtime on the GPU to reduce within a team |
| /// and store the result on the team master: |
| /// |
| /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., |
| /// reduceData, shuffleReduceFn, interWarpCpyFn) |
| /// |
| /// where: |
| /// struct ReduceData { |
| /// double *foo; |
| /// double *bar; |
| /// } reduceData |
| /// reduceData.foo = &foo_private |
| /// reduceData.bar = &bar_private |
| /// |
| /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two |
| /// auxiliary functions generated by the compiler that operate on |
| /// variables of type 'ReduceData'. They aid the runtime perform |
| /// algorithmic steps in a data agnostic manner. |
| /// |
| /// 'shuffleReduceFn' is a pointer to a function that reduces data |
| /// of type 'ReduceData' across two OpenMP threads (lanes) in the |
| /// same warp. It takes the following arguments as input: |
| /// |
| /// a. variable of type 'ReduceData' on the calling lane, |
| /// b. its lane_id, |
| /// c. an offset relative to the current lane_id to generate a |
| /// remote_lane_id. The remote lane contains the second |
| /// variable of type 'ReduceData' that is to be reduced. |
| /// d. an algorithm version parameter determining which reduction |
| /// algorithm to use. |
| /// |
| /// 'shuffleReduceFn' retrieves data from the remote lane using |
| /// efficient GPU shuffle intrinsics and reduces, using the |
| /// algorithm specified by the 4th parameter, the two operands |
| /// element-wise. The result is written to the first operand. |
| /// |
| /// Different reduction algorithms are implemented in different |
| /// runtime functions, all calling 'shuffleReduceFn' to perform |
| /// the essential reduction step. Therefore, based on the 4th |
| /// parameter, this function behaves slightly differently to |
| /// cooperate with the runtime to ensure correctness under |
| /// different circumstances. |
| /// |
| /// 'InterWarpCpyFn' is a pointer to a function that transfers |
| /// reduced variables across warps. It tunnels, through CUDA |
| /// shared memory, the thread-private data of type 'ReduceData' |
| /// from lane 0 of each warp to a lane in the first warp. |
| /// 4. Call the OpenMP runtime on the GPU to reduce across teams. |
| /// The last team writes the global reduced value to memory. |
| /// |
| /// ret = __kmpc_nvptx_teams_reduce_nowait(..., |
| /// reduceData, shuffleReduceFn, interWarpCpyFn, |
| /// scratchpadCopyFn, loadAndReduceFn) |
| /// |
| /// 'scratchpadCopyFn' is a helper that stores reduced |
| /// data from the team master to a scratchpad array in |
| /// global memory. |
| /// |
| /// 'loadAndReduceFn' is a helper that loads data from |
| /// the scratchpad array and reduces it with the input |
| /// operand. |
| /// |
| /// These compiler generated functions hide address |
| /// calculation and alignment information from the runtime. |
| /// 5. if ret == 1: |
| /// The team master of the last team stores the reduced |
| /// result to the globals in memory. |
| /// foo += reduceData.foo; bar *= reduceData.bar |
| /// |
| /// |
| /// Warp Reduction Algorithms |
| /// |
| /// On the warp level, we have three algorithms implemented in the |
| /// OpenMP runtime depending on the number of active lanes: |
| /// |
| /// Full Warp Reduction |
| /// |
| /// The reduce algorithm within a warp where all lanes are active |
| /// is implemented in the runtime as follows: |
| /// |
| /// full_warp_reduce(void *reduce_data, |
| /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { |
| /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) |
| /// ShuffleReduceFn(reduce_data, 0, offset, 0); |
| /// } |
| /// |
| /// The algorithm completes in log(2, WARPSIZE) steps. |
| /// |
| /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is |
| /// not used therefore we save instructions by not retrieving lane_id |
| /// from the corresponding special registers. The 4th parameter, which |
| /// represents the version of the algorithm being used, is set to 0 to |
| /// signify full warp reduction. |
| /// |
| /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: |
| /// |
| /// #reduce_elem refers to an element in the local lane's data structure |
| /// #remote_elem is retrieved from a remote lane |
| /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); |
| /// reduce_elem = reduce_elem REDUCE_OP remote_elem; |
| /// |
| /// Contiguous Partial Warp Reduction |
| /// |
| /// This reduce algorithm is used within a warp where only the first |
| /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the |
| /// number of OpenMP threads in a parallel region is not a multiple of |
| /// WARPSIZE. The algorithm is implemented in the runtime as follows: |
| /// |
| /// void |
| /// contiguous_partial_reduce(void *reduce_data, |
| /// kmp_ShuffleReductFctPtr ShuffleReduceFn, |
| /// int size, int lane_id) { |
| /// int curr_size; |
| /// int offset; |
| /// curr_size = size; |
| /// mask = curr_size/2; |
| /// while (offset>0) { |
| /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); |
| /// curr_size = (curr_size+1)/2; |
| /// offset = curr_size/2; |
| /// } |
| /// } |
| /// |
| /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: |
| /// |
| /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); |
| /// if (lane_id < offset) |
| /// reduce_elem = reduce_elem REDUCE_OP remote_elem |
| /// else |
| /// reduce_elem = remote_elem |
| /// |
| /// This algorithm assumes that the data to be reduced are located in a |
| /// contiguous subset of lanes starting from the first. When there is |
| /// an odd number of active lanes, the data in the last lane is not |
| /// aggregated with any other lane's dat but is instead copied over. |
| /// |
| /// Dispersed Partial Warp Reduction |
| /// |
| /// This algorithm is used within a warp when any discontiguous subset of |
| /// lanes are active. It is used to implement the reduction operation |
| /// across lanes in an OpenMP simd region or in a nested parallel region. |
| /// |
| /// void |
| /// dispersed_partial_reduce(void *reduce_data, |
| /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { |
| /// int size, remote_id; |
| /// int logical_lane_id = number_of_active_lanes_before_me() * 2; |
| /// do { |
| /// remote_id = next_active_lane_id_right_after_me(); |
| /// # the above function returns 0 of no active lane |
| /// # is present right after the current lane. |
| /// size = number_of_active_lanes_in_this_warp(); |
| /// logical_lane_id /= 2; |
| /// ShuffleReduceFn(reduce_data, logical_lane_id, |
| /// remote_id-1-threadIdx.x, 2); |
| /// } while (logical_lane_id % 2 == 0 && size > 1); |
| /// } |
| /// |
| /// There is no assumption made about the initial state of the reduction. |
| /// Any number of lanes (>=1) could be active at any position. The reduction |
| /// result is returned in the first active lane. |
| /// |
| /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: |
| /// |
| /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); |
| /// if (lane_id % 2 == 0 && offset > 0) |
| /// reduce_elem = reduce_elem REDUCE_OP remote_elem |
| /// else |
| /// reduce_elem = remote_elem |
| /// |
| /// |
| /// Intra-Team Reduction |
| /// |
| /// This function, as implemented in the runtime call |
| /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP |
| /// threads in a team. It first reduces within a warp using the |
| /// aforementioned algorithms. We then proceed to gather all such |
| /// reduced values at the first warp. |
| /// |
| /// The runtime makes use of the function 'InterWarpCpyFn', which copies |
| /// data from each of the "warp master" (zeroth lane of each warp, where |
| /// warp-reduced data is held) to the zeroth warp. This step reduces (in |
| /// a mathematical sense) the problem of reduction across warp masters in |
| /// a block to the problem of warp reduction. |
| /// |
| /// |
| /// Inter-Team Reduction |
| /// |
| /// Once a team has reduced its data to a single value, it is stored in |
| /// a global scratchpad array. Since each team has a distinct slot, this |
| /// can be done without locking. |
| /// |
| /// The last team to write to the scratchpad array proceeds to reduce the |
| /// scratchpad array. One or more workers in the last team use the helper |
| /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., |
| /// the k'th worker reduces every k'th element. |
| /// |
| /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to |
| /// reduce across workers and compute a globally reduced value. |
| /// |
| void CGOpenMPRuntimeGPU::emitReduction( |
| CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, |
| ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, |
| ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); |
| bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind); |
| bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); |
| |
| ASTContext &C = CGM.getContext(); |
| |
| if (Options.SimpleReduction) { |
| assert(!TeamsReduction && !ParallelReduction && |
| "Invalid reduction selection in emitReduction."); |
| (void)ParallelReduction; |
| CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, |
| ReductionOps, Options); |
| return; |
| } |
| |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; |
| llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); |
| int Cnt = 0; |
| for (const Expr *DRE : Privates) { |
| PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); |
| ++Cnt; |
| } |
| const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars( |
| CGM.getContext(), PrivatesReductions, {}, VarFieldMap, 1); |
| |
| if (TeamsReduction) |
| TeamsReductions.push_back(ReductionRec); |
| |
| // Source location for the ident struct |
| llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); |
| |
| using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; |
| InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(), |
| CGF.AllocaInsertPt->getIterator()); |
| InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(), |
| CGF.Builder.GetInsertPoint()); |
| llvm::OpenMPIRBuilder::LocationDescription OmpLoc( |
| CodeGenIP, CGF.SourceLocToDebugLoc(Loc)); |
| llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos; |
| |
| CodeGenFunction::OMPPrivateScope Scope(CGF); |
| unsigned Idx = 0; |
| for (const Expr *Private : Privates) { |
| llvm::Type *ElementType; |
| llvm::Value *Variable; |
| llvm::Value *PrivateVariable; |
| llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr; |
| ElementType = CGF.ConvertTypeForMem(Private->getType()); |
| const auto *RHSVar = |
| cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl()); |
| PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF); |
| const auto *LHSVar = |
| cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl()); |
| Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF); |
| llvm::OpenMPIRBuilder::EvalKind EvalKind; |
| switch (CGF.getEvaluationKind(Private->getType())) { |
| case TEK_Scalar: |
| EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar; |
| break; |
| case TEK_Complex: |
| EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex; |
| break; |
| case TEK_Aggregate: |
| EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate; |
| break; |
| } |
| auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I, |
| llvm::Value **LHSPtr, llvm::Value **RHSPtr, |
| llvm::Function *NewFunc) { |
| CGF.Builder.restoreIP(CodeGenIP); |
| auto *CurFn = CGF.CurFn; |
| CGF.CurFn = NewFunc; |
| |
| *LHSPtr = CGF.GetAddrOfLocalVar( |
| cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl())) |
| .emitRawPointer(CGF); |
| *RHSPtr = CGF.GetAddrOfLocalVar( |
| cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl())) |
| .emitRawPointer(CGF); |
| |
| emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I], |
| cast<DeclRefExpr>(LHSExprs[I]), |
| cast<DeclRefExpr>(RHSExprs[I])); |
| |
| CGF.CurFn = CurFn; |
| |
| return InsertPointTy(CGF.Builder.GetInsertBlock(), |
| CGF.Builder.GetInsertPoint()); |
| }; |
| ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo( |
| ElementType, Variable, PrivateVariable, EvalKind, |
| /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen)); |
| Idx++; |
| } |
| |
| llvm::OpenMPIRBuilder::InsertPointTy AfterIP = |
| cantFail(OMPBuilder.createReductionsGPU( |
| OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, |
| DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, |
| CGF.getTarget().getGridValue(), |
| C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); |
| CGF.Builder.restoreIP(AfterIP); |
| return; |
| } |
| |
| const VarDecl * |
| CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, |
| const VarDecl *NativeParam) const { |
| if (!NativeParam->getType()->isReferenceType()) |
| return NativeParam; |
| QualType ArgType = NativeParam->getType(); |
| QualifierCollector QC; |
| const Type *NonQualTy = QC.strip(ArgType); |
| QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); |
| if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { |
| if (Attr->getCaptureKind() == OMPC_map) { |
| PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, |
| LangAS::opencl_global); |
| } |
| } |
| ArgType = CGM.getContext().getPointerType(PointeeTy); |
| QC.addRestrict(); |
| enum { NVPTX_local_addr = 5 }; |
| QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); |
| ArgType = QC.apply(CGM.getContext(), ArgType); |
| if (isa<ImplicitParamDecl>(NativeParam)) |
| return ImplicitParamDecl::Create( |
| CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), |
| NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other); |
| return ParmVarDecl::Create( |
| CGM.getContext(), |
| const_cast<DeclContext *>(NativeParam->getDeclContext()), |
| NativeParam->getBeginLoc(), NativeParam->getLocation(), |
| NativeParam->getIdentifier(), ArgType, |
| /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); |
| } |
| |
| Address |
| CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, |
| const VarDecl *NativeParam, |
| const VarDecl *TargetParam) const { |
| assert(NativeParam != TargetParam && |
| NativeParam->getType()->isReferenceType() && |
| "Native arg must not be the same as target arg."); |
| Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); |
| QualType NativeParamType = NativeParam->getType(); |
| QualifierCollector QC; |
| const Type *NonQualTy = QC.strip(NativeParamType); |
| QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); |
| unsigned NativePointeeAddrSpace = |
| CGF.getTypes().getTargetAddressSpace(NativePointeeTy); |
| QualType TargetTy = TargetParam->getType(); |
| llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false, |
| TargetTy, SourceLocation()); |
| // Cast to native address space. |
| TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( |
| TargetAddr, |
| llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace)); |
| Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); |
| CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, |
| NativeParamType); |
| return NativeParamAddr; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( |
| CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, |
| ArrayRef<llvm::Value *> Args) const { |
| SmallVector<llvm::Value *, 4> TargetArgs; |
| TargetArgs.reserve(Args.size()); |
| auto *FnType = OutlinedFn.getFunctionType(); |
| for (unsigned I = 0, E = Args.size(); I < E; ++I) { |
| if (FnType->isVarArg() && FnType->getNumParams() <= I) { |
| TargetArgs.append(std::next(Args.begin(), I), Args.end()); |
| break; |
| } |
| llvm::Type *TargetType = FnType->getParamType(I); |
| llvm::Value *NativeArg = Args[I]; |
| if (!TargetType->isPointerTy()) { |
| TargetArgs.emplace_back(NativeArg); |
| continue; |
| } |
| TargetArgs.emplace_back( |
| CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType)); |
| } |
| CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); |
| } |
| |
| /// Emit function which wraps the outline parallel region |
| /// and controls the arguments which are passed to this function. |
| /// The wrapper ensures that the outlined function is called |
| /// with the correct arguments when data is shared. |
| llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( |
| llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { |
| ASTContext &Ctx = CGM.getContext(); |
| const auto &CS = *D.getCapturedStmt(OMPD_parallel); |
| |
| // Create a function that takes as argument the source thread. |
| FunctionArgList WrapperArgs; |
| QualType Int16QTy = |
| Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); |
| QualType Int32QTy = |
| Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); |
| ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), |
| /*Id=*/nullptr, Int16QTy, |
| ImplicitParamKind::Other); |
| ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), |
| /*Id=*/nullptr, Int32QTy, |
| ImplicitParamKind::Other); |
| WrapperArgs.emplace_back(&ParallelLevelArg); |
| WrapperArgs.emplace_back(&WrapperArg); |
| |
| const CGFunctionInfo &CGFI = |
| CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); |
| |
| auto *Fn = llvm::Function::Create( |
| CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, |
| Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); |
| |
| // Ensure we do not inline the function. This is trivially true for the ones |
| // passed to __kmpc_fork_call but the ones calles in serialized regions |
| // could be inlined. This is not a perfect but it is closer to the invariant |
| // we want, namely, every data environment starts with a new function. |
| // TODO: We should pass the if condition to the runtime function and do the |
| // handling there. Much cleaner code. |
| Fn->addFnAttr(llvm::Attribute::NoInline); |
| |
| CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); |
| Fn->setLinkage(llvm::GlobalValue::InternalLinkage); |
| Fn->setDoesNotRecurse(); |
| |
| CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); |
| CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, |
| D.getBeginLoc(), D.getBeginLoc()); |
| |
| const auto *RD = CS.getCapturedRecordDecl(); |
| auto CurField = RD->field_begin(); |
| |
| Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, |
| /*Name=*/".zero.addr"); |
| CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); |
| // Get the array of arguments. |
| SmallVector<llvm::Value *, 8> Args; |
| |
| Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF)); |
| Args.emplace_back(ZeroAddr.emitRawPointer(CGF)); |
| |
| CGBuilderTy &Bld = CGF.Builder; |
| auto CI = CS.capture_begin(); |
| |
| // Use global memory for data sharing. |
| // Handle passing of global args to workers. |
| RawAddress GlobalArgs = |
| CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); |
| llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); |
| llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; |
| CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_get_shared_variables), |
| DataSharingArgs); |
| |
| // Retrieve the shared variables from the list of references returned |
| // by the runtime. Pass the variables to the outlined function. |
| Address SharedArgListAddress = Address::invalid(); |
| if (CS.capture_size() > 0 || |
| isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { |
| SharedArgListAddress = CGF.EmitLoadOfPointer( |
| GlobalArgs, CGF.getContext() |
| .getPointerType(CGF.getContext().VoidPtrTy) |
| .castAs<PointerType>()); |
| } |
| unsigned Idx = 0; |
| if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { |
| Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); |
| Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( |
| Src, Bld.getPtrTy(0), CGF.SizeTy); |
| llvm::Value *LB = CGF.EmitLoadOfScalar( |
| TypedAddress, |
| /*Volatile=*/false, |
| CGF.getContext().getPointerType(CGF.getContext().getSizeType()), |
| cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); |
| Args.emplace_back(LB); |
| ++Idx; |
| Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); |
| TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Src, Bld.getPtrTy(0), |
| CGF.SizeTy); |
| llvm::Value *UB = CGF.EmitLoadOfScalar( |
| TypedAddress, |
| /*Volatile=*/false, |
| CGF.getContext().getPointerType(CGF.getContext().getSizeType()), |
| cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); |
| Args.emplace_back(UB); |
| ++Idx; |
| } |
| if (CS.capture_size() > 0) { |
| ASTContext &CGFContext = CGF.getContext(); |
| for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { |
| QualType ElemTy = CurField->getType(); |
| Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); |
| Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( |
| Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)), |
| CGF.ConvertTypeForMem(ElemTy)); |
| llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, |
| /*Volatile=*/false, |
| CGFContext.getPointerType(ElemTy), |
| CI->getLocation()); |
| if (CI->capturesVariableByCopy() && |
| !CI->getCapturedVar()->getType()->isAnyPointerType()) { |
| Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), |
| CI->getLocation()); |
| } |
| Args.emplace_back(Arg); |
| } |
| } |
| |
| emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); |
| CGF.FinishFunction(); |
| return Fn; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, |
| const Decl *D) { |
| if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) |
| return; |
| |
| assert(D && "Expected function or captured|block decl."); |
| assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && |
| "Function is registered already."); |
| assert((!TeamAndReductions.first || TeamAndReductions.first == D) && |
| "Team is set but not processed."); |
| const Stmt *Body = nullptr; |
| bool NeedToDelayGlobalization = false; |
| if (const auto *FD = dyn_cast<FunctionDecl>(D)) { |
| Body = FD->getBody(); |
| } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { |
| Body = BD->getBody(); |
| } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { |
| Body = CD->getBody(); |
| NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; |
| if (NeedToDelayGlobalization && |
| getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) |
| return; |
| } |
| if (!Body) |
| return; |
| CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); |
| VarChecker.Visit(Body); |
| const RecordDecl *GlobalizedVarsRecord = |
| VarChecker.getGlobalizedRecord(IsInTTDRegion); |
| TeamAndReductions.first = nullptr; |
| TeamAndReductions.second.clear(); |
| ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = |
| VarChecker.getEscapedVariableLengthDecls(); |
| ArrayRef<const ValueDecl *> DelayedVariableLengthDecls = |
| VarChecker.getDelayedVariableLengthDecls(); |
| if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() && |
| DelayedVariableLengthDecls.empty()) |
| return; |
| auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; |
| I->getSecond().MappedParams = |
| std::make_unique<CodeGenFunction::OMPMapVars>(); |
| I->getSecond().EscapedParameters.insert( |
| VarChecker.getEscapedParameters().begin(), |
| VarChecker.getEscapedParameters().end()); |
| I->getSecond().EscapedVariableLengthDecls.append( |
| EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); |
| I->getSecond().DelayedVariableLengthDecls.append( |
| DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end()); |
| DeclToAddrMapTy &Data = I->getSecond().LocalVarData; |
| for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { |
| assert(VD->isCanonicalDecl() && "Expected canonical declaration"); |
| Data.insert(std::make_pair(VD, MappedVarData())); |
| } |
| if (!NeedToDelayGlobalization) { |
| emitGenericVarsProlog(CGF, D->getBeginLoc()); |
| struct GlobalizationScope final : EHScopeStack::Cleanup { |
| GlobalizationScope() = default; |
| |
| void Emit(CodeGenFunction &CGF, Flags flags) override { |
| static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) |
| .emitGenericVarsEpilog(CGF); |
| } |
| }; |
| CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); |
| } |
| } |
| |
| Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, |
| const VarDecl *VD) { |
| if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { |
| const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); |
| auto AS = LangAS::Default; |
| switch (A->getAllocatorType()) { |
| case OMPAllocateDeclAttr::OMPNullMemAlloc: |
| case OMPAllocateDeclAttr::OMPDefaultMemAlloc: |
| case OMPAllocateDeclAttr::OMPHighBWMemAlloc: |
| case OMPAllocateDeclAttr::OMPLowLatMemAlloc: |
| break; |
| case OMPAllocateDeclAttr::OMPThreadMemAlloc: |
| return Address::invalid(); |
| case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: |
| // TODO: implement aupport for user-defined allocators. |
| return Address::invalid(); |
| case OMPAllocateDeclAttr::OMPConstMemAlloc: |
| AS = LangAS::cuda_constant; |
| break; |
| case OMPAllocateDeclAttr::OMPPTeamMemAlloc: |
| AS = LangAS::cuda_shared; |
| break; |
| case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: |
| case OMPAllocateDeclAttr::OMPCGroupMemAlloc: |
| break; |
| } |
| llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); |
| auto *GV = new llvm::GlobalVariable( |
| CGM.getModule(), VarTy, /*isConstant=*/false, |
| llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy), |
| VD->getName(), |
| /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, |
| CGM.getContext().getTargetAddressSpace(AS)); |
| CharUnits Align = CGM.getContext().getDeclAlign(VD); |
| GV->setAlignment(Align.getAsAlign()); |
| return Address( |
| CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( |
| GV, CGF.Builder.getPtrTy(CGM.getContext().getTargetAddressSpace( |
| VD->getType().getAddressSpace()))), |
| VarTy, Align); |
| } |
| |
| if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) |
| return Address::invalid(); |
| |
| VD = VD->getCanonicalDecl(); |
| auto I = FunctionGlobalizedDecls.find(CGF.CurFn); |
| if (I == FunctionGlobalizedDecls.end()) |
| return Address::invalid(); |
| auto VDI = I->getSecond().LocalVarData.find(VD); |
| if (VDI != I->getSecond().LocalVarData.end()) |
| return VDI->second.PrivateAddr; |
| if (VD->hasAttrs()) { |
| for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()), |
| E(VD->attr_end()); |
| IT != E; ++IT) { |
| auto VDI = I->getSecond().LocalVarData.find( |
| cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl()) |
| ->getCanonicalDecl()); |
| if (VDI != I->getSecond().LocalVarData.end()) |
| return VDI->second.PrivateAddr; |
| } |
| } |
| |
| return Address::invalid(); |
| } |
| |
| void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { |
| FunctionGlobalizedDecls.erase(CGF.CurFn); |
| CGOpenMPRuntime::functionFinished(CGF); |
| } |
| |
| void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( |
| CodeGenFunction &CGF, const OMPLoopDirective &S, |
| OpenMPDistScheduleClauseKind &ScheduleKind, |
| llvm::Value *&Chunk) const { |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { |
| ScheduleKind = OMPC_DIST_SCHEDULE_static; |
| Chunk = CGF.EmitScalarConversion( |
| RT.getGPUNumThreads(CGF), |
| CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), |
| S.getIterationVariable()->getType(), S.getBeginLoc()); |
| return; |
| } |
| CGOpenMPRuntime::getDefaultDistScheduleAndChunk( |
| CGF, S, ScheduleKind, Chunk); |
| } |
| |
| void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( |
| CodeGenFunction &CGF, const OMPLoopDirective &S, |
| OpenMPScheduleClauseKind &ScheduleKind, |
| const Expr *&ChunkExpr) const { |
| ScheduleKind = OMPC_SCHEDULE_static; |
| // Chunk size is 1 in this case. |
| llvm::APInt ChunkSize(32, 1); |
| ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, |
| CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), |
| SourceLocation()); |
| } |
| |
| void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( |
| CodeGenFunction &CGF, const OMPExecutableDirective &D) const { |
| assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && |
| " Expected target-based directive."); |
| const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); |
| for (const CapturedStmt::Capture &C : CS->captures()) { |
| // Capture variables captured by reference in lambdas for target-based |
| // directives. |
| if (!C.capturesVariable()) |
| continue; |
| const VarDecl *VD = C.getCapturedVar(); |
| const auto *RD = VD->getType() |
| .getCanonicalType() |
| .getNonReferenceType() |
| ->getAsCXXRecordDecl(); |
| if (!RD || !RD->isLambda()) |
| continue; |
| Address VDAddr = CGF.GetAddrOfLocalVar(VD); |
| LValue VDLVal; |
| if (VD->getType().getCanonicalType()->isReferenceType()) |
| VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); |
| else |
| VDLVal = CGF.MakeAddrLValue( |
| VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); |
| llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures; |
| FieldDecl *ThisCapture = nullptr; |
| RD->getCaptureFields(Captures, ThisCapture); |
| if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { |
| LValue ThisLVal = |
| CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); |
| llvm::Value *CXXThis = CGF.LoadCXXThis(); |
| CGF.EmitStoreOfScalar(CXXThis, ThisLVal); |
| } |
| for (const LambdaCapture &LC : RD->captures()) { |
| if (LC.getCaptureKind() != LCK_ByRef) |
| continue; |
| const ValueDecl *VD = LC.getCapturedVar(); |
| // FIXME: For now VD is always a VarDecl because OpenMP does not support |
| // capturing structured bindings in lambdas yet. |
| if (!CS->capturesVariable(cast<VarDecl>(VD))) |
| continue; |
| auto It = Captures.find(VD); |
| assert(It != Captures.end() && "Found lambda capture without field."); |
| LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); |
| Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD)); |
| if (VD->getType().getCanonicalType()->isReferenceType()) |
| VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, |
| VD->getType().getCanonicalType()) |
| .getAddress(); |
| CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal); |
| } |
| } |
| } |
| |
| bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, |
| LangAS &AS) { |
| if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) |
| return false; |
| const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); |
| switch(A->getAllocatorType()) { |
| case OMPAllocateDeclAttr::OMPNullMemAlloc: |
| case OMPAllocateDeclAttr::OMPDefaultMemAlloc: |
| // Not supported, fallback to the default mem space. |
| case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: |
| case OMPAllocateDeclAttr::OMPCGroupMemAlloc: |
| case OMPAllocateDeclAttr::OMPHighBWMemAlloc: |
| case OMPAllocateDeclAttr::OMPLowLatMemAlloc: |
| case OMPAllocateDeclAttr::OMPThreadMemAlloc: |
| AS = LangAS::Default; |
| return true; |
| case OMPAllocateDeclAttr::OMPConstMemAlloc: |
| AS = LangAS::cuda_constant; |
| return true; |
| case OMPAllocateDeclAttr::OMPPTeamMemAlloc: |
| AS = LangAS::cuda_shared; |
| return true; |
| case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: |
| llvm_unreachable("Expected predefined allocator for the variables with the " |
| "static storage."); |
| } |
| return false; |
| } |
| |
| // Get current OffloadArch and ignore any unknown values |
| static OffloadArch getOffloadArch(CodeGenModule &CGM) { |
| if (!CGM.getTarget().hasFeature("ptx")) |
| return OffloadArch::UNKNOWN; |
| for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { |
| if (Feature.getValue()) { |
| OffloadArch Arch = StringToOffloadArch(Feature.getKey()); |
| if (Arch != OffloadArch::UNKNOWN) |
| return Arch; |
| } |
| } |
| return OffloadArch::UNKNOWN; |
| } |
| |
| /// Check to see if target architecture supports unified addressing which is |
| /// a restriction for OpenMP requires clause "unified_shared_memory". |
| void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { |
| for (const OMPClause *Clause : D->clauselists()) { |
| if (Clause->getClauseKind() == OMPC_unified_shared_memory) { |
| OffloadArch Arch = getOffloadArch(CGM); |
| switch (Arch) { |
| case OffloadArch::SM_20: |
| case OffloadArch::SM_21: |
| case OffloadArch::SM_30: |
| case OffloadArch::SM_32_: |
| case OffloadArch::SM_35: |
| case OffloadArch::SM_37: |
| case OffloadArch::SM_50: |
| case OffloadArch::SM_52: |
| case OffloadArch::SM_53: { |
| SmallString<256> Buffer; |
| llvm::raw_svector_ostream Out(Buffer); |
| Out << "Target architecture " << OffloadArchToString(Arch) |
| << " does not support unified addressing"; |
| CGM.Error(Clause->getBeginLoc(), Out.str()); |
| return; |
| } |
| case OffloadArch::SM_60: |
| case OffloadArch::SM_61: |
| case OffloadArch::SM_62: |
| case OffloadArch::SM_70: |
| case OffloadArch::SM_72: |
| case OffloadArch::SM_75: |
| case OffloadArch::SM_80: |
| case OffloadArch::SM_86: |
| case OffloadArch::SM_87: |
| case OffloadArch::SM_89: |
| case OffloadArch::SM_90: |
| case OffloadArch::SM_90a: |
| case OffloadArch::SM_100: |
| case OffloadArch::SM_100a: |
| case OffloadArch::GFX600: |
| case OffloadArch::GFX601: |
| case OffloadArch::GFX602: |
| case OffloadArch::GFX700: |
| case OffloadArch::GFX701: |
| case OffloadArch::GFX702: |
| case OffloadArch::GFX703: |
| case OffloadArch::GFX704: |
| case OffloadArch::GFX705: |
| case OffloadArch::GFX801: |
| case OffloadArch::GFX802: |
| case OffloadArch::GFX803: |
| case OffloadArch::GFX805: |
| case OffloadArch::GFX810: |
| case OffloadArch::GFX9_GENERIC: |
| case OffloadArch::GFX900: |
| case OffloadArch::GFX902: |
| case OffloadArch::GFX904: |
| case OffloadArch::GFX906: |
| case OffloadArch::GFX908: |
| case OffloadArch::GFX909: |
| case OffloadArch::GFX90a: |
| case OffloadArch::GFX90c: |
| case OffloadArch::GFX9_4_GENERIC: |
| case OffloadArch::GFX940: |
| case OffloadArch::GFX941: |
| case OffloadArch::GFX942: |
| case OffloadArch::GFX950: |
| case OffloadArch::GFX10_1_GENERIC: |
| case OffloadArch::GFX1010: |
| case OffloadArch::GFX1011: |
| case OffloadArch::GFX1012: |
| case OffloadArch::GFX1013: |
| case OffloadArch::GFX10_3_GENERIC: |
| case OffloadArch::GFX1030: |
| case OffloadArch::GFX1031: |
| case OffloadArch::GFX1032: |
| case OffloadArch::GFX1033: |
| case OffloadArch::GFX1034: |
| case OffloadArch::GFX1035: |
| case OffloadArch::GFX1036: |
| case OffloadArch::GFX11_GENERIC: |
| case OffloadArch::GFX1100: |
| case OffloadArch::GFX1101: |
| case OffloadArch::GFX1102: |
| case OffloadArch::GFX1103: |
| case OffloadArch::GFX1150: |
| case OffloadArch::GFX1151: |
| case OffloadArch::GFX1152: |
| case OffloadArch::GFX1153: |
| case OffloadArch::GFX12_GENERIC: |
| case OffloadArch::GFX1200: |
| case OffloadArch::GFX1201: |
| case OffloadArch::AMDGCNSPIRV: |
| case OffloadArch::Generic: |
| case OffloadArch::UNUSED: |
| case OffloadArch::UNKNOWN: |
| break; |
| case OffloadArch::LAST: |
| llvm_unreachable("Unexpected GPU arch."); |
| } |
| } |
| } |
| CGOpenMPRuntime::processRequiresDirective(D); |
| } |
| |
| llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| llvm::Module *M = &CGF.CGM.getModule(); |
| const char *LocSize = "__kmpc_get_hardware_num_threads_in_block"; |
| llvm::Function *F = M->getFunction(LocSize); |
| if (!F) { |
| F = llvm::Function::Create(llvm::FunctionType::get(CGF.Int32Ty, {}, false), |
| llvm::GlobalVariable::ExternalLinkage, LocSize, |
| &CGF.CGM.getModule()); |
| } |
| return Bld.CreateCall(F, {}, "nvptx_num_threads"); |
| } |
| |
| llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) { |
| ArrayRef<llvm::Value *> Args{}; |
| return CGF.EmitRuntimeCall( |
| OMPBuilder.getOrCreateRuntimeFunction( |
| CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block), |
| Args); |
| } |