llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp - toolchain/llvm-project - Git at Google

 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file contains implementations for different VPlan recipes.
 ///
 //===----------------------------------------------------------------------===//

 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>

 using namespace llvm;

 using VectorParts = SmallVector<Value *, 2>;

 namespace llvm {
 extern cl::opt<bool> EnableVPlanNativePath;
 }
 extern cl::opt<unsigned> ForceTargetInstructionCost;

 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME

 bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
   case VPWidenStoreSC:
     return true;
   case VPReplicateSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
         ->mayWriteToMemory();
   case VPWidenCallSC:
     return !cast<VPWidenCallRecipe>(this)
                 ->getCalledScalarFunction()
                 ->onlyReadsMemory();
   case VPBranchOnMaskSC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
   case VPWidenPHISC:
   case VPWidenSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
     assert((!I || !I->mayWriteToMemory()) &&
            "underlying instruction may write to memory");
     return false;
   }
   default:
     return true;
   }
 }

 bool VPRecipeBase::mayReadFromMemory() const {
   switch (getVPDefID()) {
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
     return true;
   case VPReplicateSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
         ->mayReadFromMemory();
   case VPWidenCallSC:
     return !cast<VPWidenCallRecipe>(this)
                 ->getCalledScalarFunction()
                 ->onlyWritesMemory();
   case VPBranchOnMaskSC:
   case VPPredInstPHISC:
   case VPScalarIVStepsSC:
   case VPWidenStoreEVLSC:
   case VPWidenStoreSC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
   case VPWidenSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
     assert((!I || !I->mayReadFromMemory()) &&
            "underlying instruction may read from memory");
     return false;
   }
   default:
     return true;
   }
 }

 bool VPRecipeBase::mayHaveSideEffects() const {
   switch (getVPDefID()) {
   case VPDerivedIVSC:
   case VPPredInstPHISC:
   case VPScalarCastSC:
     return false;
   case VPInstructionSC:
     switch (cast<VPInstruction>(this)->getOpcode()) {
     case Instruction::Or:
     case Instruction::ICmp:
     case Instruction::Select:
     case VPInstruction::Not:
     case VPInstruction::CalculateTripCountMinusVF:
     case VPInstruction::CanonicalIVIncrementForPart:
     case VPInstruction::ExtractFromEnd:
     case VPInstruction::FirstOrderRecurrenceSplice:
     case VPInstruction::LogicalAnd:
     case VPInstruction::PtrAdd:
       return false;
     default:
       return true;
     }
   case VPWidenCallSC: {
     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
   }
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPScalarIVStepsSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
   case VPWidenPointerInductionSC:
   case VPWidenSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
     (void)I;
     assert((!I || !I->mayHaveSideEffects()) &&
            "underlying instruction has side-effects");
     return false;
   }
   case VPInterleaveSC:
     return mayWriteToMemory();
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
   case VPWidenStoreEVLSC:
   case VPWidenStoreSC:
     assert(
         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
             mayWriteToMemory() &&
         "mayHaveSideffects result for ingredient differs from this "
         "implementation");
     return mayWriteToMemory();
   case VPReplicateSC: {
     auto *R = cast<VPReplicateRecipe>(this);
     return R->getUnderlyingInstr()->mayHaveSideEffects();
   }
   default:
     return true;
   }
 }

 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
   VPValue *ExitValue = getOperand(0);
   VPBasicBlock *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
   // Values leaving the vector loop reach live out phi's in the exiting block
   // via middle block.
   auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
                        ? MiddleVPBB
                        : ExitingVPBB;
   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
   Value *V = State.get(ExitValue, VPIteration(0, 0));
   if (Phi->getBasicBlockIndex(PredBB) != -1)
     Phi->setIncomingValueForBlock(PredBB, V);
   else
     Phi->addIncoming(V, PredBB);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
   O << "Live-out ";
   getPhi()->printAsOperand(O);
   O << " = ";
   getOperand(0)->printAsOperand(O, SlotTracker);
   O << "\n";
 }
 #endif

 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(InsertPos->getParent() &&
          "Insertion position not in any VPBasicBlock");
   InsertPos->getParent()->insert(this, InsertPos->getIterator());
 }

 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
                                 iplist<VPRecipeBase>::iterator I) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(I == BB.end() || I->getParent() == &BB);
   BB.insert(this, I);
 }

 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(InsertPos->getParent() &&
          "Insertion position not in any VPBasicBlock");
   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
 }

 void VPRecipeBase::removeFromParent() {
   assert(getParent() && "Recipe not in any VPBasicBlock");
   getParent()->getRecipeList().remove(getIterator());
   Parent = nullptr;
 }

 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
   assert(getParent() && "Recipe not in any VPBasicBlock");
   return getParent()->getRecipeList().erase(getIterator());
 }

 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
   removeFromParent();
   insertAfter(InsertPos);
 }

 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
                               iplist<VPRecipeBase>::iterator I) {
   removeFromParent();
   insertBefore(BB, I);
 }

 /// Return the underlying instruction to be used for computing \p R's cost via
 /// the legacy cost model. Return nullptr if there's no suitable instruction.
 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
     return IG->getInsertPos();
   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
     return &WidenMem->getIngredient();
   return nullptr;
 }

 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
   auto *UI = getInstructionForCost(this);
   if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
     return 0;

   InstructionCost RecipeCost = computeCost(VF, Ctx);
   if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
       RecipeCost.isValid())
     RecipeCost = InstructionCost(ForceTargetInstructionCost);

   LLVM_DEBUG({
     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
     dump();
   });
   return RecipeCost;
 }

 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
                                           VPCostContext &Ctx) const {
   // Compute the cost for the recipe falling back to the legacy cost model using
   // the underlying instruction. If there is no underlying instruction, returns
   // 0.
   Instruction *UI = getInstructionForCost(this);
   if (UI && isa<VPReplicateRecipe>(this)) {
     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
     // transform, avoid computing their cost multiple times for now.
     Ctx.SkipCostComputation.insert(UI);
   }
   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
 }

 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
   FastMathFlags Res;
   Res.setAllowReassoc(FMFs.AllowReassoc);
   Res.setNoNaNs(FMFs.NoNaNs);
   Res.setNoInfs(FMFs.NoInfs);
   Res.setNoSignedZeros(FMFs.NoSignedZeros);
   Res.setAllowReciprocal(FMFs.AllowReciprocal);
   Res.setAllowContract(FMFs.AllowContract);
   Res.setApproxFunc(FMFs.ApproxFunc);
   return Res;
 }

 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
                              VPValue *A, VPValue *B, DebugLoc DL,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
                           Pred, DL),
       Opcode(Opcode), Name(Name.str()) {
   assert(Opcode == Instruction::ICmp &&
          "only ICmp predicates supported at the moment");
 }

 VPInstruction::VPInstruction(unsigned Opcode,
                              std::initializer_list<VPValue *> Operands,
                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
       Opcode(Opcode), Name(Name.str()) {
   // Make sure the VPInstruction is a floating-point operation.
   assert(isFPMathOp() && "this op can't take fast-math flags");
 }

 bool VPInstruction::doesGeneratePerAllLanes() const {
   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
 }

 bool VPInstruction::canGenerateScalarForFirstLane() const {
   if (Instruction::isBinaryOp(getOpcode()))
     return true;
   if (isSingleScalar() || isVectorToScalar())
     return true;
   switch (Opcode) {
   case Instruction::ICmp:
   case VPInstruction::BranchOnCond:
   case VPInstruction::BranchOnCount:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
     return true;
   default:
     return false;
   }
 }

 Value *VPInstruction::generatePerLane(VPTransformState &State,
                                       const VPIteration &Lane) {
   IRBuilderBase &Builder = State.Builder;

   assert(getOpcode() == VPInstruction::PtrAdd &&
          "only PtrAdd opcodes are supported for now");
   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
                               State.get(getOperand(1), Lane), Name);
 }

 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
   IRBuilderBase &Builder = State.Builder;

   if (Instruction::isBinaryOp(getOpcode())) {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
     auto *Res =
         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
     if (auto *I = dyn_cast<Instruction>(Res))
       setFlags(I);
     return Res;
   }

   switch (getOpcode()) {
   case VPInstruction::Not: {
     Value *A = State.get(getOperand(0), Part);
     return Builder.CreateNot(A, Name);
   }
   case Instruction::ICmp: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
     return Builder.CreateCmp(getPredicate(), A, B, Name);
   }
   case Instruction::Select: {
     Value *Cond = State.get(getOperand(0), Part);
     Value *Op1 = State.get(getOperand(1), Part);
     Value *Op2 = State.get(getOperand(2), Part);
     return Builder.CreateSelect(Cond, Op1, Op2, Name);
   }
   case VPInstruction::ActiveLaneMask: {
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
     // Get the original loop tripcount.
     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));

     // If this part of the active lane mask is scalar, generate the CMP directly
     // to avoid unnecessary extracts.
     if (State.VF.isScalar())
       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
                                Name);

     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
     auto *PredTy = VectorType::get(Int1Ty, State.VF);
     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
     //   vector.ph:
     //     v_init = vector(..., ..., ..., a[-1])
     //     br vector.body
     //
     //   vector.body
     //     i = phi [0, vector.ph], [i+4, vector.body]
     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
     //     v2 = a[i, i+1, i+2, i+3];
     //     v3 = vector(v1(3), v2(0, 1, 2))

     // For the first part, use the recurrence phi (v1), otherwise v2.
     auto *V1 = State.get(getOperand(0), 0);
     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
     if (!PartMinus1->getType()->isVectorTy())
       return PartMinus1;
     Value *V2 = State.get(getOperand(1), Part);
     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
   }
   case VPInstruction::CalculateTripCountMinusVF: {
     if (Part != 0)
       return State.get(this, 0, /*IsScalar*/ true);

     Value *ScalarTC = State.get(getOperand(0), {0, 0});
     Value *Step =
         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
     Value *Sub = Builder.CreateSub(ScalarTC, Step);
     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
     return Builder.CreateSelect(Cmp, Sub, Zero);
   }
   case VPInstruction::ExplicitVectorLength: {
     // Compute EVL
     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
       assert(AVL->getType()->isIntegerTy() &&
              "Requested vector length should be an integer.");

       // TODO: Add support for MaxSafeDist for correct loop emission.
       assert(State.VF.isScalable() && "Expected scalable vector factor.");
       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());

       Value *EVL = State.Builder.CreateIntrinsic(
           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
           {AVL, VFArg, State.Builder.getTrue()});
       return EVL;
     };
     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
     // be outside of the main loop.
     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
     // Compute VTC - IV as the AVL (requested vector length).
     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
     Value *AVL = State.Builder.CreateSub(TripCount, Index);
     Value *EVL = GetEVL(State, AVL);
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
     if (Part == 0)
       return IV;

     // The canonical IV is incremented by the vectorization factor (num of SIMD
     // elements) times the unroll part.
     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
                              hasNoSignedWrap());
   }
   case VPInstruction::BranchOnCond: {
     if (Part != 0)
       return nullptr;

     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
     // Replace the temporary unreachable terminator with a new conditional
     // branch, hooking it up to backward destination for exiting blocks now and
     // to forward destination(s) later when they are created.
     BranchInst *CondBr =
         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
     CondBr->setSuccessor(0, nullptr);
     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();

     if (!getParent()->isExiting())
       return CondBr;

     VPRegionBlock *ParentRegion = getParent()->getParent();
     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
     return CondBr;
   }
   case VPInstruction::BranchOnCount: {
     if (Part != 0)
       return nullptr;
     // First create the compare.
     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
     Value *Cond = Builder.CreateICmpEQ(IV, TC);

     // Now create the branch.
     auto *Plan = getParent()->getPlan();
     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();

     // Replace the temporary unreachable terminator with a new conditional
     // branch, hooking it up to backward destination (the header) now and to the
     // forward destination (the exit/middle block) later when it is created.
     // Note that CreateCondBr expects a valid BB as first argument, so we need
     // to set it to nullptr later.
     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
                                               State.CFG.VPBB2IRBB[Header]);
     CondBr->setSuccessor(0, nullptr);
     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
     return CondBr;
   }
   case VPInstruction::ComputeReductionResult: {
     if (Part != 0)
       return State.get(this, 0, /*IsScalar*/ true);

     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
     // Get its reduction variable descriptor.
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

     RecurKind RK = RdxDesc.getRecurrenceKind();

     VPValue *LoopExitingDef = getOperand(1);
     Type *PhiTy = OrigPhi->getType();
     VectorParts RdxParts(State.UF);
     for (unsigned Part = 0; Part < State.UF; ++Part)
       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());

     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
     // entire expression in the smaller type.
     // TODO: Handle this in truncateToMinBW.
     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
       for (unsigned Part = 0; Part < State.UF; ++Part)
         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
     }
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
       Op = Instruction::Or;

     if (PhiR->isOrdered()) {
       ReducedPartRdx = RdxParts[State.UF - 1];
     } else {
       // Floating-point operations should have some FMF to enable the reduction.
       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
       for (unsigned Part = 1; Part < State.UF; ++Part) {
         Value *RdxPart = RdxParts[Part];
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           ReducedPartRdx = Builder.CreateBinOp(
               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
         else
           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
       }
     }

     // Create the reduction after the loop. Note that inloop reductions create
     // the target reduction in the loop using a Reduction recipe.
     if ((State.VF.isVector() ||
          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
         !PhiR->isInLoop()) {
       ReducedPartRdx =
           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
       // If the reduction can be performed in a smaller type, we need to extend
       // the reduction to the wider type before we branch to the original loop.
       if (PhiTy != RdxDesc.getRecurrenceType())
         ReducedPartRdx = RdxDesc.isSigned()
                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
     }

     // If there were stores of the reduction value to a uniform memory address
     // inside the loop, create the final store here.
     if (StoreInst *SI = RdxDesc.IntermediateStore) {
       auto *NewSI = Builder.CreateAlignedStore(
           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
       propagateMetadata(NewSI, SI);
     }

     return ReducedPartRdx;
   }
   case VPInstruction::ExtractFromEnd: {
     if (Part != 0)
       return State.get(this, 0, /*IsScalar*/ true);

     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
     unsigned Offset = CI->getZExtValue();
     assert(Offset > 0 && "Offset from end must be positive");
     Value *Res;
     if (State.VF.isVector()) {
       assert(Offset <= State.VF.getKnownMinValue() &&
              "invalid offset to extract from");
       // Extract lane VF - Offset from the operand.
       Res = State.get(
           getOperand(0),
           VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
     } else {
       assert(Offset <= State.UF && "invalid offset to extract from");
       // When loop is unrolled without vectorizing, retrieve UF - Offset.
       Res = State.get(getOperand(0), State.UF - Offset);
     }
     if (isa<ExtractElementInst>(Res))
       Res->setName(Name);
     return Res;
   }
   case VPInstruction::LogicalAnd: {
     Value *A = State.get(getOperand(0), Part);
     Value *B = State.get(getOperand(1), Part);
     return Builder.CreateLogicalAnd(A, B, Name);
   }
   case VPInstruction::PtrAdd: {
     assert(vputils::onlyFirstLaneUsed(this) &&
            "can only generate first lane for PtrAdd");
     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
     return Builder.CreatePtrAdd(Ptr, Addend, Name);
   }
   case VPInstruction::ResumePhi: {
     if (Part != 0)
       return State.get(this, 0, /*IsScalar*/ true);
     Value *IncomingFromVPlanPred =
         State.get(getOperand(0), Part, /* IsScalar */ true);
     Value *IncomingFromOtherPreds =
         State.get(getOperand(1), Part, /* IsScalar */ true);
     auto *NewPhi =
         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
     BasicBlock *VPlanPred =
         State.CFG
             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
       assert(OtherPred != VPlanPred &&
              "VPlan predecessors should not be connected yet");
       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
     }
     return NewPhi;
   }

   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
 }

 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
          getOpcode() == VPInstruction::ComputeReductionResult;
 }

 bool VPInstruction::isSingleScalar() const {
   return getOpcode() == VPInstruction::ResumePhi;
 }

 #if !defined(NDEBUG)
 bool VPInstruction::isFPMathOp() const {
   // Inspired by FPMathOperator::classof. Notable differences are that we don't
   // support Call, PHI and Select opcodes here yet.
   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
 }
 #endif

 void VPInstruction::execute(VPTransformState &State) {
   assert(!State.Instance && "VPInstruction executing an Instance");
   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
   assert((hasFastMathFlags() == isFPMathOp() ||
           getOpcode() == Instruction::Select) &&
          "Recipe not a FPMathOp but has fast-math flags?");
   if (hasFastMathFlags())
     State.Builder.setFastMathFlags(getFastMathFlags());
   State.setDebugLocFrom(getDebugLoc());
   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
                                    (vputils::onlyFirstLaneUsed(this) ||
                                     isVectorToScalar() || isSingleScalar());
   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
   bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     if (GeneratesPerAllLanes) {
       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
            Lane != NumLanes; ++Lane) {
         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
         assert(GeneratedValue && "generatePerLane must produce a value");
         State.set(this, GeneratedValue, VPIteration(Part, Lane));
       }
       continue;
     }

     if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
       Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
       State.set(this, Part0, Part,
                 /*IsScalar*/ GeneratesPerFirstLaneOnly);
       continue;
     }

     Value *GeneratedValue = generatePerPart(State, Part);
     if (!hasResult())
       continue;
     assert(GeneratedValue && "generatePerPart must produce a value");
     assert((GeneratedValue->getType()->isVectorTy() ==
                 !GeneratesPerFirstLaneOnly ||
             State.VF.isScalar()) &&
            "scalar value but not only first lane defined");
     State.set(this, GeneratedValue, Part,
               /*IsScalar*/ GeneratesPerFirstLaneOnly);
   }
 }

 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
     return vputils::onlyFirstLaneUsed(this);

   switch (getOpcode()) {
   default:
     return false;
   case Instruction::ICmp:
   case VPInstruction::PtrAdd:
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
   case VPInstruction::ActiveLaneMask:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
   case VPInstruction::ResumePhi:
     return true;
   };
   llvm_unreachable("switch should return");
 }

 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
     return vputils::onlyFirstPartUsed(this);

   switch (getOpcode()) {
   default:
     return false;
   case Instruction::ICmp:
   case Instruction::Select:
     return vputils::onlyFirstPartUsed(this);
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
   case VPInstruction::CanonicalIVIncrementForPart:
     return true;
   };
   llvm_unreachable("switch should return");
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInstruction::dump() const {
   VPSlotTracker SlotTracker(getParent()->getPlan());
   print(dbgs(), "", SlotTracker);
 }

 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";

   if (hasResult()) {
     printAsOperand(O, SlotTracker);
     O << " = ";
   }

   switch (getOpcode()) {
   case VPInstruction::Not:
     O << "not";
     break;
   case VPInstruction::SLPLoad:
     O << "combined load";
     break;
   case VPInstruction::SLPStore:
     O << "combined store";
     break;
   case VPInstruction::ActiveLaneMask:
     O << "active lane mask";
     break;
   case VPInstruction::ResumePhi:
     O << "resume-phi";
     break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
   case VPInstruction::FirstOrderRecurrenceSplice:
     O << "first-order splice";
     break;
   case VPInstruction::BranchOnCond:
     O << "branch-on-cond";
     break;
   case VPInstruction::CalculateTripCountMinusVF:
     O << "TC > VF ? TC - VF : 0";
     break;
   case VPInstruction::CanonicalIVIncrementForPart:
     O << "VF * Part +";
     break;
   case VPInstruction::BranchOnCount:
     O << "branch-on-count";
     break;
   case VPInstruction::ExtractFromEnd:
     O << "extract-from-end";
     break;
   case VPInstruction::ComputeReductionResult:
     O << "compute-reduction-result";
     break;
   case VPInstruction::LogicalAnd:
     O << "logical-and";
     break;
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }

   printFlags(O);
   printOperands(O, SlotTracker);

   if (auto DL = getDebugLoc()) {
     O << ", !dbg ";
     DL.print(O);
   }
 }
 #endif

 void VPWidenCallRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
   Function *CalledScalarFn = getCalledScalarFunction();
   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
   State.setDebugLocFrom(getDebugLoc());

   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
   FunctionType *VFTy = nullptr;
   if (Variant)
     VFTy = Variant->getFunctionType();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     SmallVector<Type *, 2> TysForDecl;
     // Add return type if intrinsic is overloaded on it.
     if (UseIntrinsic &&
         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
       TysForDecl.push_back(VectorType::get(
           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
     SmallVector<Value *, 4> Args;
     for (const auto &I : enumerate(arg_operands())) {
       // Some intrinsics have a scalar argument - don't replace it with a
       // vector.
       Value *Arg;
       if (UseIntrinsic &&
           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
         Arg = State.get(I.value(), VPIteration(0, 0));
       // Some vectorized function variants may also take a scalar argument,
       // e.g. linear parameters for pointers. This needs to be the scalar value
       // from the start of the respective part when interleaving.
       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
         Arg = State.get(I.value(), VPIteration(Part, 0));
       else
         Arg = State.get(I.value(), Part);
       if (UseIntrinsic &&
           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
         TysForDecl.push_back(Arg->getType());
       Args.push_back(Arg);
     }

     Function *VectorF;
     if (UseIntrinsic) {
       // Use vector version of the intrinsic.
       Module *M = State.Builder.GetInsertBlock()->getModule();
       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
       assert(VectorF && "Can't retrieve vector intrinsic.");
     } else {
 #ifndef NDEBUG
       assert(Variant != nullptr && "Can't create vector function.");
 #endif
       VectorF = Variant;
     }

     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
     SmallVector<OperandBundleDef, 1> OpBundles;
     if (CI)
       CI->getOperandBundlesAsDefs(OpBundles);

     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);

     if (isa<FPMathOperator>(V))
       V->copyFastMathFlags(CI);

     if (!V->getType()->isVoidTy())
       State.set(this, V, Part);
     State.addMetadata(V, CI);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CALL ";

   Function *CalledFn = getCalledScalarFunction();
   if (CalledFn->getReturnType()->isVoidTy())
     O << "void ";
   else {
     printAsOperand(O, SlotTracker);
     O << " = ";
   }

   O << "call @" << CalledFn->getName() << "(";
   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
     Op->printAsOperand(O, SlotTracker);
   });
   O << ")";

   if (VectorIntrinsicID)
     O << " (using vector intrinsic)";
   else {
     O << " (using library function";
     if (Variant->hasName())
       O << ": " << Variant->getName();
     O << ")";
   }
 }

 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-SELECT ";
   printAsOperand(O, SlotTracker);
   O << " = select ";
   getOperand(0)->printAsOperand(O, SlotTracker);
   O << ", ";
   getOperand(1)->printAsOperand(O, SlotTracker);
   O << ", ";
   getOperand(2)->printAsOperand(O, SlotTracker);
   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
 }
 #endif

 void VPWidenSelectRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());

   // The condition can be loop invariant but still defined inside the
   // loop. This means that we can't just use the original 'cond' value.
   // We have to take the 'vectorized' value and pick the first lane.
   // Instcombine will make this a no-op.
   auto *InvarCond =
       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;

   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
     Value *Op0 = State.get(getOperand(1), Part);
     Value *Op1 = State.get(getOperand(2), Part);
     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
     State.set(this, Sel, Part);
     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
   }
 }

 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
     const FastMathFlags &FMF) {
   AllowReassoc = FMF.allowReassoc();
   NoNaNs = FMF.noNaNs();
   NoInfs = FMF.noInfs();
   NoSignedZeros = FMF.noSignedZeros();
   AllowReciprocal = FMF.allowReciprocal();
   AllowContract = FMF.allowContract();
   ApproxFunc = FMF.approxFunc();
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
   switch (OpType) {
   case OperationType::Cmp:
     O << " " << CmpInst::getPredicateName(getPredicate());
     break;
   case OperationType::DisjointOp:
     if (DisjointFlags.IsDisjoint)
       O << " disjoint";
     break;
   case OperationType::PossiblyExactOp:
     if (ExactFlags.IsExact)
       O << " exact";
     break;
   case OperationType::OverflowingBinOp:
     if (WrapFlags.HasNUW)
       O << " nuw";
     if (WrapFlags.HasNSW)
       O << " nsw";
     break;
   case OperationType::FPMathOp:
     getFastMathFlags().print(O);
     break;
   case OperationType::GEPOp:
     if (GEPFlags.IsInBounds)
       O << " inbounds";
     break;
   case OperationType::NonNegOp:
     if (NonNegFlags.NonNeg)
       O << " nneg";
     break;
   case OperationType::Other:
     break;
   }
   if (getNumOperands() > 0)
     O << " ";
 }
 #endif

 void VPWidenRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   auto &Builder = State.Builder;
   switch (Opcode) {
   case Instruction::Call:
   case Instruction::Br:
   case Instruction::PHI:
   case Instruction::GetElementPtr:
   case Instruction::Select:
     llvm_unreachable("This instruction is handled by a different recipe.");
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
   case Instruction::FSub:
   case Instruction::FNeg:
   case Instruction::Mul:
   case Instruction::FMul:
   case Instruction::FDiv:
   case Instruction::FRem:
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
     // Just widen unops and binops.
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       SmallVector<Value *, 2> Ops;
       for (VPValue *VPOp : operands())
         Ops.push_back(State.get(VPOp, Part));

       Value *V = Builder.CreateNAryOp(Opcode, Ops);

       if (auto *VecOp = dyn_cast<Instruction>(V))
         setFlags(VecOp);

       // Use this vector value for all users of the original instruction.
       State.set(this, V, Part);
       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     }

     break;
   }
   case Instruction::Freeze: {
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Value *Op = State.get(getOperand(0), Part);

       Value *Freeze = Builder.CreateFreeze(Op);
       State.set(this, Freeze, Part);
     }
     break;
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     // Widen compares. Generate vector compares.
     bool FCmp = Opcode == Instruction::FCmp;
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Value *A = State.get(getOperand(0), Part);
       Value *B = State.get(getOperand(1), Part);
       Value *C = nullptr;
       if (FCmp) {
         // Propagate fast math flags.
         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
           Builder.setFastMathFlags(I->getFastMathFlags());
         C = Builder.CreateFCmp(getPredicate(), A, B);
       } else {
         C = Builder.CreateICmp(getPredicate(), A, B);
       }
       State.set(this, C, Part);
       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     }

     break;
   }
   default:
     // This instruction is not vectorized by simple widening.
     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
                       << Instruction::getOpcodeName(Opcode));
     llvm_unreachable("Unhandled instruction!");
   } // end of switch.

 #if !defined(NDEBUG)
   // Verify that VPlan type inference results agree with the type of the
   // generated values.
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
                            State.VF) == State.get(this, Part)->getType() &&
            "inferred type and type from generated instructions do not match");
   }
 #endif
 }

 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) const {
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   switch (Opcode) {
   case Instruction::FNeg: {
     Type *VectorTy =
         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
     return Ctx.TTI.getArithmeticInstrCost(
         Opcode, VectorTy, CostKind,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
   }

   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
     // More complex computation, let the legacy cost-model handle this for now.
     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
   case Instruction::FSub:
   case Instruction::Mul:
   case Instruction::FMul:
   case Instruction::FDiv:
   case Instruction::FRem:
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
     VPValue *RHS = getOperand(1);
     // Certain instructions can be cheaper to vectorize if they have a constant
     // second vector operand. One example of this are shifts on x86.
     TargetTransformInfo::OperandValueInfo RHSInfo = {
         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
     if (RHS->isLiveIn())
       RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());

     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
         getOperand(1)->isDefinedOutsideVectorRegions())
       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
     Type *VectorTy =
         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());

     SmallVector<const Value *, 4> Operands;
     if (CtxI)
       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
     return Ctx.TTI.getArithmeticInstrCost(
         Opcode, VectorTy, CostKind,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
         RHSInfo, Operands, CtxI, &Ctx.TLI);
   }
   case Instruction::Freeze: {
     // This opcode is unknown. Assume that it is the same as 'mul'.
     Type *VectorTy =
         ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
                                       CostKind, CtxI);
   }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode);
   printFlags(O);
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenCastRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   auto &Builder = State.Builder;
   /// Vectorize casts.
   assert(State.VF.isVector() && "Not vectorizing?");
   Type *DestTy = VectorType::get(getResultType(), State.VF);
   VPValue *Op = getOperand(0);
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     if (Part > 0 && Op->isLiveIn()) {
       // FIXME: Remove once explicit unrolling is implemented using VPlan.
       State.set(this, State.get(this, 0), Part);
       continue;
     }
     Value *A = State.get(Op, Part);
     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
     State.set(this, Cast, Part);
     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CAST ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
   printFlags(O);
   printOperands(O, SlotTracker);
   O << " to " << *getResultType();
 }
 #endif

 /// This function adds
 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
 /// to each vector element of Val. The sequence starts at StartIndex.
 /// \p Opcode is relevant for FP induction variable.
 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
                             Instruction::BinaryOps BinOp, ElementCount VF,
                             IRBuilderBase &Builder) {
   assert(VF.isVector() && "only vector VFs are supported");

   // Create and check the types.
   auto *ValVTy = cast<VectorType>(Val->getType());
   ElementCount VLen = ValVTy->getElementCount();

   Type *STy = Val->getType()->getScalarType();
   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
          "Induction Step must be an integer or FP");
   assert(Step->getType() == STy && "Step has wrong type");

   SmallVector<Constant *, 8> Indices;

   // Create a vector of consecutive numbers from zero to VF.
   VectorType *InitVecValVTy = ValVTy;
   if (STy->isFloatingPointTy()) {
     Type *InitVecValSTy =
         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
   }
   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);

   // Splat the StartIdx
   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);

   if (STy->isIntegerTy()) {
     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
     Step = Builder.CreateVectorSplat(VLen, Step);
     assert(Step->getType() == Val->getType() && "Invalid step vec");
     // FIXME: The newly created binary instructions should contain nsw/nuw
     // flags, which can be found from the original scalar operations.
     Step = Builder.CreateMul(InitVec, Step);
     return Builder.CreateAdd(Val, Step, "induction");
   }

   // Floating point induction.
   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
          "Binary Opcode should be specified for FP induction");
   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);

   Step = Builder.CreateVectorSplat(VLen, Step);
   Value *MulOp = Builder.CreateFMul(InitVec, Step);
   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
 }

 /// A helper function that returns an integer or floating-point constant with
 /// value C.
 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
                            : ConstantFP::get(Ty, C);
 }

 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
                                   ElementCount VF) {
   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
   return B.CreateUIToFP(RuntimeVF, FTy);
 }

 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");

   Value *Start = getStartValue()->getLiveInIRValue();
   const InductionDescriptor &ID = getInductionDescriptor();
   TruncInst *Trunc = getTruncInst();
   IRBuilderBase &Builder = State.Builder;
   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
   assert(State.VF.isVector() && "must have vector VF");

   // The value from the original loop to which we are mapping the new induction
   // variable.
   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;

   // Fast-math-flags propagate from the original induction instruction.
   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());

   // Now do the actual transformations, and start with fetching the step value.
   Value *Step = State.get(getStepValue(), VPIteration(0, 0));

   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
          "Expected either an induction phi-node or a truncate of it!");

   // Construct the initial value of the vector IV in the vector loop preheader
   auto CurrIP = Builder.saveIP();
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   Builder.SetInsertPoint(VectorPH->getTerminator());
   if (isa<TruncInst>(EntryVal)) {
     assert(Start->getType()->isIntegerTy() &&
            "Truncation requires an integer type");
     auto *TruncType = cast<IntegerType>(EntryVal->getType());
     Step = Builder.CreateTrunc(Step, TruncType);
     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
   }

   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
   Value *SteppedStart = getStepVector(
       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);

   // We create vector phi nodes for both integer and floating-point induction
   // variables. Here, we determine the kind of arithmetic we will perform.
   Instruction::BinaryOps AddOp;
   Instruction::BinaryOps MulOp;
   if (Step->getType()->isIntegerTy()) {
     AddOp = Instruction::Add;
     MulOp = Instruction::Mul;
   } else {
     AddOp = ID.getInductionOpcode();
     MulOp = Instruction::FMul;
   }

   // Multiply the vectorization factor by the step using integer or
   // floating-point arithmetic as appropriate.
   Type *StepType = Step->getType();
   Value *RuntimeVF;
   if (Step->getType()->isFloatingPointTy())
     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
   else
     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);

   // Create a vector splat to use in the induction update.
   //
   // FIXME: If the step is non-constant, we create the vector splat with
   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
   //        handle a constant vector splat.
   Value *SplatVF = isa<Constant>(Mul)
                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
                        : Builder.CreateVectorSplat(State.VF, Mul);
   Builder.restoreIP(CurrIP);

   // We may need to add the step a number of times, depending on the unroll
   // factor. The last of those goes into the PHI.
   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
   VecInd->setDebugLoc(EntryVal->getDebugLoc());
   Instruction *LastInduction = VecInd;
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     State.set(this, LastInduction, Part);

     if (isa<TruncInst>(EntryVal))
       State.addMetadata(LastInduction, EntryVal);

     LastInduction = cast<Instruction>(
         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
   }

   LastInduction->setName("vec.ind.next");
   VecInd->addIncoming(SteppedStart, VectorPH);
   // Add induction update using an incorrect block temporarily. The phi node
   // will be fixed after VPlan execution. Note that at this point the latch
   // block cannot be used, as it does not exist yet.
   // TODO: Model increment value in VPlan, by turning the recipe into a
   // multi-def and a subclass of VPHeaderPHIRecipe.
   VecInd->addIncoming(LastInduction, VectorPH);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-INDUCTION";
   if (getTruncInst()) {
     O << "\\l\"";
     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
     O << " +\n" << Indent << "\"  ";
     getVPValue(0)->printAsOperand(O, SlotTracker);
   } else
     O << " " << VPlanIngredient(IV);

   O << ", ";
   getStepValue()->printAsOperand(O, SlotTracker);
 }
 #endif

 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
   // The step may be defined by a recipe in the preheader (e.g. if it requires
   // SCEV expansion), but for the canonical induction the step is required to be
   // 1, which is represented as live-in.
   if (getStepValue()->getDefiningRecipe())
     return false;
   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
          getScalarType() == CanIV->getScalarType();
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << Indent << "= DERIVED-IV ";
   getStartValue()->printAsOperand(O, SlotTracker);
   O << " + ";
   getOperand(1)->printAsOperand(O, SlotTracker);
   O << " * ";
   getStepValue()->printAsOperand(O, SlotTracker);
 }
 #endif

 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   // Fast-math-flags propagate from the original induction instruction.
   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
   if (hasFastMathFlags())
     State.Builder.setFastMathFlags(getFastMathFlags());

   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
   /// variable on which to base the steps, \p Step is the size of the step.

   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
   IRBuilderBase &Builder = State.Builder;

   // Ensure step has the same type as that of scalar IV.
   Type *BaseIVTy = BaseIV->getType()->getScalarType();
   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");

   // We build scalar steps for both integer and floating-point induction
   // variables. Here, we determine the kind of arithmetic we will perform.
   Instruction::BinaryOps AddOp;
   Instruction::BinaryOps MulOp;
   if (BaseIVTy->isIntegerTy()) {
     AddOp = Instruction::Add;
     MulOp = Instruction::Mul;
   } else {
     AddOp = InductionOpcode;
     MulOp = Instruction::FMul;
   }

   // Determine the number of scalars we need to generate for each unroll
   // iteration.
   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
   // Compute the scalar steps and save the results in State.
   Type *IntStepTy =
       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
   Type *VecIVTy = nullptr;
   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
   if (!FirstLaneOnly && State.VF.isScalable()) {
     VecIVTy = VectorType::get(BaseIVTy, State.VF);
     UnitStepVec =
         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
   }

   unsigned StartPart = 0;
   unsigned EndPart = State.UF;
   unsigned StartLane = 0;
   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
   if (State.Instance) {
     StartPart = State.Instance->Part;
     EndPart = StartPart + 1;
     StartLane = State.Instance->Lane.getKnownLane();
     EndLane = StartLane + 1;
   }
   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);

     if (!FirstLaneOnly && State.VF.isScalable()) {
       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
       if (BaseIVTy->isFloatingPointTy())
         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
       State.set(this, Add, Part);
       // It's useful to record the lane values too for the known minimum number
       // of elements so we do those below. This improves the code quality when
       // trying to extract the first element, for example.
     }

     if (BaseIVTy->isFloatingPointTy())
       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);

     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
       Value *StartIdx = Builder.CreateBinOp(
           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
       // The step returned by `createStepForVF` is a runtime-evaluated value
       // when VF is scalable. Otherwise, it should be folded into a Constant.
       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
              "Expected StartIdx to be folded to a constant when VF is not "
              "scalable");
       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
       State.set(this, Add, VPIteration(Part, Lane));
     }
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = SCALAR-STEPS ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenGEPRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
   // Construct a vector GEP by widening the operands of the scalar GEP as
   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
   // results in a vector of pointers when at least one operand of the GEP
   // is vector-typed. Thus, to keep the representation compact, we only use
   // vector-typed operands for loop-varying values.

   if (areAllOperandsInvariant()) {
     // If we are vectorizing, but the GEP has only loop-invariant operands,
     // the GEP we build (by only using vector-typed operands for
     // loop-varying values) would be a scalar pointer. Thus, to ensure we
     // produce a vector of pointers, we need to either arbitrarily pick an
     // operand to broadcast, or broadcast a clone of the original GEP.
     // Here, we broadcast a clone of the original.
     //
     // TODO: If at some point we decide to scalarize instructions having
     //       loop-invariant operands, this special case will no longer be
     //       required. We would add the scalarization decision to
     //       collectLoopScalars() and teach getVectorValue() to broadcast
     //       the lane-zero scalar value.
     SmallVector<Value *> Ops;
     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));

     auto *NewGEP =
         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
                                 ArrayRef(Ops).drop_front(), "", isInBounds());
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
       State.set(this, EntryPart, Part);
       State.addMetadata(EntryPart, GEP);
     }
   } else {
     // If the GEP has at least one loop-varying operand, we are sure to
     // produce a vector of pointers. But if we are only unrolling, we want
     // to produce a scalar GEP for each unroll part. Thus, the GEP we
     // produce with the code below will be scalar (if VF == 1) or vector
     // (otherwise). Note that for the unroll-only case, we still maintain
     // values in the vector mapping with initVector, as we do for other
     // instructions.
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       // The pointer operand of the new GEP. If it's loop-invariant, we
       // won't broadcast it.
       auto *Ptr = isPointerLoopInvariant()
                       ? State.get(getOperand(0), VPIteration(0, 0))
                       : State.get(getOperand(0), Part);

       // Collect all the indices for the new GEP. If any index is
       // loop-invariant, we won't broadcast it.
       SmallVector<Value *, 4> Indices;
       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
         VPValue *Operand = getOperand(I);
         if (isIndexLoopInvariant(I - 1))
           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
         else
           Indices.push_back(State.get(Operand, Part));
       }

       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
       // but it should be a vector, otherwise.
       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
                                              Indices, "", isInBounds());
       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
              "NewGEP is not a pointer vector");
       State.set(this, NewGEP, Part);
       State.addMetadata(NewGEP, GEP);
     }
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-GEP ";
   O << (isPointerLoopInvariant() ? "Inv" : "Var");
   for (size_t I = 0; I < getNumOperands() - 1; ++I)
     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";

   O << " ";
   printAsOperand(O, SlotTracker);
   O << " = getelementptr";
   printFlags(O);
   printOperands(O, SlotTracker);
 }
 #endif

 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     // Calculate the pointer for the specific unroll-part.
     Value *PartPtr = nullptr;
     // Use i32 for the gep index type when the value is constant,
     // or query DataLayout for a more suitable index type otherwise.
     const DataLayout &DL =
         Builder.GetInsertBlock()->getDataLayout();
     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
                         ? DL.getIndexType(IndexedTy->getPointerTo())
                         : Builder.getInt32Ty();
     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
     bool InBounds = isInBounds();
     if (IsReverse) {
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
       // RunTimeVF =  VScale * VF.getKnownMinValue()
       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
       // NumElt = -Part * RunTimeVF
       Value *NumElt = Builder.CreateMul(
           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
       // LastLane = 1 - RunTimeVF
       Value *LastLane =
           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
     } else {
       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
     }

     State.set(this, PartPtr, Part, /*IsScalar*/ true);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = vector-pointer ";
   if (IsReverse)
     O << "(reverse) ";

   printOperands(O, SlotTracker);
 }
 #endif

 void VPBlendRecipe::execute(VPTransformState &State) {
   assert(isNormalized() && "Expected blend to be normalized!");
   State.setDebugLocFrom(getDebugLoc());
   // We know that all PHIs in non-header blocks are converted into
   // selects, so we don't have to worry about the insertion order and we
   // can just use the builder.
   // At this point we generate the predication tree. There may be
   // duplications since this is a simple recursive scan, but future
   // optimizations will clean it up.

   unsigned NumIncoming = getNumIncomingValues();

   // Generate a sequence of selects of the form:
   // SELECT(Mask3, In3,
   //        SELECT(Mask2, In2,
   //               SELECT(Mask1, In1,
   //                      In0)))
   // Note that Mask0 is never used: lanes for which no path reaches this phi and
   // are essentially undef are taken from In0.
   VectorParts Entry(State.UF);
   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
   for (unsigned In = 0; In < NumIncoming; ++In) {
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       // We might have single edge PHIs (blocks) - use an identity
       // 'select' for the first PHI operand.
       Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
       if (In == 0)
         Entry[Part] = In0; // Initialize with the first incoming value.
       else {
         // Select between the current value and the previous incoming edge
         // based on the incoming mask.
         Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
         Entry[Part] =
             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
       }
     }
   }

   for (unsigned Part = 0; Part < State.UF; ++Part)
     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
   O << Indent << "BLEND ";
   printAsOperand(O, SlotTracker);
   O << " =";
   if (getNumIncomingValues() == 1) {
     // Not a User of any mask: not really blending, this is a
     // single-predecessor phi.
     O << " ";
     getIncomingValue(0)->printAsOperand(O, SlotTracker);
   } else {
     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
       O << " ";
       getIncomingValue(I)->printAsOperand(O, SlotTracker);
       if (I == 0)
         continue;
       O << "/";
       getMask(I)->printAsOperand(O, SlotTracker);
     }
   }
 }
 #endif

 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
   RecurKind Kind = RdxDesc.getRecurrenceKind();
   // Propagate the fast-math flags carried by the underlying instruction.
   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewVecOp = State.get(getVecOp(), Part);
     if (VPValue *Cond = getCondOp()) {
       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
                                                   RdxDesc.getFastMathFlags());
       if (State.VF.isVector()) {
         Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
       }

       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
       NewVecOp = Select;
     }
     Value *NewRed;
     Value *NextInChain;
     if (IsOrdered) {
       if (State.VF.isVector())
         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
                                         PrevInChain);
       else
         NewRed = State.Builder.CreateBinOp(
             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
             NewVecOp);
       PrevInChain = NewRed;
     } else {
       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
     }
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
                                    NewRed, PrevInChain);
     } else if (IsOrdered)
       NextInChain = NewRed;
     else
       NextInChain = State.Builder.CreateBinOp(
           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
     State.set(this, NextInChain, Part, /*IsScalar*/ true);
   }
 }

 void VPReductionEVLRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
   assert(State.UF == 1 &&
          "Expected only UF == 1 when vectorizing with explicit vector length.");

   auto &Builder = State.Builder;
   // Propagate the fast-math flags carried by the underlying instruction.
   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());

   RecurKind Kind = RdxDesc.getRecurrenceKind();
   Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
   Value *VecOp = State.get(getVecOp(), 0);
   Value *EVL = State.get(getEVL(), VPIteration(0, 0));

   VectorBuilder VBuilder(Builder);
   VBuilder.setEVL(EVL);
   Value *Mask;
   // TODO: move the all-true mask generation into VectorBuilder.
   if (VPValue *CondOp = getCondOp())
     Mask = State.get(CondOp, 0);
   else
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
   VBuilder.setMask(Mask);

   Value *NewRed;
   if (isOrdered()) {
     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
   } else {
     NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc);
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
     else
       NewRed = Builder.CreateBinOp(
           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
   }
   State.set(this, NewRed, 0, /*IsScalar*/ true);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "REDUCE ";
   printAsOperand(O, SlotTracker);
   O << " = ";
   getChainOp()->printAsOperand(O, SlotTracker);
   O << " +";
   if (isa<FPMathOperator>(getUnderlyingInstr()))
     O << getUnderlyingInstr()->getFastMathFlags();
   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
   getVecOp()->printAsOperand(O, SlotTracker);
   if (isConditional()) {
     O << ", ";
     getCondOp()->printAsOperand(O, SlotTracker);
   }
   O << ")";
   if (RdxDesc.IntermediateStore)
     O << " (with final reduction value stored in invariant address sank "
          "outside of loop)";
 }

 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
   O << Indent << "REDUCE ";
   printAsOperand(O, SlotTracker);
   O << " = ";
   getChainOp()->printAsOperand(O, SlotTracker);
   O << " +";
   if (isa<FPMathOperator>(getUnderlyingInstr()))
     O << getUnderlyingInstr()->getFastMathFlags();
   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
   getVecOp()->printAsOperand(O, SlotTracker);
   O << ", ";
   getEVL()->printAsOperand(O, SlotTracker);
   if (isConditional()) {
     O << ", ";
     getCondOp()->printAsOperand(O, SlotTracker);
   }
   O << ")";
   if (RdxDesc.IntermediateStore)
     O << " (with final reduction value stored in invariant address sank "
          "outside of loop)";
 }
 #endif

 bool VPReplicateRecipe::shouldPack() const {
   // Find if the recipe is used by a widened recipe via an intervening
   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
   return any_of(users(), [](const VPUser *U) {
     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
       return any_of(PredR->users(), [PredR](const VPUser *U) {
         return !U->usesScalars(PredR);
       });
     return false;
   });
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");

   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
     printAsOperand(O, SlotTracker);
     O << " = ";
   }
   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
     O << "call";
     printFlags(O);
     O << "@" << CB->getCalledFunction()->getName() << "(";
     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
                     O, [&O, &SlotTracker](VPValue *Op) {
                       Op->printAsOperand(O, SlotTracker);
                     });
     O << ")";
   } else {
     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
     printFlags(O);
     printOperands(O, SlotTracker);
   }

   if (shouldPack())
     O << " (S->V)";
 }
 #endif

 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
 /// if it is either defined outside the vector region or its operand is known to
 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
 /// TODO: Uniformity should be associated with a VPValue and there should be a
 /// generic way to check.
 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
   return C->isDefinedOutsideVectorRegions() ||
          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
 }

 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
   assert(vputils::onlyFirstLaneUsed(this) &&
          "Codegen only implemented for first lane.");
   switch (Opcode) {
   case Instruction::SExt:
   case Instruction::ZExt:
   case Instruction::Trunc: {
     // Note: SExt/ZExt not used yet.
     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
   }
   default:
     llvm_unreachable("opcode not implemented yet");
   }
 }

 void VPScalarCastRecipe ::execute(VPTransformState &State) {
   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
   for (unsigned Part = 0; Part != State.UF; ++Part) {
     Value *Res;
     // Only generate a single instance, if the recipe is uniform across UFs and
     // VFs.
     if (Part > 0 && IsUniformAcrossVFsAndUFs)
       Res = State.get(this, VPIteration(0, 0));
     else
       Res = generate(State, Part);
     State.set(this, Res, VPIteration(Part, 0));
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
   O << Indent << "SCALAR-CAST ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
   printOperands(O, SlotTracker);
   O << " to " << *ResultTy;
 }
 #endif

 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   assert(State.Instance && "Branch on Mask works only on single instance.");

   unsigned Part = State.Instance->Part;
   unsigned Lane = State.Instance->Lane.getKnownLane();

   Value *ConditionBit = nullptr;
   VPValue *BlockInMask = getMask();
   if (BlockInMask) {
     ConditionBit = State.get(BlockInMask, Part);
     if (ConditionBit->getType()->isVectorTy())
       ConditionBit = State.Builder.CreateExtractElement(
           ConditionBit, State.Builder.getInt32(Lane));
   } else // Block in mask is all-one.
     ConditionBit = State.Builder.getTrue();

   // Replace the temporary unreachable terminator with a new conditional branch,
   // whose two destinations will be set later when they are created.
   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
   assert(isa<UnreachableInst>(CurrentTerminator) &&
          "Expected to replace unreachable terminator with conditional branch.");
   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
   CondBr->setSuccessor(0, nullptr);
   ReplaceInstWithInst(CurrentTerminator, CondBr);
 }

 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   assert(State.Instance && "Predicated instruction PHI works per instance.");
   Instruction *ScalarPredInst =
       cast<Instruction>(State.get(getOperand(0), *State.Instance));
   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
   assert(PredicatingBB && "Predicated block has no single predecessor.");
   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
          "operand must be VPReplicateRecipe");

   // By current pack/unpack logic we need to generate only a single phi node: if
   // a vector value for the predicated instruction exists at this point it means
   // the instruction has vector users only, and a phi for the vector value is
   // needed. In this case the recipe of the predicated instruction is marked to
   // also do that packing, thereby "hoisting" the insert-element sequence.
   // Otherwise, a phi node for the scalar value is needed.
   unsigned Part = State.Instance->Part;
   if (State.hasVectorValue(getOperand(0), Part)) {
     Value *VectorValue = State.get(getOperand(0), Part);
     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
     if (State.hasVectorValue(this, Part))
       State.reset(this, VPhi, Part);
     else
       State.set(this, VPhi, Part);
     // NOTE: Currently we need to update the value of the operand, so the next
     // predicated iteration inserts its generated value in the correct vector.
     State.reset(getOperand(0), VPhi, Part);
   } else {
     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
                      PredicatingBB);
     Phi->addIncoming(ScalarPredInst, PredicatedBB);
     if (State.hasScalarValue(this, *State.Instance))
       State.reset(this, Phi, *State.Instance);
     else
       State.set(this, Phi, *State.Instance);
     // NOTE: Currently we need to update the value of the operand, so the next
     // predicated iteration inserts its generated value in the correct vector.
     State.reset(getOperand(0), Phi, *State.Instance);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
   printAsOperand(O, SlotTracker);
   O << " = ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenLoadRecipe::execute(VPTransformState &State) {
   auto *LI = cast<LoadInst>(&Ingredient);

   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
   bool CreateGather = !isConsecutive();

   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewLI;
     Value *Mask = nullptr;
     if (auto *VPMask = getMask()) {
       // Mask reversal is only needed for non-all-one (null) masks, as reverse
       // of a null all-one mask is a null mask.
       Mask = State.get(VPMask, Part);
       if (isReverse())
         Mask = Builder.CreateVectorReverse(Mask, "reverse");
     }

     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
     if (CreateGather) {
       NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
                                          "wide.masked.gather");
     } else if (Mask) {
       NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
                                        PoisonValue::get(DataTy),
                                        "wide.masked.load");
     } else {
       NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
     }
     // Add metadata to the load, but setVectorValue to the reverse shuffle.
     State.addMetadata(NewLI, LI);
     if (Reverse)
       NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
     State.set(this, NewLI, Part);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = load ";
   printOperands(O, SlotTracker);
 }

 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = vp.load ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenStoreRecipe::execute(VPTransformState &State) {
   auto *SI = cast<StoreInst>(&Ingredient);

   VPValue *StoredVPValue = getStoredValue();
   bool CreateScatter = !isConsecutive();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);

   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());

   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Instruction *NewSI = nullptr;
     Value *Mask = nullptr;
     if (auto *VPMask = getMask()) {
       // Mask reversal is only needed for non-all-one (null) masks, as reverse
       // of a null all-one mask is a null mask.
       Mask = State.get(VPMask, Part);
       if (isReverse())
         Mask = Builder.CreateVectorReverse(Mask, "reverse");
     }

     Value *StoredVal = State.get(StoredVPValue, Part);
     if (isReverse()) {
       // If we store to reverse consecutive memory locations, then we need
       // to reverse the order of elements in the stored value.
       StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
       // We don't want to update the value in the map as it might be used in
       // another expression. So don't call resetVectorValue(StoredVal).
     }
     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
     if (CreateScatter)
       NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
     else if (Mask)
       NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
     else
       NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
     State.addMetadata(NewSI, SI);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN store ";
   printOperands(O, SlotTracker);
 }

 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN vp.store ";
   printOperands(O, SlotTracker);
 }
 #endif

 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
                                      VectorType *DstVTy, const DataLayout &DL) {
   // Verify that V is a vector type with same number of elements as DstVTy.
   auto VF = DstVTy->getElementCount();
   auto *SrcVecTy = cast<VectorType>(V->getType());
   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
   Type *SrcElemTy = SrcVecTy->getElementType();
   Type *DstElemTy = DstVTy->getElementType();
   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
          "Vector elements must have same size");

   // Do a direct cast if element types are castable.
   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
     return Builder.CreateBitOrPointerCast(V, DstVTy);
   }
   // V cannot be directly casted to desired vector type.
   // May happen when V is a floating point vector but DstVTy is a vector of
   // pointers or vice-versa. Handle this using a two-step bitcast using an
   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
          "Only one type should be a pointer type");
   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
          "Only one type should be a floating point type");
   Type *IntTy =
       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
   auto *VecIntTy = VectorType::get(IntTy, VF);
   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
 }

 /// Return a vector containing interleaved elements from multiple
 /// smaller input vectors.
 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
                                 const Twine &Name) {
   unsigned Factor = Vals.size();
   assert(Factor > 1 && "Tried to interleave invalid number of vectors");

   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
 #ifndef NDEBUG
   for (Value *Val : Vals)
     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
 #endif

   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
                                    Vals,
                                    /*FMFSource=*/nullptr, Name);
   }

   // Fixed length. Start by concatenating all vectors into a wide vector.
   Value *WideVec = concatenateVectors(Builder, Vals);

   // Interleave the elements into the wide vector.
   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
   return Builder.CreateShuffleVector(
       WideVec, createInterleaveMask(NumElts, Factor), Name);
 }

 // Try to vectorize the interleave group that \p Instr belongs to.
 //
 // E.g. Translate following interleaved load group (factor = 3):
 //   for (i = 0; i < N; i+=3) {
 //     R = Pic[i];             // Member of index 0
 //     G = Pic[i+1];           // Member of index 1
 //     B = Pic[i+2];           // Member of index 2
 //     ... // do something to R, G, B
 //   }
 // To:
 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
 //
 // Or translate following interleaved store group (factor = 3):
 //   for (i = 0; i < N; i+=3) {
 //     ... do something to R, G, B
 //     Pic[i]   = R;           // Member of index 0
 //     Pic[i+1] = G;           // Member of index 1
 //     Pic[i+2] = B;           // Member of index 2
 //   }
 // To:
 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
   const InterleaveGroup<Instruction> *Group = IG;
   Instruction *Instr = Group->getInsertPos();

   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getLoadStoreType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);

   // Prepare for the new pointers.
   SmallVector<Value *, 2> AddrParts;
   unsigned Index = Group->getIndex(Instr);

   // TODO: extend the masked interleaved-group support to reversed access.
   VPValue *BlockInMask = getMask();
   assert((!BlockInMask || !Group->isReverse()) &&
          "Reversed masked interleave-group not supported.");

   Value *Idx;
   // If the group is reverse, adjust the index to refer to the last vector lane
   // instead of the first. We adjust the index from the first vector lane,
   // rather than directly getting the pointer for lane VF - 1, because the
   // pointer operand of the interleaved access is supposed to be uniform. For
   // uniform instructions, we're only required to generate a value for the
   // first vector lane in each unroll iteration.
   if (Group->isReverse()) {
     Value *RuntimeVF =
         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
     Idx = State.Builder.CreateMul(Idx,
                                   State.Builder.getInt32(Group->getFactor()));
     Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
     Idx = State.Builder.CreateNeg(Idx);
   } else
     Idx = State.Builder.getInt32(-Index);

   VPValue *Addr = getAddr();
   for (unsigned Part = 0; Part < State.UF; Part++) {
     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
     if (auto *I = dyn_cast<Instruction>(AddrPart))
       State.setDebugLocFrom(I->getDebugLoc());

     // Notice current instruction could be any index. Need to adjust the address
     // to the member of index 0.
     //
     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
     //       b = A[i];       // Member of index 0
     // Current pointer is pointed to A[i+1], adjust it to A[i].
     //
     // E.g.  A[i+1] = a;     // Member of index 1
     //       A[i]   = b;     // Member of index 0
     //       A[i+2] = c;     // Member of index 2 (Current instruction)
     // Current pointer is pointed to A[i+2], adjust it to A[i].

     bool InBounds = false;
     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
       InBounds = gep->isInBounds();
     AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
     AddrParts.push_back(AddrPart);
   }

   State.setDebugLocFrom(Instr->getDebugLoc());
   Value *PoisonVec = PoisonValue::get(VecTy);

   auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
                              unsigned Part, Value *MaskForGaps) -> Value * {
     if (State.VF.isScalable()) {
       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
       assert(InterleaveFactor == 2 &&
              "Unsupported deinterleave factor for scalable vectors");
       auto *BlockInMaskPart = State.get(BlockInMask, Part);
       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
                                      State.VF.getKnownMinValue() * 2, true);
       return State.Builder.CreateIntrinsic(
           MaskTy, Intrinsic::vector_interleave2, Ops,
           /*FMFSource=*/nullptr, "interleaved.mask");
     }

     if (!BlockInMask)
       return MaskForGaps;

     Value *BlockInMaskPart = State.get(BlockInMask, Part);
     Value *ShuffledMask = State.Builder.CreateShuffleVector(
         BlockInMaskPart,
         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
         "interleaved.mask");
     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
                                                    ShuffledMask, MaskForGaps)
                        : ShuffledMask;
   };

   const DataLayout &DL = Instr->getDataLayout();
   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     Value *MaskForGaps = nullptr;
     if (NeedsMaskForGaps) {
       MaskForGaps = createBitMaskForGaps(State.Builder,
                                          State.VF.getKnownMinValue(), *Group);
       assert(MaskForGaps && "Mask for Gaps is required but it is null");
     }

     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
     for (unsigned Part = 0; Part < State.UF; Part++) {
       Instruction *NewLoad;
       if (BlockInMask || MaskForGaps) {
         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
         NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
                                                  Group->getAlign(), GroupMask,
                                                  PoisonVec, "wide.masked.vec");
       } else
         NewLoad = State.Builder.CreateAlignedLoad(
             VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
       Group->addMetadata(NewLoad);
       NewLoads.push_back(NewLoad);
     }

     ArrayRef<VPValue *> VPDefs = definedValues();
     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
     if (VecTy->isScalableTy()) {
       assert(InterleaveFactor == 2 &&
              "Unsupported deinterleave factor for scalable vectors");

       for (unsigned Part = 0; Part < State.UF; ++Part) {
         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
         // so must use intrinsics to deinterleave.
         Value *DI = State.Builder.CreateIntrinsic(
             Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
             /*FMFSource=*/nullptr, "strided.vec");
         unsigned J = 0;
         for (unsigned I = 0; I < InterleaveFactor; ++I) {
           Instruction *Member = Group->getMember(I);

           if (!Member)
             continue;

           Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
           // If this member has different type, cast the result type.
           if (Member->getType() != ScalarTy) {
             VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
             StridedVec =
                 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
           }

           if (Group->isReverse())
             StridedVec =
                 State.Builder.CreateVectorReverse(StridedVec, "reverse");

           State.set(VPDefs[J], StridedVec, Part);
           ++J;
         }
       }

       return;
     }

     // For each member in the group, shuffle out the appropriate data from the
     // wide loads.
     unsigned J = 0;
     for (unsigned I = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);

       // Skip the gaps in the group.
       if (!Member)
         continue;

       auto StrideMask =
           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
       for (unsigned Part = 0; Part < State.UF; Part++) {
         Value *StridedVec = State.Builder.CreateShuffleVector(
             NewLoads[Part], StrideMask, "strided.vec");

         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
           StridedVec =
               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
         }

         if (Group->isReverse())
           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");

         State.set(VPDefs[J], StridedVec, Part);
       }
       ++J;
     }
     return;
   }

   // The sub vector type for current instruction.
   auto *SubVT = VectorType::get(ScalarTy, State.VF);

   // Vectorize the interleaved store group.
   Value *MaskForGaps =
       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
   assert((!MaskForGaps || !State.VF.isScalable()) &&
          "masking gaps for scalable vectors is not yet supported.");
   ArrayRef<VPValue *> StoredValues = getStoredValues();
   for (unsigned Part = 0; Part < State.UF; Part++) {
     // Collect the stored vector from each member.
     SmallVector<Value *, 4> StoredVecs;
     unsigned StoredIdx = 0;
     for (unsigned i = 0; i < InterleaveFactor; i++) {
       assert((Group->getMember(i) || MaskForGaps) &&
              "Fail to get a member from an interleaved store group");
       Instruction *Member = Group->getMember(i);

       // Skip the gaps in the group.
       if (!Member) {
         Value *Undef = PoisonValue::get(SubVT);
         StoredVecs.push_back(Undef);
         continue;
       }

       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
       ++StoredIdx;

       if (Group->isReverse())
         StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");

       // If this member has different type, cast it to a unified type.

       if (StoredVec->getType() != SubVT)
         StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);

       StoredVecs.push_back(StoredVec);
     }

     // Interleave all the smaller vectors into one wider vector.
     Value *IVec =
         interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
     Instruction *NewStoreInstr;
     if (BlockInMask || MaskForGaps) {
       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
       NewStoreInstr = State.Builder.CreateMaskedStore(
           IVec, AddrParts[Part], Group->getAlign(), GroupMask);
     } else
       NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
                                                        Group->getAlign());

     Group->addMetadata(NewStoreInstr);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
   O << ", ";
   getAddr()->printAsOperand(O, SlotTracker);
   VPValue *Mask = getMask();
   if (Mask) {
     O << ", ";
     Mask->printAsOperand(O, SlotTracker);
   }

   unsigned OpIdx = 0;
   for (unsigned i = 0; i < IG->getFactor(); ++i) {
     if (!IG->getMember(i))
       continue;
     if (getNumStoreOperands() > 0) {
       O << "\n" << Indent << "  store ";
       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
       O << " to index " << i;
     } else {
       O << "\n" << Indent << "  ";
       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
       O << " = load from index " << i;
     }
     ++OpIdx;
   }
 }
 #endif

 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
   Value *Start = getStartValue()->getLiveInIRValue();
   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());

   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   EntryPart->addIncoming(Start, VectorPH);
   EntryPart->setDebugLoc(getDebugLoc());
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
     State.set(this, EntryPart, Part, /*IsScalar*/ true);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = CANONICAL-INDUCTION ";
   printOperands(O, SlotTracker);
 }
 #endif

 bool VPCanonicalIVPHIRecipe::isCanonical(
     InductionDescriptor::InductionKind Kind, VPValue *Start,
     VPValue *Step) const {
   // Must be an integer induction.
   if (Kind != InductionDescriptor::IK_IntInduction)
     return false;
   // Start must match the start value of this canonical induction.
   if (Start != getStartValue())
     return false;

   // If the step is defined by a recipe, it is not a ConstantInt.
   if (Step->getDefiningRecipe())
     return false;

   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
   return StepC && StepC->isOne();
 }

 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
   return IsScalarAfterVectorization &&
          (!IsScalable || vputils::onlyFirstLaneUsed(this));
 }

 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
          "Not a pointer induction according to InductionDescriptor!");
   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
          "Unexpected type.");
   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
          "Recipe should have been replaced");

   auto *IVR = getParent()->getPlan()->getCanonicalIV();
   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
   Type *PhiType = IndDesc.getStep()->getType();

   // Build a pointer phi
   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
   Type *ScStValueType = ScalarStartValue->getType();
   PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
                                            CanonicalIV->getIterator());

   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);

   // A pointer induction, performed by using a gep
   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();

   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
   Value *NumUnrolledElems =
       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
   Value *InductionGEP = GetElementPtrInst::Create(
       State.Builder.getInt8Ty(), NewPointerPhi,
       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
       InductionLoc);
   // Add induction update using an incorrect block temporarily. The phi node
   // will be fixed after VPlan execution. Note that at this point the latch
   // block cannot be used, as it does not exist yet.
   // TODO: Model increment value in VPlan, by turning the recipe into a
   // multi-def and a subclass of VPHeaderPHIRecipe.
   NewPointerPhi->addIncoming(InductionGEP, VectorPH);

   // Create UF many actual address geps that use the pointer
   // phi as base and a vectorized version of the step value
   // (<step*0, ..., step*N>) as offset.
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Type *VecPhiType = VectorType::get(PhiType, State.VF);
     Value *StartOffsetScalar =
         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
     Value *StartOffset =
         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
     // Create a vector of consecutive numbers from zero to VF.
     StartOffset = State.Builder.CreateAdd(
         StartOffset, State.Builder.CreateStepVector(VecPhiType));

     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
            "scalar step must be the same across all parts");
     Value *GEP = State.Builder.CreateGEP(
         State.Builder.getInt8Ty(), NewPointerPhi,
         State.Builder.CreateMul(
             StartOffset,
             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
             "vector.gep"));
     State.set(this, GEP, Part);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-POINTER-INDUCTION ";
   getStartValue()->printAsOperand(O, SlotTracker);
   O << ", " << *IndDesc.getStep();
 }
 #endif

 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "cannot be used in per-lane");
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
   SCEVExpander Exp(SE, DL, "induction");

   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
                                  &*State.Builder.GetInsertPoint());
   assert(!State.ExpandedSCEVs.contains(Expr) &&
          "Same SCEV expanded multiple times");
   State.ExpandedSCEVs[Expr] = Res;
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
     State.set(this, Res, {Part, 0});
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   getVPSingleValue()->printAsOperand(O, SlotTracker);
   O << " = EXPAND SCEV " << *Expr;
 }
 #endif

 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
   Type *STy = CanonicalIV->getType();
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   ElementCount VF = State.VF;
   Value *VStart = VF.isScalar()
                       ? CanonicalIV
                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
     Value *VStep = createStepForVF(Builder, STy, VF, Part);
     if (VF.isVector()) {
       VStep = Builder.CreateVectorSplat(VF, VStep);
       VStep =
           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
     }
     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
     State.set(this, CanonicalVectorIV, Part);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-CANONICAL-INDUCTION ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   // Create a vector from the initial value.
   auto *VectorInit = getStartValue()->getLiveInIRValue();

   Type *VecTy = State.VF.isScalar()
                     ? VectorInit->getType()
                     : VectorType::get(VectorInit->getType(), State.VF);

   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   if (State.VF.isVector()) {
     auto *IdxTy = Builder.getInt32Ty();
     auto *One = ConstantInt::get(IdxTy, 1);
     IRBuilder<>::InsertPointGuard Guard(Builder);
     Builder.SetInsertPoint(VectorPH->getTerminator());
     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
     VectorInit = Builder.CreateInsertElement(
         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
   }

   // Create a phi node for the new recurrence.
   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
   EntryPart->addIncoming(VectorInit, VectorPH);
   State.set(this, EntryPart, 0);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                             VPSlotTracker &SlotTracker) const {
   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPReductionPHIRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;

   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
   VPValue *StartVPV = getStartValue();
   Value *StartV = StartVPV->getLiveInIRValue();

   // In order to support recurrences we need to be able to vectorize Phi nodes.
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
   Type *VecTy = ScalarPHI ? StartV->getType()
                           : VectorType::get(StartV->getType(), State.VF);

   BasicBlock *HeaderBB = State.CFG.PrevBB;
   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
          "recipe must be in the vector loop header");
   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
     State.set(this, EntryPart, Part, IsInLoop);
   }

   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);

   Value *Iden = nullptr;
   RecurKind RK = RdxDesc.getRecurrenceKind();
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
     // MinMax and AnyOf reductions have the start value as their identity.
     if (ScalarPHI) {
       Iden = StartV;
     } else {
       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
       Builder.SetInsertPoint(VectorPH->getTerminator());
       StartV = Iden =
           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
     }
   } else {
     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
                                          RdxDesc.getFastMathFlags());

     if (!ScalarPHI) {
       Iden = Builder.CreateVectorSplat(State.VF, Iden);
       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
       Builder.SetInsertPoint(VectorPH->getTerminator());
       Constant *Zero = Builder.getInt32(0);
       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
     }
   }

   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
     Value *EntryPart = State.get(this, Part, IsInLoop);
     // Make sure to add the reduction start value only to the
     // first unroll part.
     Value *StartVal = (Part == 0) ? StartV : Iden;
     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-REDUCTION-PHI ";

   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPWidenPHIRecipe::execute(VPTransformState &State) {
   assert(EnableVPlanNativePath &&
          "Non-native vplans are not expected to have VPWidenPHIRecipes.");

   Value *Op0 = State.get(getOperand(0), 0);
   Type *VecTy = Op0->getType();
   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
   State.set(this, VecPhi, 0);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-PHI ";

   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
   // Unless all incoming values are modeled in VPlan  print the original PHI
   // directly.
   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
   // values as VPValues.
   if (getNumOperands() != OriginalPhi->getNumOperands()) {
     O << VPlanIngredient(OriginalPhi);
     return;
   }

   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif

 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
 // remove VPActiveLaneMaskPHIRecipe.
 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
     Value *StartMask = State.get(getOperand(0), Part);
     PHINode *EntryPart =
         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
     EntryPart->addIncoming(StartMask, VectorPH);
     EntryPart->setDebugLoc(getDebugLoc());
     State.set(this, EntryPart, Part);
   }
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                       VPSlotTracker &SlotTracker) const {
   O << Indent << "ACTIVE-LANE-MASK-PHI ";

   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif

 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
   PHINode *EntryPart =
       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
   EntryPart->addIncoming(Start, VectorPH);
   EntryPart->setDebugLoc(getDebugLoc());
   State.set(this, EntryPart, 0, /*IsScalar=*/true);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";

   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif