patches/cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch - toolchain/llvm_android - Git at Google

 commit 7e4b5c2e864ebd1c1a3a0203171143e311dd2a96 (HEAD)
 Author: Peter Waller <peter.waller@arm.com>
 Date:   Mon May 16 20:59:17 2022 +0000

     [LV] Improve register pressure estimate at high VFs

 commit 4f81e1af2d1de9d902709cbaff727ba198cd5410
 Author: Jingu Kang <jingu.kang@arm.com>
 Date:   Tue Apr 5 13:16:10 2022 +0100

     [AArch64] Set maximum VF with shouldMaximizeVectorBandwidth
 ---
 diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
 index 7412e050322e..1179971ad13b 100644
 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
 +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
 @@ -727,7 +727,7 @@ public:
    bool isTypeLegal(Type *Ty) const;

    /// Returns the estimated number of registers required to represent \p Ty.
 -  InstructionCost getRegUsageForType(Type *Ty) const;
 +  unsigned getRegUsageForType(Type *Ty) const;

    /// Return true if switches should be turned into lookup tables for the
    /// target.
 @@ -934,7 +934,8 @@ public:
    /// creating vectors that span multiple vector registers.
    /// If false, the vectorization factor will be chosen based on the
    /// size of the widest element type.
 -  bool shouldMaximizeVectorBandwidth() const;
 +  /// \p K Register Kind for vectorization.
 +  bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;

    /// \return The minimum vectorization factor for types of given element
    /// bit width, or 0 if there is no minimum VF. The returned value only
 @@ -1571,7 +1572,7 @@ public:
    virtual bool isProfitableToHoist(Instruction *I) = 0;
    virtual bool useAA() = 0;
    virtual bool isTypeLegal(Type *Ty) = 0;
 -  virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
 +  virtual unsigned getRegUsageForType(Type *Ty) = 0;
    virtual bool shouldBuildLookupTables() = 0;
    virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
    virtual bool shouldBuildRelLookupTables() = 0;
 @@ -1618,7 +1619,8 @@ public:
    virtual unsigned getMinVectorRegisterBitWidth() const = 0;
    virtual Optional<unsigned> getMaxVScale() const = 0;
    virtual Optional<unsigned> getVScaleForTuning() const = 0;
 -  virtual bool shouldMaximizeVectorBandwidth() const = 0;
 +  virtual bool
 +  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0;
    virtual ElementCount getMinimumVF(unsigned ElemWidth,
                                      bool IsScalable) const = 0;
    virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
 @@ -2001,7 +2003,7 @@ public:
    }
    bool useAA() override { return Impl.useAA(); }
    bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
 -  InstructionCost getRegUsageForType(Type *Ty) override {
 +  unsigned getRegUsageForType(Type *Ty) override {
      return Impl.getRegUsageForType(Ty);
    }
    bool shouldBuildLookupTables() override {
 @@ -2108,8 +2110,9 @@ public:
    Optional<unsigned> getVScaleForTuning() const override {
      return Impl.getVScaleForTuning();
    }
 -  bool shouldMaximizeVectorBandwidth() const override {
 -    return Impl.shouldMaximizeVectorBandwidth();
 +  bool shouldMaximizeVectorBandwidth(
 +      TargetTransformInfo::RegisterKind K) const override {
 +    return Impl.shouldMaximizeVectorBandwidth(K);
    }
    ElementCount getMinimumVF(unsigned ElemWidth,
                              bool IsScalable) const override {
 diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
 index a32744f8d58b..28ce1690202d 100644
 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
 +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
 @@ -310,7 +310,7 @@ public:

    bool isTypeLegal(Type *Ty) const { return false; }

 -  InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
 +  unsigned getRegUsageForType(Type *Ty) const { return 1; }

    bool shouldBuildLookupTables() const { return true; }

 @@ -415,7 +415,10 @@ public:
    Optional<unsigned> getMaxVScale() const { return None; }
    Optional<unsigned> getVScaleForTuning() const { return None; }

 -  bool shouldMaximizeVectorBandwidth() const { return false; }
 +  bool
 +  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
 +    return false;
 +  }

    ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
      return ElementCount::get(0, IsScalable);
 diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
 index 0b2737628923..39c8eaf6206b 100644
 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
 +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
 @@ -362,10 +362,9 @@ public:
      return getTLI()->isTypeLegal(VT);
    }

 -  InstructionCost getRegUsageForType(Type *Ty) {
 -    InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
 -    assert(Val >= 0 && "Negative cost!");
 -    return Val;
 +  unsigned getRegUsageForType(Type *Ty) {
 +    EVT ETy = getTLI()->getValueType(DL, Ty);
 +    return getTLI()->getNumRegisters(Ty->getContext(), ETy);
    }

    InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
 diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
 index 25e9dee98e13..7ec752990620 100644
 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp
 +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
 @@ -470,7 +470,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
    return TTIImpl->isTypeLegal(Ty);
  }

 -InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
 +unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
    return TTIImpl->getRegUsageForType(Ty);
  }

 @@ -623,8 +623,9 @@ Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const {
    return TTIImpl->getVScaleForTuning();
  }

 -bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const {
 -  return TTIImpl->shouldMaximizeVectorBandwidth();
 +bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
 +    TargetTransformInfo::RegisterKind K) const {
 +  return TTIImpl->shouldMaximizeVectorBandwidth(K);
  }

  ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
 diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
 index b2ffdf949d8b..c245b29b6d8a 100644
 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
 +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
 @@ -50,6 +50,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
    return (CallerBits & CalleeBits) == CalleeBits;
  }

 +bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
 +    TargetTransformInfo::RegisterKind K) const {
 +  assert(K != TargetTransformInfo::RGK_Scalar);
 +  return K == TargetTransformInfo::RGK_FixedWidthVector;
 +}
 +
  /// Calculate the cost of materializing a 64-bit value. This helper
  /// method might only calculate a fraction of a larger immediate. Therefore it
  /// is valid to return a cost of ZERO.
 diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
 index a6029b9f2445..b7b11d196f1c 100644
 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
 +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
 @@ -135,6 +135,8 @@ public:
      return ST->getVScaleForTuning();
    }

 +  bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
 +
    /// Try to return an estimate cost factor that can be used as a multiplier
    /// when scalarizing an operation for a vector with ElementCount \p VF.
    /// For scalable vectors this currently takes the most pessimistic view based
 diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
 index 9e637dfc3e16..7bc7bbf10614 100644
 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
 +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
 @@ -86,12 +86,11 @@ public:
    unsigned getMinVectorRegisterBitWidth() const;
    ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const;

 -  bool shouldMaximizeVectorBandwidth() const {
 +  bool
 +  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
      return true;
    }
 -  bool supportsEfficientVectorElementLoadStore() {
 -    return false;
 -  }
 +  bool supportsEfficientVectorElementLoadStore() { return false; }
    bool hasBranchDivergence() {
      return false;
    }
 diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
 index 99e6774a02e4..26ac8d872800 100644
 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
 +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
 @@ -276,7 +276,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
    BaseT::getPeelingPreferences(L, SE, PP);
  }

 -InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
 +unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
    TypeSize Size = Ty->getPrimitiveSizeInBits();
    if (Ty->isVectorTy()) {
      if (Size.isScalable() && ST->hasVInstructions())
 diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
 index e79c4f75712b..959a1433e689 100644
 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
 +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
 @@ -60,7 +60,7 @@ public:

    TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;

 -  InstructionCost getRegUsageForType(Type *Ty);
 +  unsigned getRegUsageForType(Type *Ty);

    void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                 TTI::UnrollingPreferences &UP,
 diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
 index 46ff0994e04e..c41726b11aca 100644
 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
 +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
 @@ -5560,9 +5560,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
      return ElementCount::getFixed(ClampedConstTripCount);
    }

 +  TargetTransformInfo::RegisterKind RegKind =
 +      ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
 +                           : TargetTransformInfo::RGK_FixedWidthVector;
    ElementCount MaxVF = MaxVectorElementCount;
 -  if (TTI.shouldMaximizeVectorBandwidth() ||
 -      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
 +  if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
 +                            TTI.shouldMaximizeVectorBandwidth(RegKind))) {
      auto MaxVectorElementCountMaxBW = ElementCount::get(
          PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
          ComputeScalableMaxVF);
 @@ -6319,16 +6322,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

    LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");

 -  // A lambda that gets the register usage for the given type and VF.
 -  const auto &TTICapture = TTI;
 -  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
 +  auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
      if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
        return 0;
 -    InstructionCost::CostType RegUsage =
 -        *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
 -    assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
 -           "Nonsensical values for register usage.");
 -    return RegUsage;
 +    return TTI.getRegUsageForType(VectorType::get(Ty, VF));
    };

    for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
 index 371d209bafff..a1ca0fea7972 100644
 --- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
 +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
 @@ -4,11 +4,12 @@
  ; are not profitable.

  ; Test with a loop that contains memory accesses of i8 and i32 types. The
 -; default maximum VF for NEON is 4. And while we don't have an instruction to
 -; load 4 x i8, vectorization might still be profitable.
 +; maximum VF for NEON is calculated by 128/size of smallest type in loop.
 +; And while we don't have an instruction to  load 4 x i8, vectorization
 +; might still be profitable.
  define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
  ; CHECK-LABEL: @test_load_i8_store_i32(
 -; CHECK:       <4 x i8>
 +; CHECK:       <16 x i8>
  ;
  entry:
    br label %loop
 @@ -32,7 +33,7 @@ exit:
  ; Same as test_load_i8_store_i32, but with types flipped for load and store.
  define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
  ; CHECK-LABEL: @test_load_i32_store_i8(
 -; CHECK:     <4 x i8>
 +; CHECK:     <16 x i8>
  ;
  entry:
    br label %loop
 @@ -84,7 +85,7 @@ exit:
  ; vectorization factor.
  define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
  ; CHECK-LABEL: @test_load_i8_store_i64_large
 -; CHECK: <2 x i64>
 +; CHECK: <8 x i64>
  ;
  entry:
    br label %loop
 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
 new file mode 100644
 index 000000000000..f0dc8e502769
 --- /dev/null
 +++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
 @@ -0,0 +1,57 @@
 +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
 +; REQUIRES: asserts
 +
 +target triple = "aarch64"
 +
 +; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
 +
 +; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
 +; CHECK: LV(REG): VF = 32
 +; CHECK-NEXT: LV(REG): Found max usage: 2 item
 +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
 +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
 +
 +define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
 +entry:
 +  br label %loop
 +exit:
 +  ret i1 %reduction_next
 +loop:
 +  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
 +  %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
 +  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
 +  %loaded = load i32, ptr %gep
 +  %i1 = icmp eq i32 %loaded, %induction
 +  %reduction_next = or i1 %i1, %reduction
 +  %induction_next = add nuw i32 %induction, 1
 +  %cond = icmp eq i32 %induction_next, %arg
 +  br i1 %cond, label %exit, label %loop, !llvm.loop !32
 +}
 +
 +; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
 +; CHECK: LV(REG): VF = 64
 +; CHECK-NEXT: LV(REG): Found max usage: 2 item
 +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
 +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
 +
 +define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
 +entry:
 +  br label %loop
 +exit:
 +  ret i1 %reduction_next
 +loop:
 +  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
 +  %reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ]
 +  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
 +  %loaded = load i32, ptr %gep
 +  %i1 = icmp eq i32 %loaded, %induction
 +  %reduction_next = or i1 %i1, %reduction
 +  %induction_next = add nuw i32 %induction, 1
 +  %cond = icmp eq i32 %induction_next, %arg
 +  br i1 %cond, label %exit, label %loop, !llvm.loop !64
 +}
 +
 +!32 = distinct !{!32, !33}
 +!33 = !{!"llvm.loop.vectorize.width", i32 32}
 +!64 = distinct !{!64, !65}
 +!65 = !{!"llvm.loop.vectorize.width", i32 64}
 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
 index e6e43375204d..28eabe382dfb 100644
 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
 +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
 @@ -116,9 +116,9 @@ for.body:                                         ; preds = %entry, %for.body
  }

  ; CHECK-LABEL: @add_d(
 -; CHECK: load <4 x i16>
 -; CHECK: add nsw <4 x i32>
 -; CHECK: store <4 x i32>
 +; CHECK: load <8 x i16>
 +; CHECK: add nsw <8 x i32>
 +; CHECK: store <8 x i32>
  define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
  entry:
    %cmp7 = icmp sgt i32 %len, 0
 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
 index a95c0aa6f375..071255c4f4f0 100644
 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
 +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
 @@ -123,16 +123,16 @@ for.body:
  ; }
  ;
  ; CHECK: vector.body:
 -; CHECK:   phi <8 x i16>
 -; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
 -; CHECK:   zext <8 x i8> [[Ld1]] to <8 x i16>
 -; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
 -; CHECK:   zext <8 x i8> [[Ld2]] to <8 x i16>
 -; CHECK:   add <8 x i16>
 -; CHECK:   add <8 x i16>
 +; CHECK:   phi <16 x i16>
 +; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
 +; CHECK:   zext <16 x i8> [[Ld1]] to <16 x i16>
 +; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
 +; CHECK:   zext <16 x i8> [[Ld2]] to <16 x i16>
 +; CHECK:   add <16 x i16>
 +; CHECK:   add <16 x i16>
  ;
  ; CHECK: middle.block:
 -; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
 +; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
  ; CHECK:   zext i16 [[Rdx]] to i32
  ;
  define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
 index 27868480c23b..262236075f7c 100644
 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
 +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
 @@ -29,7 +29,7 @@
  ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).

  ; VF-4: <4 x i32>
 -; VF-VSCALE4: <vscale x 4 x i32>
 +; VF-VSCALE4: <16 x i32>
  define void @test0(i32* %a, i8* %b, i32* %c) #0 {
  entry:
    br label %loop
 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
 index 9bd9c31d32d3..1d2c70db11cf 100644
 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
 +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
 @@ -9,9 +9,9 @@
  define void @test0(i32* %a, i8* %b, i32* %c) #0 {
  ; CHECK: LV: Checking a loop in "test0"
  ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16
  ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
 -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
  ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
  ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16
  entry:
 @@ -40,9 +40,9 @@ exit:
  define void @test1(i32* %a, i8* %b) #0 {
  ; CHECK: LV: Checking a loop in "test1"
  ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16
  ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
 -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
  ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
  ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
  entry:
 @@ -72,9 +72,9 @@ exit:
  define void @test2(i32* %a, i8* %b) #0 {
  ; CHECK: LV: Checking a loop in "test2"
  ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2
 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16
  ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
 -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
  ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
  ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
  entry:
 @@ -104,9 +104,9 @@ exit:
  define void @test3(i32* %a, i8* %b) #0 {
  ; CHECK: LV: Checking a loop in "test3"
  ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
 -; CHECK_SCALABLE_ON: LV: Selecting VF: 4
 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16
  ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
 -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
  ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
  ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
  entry:
 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
 index 4d0886f4d953..43ef43c11507 100644
 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
 +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
 @@ -83,11 +83,11 @@ for.end:
  define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) {
  ; CHECK-LABEL: @uniform_store_i1
  ; CHECK: vector.body
 -; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1
 -; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]]
 -; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0
 +; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x i64*> {{.*}}, i64 1
 +; CHECK: %[[ICMP:.*]] = icmp eq <64 x i64*> %[[GEP]], %[[SPLAT:.*]]
 +; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 0
  ; CHECK: store i1 %[[EXTRACT1]], i1* %dst
 -; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1
 +; CHECK: %[[EXTRACT2:.*]] = extractelement <64 x i1> %[[ICMP]], i32 1
  ; CHECK: store i1 %[[EXTRACT2]], i1* %dst
  ; CHECK-NOT: vscale
  entry:
 diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
 new file mode 100644
 index 000000000000..4cab716c7544
 --- /dev/null
 +++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
 @@ -0,0 +1,32 @@
 +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
 +; REQUIRES: asserts
 +
 +target triple = "x86_64"
 +
 +; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
 +
 +; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
 +; CHECK: LV(REG): VF = 64
 +; CHECK-NEXT: LV(REG): Found max usage: 2 item
 +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
 +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
 +
 +define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
 +entry:
 +  br label %loop
 +exit:
 +  ret i1 %reduction_next
 +loop:
 +  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
 +  %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
 +  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
 +  %loaded = load i32, ptr %gep
 +  %i1 = icmp eq i32 %loaded, %induction
 +  %reduction_next = or i1 %i1, %reduction
 +  %induction_next = add nuw i32 %induction, 1
 +  %cond = icmp eq i32 %induction_next, %arg
 +  br i1 %cond, label %exit, label %loop, !llvm.loop !64
 +}
 +
 +!64 = distinct !{!64, !65}
 +!65 = !{!"llvm.loop.vectorize.width", i32 64}