blob: b4afeb552677e0ef50c1188fd9b8e2fa183876c8 [file] [log] [blame] [edit]
commit 7e4b5c2e864ebd1c1a3a0203171143e311dd2a96 (HEAD)
Author: Peter Waller <peter.waller@arm.com>
Date: Mon May 16 20:59:17 2022 +0000
[LV] Improve register pressure estimate at high VFs
commit 4f81e1af2d1de9d902709cbaff727ba198cd5410
Author: Jingu Kang <jingu.kang@arm.com>
Date: Tue Apr 5 13:16:10 2022 +0100
[AArch64] Set maximum VF with shouldMaximizeVectorBandwidth
---
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7412e050322e..1179971ad13b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -727,7 +727,7 @@ public:
bool isTypeLegal(Type *Ty) const;
/// Returns the estimated number of registers required to represent \p Ty.
- InstructionCost getRegUsageForType(Type *Ty) const;
+ unsigned getRegUsageForType(Type *Ty) const;
/// Return true if switches should be turned into lookup tables for the
/// target.
@@ -934,7 +934,8 @@ public:
/// creating vectors that span multiple vector registers.
/// If false, the vectorization factor will be chosen based on the
/// size of the widest element type.
- bool shouldMaximizeVectorBandwidth() const;
+ /// \p K Register Kind for vectorization.
+ bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
/// \return The minimum vectorization factor for types of given element
/// bit width, or 0 if there is no minimum VF. The returned value only
@@ -1571,7 +1572,7 @@ public:
virtual bool isProfitableToHoist(Instruction *I) = 0;
virtual bool useAA() = 0;
virtual bool isTypeLegal(Type *Ty) = 0;
- virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
+ virtual unsigned getRegUsageForType(Type *Ty) = 0;
virtual bool shouldBuildLookupTables() = 0;
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
virtual bool shouldBuildRelLookupTables() = 0;
@@ -1618,7 +1619,8 @@ public:
virtual unsigned getMinVectorRegisterBitWidth() const = 0;
virtual Optional<unsigned> getMaxVScale() const = 0;
virtual Optional<unsigned> getVScaleForTuning() const = 0;
- virtual bool shouldMaximizeVectorBandwidth() const = 0;
+ virtual bool
+ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0;
virtual ElementCount getMinimumVF(unsigned ElemWidth,
bool IsScalable) const = 0;
virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
@@ -2001,7 +2003,7 @@ public:
}
bool useAA() override { return Impl.useAA(); }
bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
- InstructionCost getRegUsageForType(Type *Ty) override {
+ unsigned getRegUsageForType(Type *Ty) override {
return Impl.getRegUsageForType(Ty);
}
bool shouldBuildLookupTables() override {
@@ -2108,8 +2110,9 @@ public:
Optional<unsigned> getVScaleForTuning() const override {
return Impl.getVScaleForTuning();
}
- bool shouldMaximizeVectorBandwidth() const override {
- return Impl.shouldMaximizeVectorBandwidth();
+ bool shouldMaximizeVectorBandwidth(
+ TargetTransformInfo::RegisterKind K) const override {
+ return Impl.shouldMaximizeVectorBandwidth(K);
}
ElementCount getMinimumVF(unsigned ElemWidth,
bool IsScalable) const override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a32744f8d58b..28ce1690202d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -310,7 +310,7 @@ public:
bool isTypeLegal(Type *Ty) const { return false; }
- InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
+ unsigned getRegUsageForType(Type *Ty) const { return 1; }
bool shouldBuildLookupTables() const { return true; }
@@ -415,7 +415,10 @@ public:
Optional<unsigned> getMaxVScale() const { return None; }
Optional<unsigned> getVScaleForTuning() const { return None; }
- bool shouldMaximizeVectorBandwidth() const { return false; }
+ bool
+ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
+ return false;
+ }
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
return ElementCount::get(0, IsScalable);
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0b2737628923..39c8eaf6206b 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -362,10 +362,9 @@ public:
return getTLI()->isTypeLegal(VT);
}
- InstructionCost getRegUsageForType(Type *Ty) {
- InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
- assert(Val >= 0 && "Negative cost!");
- return Val;
+ unsigned getRegUsageForType(Type *Ty) {
+ EVT ETy = getTLI()->getValueType(DL, Ty);
+ return getTLI()->getNumRegisters(Ty->getContext(), ETy);
}
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 25e9dee98e13..7ec752990620 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -470,7 +470,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
return TTIImpl->isTypeLegal(Ty);
}
-InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
+unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
return TTIImpl->getRegUsageForType(Ty);
}
@@ -623,8 +623,9 @@ Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const {
return TTIImpl->getVScaleForTuning();
}
-bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const {
- return TTIImpl->shouldMaximizeVectorBandwidth();
+bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
+ TargetTransformInfo::RegisterKind K) const {
+ return TTIImpl->shouldMaximizeVectorBandwidth(K);
}
ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b2ffdf949d8b..c245b29b6d8a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -50,6 +50,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
return (CallerBits & CalleeBits) == CalleeBits;
}
+bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
+ TargetTransformInfo::RegisterKind K) const {
+ assert(K != TargetTransformInfo::RGK_Scalar);
+ return K == TargetTransformInfo::RGK_FixedWidthVector;
+}
+
/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6029b9f2445..b7b11d196f1c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -135,6 +135,8 @@ public:
return ST->getVScaleForTuning();
}
+ bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
+
/// Try to return an estimate cost factor that can be used as a multiplier
/// when scalarizing an operation for a vector with ElementCount \p VF.
/// For scalable vectors this currently takes the most pessimistic view based
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 9e637dfc3e16..7bc7bbf10614 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -86,12 +86,11 @@ public:
unsigned getMinVectorRegisterBitWidth() const;
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const;
- bool shouldMaximizeVectorBandwidth() const {
+ bool
+ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
return true;
}
- bool supportsEfficientVectorElementLoadStore() {
- return false;
- }
+ bool supportsEfficientVectorElementLoadStore() { return false; }
bool hasBranchDivergence() {
return false;
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 99e6774a02e4..26ac8d872800 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -276,7 +276,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getPeelingPreferences(L, SE, PP);
}
-InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
+unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
TypeSize Size = Ty->getPrimitiveSizeInBits();
if (Ty->isVectorTy()) {
if (Size.isScalable() && ST->hasVInstructions())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index e79c4f75712b..959a1433e689 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -60,7 +60,7 @@ public:
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
- InstructionCost getRegUsageForType(Type *Ty);
+ unsigned getRegUsageForType(Type *Ty);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 46ff0994e04e..c41726b11aca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5560,9 +5560,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return ElementCount::getFixed(ClampedConstTripCount);
}
+ TargetTransformInfo::RegisterKind RegKind =
+ ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
- if (TTI.shouldMaximizeVectorBandwidth() ||
- (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+ if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
+ TTI.shouldMaximizeVectorBandwidth(RegKind))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
ComputeScalableMaxVF);
@@ -6319,16 +6322,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
- // A lambda that gets the register usage for the given type and VF.
- const auto &TTICapture = TTI;
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+ auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
return 0;
- InstructionCost::CostType RegUsage =
- *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
- assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
- "Nonsensical values for register usage.");
- return RegUsage;
+ return TTI.getRegUsageForType(VectorType::get(Ty, VF));
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
index 371d209bafff..a1ca0fea7972 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
@@ -4,11 +4,12 @@
; are not profitable.
; Test with a loop that contains memory accesses of i8 and i32 types. The
-; default maximum VF for NEON is 4. And while we don't have an instruction to
-; load 4 x i8, vectorization might still be profitable.
+; maximum VF for NEON is calculated by 128/size of smallest type in loop.
+; And while we don't have an instruction to load 4 x i8, vectorization
+; might still be profitable.
define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
; CHECK-LABEL: @test_load_i8_store_i32(
-; CHECK: <4 x i8>
+; CHECK: <16 x i8>
;
entry:
br label %loop
@@ -32,7 +33,7 @@ exit:
; Same as test_load_i8_store_i32, but with types flipped for load and store.
define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
; CHECK-LABEL: @test_load_i32_store_i8(
-; CHECK: <4 x i8>
+; CHECK: <16 x i8>
;
entry:
br label %loop
@@ -84,7 +85,7 @@ exit:
; vectorization factor.
define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
; CHECK-LABEL: @test_load_i8_store_i64_large
-; CHECK: <2 x i64>
+; CHECK: <8 x i64>
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
new file mode 100644
index 000000000000..f0dc8e502769
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
@@ -0,0 +1,57 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "aarch64"
+
+; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
+; CHECK: LV(REG): VF = 32
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
+entry:
+ br label %loop
+exit:
+ ret i1 %reduction_next
+loop:
+ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+ %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+ %loaded = load i32, ptr %gep
+ %i1 = icmp eq i32 %loaded, %induction
+ %reduction_next = or i1 %i1, %reduction
+ %induction_next = add nuw i32 %induction, 1
+ %cond = icmp eq i32 %induction_next, %arg
+ br i1 %cond, label %exit, label %loop, !llvm.loop !32
+}
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
+; CHECK: LV(REG): VF = 64
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
+entry:
+ br label %loop
+exit:
+ ret i1 %reduction_next
+loop:
+ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+ %reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ]
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+ %loaded = load i32, ptr %gep
+ %i1 = icmp eq i32 %loaded, %induction
+ %reduction_next = or i1 %i1, %reduction
+ %induction_next = add nuw i32 %induction, 1
+ %cond = icmp eq i32 %induction_next, %arg
+ br i1 %cond, label %exit, label %loop, !llvm.loop !64
+}
+
+!32 = distinct !{!32, !33}
+!33 = !{!"llvm.loop.vectorize.width", i32 32}
+!64 = distinct !{!64, !65}
+!65 = !{!"llvm.loop.vectorize.width", i32 64}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index e6e43375204d..28eabe382dfb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -116,9 +116,9 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: @add_d(
-; CHECK: load <4 x i16>
-; CHECK: add nsw <4 x i32>
-; CHECK: store <4 x i32>
+; CHECK: load <8 x i16>
+; CHECK: add nsw <8 x i32>
+; CHECK: store <8 x i32>
define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp7 = icmp sgt i32 %len, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
index a95c0aa6f375..071255c4f4f0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -123,16 +123,16 @@ for.body:
; }
;
; CHECK: vector.body:
-; CHECK: phi <8 x i16>
-; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16>
-; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16>
-; CHECK: add <8 x i16>
-; CHECK: add <8 x i16>
+; CHECK: phi <16 x i16>
+; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16>
+; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16>
+; CHECK: add <16 x i16>
+; CHECK: add <16 x i16>
;
; CHECK: middle.block:
-; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
+; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
; CHECK: zext i16 [[Rdx]] to i32
;
define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index 27868480c23b..262236075f7c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -29,7 +29,7 @@
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
; VF-4: <4 x i32>
-; VF-VSCALE4: <vscale x 4 x i32>
+; VF-VSCALE4: <16 x i32>
define void @test0(i32* %a, i8* %b, i32* %c) #0 {
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index 9bd9c31d32d3..1d2c70db11cf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -9,9 +9,9 @@
define void @test0(i32* %a, i8* %b, i32* %c) #0 {
; CHECK: LV: Checking a loop in "test0"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16
entry:
@@ -40,9 +40,9 @@ exit:
define void @test1(i32* %a, i8* %b) #0 {
; CHECK: LV: Checking a loop in "test1"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
entry:
@@ -72,9 +72,9 @@ exit:
define void @test2(i32* %a, i8* %b) #0 {
; CHECK: LV: Checking a loop in "test2"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
entry:
@@ -104,9 +104,9 @@ exit:
define void @test3(i32* %a, i8* %b) #0 {
; CHECK: LV: Checking a loop in "test3"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
-; CHECK_SCALABLE_ON: LV: Selecting VF: 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
index 4d0886f4d953..43ef43c11507 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
@@ -83,11 +83,11 @@ for.end:
define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) {
; CHECK-LABEL: @uniform_store_i1
; CHECK: vector.body
-; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1
-; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]]
-; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x i64*> {{.*}}, i64 1
+; CHECK: %[[ICMP:.*]] = icmp eq <64 x i64*> %[[GEP]], %[[SPLAT:.*]]
+; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 0
; CHECK: store i1 %[[EXTRACT1]], i1* %dst
-; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1
+; CHECK: %[[EXTRACT2:.*]] = extractelement <64 x i1> %[[ICMP]], i32 1
; CHECK: store i1 %[[EXTRACT2]], i1* %dst
; CHECK-NOT: vscale
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
new file mode 100644
index 000000000000..4cab716c7544
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
@@ -0,0 +1,32 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "x86_64"
+
+; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
+; CHECK: LV(REG): VF = 64
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
+entry:
+ br label %loop
+exit:
+ ret i1 %reduction_next
+loop:
+ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+ %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+ %loaded = load i32, ptr %gep
+ %i1 = icmp eq i32 %loaded, %induction
+ %reduction_next = or i1 %i1, %reduction
+ %induction_next = add nuw i32 %induction, 1
+ %cond = icmp eq i32 %induction_next, %arg
+ br i1 %cond, label %exit, label %loop, !llvm.loop !64
+}
+
+!64 = distinct !{!64, !65}
+!65 = !{!"llvm.loop.vectorize.width", i32 64}