| commit 7e4b5c2e864ebd1c1a3a0203171143e311dd2a96 (HEAD) |
| Author: Peter Waller <peter.waller@arm.com> |
| Date: Mon May 16 20:59:17 2022 +0000 |
| |
| [LV] Improve register pressure estimate at high VFs |
| |
| commit 4f81e1af2d1de9d902709cbaff727ba198cd5410 |
| Author: Jingu Kang <jingu.kang@arm.com> |
| Date: Tue Apr 5 13:16:10 2022 +0100 |
| |
| [AArch64] Set maximum VF with shouldMaximizeVectorBandwidth |
| --- |
| diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h |
| index 7412e050322e..1179971ad13b 100644 |
| --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h |
| +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h |
| @@ -727,7 +727,7 @@ public: |
| bool isTypeLegal(Type *Ty) const; |
| |
| /// Returns the estimated number of registers required to represent \p Ty. |
| - InstructionCost getRegUsageForType(Type *Ty) const; |
| + unsigned getRegUsageForType(Type *Ty) const; |
| |
| /// Return true if switches should be turned into lookup tables for the |
| /// target. |
| @@ -934,7 +934,8 @@ public: |
| /// creating vectors that span multiple vector registers. |
| /// If false, the vectorization factor will be chosen based on the |
| /// size of the widest element type. |
| - bool shouldMaximizeVectorBandwidth() const; |
| + /// \p K Register Kind for vectorization. |
| + bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; |
| |
| /// \return The minimum vectorization factor for types of given element |
| /// bit width, or 0 if there is no minimum VF. The returned value only |
| @@ -1571,7 +1572,7 @@ public: |
| virtual bool isProfitableToHoist(Instruction *I) = 0; |
| virtual bool useAA() = 0; |
| virtual bool isTypeLegal(Type *Ty) = 0; |
| - virtual InstructionCost getRegUsageForType(Type *Ty) = 0; |
| + virtual unsigned getRegUsageForType(Type *Ty) = 0; |
| virtual bool shouldBuildLookupTables() = 0; |
| virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0; |
| virtual bool shouldBuildRelLookupTables() = 0; |
| @@ -1618,7 +1619,8 @@ public: |
| virtual unsigned getMinVectorRegisterBitWidth() const = 0; |
| virtual Optional<unsigned> getMaxVScale() const = 0; |
| virtual Optional<unsigned> getVScaleForTuning() const = 0; |
| - virtual bool shouldMaximizeVectorBandwidth() const = 0; |
| + virtual bool |
| + shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0; |
| virtual ElementCount getMinimumVF(unsigned ElemWidth, |
| bool IsScalable) const = 0; |
| virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; |
| @@ -2001,7 +2003,7 @@ public: |
| } |
| bool useAA() override { return Impl.useAA(); } |
| bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); } |
| - InstructionCost getRegUsageForType(Type *Ty) override { |
| + unsigned getRegUsageForType(Type *Ty) override { |
| return Impl.getRegUsageForType(Ty); |
| } |
| bool shouldBuildLookupTables() override { |
| @@ -2108,8 +2110,9 @@ public: |
| Optional<unsigned> getVScaleForTuning() const override { |
| return Impl.getVScaleForTuning(); |
| } |
| - bool shouldMaximizeVectorBandwidth() const override { |
| - return Impl.shouldMaximizeVectorBandwidth(); |
| + bool shouldMaximizeVectorBandwidth( |
| + TargetTransformInfo::RegisterKind K) const override { |
| + return Impl.shouldMaximizeVectorBandwidth(K); |
| } |
| ElementCount getMinimumVF(unsigned ElemWidth, |
| bool IsScalable) const override { |
| diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h |
| index a32744f8d58b..28ce1690202d 100644 |
| --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h |
| +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h |
| @@ -310,7 +310,7 @@ public: |
| |
| bool isTypeLegal(Type *Ty) const { return false; } |
| |
| - InstructionCost getRegUsageForType(Type *Ty) const { return 1; } |
| + unsigned getRegUsageForType(Type *Ty) const { return 1; } |
| |
| bool shouldBuildLookupTables() const { return true; } |
| |
| @@ -415,7 +415,10 @@ public: |
| Optional<unsigned> getMaxVScale() const { return None; } |
| Optional<unsigned> getVScaleForTuning() const { return None; } |
| |
| - bool shouldMaximizeVectorBandwidth() const { return false; } |
| + bool |
| + shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const { |
| + return false; |
| + } |
| |
| ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const { |
| return ElementCount::get(0, IsScalable); |
| diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h |
| index 0b2737628923..39c8eaf6206b 100644 |
| --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h |
| +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h |
| @@ -362,10 +362,9 @@ public: |
| return getTLI()->isTypeLegal(VT); |
| } |
| |
| - InstructionCost getRegUsageForType(Type *Ty) { |
| - InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first; |
| - assert(Val >= 0 && "Negative cost!"); |
| - return Val; |
| + unsigned getRegUsageForType(Type *Ty) { |
| + EVT ETy = getTLI()->getValueType(DL, Ty); |
| + return getTLI()->getNumRegisters(Ty->getContext(), ETy); |
| } |
| |
| InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, |
| diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp |
| index 25e9dee98e13..7ec752990620 100644 |
| --- a/llvm/lib/Analysis/TargetTransformInfo.cpp |
| +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp |
| @@ -470,7 +470,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const { |
| return TTIImpl->isTypeLegal(Ty); |
| } |
| |
| -InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const { |
| +unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const { |
| return TTIImpl->getRegUsageForType(Ty); |
| } |
| |
| @@ -623,8 +623,9 @@ Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const { |
| return TTIImpl->getVScaleForTuning(); |
| } |
| |
| -bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const { |
| - return TTIImpl->shouldMaximizeVectorBandwidth(); |
| +bool TargetTransformInfo::shouldMaximizeVectorBandwidth( |
| + TargetTransformInfo::RegisterKind K) const { |
| + return TTIImpl->shouldMaximizeVectorBandwidth(K); |
| } |
| |
| ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth, |
| diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp |
| index b2ffdf949d8b..c245b29b6d8a 100644 |
| --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp |
| +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp |
| @@ -50,6 +50,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, |
| return (CallerBits & CalleeBits) == CalleeBits; |
| } |
| |
| +bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( |
| + TargetTransformInfo::RegisterKind K) const { |
| + assert(K != TargetTransformInfo::RGK_Scalar); |
| + return K == TargetTransformInfo::RGK_FixedWidthVector; |
| +} |
| + |
| /// Calculate the cost of materializing a 64-bit value. This helper |
| /// method might only calculate a fraction of a larger immediate. Therefore it |
| /// is valid to return a cost of ZERO. |
| diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h |
| index a6029b9f2445..b7b11d196f1c 100644 |
| --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h |
| +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h |
| @@ -135,6 +135,8 @@ public: |
| return ST->getVScaleForTuning(); |
| } |
| |
| + bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; |
| + |
| /// Try to return an estimate cost factor that can be used as a multiplier |
| /// when scalarizing an operation for a vector with ElementCount \p VF. |
| /// For scalable vectors this currently takes the most pessimistic view based |
| diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h |
| index 9e637dfc3e16..7bc7bbf10614 100644 |
| --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h |
| +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h |
| @@ -86,12 +86,11 @@ public: |
| unsigned getMinVectorRegisterBitWidth() const; |
| ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const; |
| |
| - bool shouldMaximizeVectorBandwidth() const { |
| + bool |
| + shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const { |
| return true; |
| } |
| - bool supportsEfficientVectorElementLoadStore() { |
| - return false; |
| - } |
| + bool supportsEfficientVectorElementLoadStore() { return false; } |
| bool hasBranchDivergence() { |
| return false; |
| } |
| diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp |
| index 99e6774a02e4..26ac8d872800 100644 |
| --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp |
| +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp |
| @@ -276,7 +276,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
| BaseT::getPeelingPreferences(L, SE, PP); |
| } |
| |
| -InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) { |
| +unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { |
| TypeSize Size = Ty->getPrimitiveSizeInBits(); |
| if (Ty->isVectorTy()) { |
| if (Size.isScalable() && ST->hasVInstructions()) |
| diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h |
| index e79c4f75712b..959a1433e689 100644 |
| --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h |
| +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h |
| @@ -60,7 +60,7 @@ public: |
| |
| TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const; |
| |
| - InstructionCost getRegUsageForType(Type *Ty); |
| + unsigned getRegUsageForType(Type *Ty); |
| |
| void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
| TTI::UnrollingPreferences &UP, |
| diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |
| index 46ff0994e04e..c41726b11aca 100644 |
| --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |
| +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |
| @@ -5560,9 +5560,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( |
| return ElementCount::getFixed(ClampedConstTripCount); |
| } |
| |
| + TargetTransformInfo::RegisterKind RegKind = |
| + ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector |
| + : TargetTransformInfo::RGK_FixedWidthVector; |
| ElementCount MaxVF = MaxVectorElementCount; |
| - if (TTI.shouldMaximizeVectorBandwidth() || |
| - (MaximizeBandwidth && isScalarEpilogueAllowed())) { |
| + if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && |
| + TTI.shouldMaximizeVectorBandwidth(RegKind))) { |
| auto MaxVectorElementCountMaxBW = ElementCount::get( |
| PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), |
| ComputeScalableMaxVF); |
| @@ -6319,16 +6322,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { |
| |
| LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); |
| |
| - // A lambda that gets the register usage for the given type and VF. |
| - const auto &TTICapture = TTI; |
| - auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { |
| + auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { |
| if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) |
| return 0; |
| - InstructionCost::CostType RegUsage = |
| - *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); |
| - assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && |
| - "Nonsensical values for register usage."); |
| - return RegUsage; |
| + return TTI.getRegUsageForType(VectorType::get(Ty, VF)); |
| }; |
| |
| for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { |
| diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll |
| index 371d209bafff..a1ca0fea7972 100644 |
| --- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll |
| +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll |
| @@ -4,11 +4,12 @@ |
| ; are not profitable. |
| |
| ; Test with a loop that contains memory accesses of i8 and i32 types. The |
| -; default maximum VF for NEON is 4. And while we don't have an instruction to |
| -; load 4 x i8, vectorization might still be profitable. |
| +; maximum VF for NEON is calculated by 128/size of smallest type in loop. |
| +; And while we don't have an instruction to load 4 x i8, vectorization |
| +; might still be profitable. |
| define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) { |
| ; CHECK-LABEL: @test_load_i8_store_i32( |
| -; CHECK: <4 x i8> |
| +; CHECK: <16 x i8> |
| ; |
| entry: |
| br label %loop |
| @@ -32,7 +33,7 @@ exit: |
| ; Same as test_load_i8_store_i32, but with types flipped for load and store. |
| define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) { |
| ; CHECK-LABEL: @test_load_i32_store_i8( |
| -; CHECK: <4 x i8> |
| +; CHECK: <16 x i8> |
| ; |
| entry: |
| br label %loop |
| @@ -84,7 +85,7 @@ exit: |
| ; vectorization factor. |
| define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) { |
| ; CHECK-LABEL: @test_load_i8_store_i64_large |
| -; CHECK: <2 x i64> |
| +; CHECK: <8 x i64> |
| ; |
| entry: |
| br label %loop |
| diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll |
| new file mode 100644 |
| index 000000000000..f0dc8e502769 |
| --- /dev/null |
| +++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll |
| @@ -0,0 +1,57 @@ |
| +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s |
| +; REQUIRES: asserts |
| + |
| +target triple = "aarch64" |
| + |
| +; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume. |
| + |
| +; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin> |
| +; CHECK: LV(REG): VF = 32 |
| +; CHECK-NEXT: LV(REG): Found max usage: 2 item |
| +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers |
| +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers |
| + |
| +define i1 @or_reduction_neon(i32 %arg, ptr %ptr) { |
| +entry: |
| + br label %loop |
| +exit: |
| + ret i1 %reduction_next |
| +loop: |
| + %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ] |
| + %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ] |
| + %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction |
| + %loaded = load i32, ptr %gep |
| + %i1 = icmp eq i32 %loaded, %induction |
| + %reduction_next = or i1 %i1, %reduction |
| + %induction_next = add nuw i32 %induction, 1 |
| + %cond = icmp eq i32 %induction_next, %arg |
| + br i1 %cond, label %exit, label %loop, !llvm.loop !32 |
| +} |
| + |
| +; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve' |
| +; CHECK: LV(REG): VF = 64 |
| +; CHECK-NEXT: LV(REG): Found max usage: 2 item |
| +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers |
| +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers |
| + |
| +define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" { |
| +entry: |
| + br label %loop |
| +exit: |
| + ret i1 %reduction_next |
| +loop: |
| + %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ] |
| + %reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ] |
| + %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction |
| + %loaded = load i32, ptr %gep |
| + %i1 = icmp eq i32 %loaded, %induction |
| + %reduction_next = or i1 %i1, %reduction |
| + %induction_next = add nuw i32 %induction, 1 |
| + %cond = icmp eq i32 %induction_next, %arg |
| + br i1 %cond, label %exit, label %loop, !llvm.loop !64 |
| +} |
| + |
| +!32 = distinct !{!32, !33} |
| +!33 = !{!"llvm.loop.vectorize.width", i32 32} |
| +!64 = distinct !{!64, !65} |
| +!65 = !{!"llvm.loop.vectorize.width", i32 64} |
| diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll |
| index e6e43375204d..28eabe382dfb 100644 |
| --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll |
| +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll |
| @@ -116,9 +116,9 @@ for.body: ; preds = %entry, %for.body |
| } |
| |
| ; CHECK-LABEL: @add_d( |
| -; CHECK: load <4 x i16> |
| -; CHECK: add nsw <4 x i32> |
| -; CHECK: store <4 x i32> |
| +; CHECK: load <8 x i16> |
| +; CHECK: add nsw <8 x i32> |
| +; CHECK: store <8 x i32> |
| define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { |
| entry: |
| %cmp7 = icmp sgt i32 %len, 0 |
| diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll |
| index a95c0aa6f375..071255c4f4f0 100644 |
| --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll |
| +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll |
| @@ -123,16 +123,16 @@ for.body: |
| ; } |
| ; |
| ; CHECK: vector.body: |
| -; CHECK: phi <8 x i16> |
| -; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> |
| -; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> |
| -; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> |
| -; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> |
| -; CHECK: add <8 x i16> |
| -; CHECK: add <8 x i16> |
| +; CHECK: phi <16 x i16> |
| +; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8> |
| +; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16> |
| +; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8> |
| +; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16> |
| +; CHECK: add <16 x i16> |
| +; CHECK: add <16 x i16> |
| ; |
| ; CHECK: middle.block: |
| -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> |
| +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> |
| ; CHECK: zext i16 [[Rdx]] to i32 |
| ; |
| define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { |
| diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll |
| index 27868480c23b..262236075f7c 100644 |
| --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll |
| +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll |
| @@ -29,7 +29,7 @@ |
| ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1). |
| |
| ; VF-4: <4 x i32> |
| -; VF-VSCALE4: <vscale x 4 x i32> |
| +; VF-VSCALE4: <16 x i32> |
| define void @test0(i32* %a, i8* %b, i32* %c) #0 { |
| entry: |
| br label %loop |
| diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll |
| index 9bd9c31d32d3..1d2c70db11cf 100644 |
| --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll |
| +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll |
| @@ -9,9 +9,9 @@ |
| define void @test0(i32* %a, i8* %b, i32* %c) #0 { |
| ; CHECK: LV: Checking a loop in "test0" |
| ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 |
| -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 |
| +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 |
| ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF |
| -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 |
| +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 |
| ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 |
| ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16 |
| entry: |
| @@ -40,9 +40,9 @@ exit: |
| define void @test1(i32* %a, i8* %b) #0 { |
| ; CHECK: LV: Checking a loop in "test1" |
| ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 |
| -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 |
| +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 |
| ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF |
| -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 |
| +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 |
| ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 |
| ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 |
| entry: |
| @@ -72,9 +72,9 @@ exit: |
| define void @test2(i32* %a, i8* %b) #0 { |
| ; CHECK: LV: Checking a loop in "test2" |
| ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 |
| -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2 |
| +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 |
| ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF |
| -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 |
| +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 |
| ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 |
| ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 |
| entry: |
| @@ -104,9 +104,9 @@ exit: |
| define void @test3(i32* %a, i8* %b) #0 { |
| ; CHECK: LV: Checking a loop in "test3" |
| ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 |
| -; CHECK_SCALABLE_ON: LV: Selecting VF: 4 |
| +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 |
| ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF |
| -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 |
| +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 |
| ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 |
| ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 |
| entry: |
| diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll |
| index 4d0886f4d953..43ef43c11507 100644 |
| --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll |
| +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll |
| @@ -83,11 +83,11 @@ for.end: |
| define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) { |
| ; CHECK-LABEL: @uniform_store_i1 |
| ; CHECK: vector.body |
| -; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1 |
| -; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]] |
| -; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0 |
| +; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x i64*> {{.*}}, i64 1 |
| +; CHECK: %[[ICMP:.*]] = icmp eq <64 x i64*> %[[GEP]], %[[SPLAT:.*]] |
| +; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 0 |
| ; CHECK: store i1 %[[EXTRACT1]], i1* %dst |
| -; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1 |
| +; CHECK: %[[EXTRACT2:.*]] = extractelement <64 x i1> %[[ICMP]], i32 1 |
| ; CHECK: store i1 %[[EXTRACT2]], i1* %dst |
| ; CHECK-NOT: vscale |
| entry: |
| diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll |
| new file mode 100644 |
| index 000000000000..4cab716c7544 |
| --- /dev/null |
| +++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll |
| @@ -0,0 +1,32 @@ |
| +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s |
| +; REQUIRES: asserts |
| + |
| +target triple = "x86_64" |
| + |
| +; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume. |
| + |
| +; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin> |
| +; CHECK: LV(REG): VF = 64 |
| +; CHECK-NEXT: LV(REG): Found max usage: 2 item |
| +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers |
| +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers |
| + |
| +define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" { |
| +entry: |
| + br label %loop |
| +exit: |
| + ret i1 %reduction_next |
| +loop: |
| + %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ] |
| + %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ] |
| + %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction |
| + %loaded = load i32, ptr %gep |
| + %i1 = icmp eq i32 %loaded, %induction |
| + %reduction_next = or i1 %i1, %reduction |
| + %induction_next = add nuw i32 %induction, 1 |
| + %cond = icmp eq i32 %induction_next, %arg |
| + br i1 %cond, label %exit, label %loop, !llvm.loop !64 |
| +} |
| + |
| +!64 = distinct !{!64, !65} |
| +!65 = !{!"llvm.loop.vectorize.width", i32 64} |